## description

- `TripType` : a categorical id representing the type of shopping trip the customer made. This is the ground truth that you are predicting. TripType_999 is an "other" category.

- `VisitNumber` : an id corresponding to a single trip by a single customer

- `Weekday` : the weekday of the trip

- `Upc` : the UPC number of the product purchased

- `ScanCount` : the number of the given item that was purchased. A negative value indicates a product return.

- `DepartmentDescription` : a high-level description of the item's department

- `FinelineNumber` : a more refined category for each of the products, created by Walmart

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
data = pd.read_csv('../data/train_v1.csv', index_col = 0)

In [3]:
data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,FinelineNumber,standard_upc,company_code,product_code
0,999,5,Friday,-1,FINANCIAL SERVICES,1000.0,681131529297,681131,52929
1,30,7,Friday,1,SHOES,8931.0,605388159809,605388,15980
2,30,7,Friday,1,PERSONAL CARE,4504.0,74108110992,74108,11099
3,26,8,Friday,2,PAINT AND ACCESSORIES,3565.0,22384035102,22384,3510
4,26,8,Friday,2,PAINT AND ACCESSORIES,1017.0,20066137441,20066,13744


In [4]:
print("VIsitNumber_len", len(data['VisitNumber'].unique()))
print("ScanCount_len", len(data['ScanCount'].unique()))
print("DepartmentDescription_len", len(data['DepartmentDescription'].unique()))
print("FinelineNumber_len", len(data['FinelineNumber'].unique()))
print("standard_upc_len", len(data['standard_upc'].unique()))
print("company_code_len", len(data['company_code'].unique()))
print("product_code_len", len(data['product_code'].unique()))


VIsitNumber_len 95674
ScanCount_len 39
DepartmentDescription_len 69
FinelineNumber_len 5196
standard_upc_len 97715
company_code_len 5917
product_code_len 54285


In [5]:
def convert_weekday(weekday):
    if weekday == "Saturday" or weekday == "Sunday":
        return 1
    else:
        return 0
    
data['Weekday'] = data['Weekday'].apply(convert_weekday)

In [6]:
def convert_code(companycode):
    if companycode == "missing":
        return 999
    else:
        return int(companycode)
    
data['company_code'] = data['company_code'].apply(convert_code)
data['standard_upc'] = data['standard_upc'].apply(convert_code)
data['product_code'] = data['product_code'].apply(convert_code)

In [7]:
data['DepartmentDescription'] = data['DepartmentDescription'].fillna(value = 999)
data['FinelineNumber'] = data['FinelineNumber'].fillna(value = 999)

In [8]:
data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,FinelineNumber,standard_upc,company_code,product_code
0,999,5,0,-1,FINANCIAL SERVICES,1000.0,681131529297,681131,52929
1,30,7,0,1,SHOES,8931.0,605388159809,605388,15980
2,30,7,0,1,PERSONAL CARE,4504.0,74108110992,74108,11099
3,26,8,0,2,PAINT AND ACCESSORIES,3565.0,22384035102,22384,3510
4,26,8,0,2,PAINT AND ACCESSORIES,1017.0,20066137441,20066,13744


In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
data[data['FinelineNumber'].isin([999])]

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,FinelineNumber,standard_upc,company_code,product_code
25,26,8,0,1,999,999.0,999,999,999
548,27,259,0,3,999,999.0,999,999,999
549,27,259,0,1,999,999.0,999,999,999
959,999,409,0,-1,999,999.0,999,999,999
1116,39,479,0,1,999,999.0,999,999,999
1134,999,484,0,-2,999,999.0,999,999,999
1135,999,484,0,-2,999,999.0,999,999,999
1155,44,496,0,1,PHARMACY RX,999.0,999,999,999
1216,5,521,0,1,PHARMACY RX,999.0,999,999,999
1373,5,585,0,1,PHARMACY RX,999.0,999,999,999


In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [12]:
y = list(data['TripType'].unique())
le.fit(y)
le.classes_

array([  3,   4,   5,   6,   7,   8,   9,  12,  14,  15,  18,  19,  20,
        21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,
        34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44, 999])

In [13]:
tmp = list(data['DepartmentDescription'].unique())
tmp_num = [i for i in range(69)]

In [14]:
def convert_description(description):
    tmp = ['FINANCIAL SERVICES',
 'SHOES',
 'PERSONAL CARE',
 'PAINT AND ACCESSORIES',
 'DSD GROCERY',
 'MEAT - FRESH & FROZEN',
 'DAIRY',
 'PETS AND SUPPLIES',
 'HOUSEHOLD CHEMICALS/SUPP',
 999,
 'IMPULSE MERCHANDISE',
 'PRODUCE',
 'CANDY, TOBACCO, COOKIES',
 'GROCERY DRY GOODS',
 'BOYS WEAR',
 'FABRICS AND CRAFTS',
 'JEWELRY AND SUNGLASSES',
 'MENS WEAR',
 'ACCESSORIES',
 'HOME MANAGEMENT',
 'FROZEN FOODS',
 'SERVICE DELI',
 'INFANT CONSUMABLE HARDLINES',
 'PRE PACKED DELI',
 'COOK AND DINE',
 'PHARMACY OTC',
 'LADIESWEAR',
 'COMM BREAD',
 'BAKERY',
 'HOUSEHOLD PAPER GOODS',
 'CELEBRATION',
 'HARDWARE',
 'BEAUTY',
 'AUTOMOTIVE',
 'BOOKS AND MAGAZINES',
 'SEAFOOD',
 'OFFICE SUPPLIES',
 'LAWN AND GARDEN',
 'SHEER HOSIERY',
 'WIRELESS',
 'BEDDING',
 'BATH AND SHOWER',
 'HORTICULTURE AND ACCESS',
 'HOME DECOR',
 'TOYS',
 'INFANT APPAREL',
 'LADIES SOCKS',
 'PLUS AND MATERNITY',
 'ELECTRONICS',
 'GIRLS WEAR, 4-6X  AND 7-14',
 'BRAS & SHAPEWEAR',
 'LIQUOR,WINE,BEER',
 'SLEEPWEAR/FOUNDATIONS',
 'CAMERAS AND SUPPLIES',
 'SPORTING GOODS',
 'PLAYERS AND ELECTRONICS',
 'PHARMACY RX',
 'MENSWEAR',
 'OPTICAL - FRAMES',
 'SWIMWEAR/OUTERWEAR',
 'OTHER DEPARTMENTS',
 'MEDIA AND GAMING',
 'FURNITURE',
 'OPTICAL - LENSES',
 'SEASONAL',
 'LARGE HOUSEHOLD GOODS',
 '1-HR PHOTO',
 'CONCEPT STORES',
 'HEALTH AND BEAUTY AIDS']
    
    return tmp.index(description)

data['DepartmentDescription'] = data['DepartmentDescription'].apply(convert_description)

In [15]:
print("VIsitNumber_len", len(data['VisitNumber'].unique()))
print("ScanCount_len", len(data['ScanCount'].unique()))
print("DepartmentDescription_len", len(data['DepartmentDescription'].unique()))
print("FinelineNumber_len", len(data['FinelineNumber'].unique()))
print("standard_upc_len", len(data['standard_upc'].unique()))
print("company_code_len", len(data['company_code'].unique()))
print("product_code_len", len(data['product_code'].unique()))

VIsitNumber_len 95674
ScanCount_len 39
DepartmentDescription_len 69
FinelineNumber_len 5196
standard_upc_len 97715
company_code_len 5917
product_code_len 54187


In [16]:
data['FinelineNumber'] = data['FinelineNumber'].astype(int)

In [85]:
data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,FinelineNumber,company_code,product_code
0,999,5,0,-1,0,0,0,0
1,30,7,0,1,1,1,1,1
2,30,7,0,1,2,2,2,2
3,26,8,0,2,3,3,3,3
4,26,8,0,2,3,4,4,4


In [86]:
data[data['FinelineNumber'].isin([8931])]

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,FinelineNumber,company_code,product_code


In [61]:
tmp = data.groupby('TripType')["DepartmentDescription"].apply(set)

In [77]:
for x,y in tmp.items():
    

<zip at 0x112d5d948>

In [78]:
tmp[3]&tmp[4]&tmp[5]&tmp[6]&tmp[7]&tmp[8]&tmp[9]&tmp[12]&tmp[14]&tmp[15]&tmp[18]&tmp[19]&tmp[20]&tmp[21]&tmp[22]&tmp[23]&tmp[24]&tmp[25]&tmp[26]&tmp[27]&tmp[28]&tmp[29]&tmp[30]&tmp[31]&tmp[32]&tmp[33]&tmp[34]&tmp[35]&tmp[36]&tmp[37]&tmp[38]&tmp[39]&tmp[40]&tmp[41]&tmp[42]&tmp[43]&tmp[44]&tmp[999]

{10, 29, 30, 36}

In [81]:
tmp_list[10], tmp_list[29],tmp_list[30],tmp_list[36]

('IMPULSE MERCHANDISE',
 'HOUSEHOLD PAPER GOODS',
 'CELEBRATION',
 'OFFICE SUPPLIES')

In [79]:
tmp_list = ['FINANCIAL SERVICES',
 'SHOES',
 'PERSONAL CARE',
 'PAINT AND ACCESSORIES',
 'DSD GROCERY',
 'MEAT - FRESH & FROZEN',
 'DAIRY',
 'PETS AND SUPPLIES',
 'HOUSEHOLD CHEMICALS/SUPP',
 999,
 'IMPULSE MERCHANDISE',
 'PRODUCE',
 'CANDY, TOBACCO, COOKIES',
 'GROCERY DRY GOODS',
 'BOYS WEAR',
 'FABRICS AND CRAFTS',
 'JEWELRY AND SUNGLASSES',
 'MENS WEAR',
 'ACCESSORIES',
 'HOME MANAGEMENT',
 'FROZEN FOODS',
 'SERVICE DELI',
 'INFANT CONSUMABLE HARDLINES',
 'PRE PACKED DELI',
 'COOK AND DINE',
 'PHARMACY OTC',
 'LADIESWEAR',
 'COMM BREAD',
 'BAKERY',
 'HOUSEHOLD PAPER GOODS',
 'CELEBRATION',
 'HARDWARE',
 'BEAUTY',
 'AUTOMOTIVE',
 'BOOKS AND MAGAZINES',
 'SEAFOOD',
 'OFFICE SUPPLIES',
 'LAWN AND GARDEN',
 'SHEER HOSIERY',
 'WIRELESS',
 'BEDDING',
 'BATH AND SHOWER',
 'HORTICULTURE AND ACCESS',
 'HOME DECOR',
 'TOYS',
 'INFANT APPAREL',
 'LADIES SOCKS',
 'PLUS AND MATERNITY',
 'ELECTRONICS',
 'GIRLS WEAR, 4-6X  AND 7-14',
 'BRAS & SHAPEWEAR',
 'LIQUOR,WINE,BEER',
 'SLEEPWEAR/FOUNDATIONS',
 'CAMERAS AND SUPPLIES',
 'SPORTING GOODS',
 'PLAYERS AND ELECTRONICS',
 'PHARMACY RX',
 'MENSWEAR',
 'OPTICAL - FRAMES',
 'SWIMWEAR/OUTERWEAR',
 'OTHER DEPARTMENTS',
 'MEDIA AND GAMING',
 'FURNITURE',
 'OPTICAL - LENSES',
 'SEASONAL',
 'LARGE HOUSEHOLD GOODS',
 '1-HR PHOTO',
 'CONCEPT STORES',
 'HEALTH AND BEAUTY AIDS']

In [70]:
tmp[3].intersection(tmp[4:])

TypeError: unhashable type: 'set'

In [57]:
data.sort_values(by = ["VisitNumber"], ascending=True)

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,FinelineNumber,company_code,product_code
0,999,5,0,-1,0,0,0,0
1,30,7,0,1,1,1,1,1
2,30,7,0,1,2,2,2,2
25,26,8,0,1,9,19,17,22
24,26,8,0,3,7,18,16,21
23,26,8,0,1,3,17,15,20
22,26,8,0,1,3,17,15,19
21,26,8,0,1,8,16,14,18
20,26,8,0,2,3,15,11,17
18,26,8,0,2,6,13,12,12


In [18]:
global tmp
tmp = list(data['company_code'].unique())
def convert_comcode(companycode):
    global tmp
    return tmp.index(companycode)

data['company_code'] = data['company_code'].apply(convert_comcode)

In [23]:
global tmp
tmp = list(data['product_code'].unique())
def convert_comcode(productcode):
    global tmp
    return tmp.index(productcode)

data['product_code'] = data['product_code'].apply(convert_comcode)

In [25]:
global tmp
tmp = list(data['FinelineNumber'].unique())
def convert_comcode(finelinenumber):
    global tmp
    return tmp.index(finelinenumber)

data['FinelineNumber'] = data['FinelineNumber'].apply(convert_comcode)

In [26]:
data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,FinelineNumber,standard_upc,company_code,product_code
0,999,5,0,-1,0,0,681131529297,0,0
1,30,7,0,1,1,1,605388159809,1,1
2,30,7,0,1,2,2,74108110992,2,2
3,26,8,0,2,3,3,22384035102,3,3
4,26,8,0,2,3,4,20066137441,4,4


In [31]:
data = data.drop(columns = ['standard_upc'])

In [32]:
data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,FinelineNumber,company_code,product_code
0,999,5,0,-1,0,0,0,0
1,30,7,0,1,1,1,1,1
2,30,7,0,1,2,2,2,2
3,26,8,0,2,3,3,3,3
4,26,8,0,2,3,4,4,4


In [55]:
data.shape

(647054, 8)

In [54]:
len(data['product_code'].unique())

54187

In [33]:
data.to_csv('../data/train_v2.csv', sep = ',')

In [34]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [46]:
data['Weekday'] = data['Weekday'].astype("category")
data['DepartmentDescription'] = data['DepartmentDescription'].astype("category")
data['FinelineNumber'] = data['FinelineNumber'].astype("category")
data['company_code'] = data['company_code'].astype("category")
data['product_code'] = data['product_code'].astype("category")

In [47]:
tmp_X = data.drop(columns = ['TripType', 'VisitNumber', 'ScanCount'])

In [53]:
ohe.transform(tmp_X).toarray().shape

(647054, 65371)

In [None]:
tmp = list(data['product_code'].unique())

def convert_comcode(productcode):
    return tmp.index(prouctcode)

data['product_code'] = data['product_code'].apply(convert_comcode)

In [None]:
ggplot(data = data) +
geom_histogram(mapping = aes( x = ))

In [7]:
tmp_list = list(data['FinelineNumber'].unique())

In [8]:
tmp_list.sort()

In [9]:
tmp_list

[0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 7.0,
 10.0,
 12.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 27.0,
 28.0,
 30.0,
 31.0,
 33.0,
 34.0,
 35.0,
 37.0,
 38.0,
 42.0,
 43.0,
 44.0,
 46.0,
 47.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 63.0,
 65.0,
 66.0,
 68.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 77.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 89.0,
 90.0,
 91.0,
 94.0,
 96.0,
 101.0,
 103.0,
 104.0,
 105.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 112.0,
 113.0,
 114.0,
 115.0,
 116.0,
 117.0,
 118.0,
 119.0,
 121.0,
 123.0,
 125.0,
 127.0,
 128.0,
 130.0,
 132.0,
 134.0,
 135.0,
 138.0,
 139.0,
 140.0,
 142.0,
 143.0,
 145.0,
 149.0,
 150.0,
 151.0,
 153.0,
 155.0,
 157.0,
 160.0,
 164.0,
 165.0,
 166.0,
 167.0,
 170.0,
 172.0,
 175.0,
 177.0,
 179.0,
 185.0,
 188.0,
 192.0,
 201.0,
 202.0,
 203.0,
 204.0,
 205.0,
 206.0,
 208.0,
 209.0,
 210.0,
 211.0,
 212.0,
 213.0,
 214.0,
 215.0,
 219.0,
 22

In [10]:
data['DepartmentDescription'].unique()[2]

'PERSONAL CARE'

In [11]:
tmp_1 = data[data['FinelineNumber'] <= 1000]

In [12]:
data['DepartmentDescription']

0               FINANCIAL SERVICES
1                            SHOES
2                    PERSONAL CARE
3            PAINT AND ACCESSORIES
4            PAINT AND ACCESSORIES
5            PAINT AND ACCESSORIES
6            PAINT AND ACCESSORIES
7            PAINT AND ACCESSORIES
8            PAINT AND ACCESSORIES
9            PAINT AND ACCESSORIES
10                     DSD GROCERY
11           PAINT AND ACCESSORIES
12           MEAT - FRESH & FROZEN
13           PAINT AND ACCESSORIES
14           PAINT AND ACCESSORIES
15           PAINT AND ACCESSORIES
16           PAINT AND ACCESSORIES
17           PAINT AND ACCESSORIES
18                           DAIRY
19               PETS AND SUPPLIES
20           PAINT AND ACCESSORIES
21        HOUSEHOLD CHEMICALS/SUPP
22           PAINT AND ACCESSORIES
23           PAINT AND ACCESSORIES
24               PETS AND SUPPLIES
25                             NaN
26             IMPULSE MERCHANDISE
27                         PRODUCE
28                  

In [13]:
Description_count = data.groupby(['DepartmentDescription']).size().reset_index(name = 'count')

In [14]:
Description_count.sort_values(by = 'count', ascending=False)[:10]

Unnamed: 0,DepartmentDescription,count
24,GROCERY DRY GOODS,70402
17,DSD GROCERY,68332
57,PRODUCE,51115
16,DAIRY,43820
50,PERSONAL CARE,41969
32,IMPULSE MERCHANDISE,28712
30,HOUSEHOLD CHEMICALS/SUPP,24880
52,PHARMACY OTC,23306
21,FROZEN FOODS,21101
31,HOUSEHOLD PAPER GOODS,16274


In [15]:
FinelineNumber = data.groupby(['FinelineNumber']).size().reset_index(name = 'count')

In [16]:
FinelineNumber.sort_values(by = 'count', ascending=False)[:10]

Unnamed: 0,FinelineNumber,count
3368,5501.0,8244
1089,1508.0,5121
130,135.0,4538
653,808.0,4401
0,0.0,3837
5091,9546.0,2997
1031,1407.0,2848
2905,4606.0,2753
110,115.0,2723
186,203.0,2639


In [17]:
Companycode = data.groupby(['company_code']).size().reset_index(name = 'count')
Companycode.sort_values(by = 'count', ascending=False)[:10]

Unnamed: 0,company_code,count
2195,78742,48001
3868,681131,23357
3489,40,21115
3549,605388,16218
827,37000,14031
1313,49000,11890
585,28400,10915
743,33383,6829
788,35000,6229
761,34000,5952
