In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.externals import joblib

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix

import xgboost
%matplotlib inline

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
train.shape

(647054, 7)

In [None]:
# http://blog.8thandwalton.com/2014/06/supplier-glossary-fineline/

In [4]:
train.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [5]:
test.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503389714,1,SHOES,3002
1,1,Friday,1707710732,1,DAIRY,1526
2,1,Friday,89470001026,1,DAIRY,1431
3,1,Friday,88491211470,1,GROCERY DRY GOODS,3555
4,2,Friday,2840015224,1,DSD GROCERY,4408


In [None]:
#Space for EDA
#train[pd.isnull(train.FinelineNumber)]

In [6]:
train_y = train.loc[:, ['VisitNumber', 'TripType']]
train_y.drop_duplicates('VisitNumber', inplace=True)

In [7]:
train_y.head() #This will end up being y labels

Unnamed: 0,VisitNumber,TripType
0,5,999
1,7,30
3,8,26
26,9,8
29,10,8


In [8]:
train.drop(['TripType'], axis = 1, inplace = True)

In [61]:
test_y_cols = test.VisitNumber
test_y_cols.drop_duplicates(inplace = True)
#test_y_cols = test_y_cols.values

In [18]:
train.Upc.fillna(-100, inplace=True)
train.DepartmentDescription.fillna('Unknown', inplace=True)
train.FinelineNumber.fillna(-100, inplace=True)

test.Upc.fillna(-100, inplace=True)
test.DepartmentDescription.fillna('Unknown', inplace=True)
test.FinelineNumber.fillna(-100, inplace=True)

In [19]:
train['FinelineNumber'] = train['FinelineNumber'].astype('int')
test['FinelineNumber'] = test['FinelineNumber'].astype('int')

In [20]:
train['DeptItems'] = train.DepartmentDescription +' ' + train.FinelineNumber.astype('str')
test['DeptItems'] = test.DepartmentDescription +' ' + test.FinelineNumber.astype('str')

In [21]:
full_df = pd.concat((train, test))

In [22]:
full_df.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,DeptItems
0,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000,FINANCIAL SERVICES 1000
1,7,Friday,60538815980,1,SHOES,8931,SHOES 8931
2,7,Friday,7410811099,1,PERSONAL CARE,4504,PERSONAL CARE 4504
3,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565,PAINT AND ACCESSORIES 3565
4,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017,PAINT AND ACCESSORIES 1017


In [None]:
#full_df[full_df.VisitNumber==191319] #Useful to check

In [23]:
visit_days = full_df.loc[:,['VisitNumber','Weekday']]

In [24]:
visit_days.drop_duplicates('VisitNumber', inplace = True)

In [None]:
#visit_days.set_index('VisitNumber', inplace = True)
#visit_days.sort_index(inplace = True)
#visit_days.reset_index(inplace = True)

In [25]:
visit_days.sort(columns=['VisitNumber'], inplace=True)
visit_days.head()

Unnamed: 0,VisitNumber,Weekday
0,1,Friday
4,2,Friday
8,3,Friday
10,4,Friday
0,5,Friday


In [26]:
visit_days = pd.concat((visit_days, pd.get_dummies(visit_days.Weekday)), axis = 1)
visit_days.drop(['Weekday'], axis = 1, inplace = True)

In [27]:
visit_days.head()

Unnamed: 0,VisitNumber,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,1,1,0,0,0,0,0,0
4,2,1,0,0,0,0,0,0
8,3,1,0,0,0,0,0,0
10,4,1,0,0,0,0,0,0
0,5,1,0,0,0,0,0,0


In [28]:
full_df.head(10)

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,DeptItems
0,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000,FINANCIAL SERVICES 1000
1,7,Friday,60538815980,1,SHOES,8931,SHOES 8931
2,7,Friday,7410811099,1,PERSONAL CARE,4504,PERSONAL CARE 4504
3,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565,PAINT AND ACCESSORIES 3565
4,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017,PAINT AND ACCESSORIES 1017
5,8,Friday,2006618783,2,PAINT AND ACCESSORIES,1017,PAINT AND ACCESSORIES 1017
6,8,Friday,2006613743,1,PAINT AND ACCESSORIES,1017,PAINT AND ACCESSORIES 1017
7,8,Friday,7004802737,1,PAINT AND ACCESSORIES,2802,PAINT AND ACCESSORIES 2802
8,8,Friday,2238495318,1,PAINT AND ACCESSORIES,4501,PAINT AND ACCESSORIES 4501
9,8,Friday,2238400200,-1,PAINT AND ACCESSORIES,3565,PAINT AND ACCESSORIES 3565


In [29]:
full_df_departments = pd.pivot_table(full_df, values='ScanCount', index='VisitNumber',columns='DepartmentDescription', aggfunc=np.sum)

In [30]:
full_df_departments.fillna(0, inplace=True)
full_df_departments.head()

DepartmentDescription,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,...,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,Unknown,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
full_df_departments['Totals'] = 0

In [32]:
for department in full_df_departments.columns[:-1]:
    full_df_departments['Totals'] += full_df_departments[department]

In [33]:
full_df_departments.reset_index(inplace = True)

In [34]:
departments_and_time = pd.merge(full_df_departments, visit_days, on='VisitNumber')

In [35]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday', 'Sunday']

for c in days:
    departments_and_time[c] = departments_and_time[c] * departments_and_time['Totals']

In [36]:
departments_and_time.set_index('VisitNumber', inplace = True)

In [37]:
X = departments_and_time.loc[train_y.VisitNumber,:].values
y = train_y[['TripType']].values

In [38]:
print X.shape
print y.shape

(95674, 77)
(95674, 1)


In [42]:
enc = LabelEncoder()
y = enc.fit_transform(y.ravel())

In [50]:
X_train, X_val, y_train, y_val = train_test_split(X, y , test_size = 3000, random_state = 1)

In [51]:
xgb = xgboost.XGBClassifier(max_depth = 13, n_estimators = 100,
                        objective='multi:softprob', subsample = .9, colsample_bytree=.8)

xgb.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'mlogloss', early_stopping_rounds=5)


Will train until validation_0 error hasn't decreased in 5 rounds.
[0]	validation_0-mlogloss:2.955780
[1]	validation_0-mlogloss:2.625777
[2]	validation_0-mlogloss:2.399244
[3]	validation_0-mlogloss:2.219026
[4]	validation_0-mlogloss:2.080320
[5]	validation_0-mlogloss:1.962622
[6]	validation_0-mlogloss:1.862833
[7]	validation_0-mlogloss:1.776057
[8]	validation_0-mlogloss:1.701525
[9]	validation_0-mlogloss:1.634656
[10]	validation_0-mlogloss:1.577555
[11]	validation_0-mlogloss:1.528673
[12]	validation_0-mlogloss:1.482949
[13]	validation_0-mlogloss:1.443228
[14]	validation_0-mlogloss:1.403397
[15]	validation_0-mlogloss:1.368354
[16]	validation_0-mlogloss:1.336370
[17]	validation_0-mlogloss:1.309022
[18]	validation_0-mlogloss:1.283050
[19]	validation_0-mlogloss:1.256295
[20]	validation_0-mlogloss:1.233279
[21]	validation_0-mlogloss:1.210621
[22]	validation_0-mlogloss:1.192036
[23]	validation_0-mlogloss:1.172504
[24]	validation_0-mlogloss:1.153572
[25]	validation_0-mlogloss:1.136187
[26]	val

XGBClassifier(base_score=0.5, colsample_bytree=0.8, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=13,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', seed=0, silent=True, subsample=0.9)

In [52]:
X_test = departments_and_time.loc[test_y_cols,:].values

In [53]:
y_probas = xgb.predict_proba(X_test)


In [55]:
y_probas.shape

(95674, 38)

In [None]:
#test = pd.read_csv('./test.csv')

In [69]:
col_names = ['TripType_' + str(c) for c in enc.classes_]
submission = pd.DataFrame(np.round(y_probas, 3), index=test_y_cols, columns = col_names)

In [70]:
submission.head()

Unnamed: 0_level_0,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,TripType_15,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.001,0.001,0.001,0.001,0.007,0.022,0.018,0.001,0.001,0.002,...,0.001,0.002,0.11,0.123,0.001,0.008,0.007,0.004,0.002,0.093
2,0.001,0.001,0.001,0.002,0.012,0.025,0.006,0.002,0.001,0.021,...,0.004,0.001,0.029,0.101,0.001,0.002,0.012,0.006,0.001,0.012
3,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.996
4,0.001,0.001,0.002,0.001,0.005,0.081,0.827,0.0,0.0,0.001,...,0.001,0.0,0.001,0.001,0.0,0.0,0.001,0.001,0.0,0.029
6,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.996


In [71]:
submission.reset_index(inplace = True)

submission.to_csv('Walmart_submission_XGB_Simple_DepartmentsAndTotal-1.csv', index=False)

In [73]:
y_pred = xgb.predict(X_train)

In [76]:
#cm = confusion_matrix(y_train,y_pred)
#plt.imshow(cm,cmap=plt.cm.Blues)

In [82]:
#Generate confusion matrix to look at in excel

cm_df = pd.DataFrame(cm, index = enc.classes_, columns=enc.classes_)
cm_df.to_csv('Walmart_Confusion_Matrix.csv')