In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.externals import joblib

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix

import xgboost
%matplotlib inline

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
train.shape

(647054, 7)

In [4]:
# http://blog.8thandwalton.com/2014/06/supplier-glossary-fineline/

In [5]:
train.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [6]:
test.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503389714,1,SHOES,3002
1,1,Friday,1707710732,1,DAIRY,1526
2,1,Friday,89470001026,1,DAIRY,1431
3,1,Friday,88491211470,1,GROCERY DRY GOODS,3555
4,2,Friday,2840015224,1,DSD GROCERY,4408


In [7]:
#Space for EDA
#train[pd.isnull(train.FinelineNumber)]

In [8]:
train_y = train.loc[:, ['VisitNumber', 'TripType']]
train_y.drop_duplicates('VisitNumber', inplace=True)
train_y.set_index('VisitNumber', inplace=True)

In [9]:
train_y.head() #This will end up being y labels 96000 trips

Unnamed: 0_level_0,TripType
VisitNumber,Unnamed: 1_level_1
5,999
7,30
8,26
9,8
10,8


In [10]:
test_y_cols = test.VisitNumber
test_y_cols.drop_duplicates(inplace = True)
#test_y_cols = test_y_cols.values

In [11]:
train.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [12]:
train_negatives = train[train.ScanCount < 0]
train_negatives_agg = train_negatives.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Negative Feature Count

In [13]:
train_negatives_agg.shape

(11077, 1)

In [15]:
train_uncategorized = train[pd.isnull(train.Upc)]
train_uncategorized_agg = train_uncategorized.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Unknown Feature Count

In [16]:
train_uncategorized_agg.shape

(2754, 1)

In [18]:
test_negatives = test[test.ScanCount < 0]
test_negatives_agg = test.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Negative Feature Count
test_uncategorized = test[pd.isnull(test.Upc)]
test_uncategorized_agg = test_uncategorized.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Unknown Feature Count

In [19]:
train.drop(['TripType'], axis = 1, inplace = True)

In [20]:
train.Upc.fillna(-100, inplace=True)
train.DepartmentDescription.fillna('Unknown', inplace=True)
train.FinelineNumber.fillna(-100, inplace=True)

test.Upc.fillna(-100, inplace=True)
test.DepartmentDescription.fillna('Unknown', inplace=True)
test.FinelineNumber.fillna(-100, inplace=True)

In [21]:
train['FinelineNumber'] = train['FinelineNumber'].astype('int')
test['FinelineNumber'] = test['FinelineNumber'].astype('int')

In [22]:
train['DeptItems'] = train.DepartmentDescription +' ' + train.FinelineNumber.astype('str')
test['DeptItems'] = test.DepartmentDescription +' ' + test.FinelineNumber.astype('str')

In [32]:
full_df = pd.concat((train, test)) #Cannot Concant with ScanCount > 0.. some visit numbers will not be present

In [33]:
print full_df.shape
full_df.head()


(1300700, 7)


Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,DeptItems
0,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000,FINANCIAL SERVICES 1000
1,7,Friday,60538815980,1,SHOES,8931,SHOES 8931
2,7,Friday,7410811099,1,PERSONAL CARE,4504,PERSONAL CARE 4504
3,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565,PAINT AND ACCESSORIES 3565
4,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017,PAINT AND ACCESSORIES 1017


In [None]:
#full_df[full_df.VisitNumber==191319] #Useful to check

In [46]:
visit_days = full_df.loc[:,['VisitNumber','Weekday']]
visit_days.drop_duplicates('VisitNumber', inplace = True)
visit_days.set_index('VisitNumber', inplace = True)

In [49]:
visit_days.shape

(191348, 1)

In [None]:
#visit_days.set_index('VisitNumber', inplace = True)
#visit_days.sort_index(inplace = True)
#visit_days.reset_index(inplace = True)

In [45]:
#visit_days.sort(columns=['VisitNumber'], inplace=True)
#visit_days.head()

In [50]:
visit_days = pd.get_dummies(visit_days)
#visit_days.drop(['Weekday'], axis = 1, inplace = True)

In [51]:
visit_days.head()

Unnamed: 0_level_0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,1,0,0,0,0,0,0
7,1,0,0,0,0,0,0
8,1,0,0,0,0,0,0
9,1,0,0,0,0,0,0
10,1,0,0,0,0,0,0


In [53]:
full_df.shape

(1300700, 7)

In [59]:
full_df.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,DeptItems
0,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000,FINANCIAL SERVICES 1000
1,7,Friday,60538815980,1,SHOES,8931,SHOES 8931
2,7,Friday,7410811099,1,PERSONAL CARE,4504,PERSONAL CARE 4504
3,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565,PAINT AND ACCESSORIES 3565
4,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017,PAINT AND ACCESSORIES 1017


In [None]:
#Should pivot with only positive scancounts so i can perform chisq
full_df_departments = pd.pivot_table(full_df[full_df.ScanCount>0], values='ScanCount', index='VisitNumber',columns='DeptItems', aggfunc=np.sum)
full_df_departments.fillna(0, inplace=True)

In [None]:
full_df_departments.head()

In [None]:
full_df_departments['Totals'] = 0

In [None]:
for department in full_df_departments.columns[:-1]:
    full_df_departments['Totals'] += full_df_departments[department]

In [None]:
full_df_departments.reset_index(inplace = True)

In [None]:
departments_and_time = pd.merge(full_df_departments, visit_days, on='VisitNumber')

In [None]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday','Saturday', 'Sunday']

for c in days:
    departments_and_time[c] = departments_and_time[c] * departments_and_time['Totals']

In [None]:
departments_and_time.set_index('VisitNumber', inplace = True)

In [None]:
X = departments_and_time.loc[train_y.VisitNumber,:].values
y = train_y[['TripType']].values

In [None]:
print X.shape
print y.shape

In [None]:
enc = LabelEncoder()
y = enc.fit_transform(y.ravel())

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y , test_size = 3000, random_state = 1)

In [None]:
xgb = xgboost.XGBClassifier(max_depth = 13, n_estimators = 100,
                        objective='multi:softprob', subsample = .9, colsample_bytree=.8)

xgb.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'mlogloss', early_stopping_rounds=5)


In [None]:
X_test = departments_and_time.loc[test_y_cols,:].values

In [None]:
y_probas = xgb.predict_proba(X_test)


In [None]:
y_probas.shape

In [None]:
#test = pd.read_csv('./test.csv')

In [None]:
col_names = ['TripType_' + str(c) for c in enc.classes_]
submission = pd.DataFrame(np.round(y_probas, 3), index=test_y_cols, columns = col_names)

In [None]:
submission.head()

In [None]:
submission.reset_index(inplace = True)

submission.to_csv('Walmart_submission_XGB_Simple_DepartmentsAndTotal-1.csv', index=False)

In [None]:
y_pred = xgb.predict(X_train)

In [None]:
#cm = confusion_matrix(y_train,y_pred)
#plt.imshow(cm,cmap=plt.cm.Blues)

In [None]:
#Generate confusion matrix to look at in excel

cm_df = pd.DataFrame(cm, index = enc.classes_, columns=enc.classes_)
cm_df.to_csv('Walmart_Confusion_Matrix.csv')