# http://blog.8thandwalton.com/2014/06/supplier-glossary-fineline/

In [88]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.externals import joblib

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.metrics import confusion_matrix

import xgboost
%matplotlib inline

In [2]:
train = pd.read_csv('./train.csv') #Last visit number is 191347
test = pd.read_csv('./test.csv') #Last visit number is 191348

In [3]:
#test.TripType = -1 #May not be needed it will just add Nans

In [4]:
full_df = pd.concat((train, test))

In [5]:
full_df_negatives = full_df[full_df.ScanCount < 0]
full_df_negatives_agg = full_df_negatives.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Negative Feature Count

full_df_uncategorized = full_df[pd.isnull(full_df.Upc)]
full_df_uncategorized_agg = full_df_uncategorized.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Unknown Feature Count

full_df_totals = full_df[full_df.ScanCount > 0]
full_df_totals_agg = full_df_totals.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Total purchases Feature Count

In [6]:
full_df.Upc.fillna(-100, inplace=True)
full_df.DepartmentDescription.fillna('Unknown', inplace=True)
full_df.FinelineNumber.fillna(-100, inplace=True)

In [47]:
visit_days = full_df.loc[:,['VisitNumber','Weekday']]
visit_days.drop_duplicates('VisitNumber', inplace = True)
visit_days.set_index('VisitNumber', inplace = True)
visit_days = pd.get_dummies(visit_days)

In [8]:
full_df['FinelineNumber'] = full_df['FinelineNumber'].astype('int')
full_df['DeptItems'] = full_df.DepartmentDescription +' ' + full_df.FinelineNumber.astype('str')

In [9]:
#Should pivot with only positive scancounts so i can perform chisq
full_deptitems_df = pd.pivot_table(full_df[full_df.ScanCount>0], values='ScanCount', index='VisitNumber',columns='DeptItems', aggfunc=np.sum)
full_deptitems_df.fillna(0, inplace=True)

In [10]:
full_deptitems_df.shape

(183097, 11048)

In [11]:
y_df = full_df.loc[:, ['VisitNumber', 'TripType']]
y_df.drop_duplicates('VisitNumber', inplace=True)
y_df.set_index('VisitNumber', inplace=True)

In [12]:
full_deptitems_df.shape

(183097, 11048)

In [13]:
y_df = y_df.join(full_deptitems_df) #This requires an insane amount of memory **Cannot fill 0s due to memory error

In [14]:
del full_deptitems_df

In [15]:
X_train = y_df[pd.notnull(y_df.TripType)].drop('TripType', axis = 1).values
X_test = y_df[pd.isnull(y_df.TripType)].drop('TripType', axis = 1).values
y_train = y_df[pd.notnull(y_df.TripType)]['TripType'].values


In [16]:
y_df = y_df[['TripType']] #Removing Unneccessary Columns

In [17]:
X_train = np.nan_to_num(X_train) #Splitting this into 2 cells works

In [18]:
#X_test = np.nan_to_num(X_test) #memory error here


In [19]:
chi_sq_best = SelectKBest(score_func=chi2, k = 7000)
chi_sq_best.fit(X_train,y_train)

X_train = chi_sq_best.transform(X_train)

In [20]:
X_test = np.nan_to_num(X_test)
X_test = chi_sq_best.transform(X_test)

In [23]:
print X_train.shape, X_test.shape

(95674, 7000) (95674, 7000)


In [56]:
#this only aggregates by dept, may give a better large picture overview ~ 77 columsn i believe
#Used to be higher but moved down to optimize memory usage
#changed to not subset on positive values
X_df = pd.pivot_table(full_df, values='ScanCount', index='VisitNumber',columns='DepartmentDescription', aggfunc=np.sum)
X_df.fillna(0, inplace=True)
X_df.shape

(191348, 69)

In [57]:
X_df = X_df.join(full_df_totals_agg, rsuffix='Totals')
X_df = X_df.join(full_df_uncategorized_agg, rsuffix='Uncategorized')
X_df = X_df.join(full_df_negatives_agg, rsuffix='Negatives')
X_df = X_df.join(visit_days)
X_df.fillna(0, inplace = True)


In [62]:
y_df = y_df.join(X_df)

In [63]:
X_train2 = y_df[pd.notnull(y_df.TripType)].drop('TripType', axis = 1).values
X_test2 = y_df[pd.isnull(y_df.TripType)].drop('TripType', axis = 1).values
y_train2 = y_df[pd.notnull(y_df.TripType)]['TripType'].values

In [66]:
print X_train.shape, X_train2.shape

(95674, 7000) (95674, 79)


In [67]:
X_train = np.concatenate((X_train, X_train2), axis = 1)

In [70]:
X_test = np.concatenate((X_test, X_test2), axis = 1)

In [71]:
enc = LabelEncoder()
y_train = enc.fit_transform(y_train)

In [72]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 4000, random_state = 1)

In [73]:
xgb = xgboost.XGBClassifier(max_depth = 12, n_estimators = 100,
                        objective='multi:softprob', subsample = .80, colsample_bytree=.5)

xgb.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'mlogloss', early_stopping_rounds=5)


Will train until validation_0 error hasn't decreased in 5 rounds.
[0]	validation_0-mlogloss:3.054260
[1]	validation_0-mlogloss:2.766119
[2]	validation_0-mlogloss:2.594240
[3]	validation_0-mlogloss:2.480443
[4]	validation_0-mlogloss:2.364547
[5]	validation_0-mlogloss:2.275330
[6]	validation_0-mlogloss:2.187580
[7]	validation_0-mlogloss:2.119275
[8]	validation_0-mlogloss:2.044838
[9]	validation_0-mlogloss:1.981150
[10]	validation_0-mlogloss:1.933918
[11]	validation_0-mlogloss:1.875427
[12]	validation_0-mlogloss:1.831581
[13]	validation_0-mlogloss:1.791988
[14]	validation_0-mlogloss:1.759920
[15]	validation_0-mlogloss:1.727565
[16]	validation_0-mlogloss:1.703464
[17]	validation_0-mlogloss:1.672175
[18]	validation_0-mlogloss:1.653139
[19]	validation_0-mlogloss:1.626189
[20]	validation_0-mlogloss:1.602445
[21]	validation_0-mlogloss:1.580067
[22]	validation_0-mlogloss:1.566135
[23]	validation_0-mlogloss:1.547138
[24]	validation_0-mlogloss:1.538353
[25]	validation_0-mlogloss:1.528180
[26]	val

XGBClassifier(base_score=0.5, colsample_bytree=0.5, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=12,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', seed=0, silent=True, subsample=0.8)

In [74]:
y_probas = xgb.predict_proba(X_test)


In [79]:
#y_df[pd.isnull(y_df.TripType)].index

In [85]:
col_names = ['TripType_' + str(c) for c in enc.classes_.astype('int')]
submission = pd.DataFrame(np.round(y_probas, 3), index=y_df[pd.isnull(y_df.TripType)].index, columns = col_names)

In [86]:
submission.reset_index(inplace = True)
submission.to_csv('Walmart_submission_XGB_7000Features-1.36.csv', index=False)

In [None]:
#cm = confusion_matrix(y_train,y_pred)
#cm_df = pd.DataFrame(cm, index = enc.classes_, columns=enc.classes_)
#cm_df.to_csv('Walmart_Confusion_Matrix.csv')

In [None]:
#*******************

In [4]:
train_y.head() #This will end up being y labels 96000 trips

Unnamed: 0_level_0,TripType
VisitNumber,Unnamed: 1_level_1
5,999
7,30
8,26
9,8
10,8


In [5]:
test_y_cols = test.VisitNumber
test_y_cols.drop_duplicates(inplace = True) #This is a pandas series
#test_y_cols = test_y_cols.values

In [6]:
train.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [7]:
train_negatives = train[train.ScanCount < 0] #Can pivot into their own 'negative fineline counts'
train_negatives_agg = train_negatives.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Negative Feature Count

In [8]:
train_uncategorized = train[pd.isnull(train.Upc)]
train_uncategorized_agg = train_uncategorized.groupby(['VisitNumber']).agg({'ScanCount':np.sum}) #Unknown Feature Count

In [10]:
train.drop(['TripType'], axis = 1, inplace = True)

In [11]:
train.Upc.fillna(-100, inplace=True)
train.DepartmentDescription.fillna('Unknown', inplace=True)
train.FinelineNumber.fillna(-100, inplace=True)

test.Upc.fillna(-100, inplace=True)
test.DepartmentDescription.fillna('Unknown', inplace=True)
test.FinelineNumber.fillna(-100, inplace=True)

In [12]:
train['FinelineNumber'] = train['FinelineNumber'].astype('int')
test['FinelineNumber'] = test['FinelineNumber'].astype('int')

In [13]:
train['DeptItems'] = train.DepartmentDescription +' ' + train.FinelineNumber.astype('str')
test['DeptItems'] = test.DepartmentDescription +' ' + test.FinelineNumber.astype('str')

In [14]:
full_df = pd.concat((train, test)) #Cannot Concant with ScanCount > 0.. some visit numbers will not be present

In [15]:
print full_df.shape
full_df.head()


(1300700, 7)


Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,DeptItems
0,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000,FINANCIAL SERVICES 1000
1,7,Friday,60538815980,1,SHOES,8931,SHOES 8931
2,7,Friday,7410811099,1,PERSONAL CARE,4504,PERSONAL CARE 4504
3,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565,PAINT AND ACCESSORIES 3565
4,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017,PAINT AND ACCESSORIES 1017


full_df[full_df.VisitNumber==191319] #Useful to check

In [16]:
visit_days = full_df.loc[:,['VisitNumber','Weekday']]
visit_days.drop_duplicates('VisitNumber', inplace = True)
visit_days.set_index('VisitNumber', inplace = True)
visit_days = pd.get_dummies(visit_days)
visit_days.head()

In [17]:
visit_days = pd.get_dummies(visit_days)
visit_days.head()

Unnamed: 0_level_0,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5,1,0,0,0,0,0,0
7,1,0,0,0,0,0,0
8,1,0,0,0,0,0,0
9,1,0,0,0,0,0,0
10,1,0,0,0,0,0,0


In [29]:
full_df.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,DeptItems
0,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000,FINANCIAL SERVICES 1000
1,7,Friday,60538815980,1,SHOES,8931,SHOES 8931
2,7,Friday,7410811099,1,PERSONAL CARE,4504,PERSONAL CARE 4504
3,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565,PAINT AND ACCESSORIES 3565
4,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017,PAINT AND ACCESSORIES 1017


In [18]:
#this only aggregates by dept, may give a better large picture overview ~ 77 columsn i believe
full_df_by_departments = pd.pivot_table(full_df[full_df.ScanCount>0], values='ScanCount', index='VisitNumber',columns='DepartmentDescription', aggfunc=np.sum)
full_df_by_departments.fillna(0, inplace=True)


In [28]:
full_df_by_departments['Totals'] = 0

In [29]:
for department in full_df_by_departments.columns[:-1]:
    full_df_by_departments['Totals'] += full_df_by_departments[department]

In [21]:
#Should pivot with only positive scancounts so i can perform chisq
full_df_departments = pd.pivot_table(full_df[full_df.ScanCount>0], values='ScanCount', index='VisitNumber',columns='DeptItems', aggfunc=np.sum)
full_df_departments.fillna(0, inplace=True)

In [22]:
full_df_departments.head() #Will be incomplete because some ScanCounts **had** only negative values

DeptItems,1-HR PHOTO 110,1-HR PHOTO 120,1-HR PHOTO 130,1-HR PHOTO 141,1-HR PHOTO 150,1-HR PHOTO 160,1-HR PHOTO 1628,1-HR PHOTO 170,1-HR PHOTO 180,1-HR PHOTO 190,...,WIRELESS 870,WIRELESS 880,WIRELESS 890,WIRELESS 9,WIRELESS 940,WIRELESS 950,WIRELESS 965,WIRELESS 970,WIRELESS 990,WIRELESS 9998
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
#One approach would be to just assign test_y_cols as -1 for for trip type
#This is the y we have in the trip data
train_y.index


Int64Index([     5,      7,      8,      9,     10,     11,     12,     15,
                17,     19,
            ...
            191329, 191331, 191335, 191337, 191342, 191343, 191344, 191345,
            191346, 191347],
           dtype='int64', name=u'VisitNumber', length=95674)

In [37]:
#This is the y trip indices we would like to predict
test_y_cols.values

array([     1,      2,      3, ..., 191340, 191341, 191348])

In [35]:
train_wide = full_df_departments.loc[train_y.index]
train_wide.fillna(0, inplace = True)

In [36]:
X_train = train_wide.drop('TripType', axis = 1).values
y_train = train_wide['TripType'].values

In [37]:
print X_train.shape, y_train.shape

(95674, 11048) (95674,)


In [38]:
chi_sq_best = SelectKBest(score_func=chi2, k = 7000)
chi_sq_best.fit(X_train,y_train)

In [39]:
chi_sq_best.fit(X_train,y_train)

SelectKBest(k=7000, score_func=<function chi2 at 0x7f6074c22398>)

In [40]:
np.sum(pd.isnull(chi_sq_best.pvalues_)) #646 null values in chisq not sure what this means

646

In [41]:
np.sum(chi_sq_best.pvalues_ < .00001) #Used 7000

7339

In [42]:
X_subset = chi_sq_best.transform(X_train) ##This takes into account Dept and Fineline number associations
X_subset.shape

(95674, 7000)

In [79]:
# print train_y.head()
# print train_uncategorized_agg.head()
# print train_negatives_agg.head()
# print totals.head()

In [44]:
train_negatives_agg.head()

Unnamed: 0_level_0,ScanCount
VisitNumber,Unnamed: 1_level_1
5,-1
8,-2
132,-2
133,-1
182,-1


In [45]:
X_additional_aggregates = train_y.join(train_uncategorized_agg,rsuffix='Uncategorized') #Should rename next time run
X_additional_aggregates = X_additional_aggregates.join(train_negatives_agg, rsuffix='Negatives')
X_additional_aggregates = X_additional_aggregates.join(visit_days, rsuffix='Days')
X_additional_aggregates = X_additional_aggregates.join(totals, rsuffix='Totals')

In [None]:
X_additional_aggregates.drop('TripType', inplace = True)

In [50]:
X_additional_aggregates.fillna(0,inplace=True)

In [51]:
X_additional_aggregates.shape #Temporary 10 additional features, will probably want just by department agragates

(95674, 10)

In [55]:
for c in X_additional_aggregates:
    if 'Weekday' in c:
        X_additional_aggregates[c] = X_additional_aggregates[c] * X_additional_aggregates['Totals']

In [57]:
X_additional_aggregates.drop('Totals', axis = 1, inplace = True)

In [71]:
X_train = np.concatenate((X_subset, X_additional_aggregates), axis = 1)

In [72]:
X_train.shape, y_train.shape

((95674, 7009), (95674,))

In [76]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 4000, random_state = 1)

In [None]:
xgb = xgboost.XGBClassifier(max_depth = 12, n_estimators = 100,
                        objective='multi:softprob', subsample = .85, colsample_bytree=.6)

xgb.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = 'mlogloss', early_stopping_rounds=5)


Will train until validation_0 error hasn't decreased in 5 rounds.
[0]	validation_0-mlogloss:3.276859
[1]	validation_0-mlogloss:3.138642
[2]	validation_0-mlogloss:3.055697
[3]	validation_0-mlogloss:2.958053
[4]	validation_0-mlogloss:2.892581
[5]	validation_0-mlogloss:2.840245
[6]	validation_0-mlogloss:2.788929


In [92]:
#del full_df_departments
#del X_subset

In [1]:
#this only aggregates by dept, may give a better large picture overview ~ 77 columsn i believe
full_df_departments = pd.pivot_table(full_df, values='ScanCount', index='VisitNumber',columns='DepartmentDescription', aggfunc=np.sum)
full_df_departments.fillna(0, inplace=True)



NameError: name 'pd' is not defined

In [None]:
departments_and_time.set_index('VisitNumber', inplace = True)

In [None]:
X = departments_and_time.loc[train_y.VisitNumber,:].values
y = train_y[['TripType']].values

In [None]:
y_probas = xgb.predict_proba(X_test)


In [None]:
y_probas.shape

In [None]:
#test = pd.read_csv('./test.csv')

In [None]:
col_names = ['TripType_' + str(c) for c in enc.classes_]
submission = pd.DataFrame(np.round(y_probas, 3), index=test_y_cols, columns = col_names)

In [None]:
submission.head()

In [None]:
submission.reset_index(inplace = True)

submission.to_csv('Walmart_submission_XGB_Simple_DepartmentsAndTotal-1.csv', index=False)

In [None]:
y_pred = xgb.predict(X_train)

In [None]:
#cm = confusion_matrix(y_train,y_pred)
#plt.imshow(cm,cmap=plt.cm.Blues)

In [None]:
#Generate confusion matrix to look at in excel

cm_df = pd.DataFrame(cm, index = enc.classes_, columns=enc.classes_)
cm_df.to_csv('Walmart_Confusion_Matrix.csv')