In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv('train.csv',
                         dtype = {'TripType': np.int64,
                                  'VisitNumber': np.int64,
                                  'Weekday': str,
                                  'Upc': str,
                                  'ScanCount': np.int64,
                                  'DepartmentDescription': str,
                                  'FinelineNumber': str})
train_data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [3]:
train_data.dtypes

TripType                  int64
VisitNumber               int64
Weekday                  object
Upc                      object
ScanCount                 int64
DepartmentDescription    object
FinelineNumber           object
dtype: object

In [4]:
train_data.isnull().sum()

TripType                    0
VisitNumber                 0
Weekday                     0
Upc                      4129
ScanCount                   0
DepartmentDescription    1361
FinelineNumber           4129
dtype: int64

In [5]:
train_data['UpcLength'] = train_data['Upc'].str.len()
train_data['UpcStart'] = train_data['Upc'].str.get(0)

In [6]:
train_data['DepartmentDescription'].astype('category').describe()

count                645693
unique                   68
top       GROCERY DRY GOODS
freq                  70402
Name: DepartmentDescription, dtype: object

In [7]:
train_data['FinelineNumber'].astype('category').describe()

count     642925
unique      5195
top         5501
freq        8244
Name: FinelineNumber, dtype: object

In [8]:
groups = train_data.groupby(['VisitNumber', 'TripType', 'Weekday'])
product_count = groups.size()
products_missing_upc = groups['Upc'].apply(lambda x: sum(x.isnull()))
products_missing_department = groups['DepartmentDescription'].apply(lambda x: sum(x.isnull()))
products_missing_fineline = groups['FinelineNumber'].apply(lambda x: sum(x.isnull()))
department_count = groups['DepartmentDescription'].apply(lambda x: len(set(x)))
fineline_count = groups['FinelineNumber'].apply(lambda x: len(set(x)))
items_bought = groups['ScanCount'].apply(lambda x: sum(x[x>0]))
items_returned = groups['ScanCount'].apply(lambda x: -sum(x[x<0]))
products_bought = groups['ScanCount'].apply(lambda x: x[x>0].size)
products_returned = groups['ScanCount'].apply(lambda x: x[x<0].size)
most_items_bought = groups['ScanCount'].apply(lambda x: max(max(x), 0))
most_items_returned = groups['ScanCount'].apply(lambda x: -min(min(x), 0))

In [9]:
departments = train_data['DepartmentDescription'].unique()
department_presence = {}
for department in departments:
    department_key = 'department_' + str(department)
    department_presence[department_key] = groups['DepartmentDescription'].apply(lambda x: list(x).count(department))

In [10]:
upc_length_unique = train_data['UpcLength'].unique()
upc_length = {}
for i in np.sort(upc_length_unique[~np.isnan(upc_length_unique)]):
    upc_length_key = 'upc_length_' + str(i)
    upc_length[upc_length_key] = groups['UpcLength'].apply(lambda x: list(x).count(i))

In [11]:
upc_start_unique = train_data['UpcStart'].unique()
upc_start = {}
for i in np.sort(upc_start_unique[~pd.isnull(upc_start_unique)]):
    upc_start_key = 'upc_start_' + str(i)
    upc_start[upc_start_key] = groups['UpcStart'].apply(lambda x: list(x).count(i))

In [12]:
columns = {'product_count': product_count, 
           'department_count': department_count,
           'fineline_count': fineline_count,
           'products_missing_upc': products_missing_upc, 
           'products_missing_department': products_missing_department,
           'products_missing_fineline': products_missing_fineline,
           'products_bought': products_bought,
           'products_returned': products_returned,
           'items_bought': items_bought,
           'items_returned': items_returned,
           'most_items_bought': most_items_bought,
           'most_items_returned': most_items_returned}
columns.update(department_presence)
columns.update(upc_length)
columns.update(upc_start)
final_train_data = pd.DataFrame(columns)
final_train_data = final_train_data.reset_index()

In [13]:
final_train_data['weekend'] = final_train_data['Weekday'].isin(['Saturday', 'Sunday'])
bought = final_train_data['products_bought'] > 0
returned = final_train_data['products_returned'] > 0
final_train_data['bought_only'] = bought & -returned
final_train_data['returned_only'] = -bought & returned
final_train_data['bought_and_returned'] = bought & returned
final_train_data = pd.get_dummies(final_train_data)
final_train_data = final_train_data.astype(int)
final_train_data.head()

Unnamed: 0,VisitNumber,TripType,department_1-HR PHOTO,department_ACCESSORIES,department_AUTOMOTIVE,department_BAKERY,department_BATH AND SHOWER,department_BEAUTY,department_BEDDING,department_BOOKS AND MAGAZINES,...,bought_only,returned_only,bought_and_returned,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
0,5,999,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,7,30,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,8,26,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,9,8,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,10,8,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


In [14]:
test_data = pd.read_csv('test.csv',
                         dtype = {'VisitNumber': np.int64,
                                  'Weekday': str,
                                  'Upc': str,
                                  'ScanCount': np.int64,
                                  'DepartmentDescription': str,
                                  'FinelineNumber': str})
test_data.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503389714,1,SHOES,3002
1,1,Friday,1707710732,1,DAIRY,1526
2,1,Friday,89470001026,1,DAIRY,1431
3,1,Friday,88491211470,1,GROCERY DRY GOODS,3555
4,2,Friday,2840015224,1,DSD GROCERY,4408


In [15]:
test_data['UpcLength'] = test_data['Upc'].str.len()
test_data['UpcStart'] = test_data['Upc'].str.get(0)

In [16]:
groups = test_data.groupby(['VisitNumber', 'Weekday'])
product_count = groups.size()
products_missing_upc = groups['Upc'].apply(lambda x: sum(x.isnull()))
products_missing_department = groups['DepartmentDescription'].apply(lambda x: sum(x.isnull()))
products_missing_fineline = groups['FinelineNumber'].apply(lambda x: sum(x.isnull()))
department_count = groups['DepartmentDescription'].apply(lambda x: len(set(x)))
fineline_count = groups['FinelineNumber'].apply(lambda x: len(set(x)))
items_bought = groups['ScanCount'].apply(lambda x: sum(x[x>0]))
items_returned = groups['ScanCount'].apply(lambda x: -sum(x[x<0]))
products_bought = groups['ScanCount'].apply(lambda x: x[x>0].size)
products_returned = groups['ScanCount'].apply(lambda x: x[x<0].size)
most_items_bought = groups['ScanCount'].apply(lambda x: max(max(x), 0))
most_items_returned = groups['ScanCount'].apply(lambda x: -min(min(x), 0))

In [17]:
department_presence = {}
for department in departments:
    department_key = 'department_' + str(department)
    department_presence[department_key] = groups['DepartmentDescription'].apply(lambda x: list(x).count(department))

In [18]:
upc_length = {}
for i in np.sort(upc_length_unique[~np.isnan(upc_length_unique)]):
    upc_length_key = 'upc_length_' + str(i)
    upc_length[upc_length_key] = groups['UpcLength'].apply(lambda x: list(x).count(i))

In [19]:
upc_start = {}
for i in np.sort(upc_start_unique[~pd.isnull(upc_start_unique)]):
    upc_start_key = 'upc_start_' + str(i)
    upc_start[upc_start_key] = groups['UpcStart'].apply(lambda x: list(x).count(i))

In [20]:
columns = {'product_count': product_count, 
           'department_count': department_count,
           'fineline_count': fineline_count,
           'products_missing_upc': products_missing_upc, 
           'products_missing_department': products_missing_department,
           'products_missing_fineline': products_missing_fineline,
           'products_bought': products_bought,
           'products_returned': products_returned,
           'items_bought': items_bought,
           'items_returned': items_returned,
           'most_items_bought': most_items_bought,
           'most_items_returned': most_items_returned}
columns.update(department_presence)
columns.update(upc_length)
columns.update(upc_start)
final_test_data = pd.DataFrame(columns)
final_test_data = final_test_data.reset_index()

In [21]:
final_test_data['weekend'] = final_test_data['Weekday'].isin(['Saturday', 'Sunday'])
bought = final_test_data['products_bought'] > 0
returned = final_test_data['products_returned'] > 0
final_test_data['bought_only'] = bought & -returned
final_test_data['returned_only'] = -bought & returned
final_test_data['bought_and_returned'] = bought & returned
final_test_data = pd.get_dummies(final_test_data)
final_test_data = final_test_data.astype(int)
final_test_data.head()

Unnamed: 0,VisitNumber,department_1-HR PHOTO,department_ACCESSORIES,department_AUTOMOTIVE,department_BAKERY,department_BATH AND SHOWER,department_BEAUTY,department_BEDDING,department_BOOKS AND MAGAZINES,department_BOYS WEAR,...,bought_only,returned_only,bought_and_returned,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
1,2,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,2,...,0,0,1,1,0,0,0,0,0,0


In [27]:
list(final_train_data.drop(['VisitNumber', 'TripType'], 1).columns) == list(final_test_data.drop('VisitNumber', 1).columns)

True

In [28]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [29]:
train_features = final_train_data.drop(['VisitNumber', 'TripType'], 1).values
train_target = final_train_data['TripType'].values
test_features = final_test_data.drop('VisitNumber', 1).values
test_labels = final_test_data['VisitNumber'].values

In [30]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)

In [31]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [32]:
abc_tuned_parameters = {'learning_rate': [0.01, 0.1, 1.0, 10.0, 100.0]}
abc = GridSearchCV(AdaBoostClassifier(random_state=2), abc_tuned_parameters, cv=skf, scoring='neg_log_loss')
abc.fit(train_features, train_target)
abc.cv_results_

{'mean_fit_time': array([ 11.50288087,  11.40365791,  11.36808312,  10.61351198,   2.80020684]),
 'mean_score_time': array([ 1.43327081,  1.44201857,  1.42591971,  1.58328068,  0.68216628]),
 'mean_test_score': array([-2.63332501, -3.14435459, -3.54796036, -3.48363401, -4.21261813]),
 'mean_train_score': array([-2.63220229, -3.14258322, -3.54771205, -3.48407133, -4.2125195 ]),
 'param_learning_rate': masked_array(data = [0.01 0.1 1.0 10.0 100.0],
              mask = [False False False False False],
        fill_value = ?),
 'params': ({'learning_rate': 0.01},
  {'learning_rate': 0.1},
  {'learning_rate': 1.0},
  {'learning_rate': 10.0},
  {'learning_rate': 100.0}),
 'rank_test_score': array([1, 2, 4, 3, 5]),
 'split0_test_score': array([-2.6367509 , -3.13548174, -3.54924006, -3.5230011 , -4.43147862]),
 'split0_train_score': array([-2.63278616, -3.13356549, -3.54673656, -3.52157559, -4.42743569]),
 'split1_test_score': array([-2.63583212, -3.13809147, -3.5426739 , -3.51291792, -3.9938

In [33]:
dtc_tuned_parameters = {'max_features': ["auto", None]}
dtc = GridSearchCV(DecisionTreeClassifier(), dtc_tuned_parameters, cv=skf, scoring='neg_log_loss')
dtc.fit(train_features, train_target)
dtc.cv_results_

{'mean_fit_time': array([ 0.3960824 ,  2.36718559]),
 'mean_score_time': array([ 0.05000836,  0.0488798 ]),
 'mean_test_score': array([-14.60213785, -13.17504545]),
 'mean_train_score': array([-0.06551722, -0.06551722]),
 'param_max_features': masked_array(data = ['auto' None],
              mask = [False False],
        fill_value = ?),
 'params': ({'max_features': 'auto'}, {'max_features': None}),
 'rank_test_score': array([2, 1]),
 'split0_test_score': array([-14.64360036, -13.17534283]),
 'split0_train_score': array([-0.0650156, -0.0650156]),
 'split1_test_score': array([-14.46710962, -13.35764745]),
 'split1_train_score': array([-0.06538183, -0.06538183]),
 'split2_test_score': array([-14.80559316, -12.97418597]),
 'split2_train_score': array([-0.06642838, -0.06642838]),
 'split3_test_score': array([-14.49221808, -13.19295783]),
 'split3_train_score': array([-0.06524309, -0.06524309]),
 'std_fit_time': array([ 0.00483311,  0.02081822]),
 'std_score_time': array([ 0.00114143,  0.00

In [34]:
gbc_tuned_parameters = {'max_depth': [1, 3, 5]}
gbc = GridSearchCV(GradientBoostingClassifier(random_state=3), gbc_tuned_parameters, cv=skf, scoring='neg_log_loss')
gbc.fit(train_features, train_target)
gbc.cv_results_

{'mean_fit_time': array([  604.96244514,  1537.41164178,  2730.91880566]),
 'mean_score_time': array([ 1.74463266,  3.31623209,  3.98152214]),
 'mean_test_score': array([-1.18944818, -0.92850169, -0.91345318]),
 'mean_train_score': array([-1.17652046, -0.81650617, -0.55691493]),
 'param_max_depth': masked_array(data = [1 3 5],
              mask = [False False False],
        fill_value = ?),
 'params': ({'max_depth': 1}, {'max_depth': 3}, {'max_depth': 5}),
 'rank_test_score': array([3, 2, 1]),
 'split0_test_score': array([-1.19847887, -0.93153067, -0.91373174]),
 'split0_train_score': array([-1.17412471, -0.81252733, -0.55954654]),
 'split1_test_score': array([-1.18973976, -0.934813  , -0.92763509]),
 'split1_train_score': array([-1.17539963, -0.81526948, -0.55958427]),
 'split2_test_score': array([-1.1785933 , -0.92106814, -0.89876791]),
 'split2_train_score': array([-1.17853974, -0.81924957, -0.55536556]),
 'split3_test_score': array([-1.19097404, -0.92659004, -0.91367369]),
 'spli

In [35]:
lr_tuned_parameters = {'C': [0.01, 0.1, 1.0, 10.0, 100.0]}
lr = GridSearchCV(LogisticRegression(random_state=4), lr_tuned_parameters, cv=skf, scoring='neg_log_loss')
lr.fit(train_features, train_target)
lr.cv_results_

  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)
  np.exp(prob, prob)


{'mean_fit_time': array([  33.5163874 ,   51.60751903,   82.71872503,  130.88320577,
         180.13146788]),
 'mean_score_time': array([ 0.06659031,  0.06216478,  0.06377643,  0.06091189,  0.06074685]),
 'mean_test_score': array([-1.35202104, -1.18208179, -1.16240774, -1.16683369, -1.17256691]),
 'mean_train_score': array([-1.33670425, -1.14877198, -1.11143156, -1.10430162, -1.10275899]),
 'param_C': masked_array(data = [0.01 0.1 1.0 10.0 100.0],
              mask = [False False False False False],
        fill_value = ?),
 'params': ({'C': 0.01}, {'C': 0.1}, {'C': 1.0}, {'C': 10.0}, {'C': 100.0}),
 'rank_test_score': array([5, 4, 1, 2, 3]),
 'split0_test_score': array([-1.36405076, -1.19064384, -1.16980657, -1.17390215, -1.17954232]),
 'split0_train_score': array([-1.33284535, -1.14516835, -1.10782355, -1.10070329, -1.09914631]),
 'split1_test_score': array([-1.34961273, -1.18204964, -1.16253674, -1.16741701, -1.17281837]),
 'split1_train_score': array([-1.33752619, -1.14872844, -1.

In [36]:
rfc_tuned_parameters = {'max_depth': [1, 5, 10, None]}
rfc = GridSearchCV(RandomForestClassifier(random_state=5), rfc_tuned_parameters, cv=skf, scoring='neg_log_loss')
rfc.fit(train_features, train_target)
rfc.cv_results_

{'mean_fit_time': array([ 0.29782265,  0.58417875,  0.90527368,  2.14747155]),
 'mean_score_time': array([ 0.09610689,  0.10289866,  0.11530662,  0.16815704]),
 'mean_test_score': array([-2.7779433 , -2.14289792, -1.68994732, -4.33642646]),
 'mean_train_score': array([-2.77736742, -2.13567952, -1.61324482, -0.26125066]),
 'param_max_depth': masked_array(data = [1 5 10 None],
              mask = [False False False False],
        fill_value = ?),
 'params': ({'max_depth': 1},
  {'max_depth': 5},
  {'max_depth': 10},
  {'max_depth': None}),
 'rank_test_score': array([3, 2, 1, 4]),
 'split0_test_score': array([-2.77958797, -2.15772784, -1.69016593, -4.35925619]),
 'split0_train_score': array([-2.77514156, -2.1451049 , -1.60067406, -0.26046047]),
 'split1_test_score': array([-2.77904016, -2.15441681, -1.69562854, -4.37251837]),
 'split1_train_score': array([-2.77810036, -2.14138433, -1.61272159, -0.25975056]),
 'split2_test_score': array([-2.7744201 , -2.13861698, -1.66764386, -4.22680101

In [37]:
predict_test_target = gbc.predict_proba(test_features)
test_target = pd.DataFrame(predict_test_target)

In [39]:
trip_types = gbc.best_estimator_.classes_
trip_types = ['TripType_' + str(t) for t in list(trip_types)]
test_target.columns = trip_types

In [41]:
test_target['VisitNumber'] = pd.Series(test_labels)

In [42]:
cols = test_target.columns.tolist()
cols = cols[-1:] + cols[:-1]
test_target[cols].to_csv('test_predictions.csv', index=False)