In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

% matplotlib inline

In [2]:
aisles_df = pd.read_csv("data/aisles.csv")
departments_df = pd.read_csv("data/departments.csv")
products_df = pd.read_csv("data/products.csv")
orders_df = pd.read_csv("data/orders.csv")
prior_df = pd.read_csv("data/order_products__prior.csv")
train_df = pd.read_csv("data/order_products__train.csv")
products_df_merged = (products_df
                      .merge(departments_df, on="department_id")
                      .merge(aisles_df, on="aisle_id"))

In [3]:
orders_df['absolute_date'] = orders_df.groupby("user_id").days_since_prior_order.cumsum().fillna(0)

In [4]:
orders_df['max_order_number'] = orders_df.groupby("user_id").order_number.transform(max)
orders_df['max_absolute_date'] = orders_df.groupby("user_id").absolute_date.transform(max)

In [5]:
orders_df['reverse_date'] = orders_df.max_absolute_date - orders_df.absolute_date
orders_df['reverse_order_number'] = orders_df.max_order_number - orders_df.order_number

In [6]:
train_df = train_df.merge(orders_df[["order_id", "user_id"]], on="order_id")

In [7]:
prior_df = prior_df.merge(orders_df, on="order_id")

In [8]:
prior_df['order_dow_angle'] = (prior_df.order_dow / 
                                     (prior_df.order_dow.max() + 1) * 2 * np.pi)
prior_df['order_hod_angle'] = (prior_df.order_hour_of_day / 
                                             (prior_df.order_dow.max() + 1) * 2 * np.pi)

In [9]:
prior_df['order_dow_sin'] = np.sin(prior_df.order_dow_angle)
prior_df['order_dow_cos'] = np.cos(prior_df.order_dow_angle)
prior_df['order_hod_sin'] = np.sin(prior_df.order_hod_angle)
prior_df['order_hod_cos'] = np.cos(prior_df.order_hod_angle)
prior_df['num_products'] = 1
prior_df['num_products_dw_8'] = np.exp(-np.log(2)/8 * prior_df.reverse_date)
prior_df['num_products_dw_16'] = np.exp(-np.log(2)/16 * prior_df.reverse_date)
prior_df['num_products_dw_32'] = np.exp(-np.log(2)/32 * prior_df.reverse_date)
prior_df['num_products_dw_64'] = np.exp(-np.log(2)/64 * prior_df.reverse_date)
prior_df['num_products_dw_128'] = np.exp(-np.log(2)/128 * prior_df.reverse_date)
prior_df['num_products_ow_2'] = np.exp(-np.log(2)/2 * prior_df.reverse_order_number)
prior_df['num_products_ow_4'] = np.exp(-np.log(2)/4 * prior_df.reverse_order_number)
prior_df['num_products_ow_8'] = np.exp(-np.log(2)/8 * prior_df.reverse_order_number)
prior_df['num_products_ow_16'] = np.exp(-np.log(2)/16 * prior_df.reverse_order_number)
prior_df['num_products_ow_32'] = np.exp(-np.log(2)/32 * prior_df.reverse_order_number)

In [11]:
prior_product_stats = prior_df.groupby("product_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_product_stats['order_dow_angle'] = np.arctan2(prior_product_stats.order_dow_sin, prior_product_stats.order_dow_cos)
prior_product_stats['order_hod_angle'] = np.arctan2(prior_product_stats.order_hod_sin, prior_product_stats.order_hod_cos)

In [12]:
prior_product_stats.order_dow_sin = np.sin(prior_product_stats.order_dow_angle)
prior_product_stats.order_dow_cos = np.cos(prior_product_stats.order_dow_angle)
prior_product_stats.order_hod_sin = np.sin(prior_product_stats.order_hod_angle)
prior_product_stats.order_hod_cos = np.cos(prior_product_stats.order_hod_angle)
prior_product_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)
prior_product_stats.reset_index(inplace=True)
prior_product_stats.columns = ['product_id', 'product_dow_sin', 'product_dow_cos', 'product_hod_sin', 'product_hod_cos', 'product_num_purchases']

In [14]:
prior_indorder_stats = prior_df.groupby(["user_id", "order_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum,
                                                                      'absolute_date': np.max,
                                                                     'order_number': np.max}).reset_index()

In [15]:
prior_individual_stats = prior_indorder_stats.groupby("user_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': [np.sum, np.mean],
                                                                'absolute_date': np.max,      
                                                                'order_number': np.max})

In [17]:
prior_individual_stats.columns = ['order_dow_sin', 'order_dow_cos',
                                  'order_hod_sin', 'order_hod_cos', 
                                   'num_products', 'mean_products', 'max_absolute_date', "max_order_number"]

In [18]:
prior_individual_stats['order_dow_angle'] = np.arctan2(prior_individual_stats.order_dow_sin, prior_individual_stats.order_dow_cos)
prior_individual_stats['order_hod_angle'] = np.arctan2(prior_individual_stats.order_hod_sin, prior_individual_stats.order_hod_cos)
prior_individual_stats.order_dow_sin = np.sin(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_dow_cos = np.cos(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_hod_sin = np.sin(prior_individual_stats.order_hod_angle)
prior_individual_stats.order_hod_cos = np.cos(prior_individual_stats.order_hod_angle)
prior_individual_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)

In [19]:
prior_individual_stats = prior_individual_stats.reset_index()

In [21]:
prior_individual_stats.columns = ['user_id', 'user_dow_sin',
                                  'user_dow_cos', 'user_hod_sin',
                                  'user_hod_cos', 'user_num_products',
                                  'user_mean_products', 'user_num_days',
                                 'user_num_orders']
prior_individual_stats['user_days_per_order'] = prior_individual_stats.user_num_days / prior_individual_stats.user_num_orders

In [22]:
prior_indorder_stats = prior_indorder_stats[['order_id', 'num_products']]
prior_indorder_stats.columns = ['order_id', 'num_products_in_order']

In [23]:
prior_indprod_stats = (prior_df.merge(prior_indorder_stats[['order_id', 'num_products_in_order']], on='order_id')
                       .merge(prior_individual_stats[['user_id', 'user_num_orders', 'user_num_days']], on='user_id', how='left'))
prior_indprod_stats['add_to_cart_proportion'] = prior_indprod_stats['add_to_cart_order'] / prior_indprod_stats['num_products_in_order']
prior_indprod_stats['indprod_inorder_1'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders)
prior_indprod_stats['indprod_inorder_2'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 1)
prior_indprod_stats['indprod_inorder_3'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 2)
prior_indprod_stats = prior_indprod_stats.groupby(["user_id", "product_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum,
                                                               'num_products_dw_8': np.sum,
                                                                'num_products_dw_16': np.sum,
                                                                'num_products_dw_32': np.sum,
                                                                'num_products_dw_64': np.sum,
                                                                'num_products_dw_128': np.sum,
                                                                'num_products_ow_2': np.sum,           
                                                                'num_products_ow_4': np.sum,
                                                                'num_products_ow_8': np.sum,                  
                                                                'num_products_ow_16': np.sum,
                                                                'num_products_ow_32': np.sum,                                   
                                                           'add_to_cart_order': np.mean,
                                                           'add_to_cart_proportion': np.mean,
                                                           'indprod_inorder_1': np.sum,
                                                           'indprod_inorder_2': np.sum,
                                                           'indprod_inorder_3': np.sum,
                                                            'user_num_orders': np.mean,
                                                            'user_num_days': np.mean,
                                                            'reverse_date': np.min,
                                                            'reverse_order_number': np.min}).reset_index()

In [24]:
prior_indprod_stats['order_dow_angle'] = np.arctan2(prior_indprod_stats.order_dow_sin, prior_indprod_stats.order_dow_cos)
prior_indprod_stats['order_hod_angle'] = np.arctan2(prior_indprod_stats.order_hod_sin, prior_indprod_stats.order_hod_cos)
prior_indprod_stats['proportion_orders'] = prior_indprod_stats.num_products / (prior_indprod_stats.user_num_orders)
prior_indprod_stats['days_per_order'] = prior_indprod_stats.user_num_days / (prior_indprod_stats.num_products)

In [25]:
prior_indprod_stats.order_dow_sin = np.sin(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_dow_cos = np.cos(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_hod_sin = np.sin(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.order_hod_cos = np.cos(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.drop(['order_dow_angle', 'order_hod_angle', 'user_num_orders', 'user_num_days'], axis=1, inplace=True)

In [27]:
prior_indprod_stats.columns = ['user_id', 'product_id', 'indprod_dow_sin', 
                               'indprod_dow_cos', 'indprod_hod_sin',
                               'indprod_hod_cos', 'indprod_num_products',
                               'indprod_num_products_dw_8', 'indprod_num_products_dw_16', 
                               'indprod_num_products_dw_32', 'indprod_num_products_dw_64', 
                               'indprod_num_products_dw_128', 'indprod_num_products_ow_2',
                               'indprod_num_products_ow_4', 'indprod_num_products_ow_8',
                               'indprod_num_products_ow_16', 'indprod_num_products_ow_32',
                               'indprod_add_to_cart_order', 'indprod_add_to_cart_proportion',
                               'indprod_inorder_1', 'indprod_inorder_2', 'indprod_inorder_3',
                               'indprod_days_since_last', 'indprod_orders_since_last',
                                'indprod_proportion_orders', 'indprod_days_per_order']

In [28]:
# use indprod means to add more product stats
product_order_proportions = (prior_indprod_stats[['user_id', 'product_id', 
                                                  'indprod_proportion_orders', 'indprod_days_per_order']]
                             .groupby("product_id")
                             .agg({'indprod_proportion_orders': np.mean,
                                  'indprod_days_per_order': np.mean})).reset_index()
product_order_proportions.columns = ['product_id', 'product_proportion_orders', 'product_days_per_order']
prior_product_stats = prior_product_stats.merge(product_order_proportions, on='product_id')

In [30]:
prior_all_stats = prior_indprod_stats.merge(prior_individual_stats, on="user_id", how="left").merge(prior_product_stats, on='product_id', how="left")

In [31]:
orders_df_last = orders_df[orders_df.eval_set != "prior"].copy()
orders_df_last['order_dow_angle'] = (orders_df_last.order_dow / 
                                     (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_hod_angle'] = (orders_df_last.order_hour_of_day / 
                                             (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_dow_sin'] = np.sin(orders_df_last.order_dow_angle)
orders_df_last['order_dow_cos'] = np.cos(orders_df_last.order_dow_angle)
orders_df_last['order_hod_sin'] = np.sin(orders_df_last.order_hod_angle)
orders_df_last['order_hod_cos'] = np.cos(orders_df_last.order_hod_angle)

In [32]:
orders_df_last.drop(["order_number", "order_dow", "order_hour_of_day", "order_dow_angle", "order_hod_angle"], axis=1, inplace=True)

In [33]:
prior_all_stats = prior_all_stats.merge(orders_df_last, on="user_id", how="inner")

In [34]:
products_df_merged = products_df_merged.join(pd.get_dummies(products_df_merged.aisle))

In [35]:
products_df_merged.drop(['product_name', 'aisle_id', 'department_id', 'department', 'aisle'], axis=1, inplace=True)

In [36]:
prior_all_stats = prior_all_stats.merge(products_df_merged, on="product_id")

In [37]:
prior_all_stats = prior_all_stats.merge(train_df[['user_id', 'product_id', 'reordered']], how="left", on=["user_id", "product_id"])
prior_all_stats.reordered = prior_all_stats.reordered.fillna(0)                  


In [38]:
# split into train, validation, and test sets
prior_all_stats['validation_set'] = 0
prior_all_stats['prediction'] = 0
valid_users = prior_all_stats.loc[prior_all_stats.eval_set == "train", "user_id"].unique()
valid_users = pd.Series(valid_users).sample(frac=.1)


In [39]:
prior_all_stats.loc[prior_all_stats.user_id.isin(valid_users), 'validation_set'] = 1


In [40]:

prior_train = prior_all_stats.loc[(prior_all_stats.eval_set == "train") & (prior_all_stats.validation_set == 0)]
prior_valid = prior_all_stats.loc[prior_all_stats.validation_set == 1]
prior_test = prior_all_stats.loc[prior_all_stats.eval_set == "test"]

X_train = prior_train.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()
y_train = prior_train.reordered.as_matrix()
X_valid = prior_valid.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()
y_valid = prior_valid.reordered.as_matrix()
X_test = prior_test.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()

In [41]:
import xgboost as xgb

In [42]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.1
params['max_depth'] = 6
params['nthread'] = 12
# params['scale_pos_weight'] = (1 - y_train.mean())/(y_train.mean())

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=10)

[0]	train-auc:0.823477	valid-auc:0.821979
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-auc:0.827432	valid-auc:0.825496
[20]	train-auc:0.828911	valid-auc:0.826925
[30]	train-auc:0.830239	valid-auc:0.828104
[40]	train-auc:0.831434	valid-auc:0.829168
[50]	train-auc:0.832322	valid-auc:0.830005
[60]	train-auc:0.833089	valid-auc:0.830559
[70]	train-auc:0.833614	valid-auc:0.830936
[80]	train-auc:0.834052	valid-auc:0.831192
[90]	train-auc:0.834441	valid-auc:0.831436
[100]	train-auc:0.834738	valid-auc:0.831576
[110]	train-auc:0.835027	valid-auc:0.831715
[120]	train-auc:0.835298	valid-auc:0.831839
[130]	train-auc:0.835565	valid-auc:0.83198
[140]	train-auc:0.835762	valid-auc:0.832061
[150]	train-auc:0.836032	valid-auc:0.832205
[160]	train-auc:0.836243	valid-auc:0.832291
[170]	train-auc:0.836451	valid-auc:0.83236
[180]	train-auc:0.836649	valid-auc:0.832438
[190]	train-auc:0.83686	valid-auc:

In [43]:
bst.save_model('dump_addtime.model')

In [45]:
y_predicted = bst.predict(d_valid)

In [46]:
prior_valid = prior_valid.copy()

In [74]:
best_reorder_cutoff = 0
best_cutoff_f1 = 0
best_none_cutoff = 0
for reorder_cutoff in np.arange(.1, .4, .005):
    prior_valid.loc[:,'prediction'] = 1 * (y_predicted > reorder_cutoff)
    prior_valid['p_not'] = 1 - y_predicted
    prior_valid['hit'] = (prior_valid.reordered * prior_valid.prediction)
    prior_valid_agg = prior_valid.groupby("user_id").agg({'reordered': np.sum, 
                                                      'prediction': np.sum, 
                                                      'hit': np.sum,
                                                         'p_not': np.prod})
    for none_cutoff in np.arange(0, .1, .002):
        prior_valid_agg['putnone'] = (prior_valid_agg.p_not > none_cutoff) | (prior_valid_agg.prediction == 0)
        #prior_valid_agg['putnone'] = (prior_valid_agg.prediction == 0)
        prior_valid_agg['truenone'] = (prior_valid_agg.reordered == 0)
        prior_valid_agg['r'] = prior_valid_agg.reordered
        prior_valid_agg['p'] = prior_valid_agg.prediction
        prior_valid_agg['h'] = prior_valid_agg.hit
        prior_valid_agg.loc[prior_valid_agg.putnone & prior_valid_agg.truenone, "h"] = 1
        prior_valid_agg.loc[prior_valid_agg.putnone, 'p'] = prior_valid_agg.loc[prior_valid_agg.putnone, 'p'] + 1
        prior_valid_agg.loc[prior_valid_agg.truenone, 'r'] = prior_valid_agg.loc[prior_valid_agg.truenone, 'r'] + 1
        #prior_valid_agg.loc[(prior_valid_agg.reordered == 0), 'reordered'] = 1
        #prior_valid_agg.loc[(prior_valid_agg.prediction == 0), 'prediction'] = 1
        prior_valid_agg['precision'] = (prior_valid_agg['h']) / (prior_valid_agg['p'])
        prior_valid_agg['recall'] = (prior_valid_agg['h']) / (prior_valid_agg['r'])
        prior_valid_agg['f1'] = 2 * prior_valid_agg['precision'] * prior_valid_agg['recall'] / (prior_valid_agg['precision'] + prior_valid_agg['recall'] + .000001)
        if prior_valid_agg['f1'].mean() > best_cutoff_f1:
            best_cutoff_f1 = prior_valid_agg['f1'].mean()
            best_reorder_cutoff = reorder_cutoff
            best_none_cutoff = none_cutoff
print("best reorder cutoff:", best_reorder_cutoff)
print("best none cutoff:", best_none_cutoff)
print("best f1:", best_cutoff_f1)

0.185
0.068
0.389738163363


In [49]:
d_test = xgb.DMatrix(X_test)
y_test = bst.predict(d_test)

In [76]:
prior_test = prior_test.copy()
prior_test['prediction'] = 1 * (y_test > best_reorder_cutoff)
prior_test['p_not'] = 1 - y_test

In [77]:
writenone_df = prior_test.groupby('order_id').agg({'p_not': np.prod, 'prediction': np.sum}).reset_index()

In [78]:
writenone_df['putnone'] = (writenone_df.p_not > best_none_cutoff) | (writenone_df.prediction == 0)
writenone_df['nonestring'] = ''
writenone_df.loc[writenone_df.putnone, 'nonestring'] = 'None'

In [80]:
prediction_df = prior_test[prior_test['prediction'] == 1].copy()

In [81]:
prediction_df = prediction_df[['order_id', 'product_id']]

In [82]:
prediction_lists = prediction_df.groupby('order_id').agg(lambda x: " ".join(x.astype(str))).reset_index()

In [83]:
prediction_lists = prediction_lists.merge(writenone_df[['order_id', 'nonestring']], on='order_id', how='right')

In [84]:
prediction_lists['products'] = prediction_lists.product_id.fillna('')

In [85]:
prediction_lists['products'] = prediction_lists.products + " " + prediction_lists.nonestring

In [86]:
prediction_lists = prediction_lists[['order_id', 'products']]

In [88]:
prediction_lists.to_csv("submissions/nonecutoff_xgb.csv", index=False)