In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

% matplotlib inline

In [2]:
aisles_df = pd.read_csv("data/aisles.csv")
departments_df = pd.read_csv("data/departments.csv")
products_df = pd.read_csv("data/products.csv")
orders_df = pd.read_csv("data/orders.csv")
prior_df = pd.read_csv("data/order_products__prior.csv")
train_df = pd.read_csv("data/order_products__train.csv")
products_df_merged = (products_df
                      .merge(departments_df, on="department_id")
                      .merge(aisles_df, on="aisle_id"))

In [3]:
train_df = train_df.merge(orders_df[["order_id", "user_id"]], on="order_id")

In [4]:
prior_df = prior_df.merge(orders_df, on="order_id")

In [5]:
prior_df['order_dow_angle'] = (prior_df.order_dow / 
                                     (prior_df.order_dow.max() + 1) * 2 * np.pi)
prior_df['order_hod_angle'] = (prior_df.order_hour_of_day / 
                                             (prior_df.order_dow.max() + 1) * 2 * np.pi)

In [6]:
prior_df['order_dow_sin'] = np.sin(prior_df.order_dow_angle)
prior_df['order_dow_cos'] = np.cos(prior_df.order_dow_angle)
prior_df['order_hod_sin'] = np.sin(prior_df.order_hod_angle)
prior_df['order_hod_cos'] = np.cos(prior_df.order_hod_angle)
prior_df['num_products'] = 1


In [7]:
prior_product_stats = prior_df.groupby("product_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_product_stats['order_dow_angle'] = np.arctan2(prior_product_stats.order_dow_sin, prior_product_stats.order_dow_cos)
prior_product_stats['order_hod_angle'] = np.arctan2(prior_product_stats.order_hod_sin, prior_product_stats.order_hod_cos)

In [8]:
prior_product_stats.order_dow_sin = np.sin(prior_product_stats.order_dow_angle)
prior_product_stats.order_dow_cos = np.cos(prior_product_stats.order_dow_angle)
prior_product_stats.order_hod_sin = np.sin(prior_product_stats.order_hod_angle)
prior_product_stats.order_hod_cos = np.cos(prior_product_stats.order_hod_angle)
prior_product_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)
prior_product_stats.reset_index(inplace=True)
prior_product_stats.columns = ['product_id', 'product_dow_sin', 'product_dow_cos', 'product_hod_sin', 'product_hod_cos', 'product_num_purchases']

In [9]:
prior_indorder_stats = prior_df.groupby(["user_id", "order_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum}).reset_index()

In [10]:
prior_indorder_stats['num_orders'] = 1
prior_individual_stats = prior_indorder_stats.groupby("user_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': [np.sum, np.mean],
                                                                'num_orders': np.sum})

In [11]:
prior_individual_stats.columns = ['order_dow_sin', 'order_dow_cos', 'order_hod_sin', 'order_hod_cos', 'num_products', 'mean_products', "num_orders"]

In [12]:
prior_individual_stats['order_dow_angle'] = np.arctan2(prior_individual_stats.order_dow_sin, prior_individual_stats.order_dow_cos)
prior_individual_stats['order_hod_angle'] = np.arctan2(prior_individual_stats.order_hod_sin, prior_individual_stats.order_hod_cos)
prior_individual_stats.order_dow_sin = np.sin(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_dow_cos = np.cos(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_hod_sin = np.sin(prior_individual_stats.order_hod_angle)
prior_individual_stats.order_hod_cos = np.cos(prior_individual_stats.order_hod_angle)
prior_individual_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)

In [13]:
prior_individual_stats = prior_individual_stats.reset_index()

In [14]:
prior_individual_stats.columns = ['user_id', 'user_dow_sin', 'user_dow_cos', 'user_hod_sin', 'user_hod_cos', 'user_num_products', 'user_mean_products', 'user_num_orders']

In [15]:
prior_indorder_stats = prior_indorder_stats[['order_id', 'num_products']]
prior_indorder_stats.columns = ['order_id', 'num_products_in_order']

In [16]:
prior_indprod_stats = (prior_df.merge(prior_indorder_stats[['order_id', 'num_products_in_order']], on='order_id')
                       .merge(prior_individual_stats[['user_id', 'user_num_orders']], on='user_id', how='left'))
prior_indprod_stats['add_to_cart_proportion'] = prior_indprod_stats['add_to_cart_order'] / prior_indprod_stats['num_products_in_order']
prior_indprod_stats['indprod_inorder_1'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders)
prior_indprod_stats['indprod_inorder_2'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 1)
prior_indprod_stats['indprod_inorder_3'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 2)
prior_indprod_stats = prior_indprod_stats.groupby(["user_id", "product_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum,
                                                           'add_to_cart_order': np.mean,
                                                           'add_to_cart_proportion': np.mean,
                                                           'indprod_inorder_1': np.sum,
                                                           'indprod_inorder_2': np.sum,
                                                           'indprod_inorder_3': np.sum,
                                                            'user_num_orders': np.mean}).reset_index()

In [17]:
prior_indprod_stats['order_dow_angle'] = np.arctan2(prior_indprod_stats.order_dow_sin, prior_indprod_stats.order_dow_cos)
prior_indprod_stats['order_hod_angle'] = np.arctan2(prior_indprod_stats.order_hod_sin, prior_indprod_stats.order_hod_cos)
prior_indprod_stats['proportion_orders'] = prior_indprod_stats.num_products / prior_indprod_stats.user_num_orders

In [18]:
prior_indprod_stats.order_dow_sin = np.sin(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_dow_cos = np.cos(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_hod_sin = np.sin(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.order_hod_cos = np.cos(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.drop(['order_dow_angle', 'order_hod_angle', 'user_num_orders'], axis=1, inplace=True)

In [19]:
prior_indprod_stats.columns = ['user_id', 'product_id', 'indprod_dow_sin', 
                               'indprod_dow_cos', 'indprod_hod_sin',
                               'indprod_hod_cos', 'indprod_num_products', 
                               'indprod_add_to_cart_order', 'indprod_add_to_cart_proportion',
                               'indprod_inorder_1', 'indprod_inorder_2', 'indprod_inorder_3',
                                'indprod_proportion_orders']

In [20]:
prior_all_stats = prior_indprod_stats.merge(prior_individual_stats, on="user_id", how="left").merge(prior_product_stats, on='product_id', how="left")

In [21]:
orders_df_last = orders_df[orders_df.eval_set != "prior"].copy()
orders_df_last['order_dow_angle'] = (orders_df_last.order_dow / 
                                     (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_hod_angle'] = (orders_df_last.order_hour_of_day / 
                                             (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_dow_sin'] = np.sin(orders_df_last.order_dow_angle)
orders_df_last['order_dow_cos'] = np.cos(orders_df_last.order_dow_angle)
orders_df_last['order_hod_sin'] = np.sin(orders_df_last.order_hod_angle)
orders_df_last['order_hod_cos'] = np.cos(orders_df_last.order_hod_angle)

In [22]:
orders_df_last.drop(["order_number", "order_dow", "order_hour_of_day", "order_dow_angle", "order_hod_angle"], axis=1, inplace=True)

In [23]:
prior_all_stats = prior_all_stats.merge(orders_df_last, on="user_id", how="inner")

In [24]:
products_df_merged = products_df_merged.join(pd.get_dummies(products_df_merged.aisle))

In [25]:
products_df_merged.drop(['product_name', 'aisle_id', 'department_id', 'department', 'aisle'], axis=1, inplace=True)

In [26]:
prior_all_stats = prior_all_stats.merge(products_df_merged, on="product_id")

In [27]:
prior_all_stats = prior_all_stats.merge(train_df[['user_id', 'product_id', 'reordered']], how="left", on=["user_id", "product_id"])
prior_all_stats.reordered = prior_all_stats.reordered.fillna(0)                  


In [28]:
# split into train, validation, and test sets

In [29]:
prior_all_stats['validation_set'] = 0
prior_all_stats['prediction'] = 0
valid_users = prior_all_stats.loc[prior_all_stats.eval_set == "train", "user_id"].unique()
valid_users = pd.Series(valid_users).sample(frac=.1)


In [30]:
prior_all_stats.loc[prior_all_stats.user_id.isin(valid_users), 'validation_set'] = 1


In [31]:

prior_train = prior_all_stats.loc[(prior_all_stats.eval_set == "train") & (prior_all_stats.validation_set == 0)]
prior_valid = prior_all_stats.loc[prior_all_stats.validation_set == 1]
prior_test = prior_all_stats.loc[prior_all_stats.eval_set == "test"]

X_train = prior_train.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()
y_train = prior_train.reordered.as_matrix()
X_valid = prior_valid.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()
y_valid = prior_valid.reordered.as_matrix()
X_test = prior_test.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()

In [32]:
prior_valid.shape

(849206, 169)

In [33]:
prior_train.shape

(7625455, 169)

In [34]:
X_train.shape

(7625455, 162)

In [35]:
import xgboost as xgb

In [38]:
%%time
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.1
params['max_depth'] = 6
params['nthread'] = 12

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 200, watchlist, early_stopping_rounds=50, verbose_eval=10)

#bst.fit(X_train, y_train, eval_set=[(X_trian, y_trian), (X_valid, y_valid)], early_stopping_rounds=50, eval_metric='logloss', verbose=True)

[0]	train-auc:0.806724	valid-auc:0.805657
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-auc:0.814913	valid-auc:0.813414
[20]	train-auc:0.816949	valid-auc:0.815511
[30]	train-auc:0.818605	valid-auc:0.817013
[40]	train-auc:0.81999	valid-auc:0.818377
[50]	train-auc:0.821101	valid-auc:0.81939
[60]	train-auc:0.82206	valid-auc:0.820274
[70]	train-auc:0.822695	valid-auc:0.820791
[80]	train-auc:0.823138	valid-auc:0.821133
[90]	train-auc:0.823495	valid-auc:0.821377
[100]	train-auc:0.823829	valid-auc:0.821598
[110]	train-auc:0.824147	valid-auc:0.821837
[120]	train-auc:0.824398	valid-auc:0.82198
[130]	train-auc:0.82464	valid-auc:0.822108
[140]	train-auc:0.824877	valid-auc:0.822243
[150]	train-auc:0.825063	valid-auc:0.82235
[160]	train-auc:0.825237	valid-auc:0.822436
[170]	train-auc:0.825433	valid-auc:0.822538
[180]	train-auc:0.825601	valid-auc:0.822607
[190]	train-auc:0.825792	valid-auc:0.8

In [39]:
bst.dump_model('dump.raw.txt')

In [40]:
y_predicted = bst.predict(d_valid)

In [41]:
prior_valid = prior_valid.copy()

In [67]:
best_cutoff = 0
best_cutoff_f1 = 0
for cutoff in np.arange(.01, 1, .01):
    prior_valid.loc[:,'prediction'] = 1 * (y_predicted > cutoff)
    prior_valid['hit'] = (prior_valid.reordered * prior_valid.prediction)
    prior_valid_agg = prior_valid.groupby("user_id").agg({'reordered': np.sum, 
                                                      'prediction': np.sum, 
                                                      'hit': np.sum})
    prior_valid_agg.loc[(prior_valid_agg.reordered == 0) & (prior_valid_agg.prediction == 0), 'hit'] = 1
    prior_valid_agg.loc[(prior_valid_agg.reordered == 0), 'reordered'] = 1
    prior_valid_agg.loc[(prior_valid_agg.prediction == 0), 'prediction'] = 1
    prior_valid_agg['precision'] = (prior_valid_agg['hit']) / (prior_valid_agg['prediction'])
    prior_valid_agg['recall'] = (prior_valid_agg['hit']) / (prior_valid_agg['reordered'])
    prior_valid_agg['f1'] = 2 * prior_valid_agg['precision'] * prior_valid_agg['recall'] / (prior_valid_agg['precision'] + prior_valid_agg['recall'] + .0001)
    if prior_valid_agg['f1'].mean() > best_cutoff_f1:
        best_cutoff_f1 = prior_valid_agg['f1'].mean()
        best_cutoff = cutoff
print(best_cutoff)
print(best_cutoff_f1)

0.19
0.375459789116


In [51]:
prior_valid.loc[:,'prediction'] = 1 * (y_predicted > .93)

In [52]:
prior_valid['hit'] = (prior_valid.reordered * prior_valid.prediction)

In [53]:
prior_valid_agg = prior_valid.groupby("user_id").agg({'reordered': np.sum, 
                                                      'prediction': np.sum, 
                                                      'hit': np.sum})
prior_valid_agg.loc[(prior_valid_agg.reordered == 0) & (prior_valid_agg.prediction == 0), 'hit'] = 1
prior_valid_agg.loc[(prior_valid_agg.reordered == 0), 'reordered'] = 1
prior_valid_agg.loc[(prior_valid_agg.prediction == 0), 'prediction'] = 1

In [54]:
prior_valid_agg.hit.describe()

count    13121.000000
mean         0.064629
std          0.245880
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: hit, dtype: float64

In [55]:
prior_valid_agg['precision'] = (prior_valid_agg['hit']) / (prior_valid_agg['prediction'])
prior_valid_agg['recall'] = (prior_valid_agg['hit']) / (prior_valid_agg['reordered'])

In [61]:
prior_valid_agg.recall.describe()

count    13121.000000
mean         0.064629
std          0.245880
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: recall, dtype: float64

In [65]:
#prior_valid_agg.recall = prior_valid_agg.recall.fillna(0)
#prior_valid_agg.precision = prior_valid_agg.precision.fillna(0)
prior_valid_agg['f1'] = 2 * prior_valid_agg['precision'] * prior_valid_agg['recall'] / (prior_valid_agg['precision'] + prior_valid_agg['recall'] + .00001)

In [66]:
prior_valid_agg['f1'].mean()

0.06462889718933007

In [68]:
d_test = xgb.DMatrix(X_test)
y_test = bst.predict(d_test)

In [69]:
prior_test = prior_test.copy()
prior_test['prediction'] = 1 * (y_test > best_cutoff)

In [70]:
prediction_df = prior_test[prior_test['prediction'] == 1].copy()

In [71]:
prediction_df = prediction_df[['order_id', 'product_id']]

In [72]:
prediction_lists = prediction_df.groupby('order_id').agg(lambda x: " ".join(x.astype(str))).reset_index()

In [73]:
prediction_lists = prediction_lists.merge(orders_df.loc[orders_df.eval_set == "test", ['order_id']], on='order_id', how="right")

In [74]:
prediction_lists['products'] = prediction_lists.product_id.fillna("None")

In [75]:
prediction_lists = prediction_lists[['order_id', 'products']]

In [78]:
prediction_lists.to_csv("submissions/aisles_xgb.csv", index=False)

In [77]:
prediction_lists.shape

(75000, 2)

In [125]:
prediction_lists.head()

Unnamed: 0,order_id,products
0,17,13107
1,34,39180 47029
2,137,23794 24852 38689 41787
3,182,5479 9337 13629 21903 24009 27104 30391 33000 ...
4,257,4605 13870 21137 24852 27104 27966 29837 30233...
