In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

% matplotlib inline

In [5]:
aisles_df = pd.read_csv("data/aisles.csv")
departments_df = pd.read_csv("data/departments.csv")
products_df = pd.read_csv("data/products.csv")
orders_df = pd.read_csv("data/orders.csv")
prior_df = pd.read_csv("data/order_products__prior.csv")
train_df = pd.read_csv("data/order_products__train.csv")
products_df_merged = products_df.merge(departments_df, on="department_id").merge(aisles_df, on="aisle_id")

In [6]:
prior_df = prior_df.merge(orders_df[["order_id", "user_id"]], on="order_id")

In [7]:
train_df = train_df.merge(orders_df[["order_id", "user_id"]], on="order_id")

In [8]:
train_users = orders_df[orders_df.eval_set == "train"].user_id

In [9]:
train_users_small = train_users.sample(n=10000)

In [10]:
orders_df_small = orders_df[orders_df.user_id.isin(train_users_small)]
prior_df_small = prior_df[prior_df.user_id.isin(train_users_small)]
train_df_small = train_df[train_df.user_id.isin(train_users_small)]

In [13]:
prior_df_small = prior_df_small.merge(orders_df_small, on=["order_id", "user_id"])

In [352]:
prior_df.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id'], dtype='object')

In [116]:
prior_df_small['order_dow_angle'] = (prior_df_small.order_dow / 
                                     (prior_df_small.order_dow.max() + 1) * 2 * np.pi)
prior_df_small['order_hod_angle'] = (prior_df_small.order_hour_of_day / 
                                             (prior_df_small.order_dow.max() + 1) * 2 * np.pi)

In [118]:
prior_df_small['order_dow_sin'] = np.sin(prior_df_small.order_dow_angle)
prior_df_small['order_dow_cos'] = np.cos(prior_df_small.order_dow_angle)
prior_df_small['order_hod_sin'] = np.sin(prior_df_small.order_hod_angle)
prior_df_small['order_hod_cos'] = np.cos(prior_df_small.order_hod_angle)
prior_df_small['num_products'] = 1


In [119]:
prior_product_stats = prior_df_small.groupby("product_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_product_stats['order_dow_angle'] = np.arctan2(prior_product_stats.order_dow_sin, prior_product_stats.order_dow_cos)
prior_product_stats['order_hod_angle'] = np.arctan2(prior_product_stats.order_hod_sin, prior_product_stats.order_hod_cos)

In [120]:
prior_product_stats.order_dow_sin = np.sin(prior_product_stats.order_dow_angle)
prior_product_stats.order_dow_cos = np.cos(prior_product_stats.order_dow_angle)
prior_product_stats.order_hod_sin = np.sin(prior_product_stats.order_hod_angle)
prior_product_stats.order_hod_cos = np.cos(prior_product_stats.order_hod_angle)
prior_product_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)
prior_product_stats.reset_index(inplace=True)
prior_product_stats.columns = ['product_id', 'product_dow_sin', 'product_dow_cos', 'product_hod_sin', 'product_hod_cos', 'product_num_purchases']

Unnamed: 0,product_id,product_dow_sin,product_dow_cos,product_hod_sin,product_hod_cos,product_num_purchases
0,1,0.883183,0.469029,-0.968168,0.250301,66
1,2,-0.813695,-0.581291,-0.974928,-0.222521,3
2,3,-0.221393,0.975185,0.794824,0.606841,27
3,4,-0.304167,0.952619,0.686125,0.727484,20
4,7,0.433884,0.900969,0.433884,0.900969,2


In [122]:
prior_individual_stats = prior_df_small.groupby(["user_id", "order_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})

In [123]:
prior_individual_stats['num_orders'] = 1
prior_individual_stats = prior_individual_stats.reset_index().groupby("user_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': [np.sum, np.mean],
                                                                'num_orders': np.sum})

In [124]:
prior_individual_stats.columns = ['order_dow_sin', 'order_dow_cos', 'order_hod_sin', 'order_hod_cos', 'num_products', 'mean_products', "num_orders"]

In [125]:
prior_individual_stats['order_dow_angle'] = np.arctan2(prior_individual_stats.order_dow_sin, prior_individual_stats.order_dow_cos)
prior_individual_stats['order_hod_angle'] = np.arctan2(prior_individual_stats.order_hod_sin, prior_individual_stats.order_hod_cos)
prior_individual_stats.order_dow_sin = np.sin(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_dow_cos = np.cos(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_hod_sin = np.sin(prior_individual_stats.order_hod_angle)
prior_individual_stats.order_hod_cos = np.cos(prior_individual_stats.order_hod_angle)
prior_individual_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)

In [126]:
prior_individual_stats = prior_individual_stats.reset_index()

In [127]:
prior_individual_stats.columns = ['user_id', 'user_dow_sin', 'user_dow_cos', 'user_hod_sin', 'user_hod_cos', 'user_num_products', 'user_mean_products', 'user_num_orders']

Unnamed: 0,user_id,user_dow_sin,user_dow_cos,user_hod_sin,user_hod_cos,user_num_products,user_mean_products,user_num_orders
0,48,-0.988459,0.151491,0.013915,-0.999903,113,11.3,10
1,79,0.161085,0.986941,0.978891,-0.204385,93,13.285714,7
2,99,0.678658,-0.734454,-0.82262,-0.568592,211,11.722222,18
3,102,-0.954726,0.297487,0.593517,-0.804821,59,9.833333,6
4,103,0.674559,-0.738221,0.846743,-0.532002,77,4.529412,17


In [128]:
prior_indprod_stats = prior_df_small.groupby(["user_id", "product_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})

In [129]:
prior_indprod_stats = prior_indprod_stats.reset_index().merge(prior_individual_stats[['user_id', 'user_num_orders']], on='user_id', how='left')
prior_indprod_stats['order_dow_angle'] = np.arctan2(prior_indprod_stats.order_dow_sin, prior_indprod_stats.order_dow_cos)
prior_indprod_stats['order_hod_angle'] = np.arctan2(prior_indprod_stats.order_hod_sin, prior_indprod_stats.order_hod_cos)
prior_indprod_stats['proportion_orders'] = prior_indprod_stats.num_products / prior_indprod_stats.user_num_orders

In [130]:
prior_indprod_stats.order_dow_sin = np.sin(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_dow_cos = np.cos(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_hod_sin = np.sin(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.order_hod_cos = np.cos(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.drop(['order_dow_angle', 'order_hod_angle', 'user_num_orders'], axis=1, inplace=True)

In [131]:
prior_indprod_stats.columns = ['user_id', 'product_id', 'indprod_dow_sin', 
                               'indprod_dow_cos', 'indprod_hod_sin',
                               'indprod_hod_cos', 'indprod_num_products', 
                                'indprod_proportion_orders']

In [133]:
prior_all_stats = prior_indprod_stats.merge(prior_individual_stats, on="user_id", how="left").merge(prior_product_stats, on='product_id', how="left")

In [136]:
orders_df_last = orders_df_small[orders_df_small.eval_set == "train"].copy()
orders_df_last['order_dow_angle'] = (orders_df_last.order_dow / 
                                     (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_hod_angle'] = (orders_df_last.order_hour_of_day / 
                                             (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_dow_sin'] = np.sin(orders_df_last.order_dow_angle)
orders_df_last['order_dow_cos'] = np.cos(orders_df_last.order_dow_angle)
orders_df_last['order_hod_sin'] = np.sin(orders_df_last.order_hod_angle)
orders_df_last['order_hod_cos'] = np.cos(orders_df_last.order_hod_angle)

In [137]:
orders_df_last.drop(["order_id", "order_number", "eval_set", "order_dow", "order_hour_of_day", "order_dow_angle", "order_hod_angle"], axis=1, inplace=True)

In [139]:
prior_all_stats = prior_all_stats.merge(orders_df_last, on="user_id", how="inner")

In [143]:
prior_all_stats = prior_all_stats.merge(train_df_small[['user_id', 'product_id', 'reordered']], how="left", on=["user_id", "product_id"])
prior_all_stats.reordered = prior_all_stats.reordered.fillna(0)                  


In [145]:
import xgboost as xgb

In [281]:
valid_idx = np.random.choice(range(prior_all_stats.shape[0]), size=int(prior_all_stats.shape[0] / 5), replace=False)
#prior_no_times = prior_all_stats[[x for x in prior_all_stats.columns if 'hod' not in x and 'dow' not in x]]
prior_all_stats['validation_set'] = 0
prior_all_stats.loc[valid_idx, 'validation_set'] = 1
prior_train = prior_all_stats[prior_all_stats.validation_set == 0].copy()
prior_valid = prior_all_stats[prior_all_stats.validation_set == 1].copy()

X_train = prior_train.drop(["validation_set", "reordered", "user_id", "product_id"], axis=1).as_matrix()
y_train = prior_train.reordered.as_matrix()
X_valid = prior_valid.drop(["validation_set", "reordered", "user_id", "product_id"], axis=1).as_matrix()
y_valid = prior_valid.reordered.as_matrix()


In [284]:
%%time
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.3
params['max_depth'] = 6

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)

#bst.fit(X_train, y_train, eval_set=[(X_trian, y_trian), (X_valid, y_valid)], early_stopping_rounds=50, eval_metric='logloss', verbose=True)

[0]	train-logloss:0.513069	valid-logloss:0.51295
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.260582	valid-logloss:0.260595
[20]	train-logloss:0.255849	valid-logloss:0.257595
[30]	train-logloss:0.253307	valid-logloss:0.25676
[40]	train-logloss:0.251246	valid-logloss:0.256295
[50]	train-logloss:0.249471	valid-logloss:0.255996
[60]	train-logloss:0.2478	valid-logloss:0.255801
[70]	train-logloss:0.245737	valid-logloss:0.255317
[80]	train-logloss:0.244181	valid-logloss:0.255203
[90]	train-logloss:0.242611	valid-logloss:0.255192
[100]	train-logloss:0.241278	valid-logloss:0.255113
[110]	train-logloss:0.239591	valid-logloss:0.254933
[120]	train-logloss:0.238307	valid-logloss:0.254982
[130]	train-logloss:0.237076	valid-logloss:0.254961
[140]	train-logloss:0.235825	valid-logloss:0.255109
[150]	train-logloss:0.234311	valid-logloss:0.255054
[160]	train-logloss:0.232951	val

In [286]:
y_predicted = bst.predict(d_valid)

In [294]:
prior_valid['prediction'] = 1 * (y_predicted > .25)

In [295]:
prior_valid['hit'] = (prior_valid.reordered * prior_valid.prediction)

In [297]:
prior_valid_agg = prior_valid.groupby("user_id").agg({'reordered': np.mean, 'prediction': np.mean, 'hit': np.mean})

In [347]:
offset = 0.000001
prior_valid_agg['precision'] = (prior_valid_agg['hit'] + offset) / (prior_valid_agg['prediction'] + offset)
prior_valid_agg['recall'] = (prior_valid_agg['hit'] + offset) / (prior_valid_agg['reordered'] + offset)

In [348]:
#prior_valid_agg.recall = prior_valid_agg.recall.fillna(0)
#prior_valid_agg.precision = prior_valid_agg.precision.fillna(0)
prior_valid_agg['f1'] = 2 * prior_valid_agg['precision'] * prior_valid_agg['recall'] / (prior_valid_agg['precision'] + prior_valid_agg['recall'])

In [349]:
prior_valid_agg['f1'].mean()

0.46126558686784841

In [353]:
%%time
prior_df = prior_df.merge(orders_df, on=["order_id", "user_id"])
prior_df['order_dow_angle'] = (prior_df.order_dow / 
                                     (prior_df.order_dow.max() + 1) * 2 * np.pi)
prior_df['order_hod_angle'] = (prior_df.order_hour_of_day / 
                                             (prior_df.order_dow.max() + 1) * 2 * np.pi)
prior_df['order_dow_sin'] = np.sin(prior_df.order_dow_angle)
prior_df['order_dow_cos'] = np.cos(prior_df.order_dow_angle)
prior_df['order_hod_sin'] = np.sin(prior_df.order_hod_angle)
prior_df['order_hod_cos'] = np.cos(prior_df.order_hod_angle)
prior_df['num_products'] = 1
prior_product_stats = prior_df.groupby("product_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_product_stats['order_dow_angle'] = np.arctan2(prior_product_stats.order_dow_sin, prior_product_stats.order_dow_cos)
prior_product_stats['order_hod_angle'] = np.arctan2(prior_product_stats.order_hod_sin, prior_product_stats.order_hod_cos)

prior_product_stats.order_dow_sin = np.sin(prior_product_stats.order_dow_angle)
prior_product_stats.order_dow_cos = np.cos(prior_product_stats.order_dow_angle)
prior_product_stats.order_hod_sin = np.sin(prior_product_stats.order_hod_angle)
prior_product_stats.order_hod_cos = np.cos(prior_product_stats.order_hod_angle)
prior_product_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)
prior_product_stats.reset_index(inplace=True)
prior_product_stats.columns = ['product_id', 'product_dow_sin', 'product_dow_cos', 'product_hod_sin', 'product_hod_cos', 'product_num_purchases']

In [354]:
%%time
prior_individual_stats = prior_df.groupby(["user_id", "order_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_individual_stats['num_orders'] = 1
prior_individual_stats = prior_individual_stats.reset_index().groupby("user_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': [np.sum, np.mean],
                                                                'num_orders': np.sum})
prior_individual_stats.columns = ['order_dow_sin', 'order_dow_cos', 'order_hod_sin', 'order_hod_cos', 'num_products', 'mean_products', "num_orders"]
prior_individual_stats['order_dow_angle'] = np.arctan2(prior_individual_stats.order_dow_sin, prior_individual_stats.order_dow_cos)
prior_individual_stats['order_hod_angle'] = np.arctan2(prior_individual_stats.order_hod_sin, prior_individual_stats.order_hod_cos)
prior_individual_stats.order_dow_sin = np.sin(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_dow_cos = np.cos(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_hod_sin = np.sin(prior_individual_stats.order_hod_angle)
prior_individual_stats.order_hod_cos = np.cos(prior_individual_stats.order_hod_angle)
prior_individual_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)
prior_individual_stats = prior_individual_stats.reset_index()
prior_individual_stats.columns = ['user_id', 'user_dow_sin', 'user_dow_cos', 'user_hod_sin', 'user_hod_cos', 'user_num_products', 'user_mean_products', 'user_num_orders']


CPU times: user 8.96 s, sys: 2.76 s, total: 11.7 s
Wall time: 11.9 s


In [355]:
%%time
prior_indprod_stats = prior_df.groupby(["user_id", "product_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_indprod_stats = prior_indprod_stats.reset_index().merge(prior_individual_stats[['user_id', 'user_num_orders']], on='user_id', how='left')
prior_indprod_stats['order_dow_angle'] = np.arctan2(prior_indprod_stats.order_dow_sin, prior_indprod_stats.order_dow_cos)
prior_indprod_stats['order_hod_angle'] = np.arctan2(prior_indprod_stats.order_hod_sin, prior_indprod_stats.order_hod_cos)
prior_indprod_stats['proportion_orders'] = prior_indprod_stats.num_products / prior_indprod_stats.user_num_orders
prior_indprod_stats.order_dow_sin = np.sin(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_dow_cos = np.cos(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_hod_sin = np.sin(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.order_hod_cos = np.cos(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.drop(['order_dow_angle', 'order_hod_angle', 'user_num_orders'], axis=1, inplace=True)
prior_indprod_stats.columns = ['user_id', 'product_id', 'indprod_dow_sin', 
                               'indprod_dow_cos', 'indprod_hod_sin',
                               'indprod_hod_cos', 'indprod_num_products', 
                                'indprod_proportion_orders']


CPU times: user 27.6 s, sys: 6.71 s, total: 34.3 s
Wall time: 35.3 s


In [356]:
%%time
prior_all_stats = prior_indprod_stats.merge(prior_individual_stats, on="user_id", how="left").merge(prior_product_stats, on='product_id', how="left")
 

CPU times: user 7.95 s, sys: 5.42 s, total: 13.4 s
Wall time: 15 s


In [357]:
%%time
orders_df_last = orders_df[orders_df.eval_set == "test"].copy()
orders_df_last['order_dow_angle'] = (orders_df_last.order_dow / 
                                     (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_hod_angle'] = (orders_df_last.order_hour_of_day / 
                                             (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_dow_sin'] = np.sin(orders_df_last.order_dow_angle)
orders_df_last['order_dow_cos'] = np.cos(orders_df_last.order_dow_angle)
orders_df_last['order_hod_sin'] = np.sin(orders_df_last.order_hod_angle)
orders_df_last['order_hod_cos'] = np.cos(orders_df_last.order_hod_angle)
orders_df_last.drop(["order_id", "order_number", "eval_set", "order_dow", "order_hour_of_day", "order_dow_angle", "order_hod_angle"], axis=1, inplace=True)

CPU times: user 412 ms, sys: 458 ms, total: 871 ms
Wall time: 1.14 s


In [358]:
%%time
prior_all_stats = prior_all_stats.merge(orders_df_last, on="user_id", how="inner")

CPU times: user 1.35 s, sys: 1.84 s, total: 3.19 s
Wall time: 3.27 s


In [360]:
prior_all_stats.head()

Unnamed: 0,user_id,product_id,indprod_dow_sin,indprod_dow_cos,indprod_hod_sin,indprod_hod_cos,indprod_num_products,indprod_proportion_orders,user_dow_sin,user_dow_cos,...,product_dow_sin,product_dow_cos,product_hod_sin,product_hod_cos,product_num_purchases,days_since_prior_order,order_dow_sin,order_dow_cos,order_hod_sin,order_hod_cos
0,3,248,0.433884,-0.900969,-0.974928,-0.222521,1,0.083333,0.593363,0.804935,...,0.16253,0.986704,0.96723,0.253903,6371,11.0,0.974928,0.222521,-0.781831,-0.62349
1,3,1005,0.433884,-0.900969,0.974928,-0.222521,1,0.083333,0.593363,0.804935,...,0.830565,-0.556922,0.705568,-0.708643,463,11.0,0.974928,0.222521,-0.781831,-0.62349
2,3,1819,0.480901,0.876775,0.935414,-0.353553,3,0.25,0.593363,0.804935,...,0.424031,0.905648,0.981058,-0.193716,2424,11.0,0.974928,0.222521,-0.781831,-0.62349
3,3,7503,0.433884,-0.900969,0.974928,-0.222521,1,0.083333,0.593363,0.804935,...,0.407654,0.913137,0.925818,-0.37797,12474,11.0,0.974928,0.222521,-0.781831,-0.62349
4,3,8021,0.433884,-0.900969,-0.974928,-0.222521,1,0.083333,0.593363,0.804935,...,0.119705,0.992809,0.984529,-0.175222,27864,11.0,0.974928,0.222521,-0.781831,-0.62349


In [361]:
X_test = prior_all_stats.drop(["user_id", "product_id"], axis=1).as_matrix()


In [362]:
d_test = xgb.DMatrix(X_test)
y_test = bst.predict(d_test)

In [423]:
prior_all_stats['prediction'] = 1 * (y_test > .3)

In [425]:
prediction_df = prior_all_stats[prior_all_stats['prediction'] == 1].copy()

In [426]:
prediction_df = prediction_df[['user_id', 'product_id']]

In [427]:
prediction_df.head()

Unnamed: 0,user_id,product_id
5,3,9387
11,3,17668
15,3,21903
23,3,39190
30,3,47766


In [430]:
prediction_lists = prediction_df.groupby('user_id').agg(lambda x: " ".join(x.astype(str))).reset_index()

In [432]:
prediction_lists.shape

(68677, 2)

In [394]:
#prediction_lists.columns = ['order_id', 'products']

In [433]:
prediction_lists = prediction_lists.merge(orders_df.loc[orders_df.eval_set == "test", ['order_id', 'user_id']], on='user_id', how="right")

In [434]:
prediction_lists['products'] = prediction_lists.product_id.fillna("None")

In [435]:
prediction_lists = prediction_lists[['order_id', 'products']]

In [436]:
prediction_lists.to_csv("submissions/first_xgb.csv", index=False)

In [437]:
prediction_lists.shape

(75000, 2)

In [438]:
prediction_lists.head()

Unnamed: 0,order_id,products
0,2774568,9387 17668 21903 39190 47766
1,1528013,21903 38293
2,1376945,8309 8670 14947 27959 28465 33731 34658 35640 ...
3,1356845,248 5746 5876 7076 7120 8239 10863 11520 12206...
4,2161313,196 11266 12427 14715 37710
