In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

% matplotlib inline

In [2]:
aisles_df = pd.read_csv("data/aisles.csv")
departments_df = pd.read_csv("data/departments.csv")
products_df = pd.read_csv("data/products.csv")
orders_df = pd.read_csv("data/orders.csv")
prior_df = pd.read_csv("data/order_products__prior.csv")
train_df = pd.read_csv("data/order_products__train.csv")
products_df_merged = (products_df
                      .merge(departments_df, on="department_id")
                      .merge(aisles_df, on="aisle_id"))

In [3]:
orders_df['absolute_date'] = orders_df.groupby("user_id").days_since_prior_order.cumsum().fillna(0)

In [4]:
orders_df['max_order_number'] = orders_df.groupby("user_id").order_number.transform(max)
orders_df['max_absolute_date'] = orders_df.groupby("user_id").absolute_date.transform(max)

In [5]:
orders_df['reverse_date'] = orders_df.max_absolute_date - orders_df.absolute_date
orders_df['reverse_order_number'] = orders_df.max_order_number - orders_df.order_number

In [6]:
train_df = train_df.merge(orders_df[["order_id", "user_id"]], on="order_id")

In [7]:
prior_df = prior_df.merge(orders_df, on="order_id")

In [8]:
prior_df['order_dow_angle'] = (prior_df.order_dow / 
                                     (prior_df.order_dow.max() + 1) * 2 * np.pi)
prior_df['order_hod_angle'] = (prior_df.order_hour_of_day / 
                                             (prior_df.order_dow.max() + 1) * 2 * np.pi)

In [9]:
prior_df['order_dow_sin'] = np.sin(prior_df.order_dow_angle)
prior_df['order_dow_cos'] = np.cos(prior_df.order_dow_angle)
prior_df['order_hod_sin'] = np.sin(prior_df.order_hod_angle)
prior_df['order_hod_cos'] = np.cos(prior_df.order_hod_angle)
prior_df['num_products'] = 1
prior_df['num_products_dw_12'] = np.exp(-np.log(2)/12 * prior_df.reverse_date)
prior_df['num_products_dw_36'] = np.exp(-np.log(2)/36 * prior_df.reverse_date)
prior_df['num_products_dw_108'] = np.exp(-np.log(2)/108 * prior_df.reverse_date)
prior_df['num_products_ow_3'] = np.exp(-np.log(2)/3 * prior_df.reverse_order_number)
prior_df['num_products_ow_10'] = np.exp(-np.log(2)/10 * prior_df.reverse_order_number)
prior_df['num_products_ow_30'] = np.exp(-np.log(2)/30 * prior_df.reverse_order_number)

In [10]:
prior_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,...,order_dow_cos,order_hod_sin,order_hod_cos,num_products,num_products_dw_12,num_products_dw_36,num_products_dw_108,num_products_ow_3,num_products_ow_10,num_products_ow_30
0,2,33120,1,1,202279,prior,3,5,9,8.0,...,-0.222521,0.974928,-0.222521,1,0.000145,0.052556,0.374577,0.25,0.659754,0.870551
1,2,28985,2,1,202279,prior,3,5,9,8.0,...,-0.222521,0.974928,-0.222521,1,0.000145,0.052556,0.374577,0.25,0.659754,0.870551
2,2,9327,3,0,202279,prior,3,5,9,8.0,...,-0.222521,0.974928,-0.222521,1,0.000145,0.052556,0.374577,0.25,0.659754,0.870551
3,2,45918,4,1,202279,prior,3,5,9,8.0,...,-0.222521,0.974928,-0.222521,1,0.000145,0.052556,0.374577,0.25,0.659754,0.870551
4,2,30035,5,0,202279,prior,3,5,9,8.0,...,-0.222521,0.974928,-0.222521,1,0.000145,0.052556,0.374577,0.25,0.659754,0.870551


In [11]:
prior_product_stats = prior_df.groupby("product_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_product_stats['order_dow_angle'] = np.arctan2(prior_product_stats.order_dow_sin, prior_product_stats.order_dow_cos)
prior_product_stats['order_hod_angle'] = np.arctan2(prior_product_stats.order_hod_sin, prior_product_stats.order_hod_cos)

In [12]:
prior_product_stats.order_dow_sin = np.sin(prior_product_stats.order_dow_angle)
prior_product_stats.order_dow_cos = np.cos(prior_product_stats.order_dow_angle)
prior_product_stats.order_hod_sin = np.sin(prior_product_stats.order_hod_angle)
prior_product_stats.order_hod_cos = np.cos(prior_product_stats.order_hod_angle)
prior_product_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)
prior_product_stats.reset_index(inplace=True)
prior_product_stats.columns = ['product_id', 'product_dow_sin', 'product_dow_cos', 'product_hod_sin', 'product_hod_cos', 'product_num_purchases']

In [13]:
prior_df.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id',
       'eval_set', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'absolute_date', 'max_order_number',
       'max_absolute_date', 'reverse_date', 'reverse_order_number',
       'order_dow_angle', 'order_hod_angle', 'order_dow_sin', 'order_dow_cos',
       'order_hod_sin', 'order_hod_cos', 'num_products', 'num_products_dw_12',
       'num_products_dw_36', 'num_products_dw_108', 'num_products_ow_3',
       'num_products_ow_10', 'num_products_ow_30'],
      dtype='object')

In [14]:
prior_indorder_stats = prior_df.groupby(["user_id", "order_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum,
                                                                      'absolute_date': np.max,
                                                                     'order_number': np.max}).reset_index()

In [15]:
prior_individual_stats = prior_indorder_stats.groupby("user_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': [np.sum, np.mean],
                                                                'absolute_date': np.max,      
                                                                'order_number': np.max})

In [18]:
prior_individual_stats.head()

Unnamed: 0_level_0,order_dow_sin,order_dow_cos,order_hod_sin,order_hod_cos,num_products,mean_products,max_absolute_date,max_order_number
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,17.292765,-21.135334,23.04986,31.156441,59,5.9,176.0,10
2,137.684051,-15.027917,67.978276,-97.021756,195,13.928571,198.0,14
3,26.495502,35.942886,31.231272,-14.341462,88,7.333333,133.0,12
4,-11.906919,-6.727365,-1.039639,2.076064,18,3.6,55.0,5
5,16.324118,2.066376,-9.325879,-19.766889,37,9.25,40.0,4


In [17]:
prior_individual_stats.columns = ['order_dow_sin', 'order_dow_cos',
                                  'order_hod_sin', 'order_hod_cos', 
                                   'num_products', 'mean_products', 'max_absolute_date', "max_order_number"]

In [19]:
prior_individual_stats['order_dow_angle'] = np.arctan2(prior_individual_stats.order_dow_sin, prior_individual_stats.order_dow_cos)
prior_individual_stats['order_hod_angle'] = np.arctan2(prior_individual_stats.order_hod_sin, prior_individual_stats.order_hod_cos)
prior_individual_stats.order_dow_sin = np.sin(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_dow_cos = np.cos(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_hod_sin = np.sin(prior_individual_stats.order_hod_angle)
prior_individual_stats.order_hod_cos = np.cos(prior_individual_stats.order_hod_angle)
prior_individual_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)

In [20]:
prior_individual_stats = prior_individual_stats.reset_index()

In [21]:
prior_individual_stats.columns

Index(['user_id', 'order_dow_sin', 'order_dow_cos', 'order_hod_sin',
       'order_hod_cos', 'num_products', 'mean_products', 'max_absolute_date',
       'max_order_number'],
      dtype='object')

In [22]:
prior_individual_stats.columns = ['user_id', 'user_dow_sin',
                                  'user_dow_cos', 'user_hod_sin',
                                  'user_hod_cos', 'user_num_products',
                                  'user_mean_products', 'user_num_days',
                                 'user_num_orders']
prior_individual_stats['user_days_per_order'] = prior_individual_stats.user_num_days / prior_individual_stats.user_num_orders

In [23]:
prior_indorder_stats = prior_indorder_stats[['order_id', 'num_products']]
prior_indorder_stats.columns = ['order_id', 'num_products_in_order']

In [24]:
prior_indprod_stats = (prior_df.merge(prior_indorder_stats[['order_id', 'num_products_in_order']], on='order_id')
                       .merge(prior_individual_stats[['user_id', 'user_num_orders', 'user_num_days']], on='user_id', how='left'))
prior_indprod_stats['add_to_cart_proportion'] = prior_indprod_stats['add_to_cart_order'] / prior_indprod_stats['num_products_in_order']
prior_indprod_stats['indprod_inorder_1'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders)
prior_indprod_stats['indprod_inorder_2'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 1)
prior_indprod_stats['indprod_inorder_3'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 2)
prior_indprod_stats = prior_indprod_stats.groupby(["user_id", "product_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum,
                                                               'num_products_dw_12': np.sum,
                                                                'num_products_dw_36': np.sum,
                                                                'num_products_dw_108': np.sum,
                                                                'num_products_ow_3': np.sum,           
                                                                'num_products_ow_10': np.sum,
                                                                'num_products_ow_30': np.sum,                  
                                                           'add_to_cart_order': np.mean,
                                                           'add_to_cart_proportion': np.mean,
                                                           'indprod_inorder_1': np.sum,
                                                           'indprod_inorder_2': np.sum,
                                                           'indprod_inorder_3': np.sum,
                                                            'user_num_orders': np.mean,
                                                            'user_num_days': np.mean,
                                                            'reverse_date': np.min,
                                                            'reverse_order_number': np.min}).reset_index()

In [25]:
prior_indprod_stats['order_dow_angle'] = np.arctan2(prior_indprod_stats.order_dow_sin, prior_indprod_stats.order_dow_cos)
prior_indprod_stats['order_hod_angle'] = np.arctan2(prior_indprod_stats.order_hod_sin, prior_indprod_stats.order_hod_cos)
prior_indprod_stats['proportion_orders'] = prior_indprod_stats.num_products / (prior_indprod_stats.user_num_orders)
prior_indprod_stats['days_per_order'] = prior_indprod_stats.user_num_days / (prior_indprod_stats.num_products)

In [26]:
prior_indprod_stats.order_dow_sin = np.sin(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_dow_cos = np.cos(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_hod_sin = np.sin(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.order_hod_cos = np.cos(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.drop(['order_dow_angle', 'order_hod_angle', 'user_num_orders', 'user_num_days'], axis=1, inplace=True)

In [31]:
prior_indprod_stats.head()

Unnamed: 0,user_id,product_id,indprod_dow_sin,indprod_dow_cos,indprod_hod_sin,indprod_hod_cos,indprod_num_products,indprod_num_products_dw_12,indprod_num_products_dw_36,indprod_num_products_dw_108,...,indprod_num_products_ow_30,indprod_add_to_cart_order,indprod_add_to_cart_proportion,indprod_inorder_1,indprod_inorder_2,indprod_inorder_3,indprod_days_since_last,indprod_orders_since_last,indprod_proportion_orders,indprod_days_per_order
0,1,196,0.781831,-0.62349,0.537968,0.842965,10,0.653678,2.527365,5.695093,...,8.826064,1.4,0.245278,1,1,1,14.0,1,1.0,17.6
1,1,10258,0.710747,-0.703448,0.484836,0.874605,9,0.653661,2.501588,5.399693,...,8.032364,3.333333,0.562037,1,1,1,14.0,1,0.9,19.555556
2,1,10326,-0.433884,-0.900969,0.781831,0.62349,1,0.003687,0.154487,0.536575,...,0.870551,5.0,0.625,0,0,0,97.0,6,0.1,176.0
3,1,12427,0.781831,-0.62349,0.537968,0.842965,10,0.653678,2.527365,5.695093,...,8.826064,3.3,0.541667,1,1,1,14.0,1,1.0,17.6
4,1,13032,0.552838,-0.833289,0.781831,0.62349,3,0.480567,1.125472,1.928503,...,2.701135,6.333333,0.962963,1,0,0,14.0,1,0.3,58.666667


In [28]:
prior_indprod_stats.columns = ['user_id', 'product_id', 'indprod_dow_sin', 
                               'indprod_dow_cos', 'indprod_hod_sin',
                               'indprod_hod_cos', 'indprod_num_products',
                               'indprod_num_products_dw_12', 'indprod_num_products_dw_36', 
                               'indprod_num_products_dw_108', 'indprod_num_products_ow_3',
                               'indprod_num_products_ow_10', 'indprod_num_products_ow_30',
                               'indprod_add_to_cart_order', 'indprod_add_to_cart_proportion',
                               'indprod_inorder_1', 'indprod_inorder_2', 'indprod_inorder_3',
                               'indprod_days_since_last', 'indprod_orders_since_last',
                                'indprod_proportion_orders', 'indprod_days_per_order']

In [29]:
# use indprod means to add more product stats
product_order_proportions = (prior_indprod_stats[['user_id', 'product_id', 
                                                  'indprod_proportion_orders', 'indprod_days_per_order']]
                             .groupby("product_id")
                             .agg({'indprod_proportion_orders': np.mean,
                                  'indprod_days_per_order': np.mean})).reset_index()
product_order_proportions.columns = ['product_id', 'product_proportion_orders', 'product_days_per_order']
prior_product_stats = prior_product_stats.merge(product_order_proportions, on='product_id')

In [30]:
prior_product_stats.head()

Unnamed: 0,product_id,product_dow_sin,product_dow_cos,product_hod_sin,product_hod_cos,product_num_purchases,product_proportion_orders,product_days_per_order
0,1,0.854703,-0.519117,0.918258,-0.395984,1852,0.151142,150.134253
1,2,-0.103839,0.994594,0.417505,-0.908675,90,0.060412,225.976496
2,3,0.26632,0.963885,0.970072,-0.242819,277,0.187863,121.497409
3,4,-0.007467,0.999972,0.999374,-0.035387,329,0.183989,135.010912
4,5,0.12457,0.992211,-0.655405,-0.755277,15,0.164969,92.583333


In [32]:
prior_all_stats = prior_indprod_stats.merge(prior_individual_stats, on="user_id", how="left").merge(prior_product_stats, on='product_id', how="left")

In [33]:
orders_df_last = orders_df[orders_df.eval_set != "prior"].copy()
orders_df_last['order_dow_angle'] = (orders_df_last.order_dow / 
                                     (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_hod_angle'] = (orders_df_last.order_hour_of_day / 
                                             (orders_df_last.order_dow.max() + 1) * 2 * np.pi - np.pi)
orders_df_last['order_dow_sin'] = np.sin(orders_df_last.order_dow_angle)
orders_df_last['order_dow_cos'] = np.cos(orders_df_last.order_dow_angle)
orders_df_last['order_hod_sin'] = np.sin(orders_df_last.order_hod_angle)
orders_df_last['order_hod_cos'] = np.cos(orders_df_last.order_hod_angle)

In [34]:
orders_df_last.drop(["order_number", "order_dow", "order_hour_of_day", "order_dow_angle", "order_hod_angle"], axis=1, inplace=True)

In [35]:
prior_all_stats = prior_all_stats.merge(orders_df_last, on="user_id", how="inner")

In [36]:
products_df_merged = products_df_merged.join(pd.get_dummies(products_df_merged.aisle))

In [37]:
products_df_merged.drop(['product_name', 'aisle_id', 'department_id', 'department', 'aisle'], axis=1, inplace=True)

In [38]:
prior_all_stats = prior_all_stats.merge(products_df_merged, on="product_id")

In [39]:
prior_all_stats = prior_all_stats.merge(train_df[['user_id', 'product_id', 'reordered']], how="left", on=["user_id", "product_id"])
prior_all_stats.reordered = prior_all_stats.reordered.fillna(0)                  


In [43]:
# split into train, validation, and test sets
prior_all_stats['validation_set'] = 0
prior_all_stats['prediction'] = 0
valid_users = prior_all_stats.loc[prior_all_stats.eval_set == "train", "user_id"].unique()
valid_users = pd.Series(valid_users).sample(frac=.1)


In [44]:
prior_all_stats.loc[prior_all_stats.user_id.isin(valid_users), 'validation_set'] = 1


In [45]:

prior_train = prior_all_stats.loc[(prior_all_stats.eval_set == "train") & (prior_all_stats.validation_set == 0)]
prior_valid = prior_all_stats.loc[prior_all_stats.validation_set == 1]
prior_test = prior_all_stats.loc[prior_all_stats.eval_set == "test"]

X_train = prior_train.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()
y_train = prior_train.reordered.as_matrix()
X_valid = prior_valid.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()
y_valid = prior_valid.reordered.as_matrix()
X_test = prior_test.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()

In [46]:
prior_valid.shape

(837897, 187)

In [47]:
prior_train.shape

(7636764, 187)

In [48]:
X_train.shape

(7636764, 180)

In [49]:
import xgboost as xgb

In [50]:
%%time
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.1
params['max_depth'] = 6
params['nthread'] = 12
# params['scale_pos_weight'] = (1 - y_train.mean())/(y_train.mean())

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 200, watchlist, early_stopping_rounds=50, verbose_eval=10)

[0]	train-auc:0.82355	valid-auc:0.821041
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-auc:0.82764	valid-auc:0.824945
[20]	train-auc:0.828971	valid-auc:0.82619
[30]	train-auc:0.83031	valid-auc:0.827481
[40]	train-auc:0.831524	valid-auc:0.828569
[50]	train-auc:0.832438	valid-auc:0.829341
[60]	train-auc:0.83315	valid-auc:0.829954
[70]	train-auc:0.833638	valid-auc:0.830312
[80]	train-auc:0.834085	valid-auc:0.830628
[90]	train-auc:0.834478	valid-auc:0.830874
[100]	train-auc:0.834836	valid-auc:0.831076
[110]	train-auc:0.835088	valid-auc:0.831222
[120]	train-auc:0.835338	valid-auc:0.831377
[130]	train-auc:0.835609	valid-auc:0.831537
[140]	train-auc:0.835863	valid-auc:0.831667
[150]	train-auc:0.836087	valid-auc:0.831785
[160]	train-auc:0.83628	valid-auc:0.831874
[170]	train-auc:0.836527	valid-auc:0.831992
[180]	train-auc:0.836747	valid-auc:0.832071
[190]	train-auc:0.836911	valid-auc:0.8

In [51]:
bst.dump_model('dump_addtime.raw.txt')

In [52]:
y_predicted = bst.predict(d_valid)

In [53]:
prior_valid = prior_valid.copy()

In [54]:
best_cutoff = 0
best_cutoff_f1 = 0
for cutoff in np.arange(.01, 1, .01):
    prior_valid.loc[:,'prediction'] = 1 * (y_predicted > cutoff)
    prior_valid['hit'] = (prior_valid.reordered * prior_valid.prediction)
    prior_valid_agg = prior_valid.groupby("user_id").agg({'reordered': np.sum, 
                                                      'prediction': np.sum, 
                                                      'hit': np.sum})
    prior_valid_agg.loc[(prior_valid_agg.reordered == 0) & (prior_valid_agg.prediction == 0), 'hit'] = 1
    prior_valid_agg.loc[(prior_valid_agg.reordered == 0), 'reordered'] = 1
    prior_valid_agg.loc[(prior_valid_agg.prediction == 0), 'prediction'] = 1
    prior_valid_agg['precision'] = (prior_valid_agg['hit']) / (prior_valid_agg['prediction'])
    prior_valid_agg['recall'] = (prior_valid_agg['hit']) / (prior_valid_agg['reordered'])
    prior_valid_agg['f1'] = 2 * prior_valid_agg['precision'] * prior_valid_agg['recall'] / (prior_valid_agg['precision'] + prior_valid_agg['recall'] + .0001)
    if prior_valid_agg['f1'].mean() > best_cutoff_f1:
        best_cutoff_f1 = prior_valid_agg['f1'].mean()
        best_cutoff = cutoff
print(best_cutoff)
print(best_cutoff_f1)

0.2
0.380658662724


In [None]:
prior_valid.loc[:,'prediction'] = 1 * (y_predicted > .93)

In [None]:
prior_valid['hit'] = (prior_valid.reordered * prior_valid.prediction)

In [None]:
prior_valid_agg = prior_valid.groupby("user_id").agg({'reordered': np.sum, 
                                                      'prediction': np.sum, 
                                                      'hit': np.sum})
prior_valid_agg.loc[(prior_valid_agg.reordered == 0) & (prior_valid_agg.prediction == 0), 'hit'] = 1
prior_valid_agg.loc[(prior_valid_agg.reordered == 0), 'reordered'] = 1
prior_valid_agg.loc[(prior_valid_agg.prediction == 0), 'prediction'] = 1

In [None]:
prior_valid_agg.hit.describe()

In [None]:
prior_valid_agg['precision'] = (prior_valid_agg['hit']) / (prior_valid_agg['prediction'])
prior_valid_agg['recall'] = (prior_valid_agg['hit']) / (prior_valid_agg['reordered'])

In [None]:
prior_valid_agg.recall.describe()

In [None]:
#prior_valid_agg.recall = prior_valid_agg.recall.fillna(0)
#prior_valid_agg.precision = prior_valid_agg.precision.fillna(0)
prior_valid_agg['f1'] = 2 * prior_valid_agg['precision'] * prior_valid_agg['recall'] / (prior_valid_agg['precision'] + prior_valid_agg['recall'] + .00001)

In [60]:
prior_valid_agg['f1'].mean()

0.18762922417629777

In [55]:
d_test = xgb.DMatrix(X_test)
y_test = bst.predict(d_test)

In [56]:
prior_test = prior_test.copy()
prior_test['prediction'] = 1 * (y_test > best_cutoff)

In [57]:
prediction_df = prior_test[prior_test['prediction'] == 1].copy()

In [58]:
prediction_df = prediction_df[['order_id', 'product_id']]

In [59]:
prediction_lists = prediction_df.groupby('order_id').agg(lambda x: " ".join(x.astype(str))).reset_index()

In [60]:
prediction_lists = prediction_lists.merge(orders_df.loc[orders_df.eval_set == "test", ['order_id']], on='order_id', how="right")

In [61]:
prediction_lists['products'] = prediction_lists.product_id.fillna("None")

In [62]:
prediction_lists = prediction_lists[['order_id', 'products']]

In [63]:
prediction_lists.to_csv("submissions/addtime_xgb.csv", index=False)

In [64]:
prediction_lists.shape

(75000, 2)

In [65]:
prediction_lists.head()

Unnamed: 0,order_id,products
0,17,47766 21463 13107
1,34,13176 47766 47792 21137 48523 43504 39180 3947...
2,137,41787 24852 5134 38689 25890 2326 23794
3,182,47209 11520 39275 13629 47672 5479 33000 41149...
4,257,49235 24852 27966 37646 21137 13870 24838 2710...
