In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

% matplotlib inline

In [4]:
aisles_df = pd.read_csv("data/aisles.csv")
departments_df = pd.read_csv("data/departments.csv")
products_df = pd.read_csv("data/products.csv")
orders_df = pd.read_csv("data/orders.csv")
prior_df = pd.read_csv("data/order_products__prior.csv")
train_df = pd.read_csv("data/order_products__train.csv")
products_df_merged = products_df.merge(departments_df, on="department_id").merge(aisles_df, on="aisle_id")

In [13]:
prior_df = prior_df.merge(orders_df[["order_id", "user_id"]], on="order_id")

In [14]:
train_df = train_df.merge(orders_df[["order_id", "user_id"]], on="order_id")

In [16]:
train_users = orders_df[orders_df.eval_set == "train"].user_id

In [23]:
train_users_small = train_users.sample(n=10000)

In [24]:
orders_df_small = orders_df[orders_df.user_id.isin(train_users_small)]
prior_df_small = prior_df[prior_df.user_id.isin(train_users_small)]
train_df_small = train_df[train_df.user_id.isin(train_users_small)]

In [44]:
orders_df_small.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
79,280530,9,prior,1,1,17,
80,2762092,9,prior,2,0,14,6.0
81,1830137,9,prior,3,5,12,30.0
82,1094988,9,train,4,6,10,30.0
314,361493,27,prior,1,3,9,


In [45]:
prior_df_small.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id
202,23,25931,1,1,68572
203,23,35163,2,1,68572
204,23,29662,3,1,68572
205,23,26283,4,1,68572
206,23,47766,5,1,68572


In [46]:
prior_df_small = prior_df_small.merge(orders_df_small, on=["order_id", "user_id"])

In [47]:
prior_df_small.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,23,25931,1,1,68572,prior,21,1,18,8.0
1,23,35163,2,1,68572,prior,21,1,18,8.0
2,23,29662,3,1,68572,prior,21,1,18,8.0
3,23,26283,4,1,68572,prior,21,1,18,8.0
4,23,47766,5,1,68572,prior,21,1,18,8.0


In [157]:
prior_df_small['order_dow_angle'] = (prior_df_small.order_dow / 
                                     (prior_df_small.order_dow.max() + 1) * 2 * np.pi)
prior_df_small['order_hod_angle'] = (prior_df_small.order_hour_of_day / 
                                             (prior_df_small.order_dow.max() + 1) * 2 * np.pi)

In [160]:
prior_df_small.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id',
       'eval_set', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'order_dow_angle', 'order_hour_of_day_angle',
       'order_dow_sin', 'order_dow_cos', 'order_hour_of_day_sin',
       'order_hour_of_day_cos', 'num_orders', 'num_products',
       'order_hod_angle'],
      dtype='object')

In [161]:
%%time
prior_df_small['order_dow_sin'] = np.sin(prior_df_small.order_dow_angle)
prior_df_small['order_dow_cos'] = np.cos(prior_df_small.order_dow_angle)
prior_df_small['order_hod_sin'] = np.sin(prior_df_small.order_hod_angle)
prior_df_small['order_hod_cos'] = np.cos(prior_df_small.order_hod_angle)
prior_df_small['num_products'] = 1


CPU times: user 142 ms, sys: 31.9 ms, total: 173 ms
Wall time: 173 ms


In [167]:
prior_product_stats = prior_df_small.groupby("product_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_product_stats['order_dow_angle'] = np.arctan2(prior_product_stats.order_dow_sin, prior_product_stats.order_dow_cos)
prior_product_stats['order_hod_angle'] = np.arctan2(prior_product_stats.order_hod_sin, prior_product_stats.order_hod_cos)

In [168]:
prior_product_stats.order_dow_sin = np.sin(prior_product_stats.order_dow_angle)
prior_product_stats.order_dow_cos = np.cos(prior_product_stats.order_dow_angle)
prior_product_stats.order_hod_sin = np.sin(prior_product_stats.order_hod_angle)
prior_product_stats.order_hod_cos = np.cos(prior_product_stats.order_hod_angle)
prior_product_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)
prior_product_stats.reset_index(inplace=True)
prior_product_stats.columns = ['product_id', 'product_dow_sin', 'product_dow_cos', 'product_hod_sin', 'product_hod_cos', 'product_num_purchases']
prior_product_stats.head()

Unnamed: 0,product_id,product_dow_sin,product_dow_cos,product_hod_sin,product_hod_cos,product_num_purchases
0,1,0.557573,0.830128,0.916981,-5.935116,90
1,2,0.498262,0.867026,0.367207,1.099031,3
2,3,0.136373,0.990657,0.964492,-0.928116,38
3,4,0.411151,0.911567,0.08732,3.969501,14
4,9,-0.889804,0.456343,0.918202,-0.945042,5


In [197]:
prior_product_stats.head()

Unnamed: 0,product_id,product_dow_sin,product_dow_cos,product_hod_sin,product_hod_cos,product_num_purchases
0,1,0.557573,0.830128,0.916981,-5.935116,90
1,2,0.498262,0.867026,0.367207,1.099031,3
2,3,0.136373,0.990657,0.964492,-0.928116,38
3,4,0.411151,0.911567,0.08732,3.969501,14
4,9,-0.889804,0.456343,0.918202,-0.945042,5


In [175]:
prior_individual_stats = prior_df_small.groupby(["user_id", "order_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})

In [176]:
prior_individual_stats['num_orders'] = 1
prior_individual_stats = prior_individual_stats.reset_index().groupby("user_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': [np.sum, np.mean],
                                                            'num_orders': np.sum})

In [177]:
prior_individual_stats.columns = ['order_dow_sin', 'order_dow_cos', 'order_hod_sin', 'order_hod_cos', 'num_products', 'mean_products', 'num_orders']

In [178]:
prior_individual_stats['order_dow_angle'] = np.arctan2(prior_individual_stats.order_dow_sin, prior_individual_stats.order_dow_cos)
prior_individual_stats['order_hod_angle'] = np.arctan2(prior_individual_stats.order_hod_sin, prior_individual_stats.order_hod_cos)
prior_individual_stats.order_dow_sin = np.sin(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_dow_cos = np.cos(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_hod_sin = np.sin(prior_individual_stats.order_hod_angle)
prior_individual_stats.order_hod_cos = np.cos(prior_individual_stats.order_hod_angle)
prior_individual_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)

In [181]:
prior_individual_stats = prior_individual_stats.reset_index()

In [180]:
prior_individual_stats.columns = ['user_id', 'user_dow_sin', 'user_dow_cos', 'user_how_sin', 'user_hod_cos', 'user_num_products', 'user_mean_products', 'user_num_orders']
prior_individual_stats.head()

Unnamed: 0,user_id,user_dow_sin,user_dow_cos,user_how_sin,user_hod_cos,user_num_products,user_mean_products,user_num_orders
0,9,-0.437647,0.899147,-0.663223,-23.817299,76,25.333333,3
1,27,0.877684,-0.479239,0.870565,109.004567,768,9.481481,81
2,38,-0.648625,0.761108,0.184913,-39.280653,195,16.25,12
3,48,-0.988459,0.151491,0.013915,-39.27389,113,11.3,10
4,50,0.99645,0.084186,-0.218512,-135.797696,453,6.761194,67


In [184]:
prior_indprod_stats = prior_df_small.groupby(["user_id", "product_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})

In [185]:
prior_indprod_stats = prior_indprod_stats.reset_index().merge(prior_individual_stats[['user_id', 'user_num_orders']], on='user_id', how='left')
prior_indprod_stats['order_dow_angle'] = np.arctan2(prior_indprod_stats.order_dow_sin, prior_indprod_stats.order_dow_cos)
prior_indprod_stats['order_hod_angle'] = np.arctan2(prior_indprod_stats.order_hod_sin, prior_indprod_stats.order_hod_cos)
prior_indprod_stats['proportion_orders'] = prior_indprod_stats.num_products / prior_indprod_stats.user_num_orders

In [186]:
prior_indprod_stats.order_dow_sin = np.sin(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_dow_cos = np.cos(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_hod_sin = np.sin(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.order_hod_cos = np.cos(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)

In [190]:
prior_indprod_stats.columns = ['user_id', 'product_id', 'indprod_dow_sin', 
                               'indprod_dow_cos', 'indprod_hod_sin',
                               'indprod_hod_cos', 'indprod_num_products', 
                               'indprod_num_orders', 'indprod_proportion_orders']

In [192]:
prior_indprod_stats.shape

(646031, 9)

In [193]:
prior_all_stats = prior_indprod_stats.merge(prior_individual_stats, on="user_id", how="left").merge(prior_product_stats, on='product_id', how="left")

In [194]:
prior_all_stats.shape

(646031, 22)

In [195]:
prior_all_stats.head()

Unnamed: 0,user_id,product_id,indprod_dow_sin,indprod_dow_cos,indprod_hod_sin,indprod_hod_cos,indprod_num_products,indprod_num_orders,indprod_proportion_orders,index,...,user_how_sin,user_hod_cos,user_num_products,user_mean_products,user_num_orders,product_dow_sin,product_dow_cos,product_hod_sin,product_hod_cos,product_num_purchases
0,9,311,0.781831,0.62349,0.433884,-0.900969,1,3,0.333333,0,...,-0.663223,-23.817299,76,25.333333,3,-0.047624,0.998865,0.951203,9.038032,402
1,9,481,-0.433884,0.900969,-0.433884,-0.900969,2,3,0.666667,0,...,-0.663223,-23.817299,76,25.333333,3,0.024629,0.999697,0.32968,-12.212595,142
2,9,1559,-0.974928,-0.222521,-0.974928,-0.222521,1,3,0.333333,0,...,-0.663223,-23.817299,76,25.333333,3,-0.925191,0.379502,0.998729,-2.735135,357
3,9,2732,-0.974928,-0.222521,-0.974928,-0.222521,1,3,0.333333,0,...,-0.663223,-23.817299,76,25.333333,3,-0.257799,0.966198,0.234817,-6.226701,207
4,9,3634,-0.974928,-0.222521,-0.974928,-0.222521,1,3,0.333333,0,...,-0.663223,-23.817299,76,25.333333,3,-0.579493,0.814977,0.852163,3.479189,110


In [200]:
prior_all_stats.drop("index", axis=1, inplace=True)

In [201]:
prior_all_stats.shape

(646031, 21)

In [202]:
prior_all_stats.head()

Unnamed: 0,user_id,product_id,indprod_dow_sin,indprod_dow_cos,indprod_hod_sin,indprod_hod_cos,indprod_num_products,indprod_num_orders,indprod_proportion_orders,user_dow_sin,...,user_how_sin,user_hod_cos,user_num_products,user_mean_products,user_num_orders,product_dow_sin,product_dow_cos,product_hod_sin,product_hod_cos,product_num_purchases
0,9,311,0.781831,0.62349,0.433884,-0.900969,1,3,0.333333,-0.437647,...,-0.663223,-23.817299,76,25.333333,3,-0.047624,0.998865,0.951203,9.038032,402
1,9,481,-0.433884,0.900969,-0.433884,-0.900969,2,3,0.666667,-0.437647,...,-0.663223,-23.817299,76,25.333333,3,0.024629,0.999697,0.32968,-12.212595,142
2,9,1559,-0.974928,-0.222521,-0.974928,-0.222521,1,3,0.333333,-0.437647,...,-0.663223,-23.817299,76,25.333333,3,-0.925191,0.379502,0.998729,-2.735135,357
3,9,2732,-0.974928,-0.222521,-0.974928,-0.222521,1,3,0.333333,-0.437647,...,-0.663223,-23.817299,76,25.333333,3,-0.257799,0.966198,0.234817,-6.226701,207
4,9,3634,-0.974928,-0.222521,-0.974928,-0.222521,1,3,0.333333,-0.437647,...,-0.663223,-23.817299,76,25.333333,3,-0.579493,0.814977,0.852163,3.479189,110
