In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import accuracy_score, recall_score, precision_score, \
f1_score, roc_curve, auc, roc_auc_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from collections import OrderedDict, Counter
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

sns.set(font_scale=1.2)


Preparing for the test set

In [72]:
with open('pkl/test_order.pkl', 'rb') as f:
    test_recent_order = pickle.load(f)

In [3]:
test_recent_order.shape

(12232, 7)

In [2]:
with open('processing/0806m.pkl', 'rb') as f:
    _, _, prior_details = pickle.load(f)

In [12]:
df_test = prior_summary[prior_summary.user_id.isin(test_recent_order.user_id)]

In [13]:
df_test.shape

(789105, 3)

In [8]:
with open('pkl/test_train_recent_order.pkl', 'rb') as f:
    _, recent_order_test = pickle.load(f)

In [14]:
df_test.head()

Unnamed: 0,user_id,product_id,user_product_reordered_past
205,7,274,0
206,7,519,1
207,7,4920,6
208,7,4945,2
209,7,6361,4


In [15]:
df_test = pd.merge(df_test, recent_order_test[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left')

In [17]:
df_test.head()

Unnamed: 0,user_id,product_id,user_product_reordered_past,reordered
0,7,274,0,
1,7,519,1,
2,7,4920,6,
3,7,4945,2,
4,7,6361,4,


In [21]:
df_test['reordered'].fillna(0,inplace=True)

In [22]:
with open('processing/0806_5pm_df_test.pkl', 'wb') as f:
    pickle.dump(df_test, f)

In [26]:
with open('processing/0806_1pm_prior_details_feature.pkl', 'rb') as f:
    prior_details_feature = pickle.load(f)

In [27]:
prior_details_feature.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,dow_binary,user_product_add_to_cart_order_scale
0,2,33120,1,1,202279,prior,3,5,9,8.0,0,0.111111
1,2,28985,2,1,202279,prior,3,5,9,8.0,0,0.222222


In [28]:
user_feature_temp = (prior_details_feature.groupby(['user_id', 'order_id'], as_index=False)
                .agg(OrderedDict([('product_id', 'count'), ('days_since_prior_order', 'first'), 
                                  ('dow_binary', 'first')])))

In [29]:
user_feature_temp.rename(columns={'product_id': 'total_product'}, inplace=True)

In [30]:
user_feature = (user_feature_temp.groupby(['user_id'], as_index=False)
                .agg(OrderedDict([('order_id','count'), ('total_product', ['mean', 'sum']), ('days_since_prior_order', 'mean'), 
                                 ('dow_binary', 'mean')])))

In [31]:
user_feature_colname = ['user_id', 'user_total_order', 'user_cart_size', 'user_total_product','user_ave_days_since_last_order', 
                        'user_likely_dow']
user_feature.columns = user_feature_colname

In [32]:
user_feature.head()

Unnamed: 0,user_id,user_total_order,user_cart_size,user_total_product,user_ave_days_since_last_order,user_likely_dow
0,1,10,5.9,59,19.555556,0.3
1,2,14,13.928571,195,15.230769,0.357143
2,3,12,7.333333,88,12.090909,0.666667
3,4,5,3.6,18,13.75,0.0
4,5,4,9.25,37,13.333333,0.5


In [33]:
user_uniq_product = prior_details.groupby('user_id')['product_id'].nunique()
user_feature = pd.merge(user_feature, user_uniq_product.to_frame(), left_on ='user_id', 
                        right_index=True, how='left')
user_feature.rename(columns={'product_id': 'user_total_uniq_product'}, inplace=True)

In [34]:
del user_uniq_product

In [35]:
user_feature['user_uniq_prod_over_total_prod'] = \
user_feature['user_total_uniq_product']/user_feature['user_total_product']

In [36]:
with open('processing/0806_5pm_user_feature.pkl', 'wb') as f:
    pickle.dump(user_feature, f)

In [37]:
user_feature.head(3)

Unnamed: 0,user_id,user_total_order,user_cart_size,user_total_product,user_ave_days_since_last_order,user_likely_dow,user_total_uniq_product,user_uniq_prod_over_total_prod
0,1,10,5.9,59,19.555556,0.3,18,0.305085
1,2,14,13.928571,195,15.230769,0.357143,102,0.523077
2,3,12,7.333333,88,12.090909,0.666667,33,0.375


In [39]:
df_user_test = pd.merge(df_test, user_feature, on='user_id', how='left')

In [40]:
df_user_test.shape

(789105, 11)

In [41]:
with open('processing/0806_5pm_df_user_test.pkl', 'wb') as f:
    pickle.dump(df_user_test, f)

In [42]:
del df_test

Adding product features

In [3]:
prod_feature = ['product_total_orders', 'product_avg_add_to_cart_order']
prod_features_df = prior_details.groupby('product_id', as_index=False).agg(OrderedDict([('order_id','count'), 
                                                                     ('add_to_cart_order', 'mean')]))
prod_features_df.columns = ['product_id'] + prod_feature
prod_features_df.head()

Unnamed: 0,product_id,product_total_orders,product_avg_add_to_cart_order
0,1,1852,5.801836
1,2,90,9.888889
2,3,277,6.415162
3,4,329,9.507599
4,5,15,6.466667


In [11]:
# with open('processing/0806_5pm_prod_feature_df', 'wb') as f:
#     pickle.dump(prod_features_df, f)

In [7]:
# with open('processing/0806_5pm_df_user_test.pkl', 'rb') as f:
#     df_user_test= pickle.load(f)

In [8]:
df_user_test.shape

(789105, 11)

In [9]:
df_prod_test = pd.merge(df_user_test, prod_features_df, on='product_id', how='left')

In [10]:
df_prod_test.shape

(789105, 13)

In [13]:
df_prod_test.head(2)

Unnamed: 0,user_id,product_id,user_product_reordered_past,reordered,user_total_order,user_cart_size,user_total_product,user_ave_days_since_last_order,user_likely_dow,user_total_uniq_product,user_uniq_prod_over_total_prod,product_total_orders,product_avg_add_to_cart_order
0,7,274,0,0.0,20,10.3,206,10.684211,0.55,68,0.330097,2205,10.711111
1,7,519,1,0.0,20,10.3,206,10.684211,0.55,68,0.330097,953,7.521511


In [14]:
prod_reordered=df_prod_test.groupby('product_id', as_index=False)['user_product_reordered_past'].sum()
df_prod_temp=pd.merge(df_prod_test, prod_reordered, on='product_id', how='left')
df_prod_temp.rename(columns={'user_product_reordered_past_x':'user_product_reordered_past', 
                             'user_product_reordered_past_y':'product_past_reordered_count'}, inplace=True)

In [15]:
df_prod_temp['product_percent_reordered_past'] = (df_prod_temp['product_past_reordered_count']/  #no 0 total product(chedked) 
                                                  df_prod_temp['product_total_orders']*100)

In [16]:
df_prod_test=df_prod_temp.copy()
del df_prod_temp

In [18]:
# with open('processing/0806_5pm_df_prod_test.pkl', 'wb') as f:
#     pickle.dump(df_prod_test, f)


In [19]:
df_prod_test.shape

(789105, 15)

### Add user-product feature.
**1. ave_add_to_cart_order**   
**2. user_product_ordered_last_n_order - how many orders ago is the last time the user ordered that product**   
**3. user_product_ave_days_since_prior_order**  
**4. ave_days_between_product_order (product_order_interval)**  
**5. user_product_ave_add_to_cart_order_scale(average of add to cart over scaled within each user)**   
**6. user_product_ave_add_to_cart_order (not scaled)**  
**7. user_product_percent_order_containing_product** 
**8. user_product_days_since_last_product_order**
**9. on average how many days between the user ordered the product - user_product_ave_day_per_product_order**  
**11. user_product_ave_day_between_product_order_versus_current_day_since_prior - ratio between average days the user purchase a product vs days since last order**  
**12. user_product_ordered_same_day - has the user order the same product that day.**   
**13. ave_add_to_cart_order_scale_x_reorder_count (for logistic regression)**  
 

In [20]:
with open('processing/0806_1pm_user_product_feature_bk.pkl', 'rb') as f:
    user_product_feature = pickle.load(f)

In [21]:
user_product_feature_bk = user_product_feature

In [22]:
user_product_feature.head()

Unnamed: 0,user_id,product_id,user_product_ave_add_to_cart_order,user_product_ave_add_to_cart_order_scale,user_product_ave_days_since_prior_order,user_product_most_recent_order
0,1,196,1.4,0.245278,19.555556,10
1,1,10258,3.333333,0.562037,19.555556,10
2,1,10326,5.0,0.625,28.0,5
3,1,12427,3.3,0.541667,19.555556,10
4,1,13032,6.333333,0.962963,21.666667,10


In [38]:
upf = pd.merge(user_product_feature, user_feature[['user_id', 'user_total_order']], 
               on='user_id', how='left')
upf['user_product_ordered_last_n_order'] = (upf['user_total_order']
                                            -upf['user_product_most_recent_order'])
upf.drop(columns='user_total_order', inplace=True)

In [45]:
upf.shape, user_product_feature.shape

((13307953, 7), (13307953, 6))

In [46]:
upf.head(3)

Unnamed: 0,user_id,product_id,user_product_ave_add_to_cart_order,user_product_ave_add_to_cart_order_scale,user_product_ave_days_since_prior_order,user_product_most_recent_order,user_product_ordered_last_n_order
0,1,196,1.4,0.245278,19.555556,10,0
1,1,10258,3.333333,0.562037,19.555556,10,0
2,1,10326,5.0,0.625,28.0,5,5


In [47]:
# with open('processing/0806_5pm_upf.pkl', 'wb') as f:
#     pickle.dump(upf, f)

Calculate days since last time the user ordered the product.

In [49]:
#prior_order
df_temp=(prior_details.groupby(['user_id', 'order_number'], as_index=False)
.agg(OrderedDict([('order_id', 'first'),('days_since_prior_order', 'first')]))
.sort_values(['user_id', 'order_number'], ascending=[True, False]))
df_temp['cumulative_days_since_nth_order'] = (df_temp.groupby('user_id', as_index=False)['days_since_prior_order']
                                              .cumsum())

In [50]:
a = df_temp['cumulative_days_since_nth_order'].fillna(method='ffill').copy()
a[df_temp['cumulative_days_since_nth_order'].isnull()] +=7

In [51]:
df_temp['cumulative_days_since_nth_order']=a.copy()

In [52]:
user_product_feature = pd.merge(upf, df_temp.drop(columns=['order_id', 'days_since_prior_order']), 
                    left_on=['user_id', 'user_product_most_recent_order'], right_on = ['user_id', 'order_number'],
                    how='left')
user_product_feature.drop(columns=['user_product_most_recent_order','order_number'],inplace=True)

In [53]:
user_product_feature.rename(columns={'cumulative_days_since_nth_order'
                                     :'user_product_days_since_last_product_order'}, inplace=True)

In [54]:
user_product_feature['user_product_ave_days_since_prior_order'].fillna(user_product_feature['user_product_days_since_last_product_order'], inplace=True)

In [55]:
user_product_feature.head(2)

Unnamed: 0,user_id,product_id,user_product_ave_add_to_cart_order,user_product_ave_add_to_cart_order_scale,user_product_ave_days_since_prior_order,user_product_ordered_last_n_order,user_product_days_since_last_product_order
0,1,196,1.4,0.245278,19.555556,0,30.0
1,1,10258,3.333333,0.562037,19.555556,0,30.0


In [56]:
# with open('processing/0806_5pm_user_product_feature_final.pkl', 'wb') as f:
#     pickle.dump(user_product_feature, f)

Add feature 'user_product_percent_order_containing_product' 

In [57]:
df_up_test = pd.merge(df_prod_test, user_product_feature, on=['user_id', 'product_id'], how='left')
df_up_test['user_product_percent_order_containing_product'] = ((df_up_test['user_product_reordered_past'] + 1)
                                                        / df_up_test['user_total_order'])

In [58]:
# with open('processing/0806_1pm_df_up_test.pkl', 'wb') as f:
#     pickle.dump(df_up_test, f)

Add on average how many days between the user ordered the product

In [59]:
days_since_first_order = (prior_details.groupby(['user_id','order_id'], as_index=False)
                          .agg(OrderedDict([('days_since_prior_order', 'first')]))
 .groupby('user_id', as_index=False)['days_since_prior_order'].sum())

In [60]:
days_since_first_order.rename(columns={'days_since_prior_order':'days_since_first_order'}, inplace=True)

In [61]:
days_since_first_order.head(2)

Unnamed: 0,user_id,days_since_first_order
0,1,176.0
1,2,198.0


In [62]:
df_up1_test = pd.merge(df_up_test, days_since_first_order, on='user_id', how='left')
df_up1_test['user_product_ave_day_per_product_order'] = (df_up1_test['days_since_first_order']/
                                                         (df_up1_test['user_product_reordered_past']+1))

In [64]:
df_up1_test.drop(columns='days_since_first_order', inplace=True)

In [66]:
with open('processing/0806_5pm_df_up1_test.pkl', 'wb') as f:
    pickle.dump(df_up1_test, f)

In [68]:
df_up1_test.shape

(789105, 22)

In [70]:
df_up1_test.head(2)

Unnamed: 0,user_id,product_id,user_product_reordered_past,reordered,user_total_order,user_cart_size,user_total_product,user_ave_days_since_last_order,user_likely_dow,user_total_uniq_product,...,product_avg_add_to_cart_order,product_past_reordered_count,product_percent_reordered_past,user_product_ave_add_to_cart_order,user_product_ave_add_to_cart_order_scale,user_product_ave_days_since_prior_order,user_product_ordered_last_n_order,user_product_days_since_last_product_order,user_product_percent_order_containing_product,user_product_ave_day_per_product_order
0,7,274,0,0.0,20,10.3,206,10.684211,0.55,68,...,10.711111,34,1.54195,16.0,0.761905,30.0,18,203.0,0.05,203.0
1,7,519,1,0.0,20,10.3,206,10.684211,0.55,68,...,7.521511,17,1.783841,10.5,0.583333,30.0,17,173.0,0.1,101.5


In [73]:
df_up2_test = pd.merge(df_up1_test, test_recent_order[['user_id', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']],
                  on = ['user_id'], how='left')

In [74]:
df_up2_test.shape

(789105, 25)

In [77]:
df_up2_test.rename(columns={'days_since_prior_order':'current_days_since_prior_order'},inplace=True)

In [80]:
def get_ratio(row):
    a = row['user_product_ave_day_per_product_order']
    b = row['current_days_since_prior_order']
    if a == 0:
        a+=0.5
    if b == 0:
        b+=0.5
    return a/b        

In [81]:
df_up2_test['user_product_ave_day_between_product_order_versus_current_day_since_prior']=\
df_up2_test.apply(get_ratio, axis=1)

In [82]:
# with open('processing/df_up2_test.pkl', 'wb') as f:
#     pickle.dump(df_up2_test, f)

Add feature - if the user order the item the same day.

In [84]:
df_up3_test = pd.merge(df_up2_test, user_product_feature_bk[['user_id', 'product_id', 'user_product_most_recent_order']],
                 on=['user_id', 'product_id'], how='left')

In [85]:
def order_same_day(row):
    if row['current_days_since_prior_order'] == 0 and \
    row['user_total_order'] == row['user_product_most_recent_order']:
        return 1
    else:
        return 0

In [88]:
df_up3_test['user_product_ordered_same_day']=df_up3_test.apply(order_same_day, axis=1)

In [89]:
df_up3_test.shape

(789105, 28)

In [90]:
df_up3_test.drop(columns='user_product_most_recent_order', inplace=True)

In [91]:
with open('processing/0806_5pm_df_up3_test.pkl', 'wb') as f:
    pickle.dump(df_up3_test, f)

Bin dummy variable the same way binned in training set

In [92]:
with open('processing/0806_1pm_maps,pkl', 'rb') as f:
    dept_map, aisle_map = pickle.load(f)

In [94]:
product_df=pd.read_csv('data/products.csv')
df_dept_test=pd.merge(df_up3_test, product_df, on='product_id', how='left')

In [98]:
df_dept_test.drop(columns='product_name', inplace=True)
df_dept_test['product_dept_bin'] = df_dept_test['department_id'].map(dept_map)
df_dept_test.drop(columns='department_id', inplace=True)

In [100]:
df_dept_test.head()

Unnamed: 0,user_id,product_id,user_product_reordered_past,reordered,user_total_order,user_cart_size,user_total_product,user_ave_days_since_last_order,user_likely_dow,user_total_uniq_product,...,user_product_days_since_last_product_order,user_product_percent_order_containing_product,user_product_ave_day_per_product_order,order_dow,order_hour_of_day,current_days_since_prior_order,user_product_ave_day_between_product_order_versus_current_day_since_prior,user_product_ordered_same_day,aisle_id,product_dept_bin
0,7,274,0,0.0,20,10.3,206,10.684211,0.55,68,...,203.0,0.05,203.0,2,11,6.0,33.833333,0,87,-2
1,7,519,1,0.0,20,10.3,206,10.684211,0.55,68,...,173.0,0.1,101.5,2,11,6.0,16.916667,0,31,2
2,7,4920,6,0.0,20,10.3,206,10.684211,0.55,68,...,45.0,0.35,29.0,2,11,6.0,4.833333,0,123,2
3,7,4945,2,0.0,20,10.3,206,10.684211,0.55,68,...,17.0,0.15,67.666667,2,11,6.0,11.277778,0,123,2
4,7,6361,4,0.0,20,10.3,206,10.684211,0.55,68,...,134.0,0.25,40.6,2,11,6.0,6.766667,0,112,1


In [101]:
df_aisle_test=df_dept_test.copy()
df_aisle_test['product_aisle_bin'] = df_aisle_test['aisle_id'].map(aisle_map)
df_aisle_test.drop(columns='aisle_id', inplace=True)

In [103]:
df_aisle_test.head()

Unnamed: 0,user_id,product_id,user_product_reordered_past,reordered,user_total_order,user_cart_size,user_total_product,user_ave_days_since_last_order,user_likely_dow,user_total_uniq_product,...,user_product_days_since_last_product_order,user_product_percent_order_containing_product,user_product_ave_day_per_product_order,order_dow,order_hour_of_day,current_days_since_prior_order,user_product_ave_day_between_product_order_versus_current_day_since_prior,user_product_ordered_same_day,product_dept_bin,product_aisle_bin
0,7,274,0,0.0,20,10.3,206,10.684211,0.55,68,...,203.0,0.05,203.0,2,11,6.0,33.833333,0,-2,-1
1,7,519,1,0.0,20,10.3,206,10.684211,0.55,68,...,173.0,0.1,101.5,2,11,6.0,16.916667,0,2,2
2,7,4920,6,0.0,20,10.3,206,10.684211,0.55,68,...,45.0,0.35,29.0,2,11,6.0,4.833333,0,2,2
3,7,4945,2,0.0,20,10.3,206,10.684211,0.55,68,...,17.0,0.15,67.666667,2,11,6.0,11.277778,0,2,2
4,7,6361,4,0.0,20,10.3,206,10.684211,0.55,68,...,134.0,0.25,40.6,2,11,6.0,6.766667,0,1,2


In [104]:
with open('processing/0806_5pm_dept_aisle_test_df.pkl', 'wb') as f:
    pickle.dump((df_dept_test, df_aisle_test), f)

In [105]:
df_hod_test= df_aisle_test.copy()

In [106]:
with open('processing/0806_5pm_product_hod_map.pkl', 'rb') as f:
    prod_hod_map = pickle.load(f)

In [108]:
df_hod_test['current_order_hod'] = (df_hod_test['order_hour_of_day']
                               .map(prod_hod_map))

In [109]:
df_hod_test.drop(columns='order_hour_of_day', inplace=True)

In [110]:
with open('processing/0806_1pm_df_hod_test.pkl', 'wb') as f:
    pickle.dump(df_hod_test, f)

Test data set df_hod_test