In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

% matplotlib inline

In [28]:
import spacy

nlp = spacy.load('en_vectors_glove_md')

In [29]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
aisles_df = pd.read_csv("data/aisles.csv")
departments_df = pd.read_csv("data/departments.csv")
products_df = pd.read_csv("data/products.csv")
orders_df = pd.read_csv("data/orders.csv")
prior_df = pd.read_csv("data/order_products__prior.csv")
train_df = pd.read_csv("data/order_products__train.csv")
products_df_merged = (products_df
                      .merge(departments_df, on="department_id")
                      .merge(aisles_df, on="aisle_id"))

In [31]:
none_train_df = train_df.groupby('order_id').agg({'product_id': lambda x: "None",
                                             'add_to_cart_order': lambda x: 0,
                                             'reordered': np.sum}).reset_index()
none_train_df.reordered = (none_train_df.reordered == 0).astype(int)
none_train_df = none_train_df.query('reordered == 1')
none_prior_df = prior_df.groupby('order_id').agg({'product_id': lambda x: "None",
                                             'add_to_cart_order': lambda x: 0,
                                             'reordered': np.sum}).reset_index()
none_prior_df.reordered = (none_prior_df.reordered == 0).astype(int)
none_prior_df = none_prior_df.query('reordered == 1')
train_df = pd.concat([train_df, none_train_df])
prior_df = pd.concat([prior_df, none_prior_df])


In [32]:
products_df_merged['allwords'] = products_df_merged.product_name.str.cat([products_df_merged.department, products_df_merged.aisle], sep=' ')

In [33]:
products_df_merged.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,aisle,allwords
0,1,Chocolate Sandwich Cookies,61,19,snacks,cookies cakes,Chocolate Sandwich Cookies snacks cookies cakes
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,snacks,cookies cakes,Nutter Butter Cookie Bites Go-Pak snacks cooki...
2,102,Danish Butter Cookies,61,19,snacks,cookies cakes,Danish Butter Cookies snacks cookies cakes
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,snacks,cookies cakes,Gluten Free All Natural Chocolate Chip Cookies...
4,285,Mini Nilla Wafers Munch Pack,61,19,snacks,cookies cakes,Mini Nilla Wafers Munch Pack snacks cookies cakes


In [34]:
vectors = np.array(products_df_merged.allwords.apply(lambda x: nlp(x).vector).tolist())

In [35]:
from sklearn.decomposition import PCA

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
pca = PCA(n_components=30)


In [38]:
pca.fit(vectors)

PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [39]:
pca.explained_variance_

array([ 1.63897806,  0.67271467,  0.49110381,  0.35325241,  0.33290182,
        0.28086152,  0.23732772,  0.20535761,  0.19991668,  0.17141019,
        0.15493202,  0.15028431,  0.13356821,  0.114815  ,  0.10986322,
        0.09835313,  0.093776  ,  0.0880923 ,  0.08300422,  0.07798661,
        0.07238897,  0.06619669,  0.06398507,  0.06151074,  0.05965263,
        0.05747842,  0.05548258,  0.05046424,  0.04740014,  0.0457125 ])

In [40]:
short_vectors = pca.transform(vectors)

In [41]:
cosine_similarity(short_vectors[0,:].reshape(1, -1), short_vectors[1,:].reshape(1, -1))

array([[ 0.78955682]])

In [42]:
short_vectors_df = pd.DataFrame(short_vectors)

In [43]:
short_vectors_df.columns = ["dim"+str(c) for c in short_vectors_df.columns]

In [44]:
short_vectors_df['product_id'] = products_df_merged.product_id

In [45]:
orders_df['absolute_date'] = orders_df.groupby("user_id").days_since_prior_order.cumsum().fillna(0)

In [46]:
orders_df['max_order_number'] = orders_df.groupby("user_id").order_number.transform(max)
orders_df['max_absolute_date'] = orders_df.groupby("user_id").absolute_date.transform(max)

In [47]:
orders_df['reverse_date'] = orders_df.max_absolute_date - orders_df.absolute_date
orders_df['reverse_order_number'] = orders_df.max_order_number - orders_df.order_number

In [48]:
train_df = train_df.merge(orders_df[["order_id", "user_id"]], on="order_id")

In [49]:
prior_df = prior_df.merge(orders_df, on="order_id")

In [50]:
#some_users = orders_df.loc[orders_df.eval_set == 'train', 'user_id'][:5000]
#prior_df = prior_df[prior_df.user_id.isin(some_users)]
#orders_df = orders_df[orders_df.user_id.isin(some_users)]
#train_df = train_df[train_df.user_id.isin(some_users)]

In [51]:
prior_df['order_dow_angle'] = (prior_df.order_dow / 
                                     (prior_df.order_dow.max() + 1) * 2 * np.pi)
prior_df['order_hod_angle'] = (prior_df.order_hour_of_day / 
                                             (prior_df.order_dow.max() + 1) * 2 * np.pi)

In [52]:
prior_df['order_dow_sin'] = np.sin(prior_df.order_dow_angle)
prior_df['order_dow_cos'] = np.cos(prior_df.order_dow_angle)
prior_df['order_hod_sin'] = np.sin(prior_df.order_hod_angle)
prior_df['order_hod_cos'] = np.cos(prior_df.order_hod_angle)
prior_df['num_products'] = 1
prior_df['num_products_dw_8'] = np.exp(-np.log(2)/8 * prior_df.reverse_date)
prior_df['num_products_dw_16'] = np.exp(-np.log(2)/16 * prior_df.reverse_date)
prior_df['num_products_dw_32'] = np.exp(-np.log(2)/32 * prior_df.reverse_date)
prior_df['num_products_dw_64'] = np.exp(-np.log(2)/64 * prior_df.reverse_date)
prior_df['num_products_dw_128'] = np.exp(-np.log(2)/128 * prior_df.reverse_date)
prior_df['num_products_ow_2'] = np.exp(-np.log(2)/2 * prior_df.reverse_order_number)
prior_df['num_products_ow_4'] = np.exp(-np.log(2)/4 * prior_df.reverse_order_number)
prior_df['num_products_ow_8'] = np.exp(-np.log(2)/8 * prior_df.reverse_order_number)
prior_df['num_products_ow_16'] = np.exp(-np.log(2)/16 * prior_df.reverse_order_number)
prior_df['num_products_ow_32'] = np.exp(-np.log(2)/32 * prior_df.reverse_order_number)

In [53]:
prior_df['num_products_dsin_14'] = (1.01 + np.sin(2*np.pi*(prior_df.reverse_date/14)))/2
prior_df['num_products_dcos_14'] = (1.01 + np.cos(2*np.pi*(prior_df.reverse_date/14)))/2
prior_df['num_products_dsin_30'] = (1.01 + np.sin(2*np.pi*(prior_df.reverse_date/30)))/2
prior_df['num_products_dcos_30'] = (1.01 + np.cos(2*np.pi*(prior_df.reverse_date/30)))/2

In [None]:
prior_day_idx = prior_df[['user_id', 'order_id', 'product_id']].join(pd.get_dummies(prior_df.order_dow))

product_day_idx = (prior_day_idx.drop(['user_id', 'order_id'], axis=1)
                   .groupby("product_id").agg(np.mean).reset_index()
                   .melt(id_vars='product_id', var_name="day", value_name="product_day_proportion"))
product_day_idx.day = product_day_idx.day.astype(int)

individual_day_idx = (prior_day_idx.drop(['product_id'], axis=1)
                   .groupby(['user_id', 'order_id']).agg(np.mean).reset_index()
                     .drop(['order_id'], axis=1).groupby('user_id').agg(np.mean).reset_index()
                   .melt(id_vars='user_id', var_name="day", value_name="user_day_proportion"))
individual_day_idx.day = individual_day_idx.day.astype(int)

indprod_day_idx = (prior_day_idx.drop(['order_id'], axis=1)
                   .groupby(['user_id', 'product_id']).agg(np.mean).reset_index()
                   .melt(id_vars=['user_id', 'product_id'], var_name="day", value_name="indprod_day_proportion"))
indprod_day_idx.day = indprod_day_idx.day.astype(int)

indprod_day_idx.rename(columns={'day': 'order_dow'}, inplace=True)
product_day_idx.rename(columns={'day': 'order_dow'}, inplace=True)
individual_day_idx.rename(columns={'day': 'order_dow'}, inplace=True)

prior_hod_idx = prior_df[['user_id', 'order_id', 'product_id']].join(pd.get_dummies(prior_df.order_hour_of_day))

prior_hod_idx_orig = prior_hod_idx
prior_hod_idx = prior_hod_idx.copy()

for i in range(24):
    prior_hod_idx[i] = prior_hod_idx_orig[(i - 1) % 24] + prior_hod_idx_orig[i] + prior_hod_idx_orig[(i + 1) % 24]

del prior_hod_idx_orig

product_hod_idx = (prior_hod_idx.drop(['user_id', 'order_id'], axis=1)
                   .groupby("product_id").agg(np.mean).reset_index()
                   .melt(id_vars='product_id', var_name="hod", value_name="product_hod_proportion"))
product_hod_idx.hod = product_hod_idx.hod.astype(int)

individual_hod_idx = (prior_hod_idx.drop(['product_id'], axis=1)
                   .groupby(['user_id', 'order_id']).agg(np.mean).reset_index()
                     .drop(['order_id'], axis=1).groupby('user_id').agg(np.mean).reset_index()
                   .melt(id_vars='user_id', var_name="hod", value_name="user_hod_proportion"))
individual_hod_idx.hod = individual_hod_idx.hod.astype(int)

indprod_hod_idx = (prior_hod_idx.drop(['order_id'], axis=1)
                   .groupby(['user_id', 'product_id']).agg(np.mean).reset_index()
                   .melt(id_vars=['user_id', 'product_id'], var_name="hod", value_name="indprod_hod_proportion"))
indprod_hod_idx.hod = indprod_hod_idx.hod.astype(int)

indprod_hod_idx.rename(columns={'hod': 'order_hour_of_day'}, inplace=True)
product_hod_idx.rename(columns={'hod': 'order_hour_of_day'}, inplace=True)
individual_hod_idx.rename(columns={'hod': 'order_hour_of_day'}, inplace=True)

In [55]:
#product_hod_idx_shrinkage = (product_hod_idx.groupby('order_hour_of_day')
#                             .agg({'product_hod_proportion': np.mean}).reset_index()
#                            .rename(columns={'product_hod_proportion': 'product_hod_proportion_shrinkage'}))
#individual_hod_idx_shrinkage = (individual_hod_idx.groupby('order_hour_of_day')
#                                .agg({'user_hod_proportion': np.mean}).reset_index()
#                               .rename(columns={'user_hod_proportion': 'user_hod_proportion_shrinkage'}))
#product_day_idx_shrinkage = (product_day_idx.groupby('order_dow')
#                             .agg({'product_day_proportion': np.mean}).reset_index()
#                            .rename(columns={'product_day_proportion': 'product_day_proportion_shrinkage'}))
#individual_day_idx_shrinkage = (individual_day_idx.groupby('order_dow')
#                                .agg({'user_day_proportion': np.mean}).reset_index()
#                               .rename(columns={'user_day_proportion': 'user_day_proportion_shrinkage'}))

In [56]:
prior_product_stats = prior_df.groupby("product_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_product_stats['order_dow_angle'] = np.arctan2(prior_product_stats.order_dow_sin, prior_product_stats.order_dow_cos)
prior_product_stats['order_hod_angle'] = np.arctan2(prior_product_stats.order_hod_sin, prior_product_stats.order_hod_cos)

In [57]:
prior_product_stats.order_dow_sin = np.sin(prior_product_stats.order_dow_angle)
prior_product_stats.order_dow_cos = np.cos(prior_product_stats.order_dow_angle)
prior_product_stats.order_hod_sin = np.sin(prior_product_stats.order_hod_angle)
prior_product_stats.order_hod_cos = np.cos(prior_product_stats.order_hod_angle)
prior_product_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)
prior_product_stats.reset_index(inplace=True)
prior_product_stats.columns = ['product_id', 'product_dow_sin', 'product_dow_cos', 'product_hod_sin', 'product_hod_cos', 'product_num_orders']

In [59]:
#prior_products_shrinkage = prior_product_stats.agg({'product_dow_sin': np.sum,
#                                                   'product_dow_cos': np.sum,
#                                                   'product_hod_sin': np.sum,
#                                                   'product_hod_cos': np.sum})
#prior_products_shrinkage['dow_angle'] = np.arctan2(prior_products_shrinkage.product_dow_sin, prior_products_shrinkage.product_dow_cos)
#prior_products_shrinkage['hod_angle'] = np.arctan2(prior_products_shrinkage.product_hod_sin, prior_products_shrinkage.product_hod_cos)
#prior_products_shrinkage.product_dow_sin = np.sin(prior_products_shrinkage.dow_angle)
#prior_products_shrinkage.product_dow_cos = np.cos(prior_products_shrinkage.dow_angle)
#prior_products_shrinkage.product_hod_sin = np.sin(prior_products_shrinkage.hod_angle)
#prior_products_shrinkage.product_hod_cos = np.cos(prior_products_shrinkage.hod_angle)

In [60]:
prior_indorder_stats = prior_df.groupby(["user_id", "order_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum,
                                                               'num_products_dw_8': np.mean,
                                                                'num_products_dw_16': np.mean,
                                                                'num_products_dw_32': np.mean,
                                                                'num_products_dw_64': np.mean,
                                                                'num_products_dw_128': np.mean,
                                                                'num_products_dcos_14': np.mean,
                                                                'num_products_dsin_14': np.mean,
                                                                'num_products_dcos_30': np.mean,
                                                                'num_products_dsin_30': np.mean,
                                                                'num_products_ow_2': np.mean,           
                                                                'num_products_ow_4': np.mean,
                                                                'num_products_ow_8': np.mean,                  
                                                                'num_products_ow_16': np.mean,
                                                                'num_products_ow_32': np.mean,       
                                                                      'absolute_date': np.max,
                                                                     'order_number': np.max}).reset_index()

In [61]:
prior_individual_stats = prior_indorder_stats.groupby("user_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': [np.sum, np.mean],
                                                               'num_products_dw_8': np.sum,
                                                                'num_products_dw_16': np.sum,
                                                                'num_products_dw_32': np.sum,
                                                                'num_products_dw_64': np.sum,
                                                                'num_products_dw_128': np.sum,
                                                                'num_products_dcos_14': np.sum,
                                                                'num_products_dsin_14': np.sum,
                                                                'num_products_dcos_30': np.sum,
                                                                'num_products_dsin_30': np.sum,
                                                                'num_products_ow_2': np.sum,           
                                                                'num_products_ow_4': np.sum,
                                                                'num_products_ow_8': np.sum,                  
                                                                'num_products_ow_16': np.sum,
                                                                'num_products_ow_32': np.sum,  
                                                                'absolute_date': np.max,      
                                                                'order_number': np.max})

In [62]:
prior_individual_stats.columns = ['order_dow_sin', 'order_dow_cos',
                                  'order_hod_sin', 'order_hod_cos', 
                                   'num_products', 'mean_products', 
                                                               'num_products_dw_8',
                                                                'num_products_dw_16',
                                                                'num_products_dw_32',
                                                                'num_products_dw_64',
                                                                'num_products_dw_128',
                                                                'num_products_dcos_14',
                                                                'num_products_dsin_14',
                                                                'num_products_dcos_30',
                                                                'num_products_dsin_30',
                                                                'num_products_ow_2',           
                                                                'num_products_ow_4',
                                                                'num_products_ow_8',                  
                                                                'num_products_ow_16',
                                                                'num_products_ow_32', 
                                  'max_absolute_date', "max_order_number"]

In [63]:
prior_individual_stats['order_dow_angle'] = np.arctan2(prior_individual_stats.order_dow_sin, prior_individual_stats.order_dow_cos)
prior_individual_stats['order_hod_angle'] = np.arctan2(prior_individual_stats.order_hod_sin, prior_individual_stats.order_hod_cos)
prior_individual_stats.order_dow_sin = np.sin(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_dow_cos = np.cos(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_hod_sin = np.sin(prior_individual_stats.order_hod_angle)
prior_individual_stats.order_hod_cos = np.cos(prior_individual_stats.order_hod_angle)
prior_individual_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)

In [64]:
prior_individual_stats = prior_individual_stats.reset_index()

In [65]:
prior_individual_stats.columns = ['user_id', 'user_dow_sin',
                                  'user_dow_cos', 'user_hod_sin',
                                  'user_hod_cos', 'user_num_products',
                                  'user_mean_products', 
                                                             'user_num_products_dw_8',
                                                                'user_num_products_dw_16',
                                                                'user_num_products_dw_32',
                                                                'user_num_products_dw_64',
                                                                'user_num_products_dw_128',
                                                                'user_num_products_dcos_14',
                                                                'user_num_products_dsin_14',
                                                                'user_num_products_dcos_30',
                                                                'user_num_products_dsin_30',
                                                                'user_num_products_ow_2',           
                                                                'user_num_products_ow_4',
                                                                'user_num_products_ow_8',                  
                                                                'user_num_products_ow_16',
                                                                'user_num_products_ow_32', 
  
                                  'user_num_days',
                                 'user_num_orders']
prior_individual_stats['user_days_per_order'] = prior_individual_stats.user_num_days / prior_individual_stats.user_num_orders

In [66]:
#prior_individual_shrinkage = prior_individual_stats.agg({'user_dow_sin': np.sum,
#                                                   'user_dow_cos': np.sum,
#                                                   'user_hod_sin': np.sum,
#                                                   'user_hod_cos': np.sum})
#prior_individual_shrinkage['dow_angle'] = np.arctan2(prior_individual_shrinkage.user_dow_sin, prior_individual_shrinkage.user_dow_cos)
#prior_individual_shrinkage['hod_angle'] = np.arctan2(prior_individual_shrinkage.user_hod_sin, prior_individual_shrinkage.user_hod_cos)
#prior_individual_shrinkage.user_dow_sin = np.sin(prior_individual_shrinkage.dow_angle)
#prior_individual_shrinkage.user_dow_cos = np.cos(prior_individual_shrinkage.dow_angle)
#prior_individual_shrinkage.user_hod_sin = np.sin(prior_individual_shrinkage.hod_angle)
#prior_individual_shrinkage.user_hod_cos = np.cos(prior_individual_shrinkage.hod_angle)

In [67]:
prior_indorder_stats = prior_indorder_stats[['order_id', 'num_products']]
prior_indorder_stats.columns = ['order_id', 'num_products_in_order']

In [68]:
prior_indprod_stats = (prior_df.merge(prior_indorder_stats[['order_id', 'num_products_in_order']], on='order_id')
                       .merge(prior_individual_stats[['user_id', 'user_num_orders', 'user_num_days']], on='user_id', how='left'))
prior_indprod_stats['add_to_cart_proportion'] = prior_indprod_stats['add_to_cart_order'] / prior_indprod_stats['num_products_in_order']
prior_indprod_stats['indprod_inorder_1'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders)
prior_indprod_stats['indprod_inorder_2'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 1)
prior_indprod_stats['indprod_inorder_3'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 2)
prior_indprod_stats = prior_indprod_stats.groupby(["user_id", "product_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum,
                                                               'num_products_dw_8': np.sum,
                                                                'num_products_dw_16': np.sum,
                                                                'num_products_dw_32': np.sum,
                                                                'num_products_dw_64': np.sum,
                                                                'num_products_dw_128': np.sum,
                                                                'num_products_dcos_14': np.sum,
                                                                'num_products_dsin_14': np.sum,
                                                                'num_products_dcos_30': np.sum,
                                                                'num_products_dsin_30': np.sum,
                                                                'num_products_ow_2': np.sum,           
                                                                'num_products_ow_4': np.sum,
                                                                'num_products_ow_8': np.sum,                  
                                                                'num_products_ow_16': np.sum,
                                                                'num_products_ow_32': np.sum,                                   
                                                           'add_to_cart_order': np.mean,
                                                           'add_to_cart_proportion': np.mean,
                                                           'indprod_inorder_1': np.sum,
                                                           'indprod_inorder_2': np.sum,
                                                           'indprod_inorder_3': np.sum,
                                                            'user_num_orders': np.mean,
                                                            'user_num_days': np.mean,
                                                            'reverse_date': np.min,
                                                            'reverse_order_number': np.min}).reset_index()

In [69]:
prior_indprod_stats['order_dow_angle'] = np.arctan2(prior_indprod_stats.order_dow_sin, prior_indprod_stats.order_dow_cos)
prior_indprod_stats['order_hod_angle'] = np.arctan2(prior_indprod_stats.order_hod_sin, prior_indprod_stats.order_hod_cos)
prior_indprod_stats['proportion_orders'] = prior_indprod_stats.num_products / (prior_indprod_stats.user_num_orders)
prior_indprod_stats['days_per_order'] = prior_indprod_stats.user_num_days / (prior_indprod_stats.num_products)

In [70]:
prior_indprod_stats.order_dow_sin = np.sin(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_dow_cos = np.cos(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_hod_sin = np.sin(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.order_hod_cos = np.cos(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.drop(['order_dow_angle', 'order_hod_angle', 'user_num_orders', 'user_num_days'], axis=1, inplace=True)

In [71]:
prior_indprod_stats.columns = ['user_id', 'product_id', 'indprod_dow_sin', 
                               'indprod_dow_cos', 'indprod_hod_sin',
                               'indprod_hod_cos', 'indprod_num_orders',
                               'indprod_num_products_dw_8', 'indprod_num_products_dw_16', 
                               'indprod_num_products_dw_32', 'indprod_num_products_dw_64', 
                               'indprod_num_products_dw_128', 'indprod_num_products_dcos_14',
                               'indprod_num_products_dsin_14', 'indprod_num_products_dcos_30',
                               'indprod_num_products_dsin_30', 'indprod_num_products_ow_2',
                               'indprod_num_products_ow_4', 'indprod_num_products_ow_8',
                               'indprod_num_products_ow_16', 'indprod_num_products_ow_32',
                               'indprod_add_to_cart_order', 'indprod_add_to_cart_proportion',
                               'indprod_inorder_1', 'indprod_inorder_2', 'indprod_inorder_3',
                               'indprod_days_since_last', 'indprod_orders_since_last',
                                'indprod_proportion_orders', 'indprod_days_per_order']

In [72]:
# use indprod means to add more product stats
product_order_proportions = (prior_indprod_stats[['user_id', 'product_id', 
                                                  'indprod_proportion_orders', 'indprod_days_per_order']]
                             .groupby("product_id")
                             .agg({'indprod_proportion_orders': np.mean,
                                  'indprod_days_per_order': np.mean})).reset_index()
product_order_proportions.columns = ['product_id', 'product_proportion_orders', 'product_days_per_order']
prior_product_stats = prior_product_stats.merge(product_order_proportions, on='product_id')

In [73]:
prior_all_stats = prior_indprod_stats.merge(prior_individual_stats, on="user_id", how="left").merge(prior_product_stats, on='product_id', how="left")

In [74]:
for label in ['ow_2', 'ow_4', 'ow_8', 'ow_16', 'ow_32', 'dw_8', 'dw_16', 'dw_32', 'dw_64', 'dw_128', 'dcos_14', 'dsin_14', 'dcos_30', 'dsin_30']:
    prior_all_stats['indprod_num_products_'+ label + '_reg'] = prior_all_stats['indprod_num_products_' + label] / prior_all_stats['user_num_products_' + label]
    prior_all_stats.drop(['user_num_products_' + label], axis=1, inplace=True)


In [75]:
orders_df_last = orders_df[orders_df.eval_set != "prior"].copy()
orders_df_last['order_dow_angle'] = (orders_df_last.order_dow / 
                                     (orders_df_last.order_dow.max() + 1) * 2 * np.pi)
orders_df_last['order_hod_angle'] = (orders_df_last.order_hour_of_day / 
                                             (orders_df_last.order_dow.max() + 1) * 2 * np.pi)
orders_df_last['order_dow_sin'] = np.sin(orders_df_last.order_dow_angle)
orders_df_last['order_dow_cos'] = np.cos(orders_df_last.order_dow_angle)
orders_df_last['order_hod_sin'] = np.sin(orders_df_last.order_hod_angle)
orders_df_last['order_hod_cos'] = np.cos(orders_df_last.order_hod_angle)

In [76]:
orders_df_last.drop(["order_number", "order_dow_angle", "order_hod_angle"], axis=1, inplace=True)
# "order_dow", "order_hour_of_day", 

In [77]:
prior_all_stats = prior_all_stats.merge(orders_df_last, on="user_id", how="inner")

In [78]:
prior_all_stats.shape

(13514162, 74)

In [81]:
prior_all_stats = prior_all_stats.merge(individual_day_idx, on=['user_id', 'order_dow'])
#prior_all_stats = prior_all_stats.merge(individual_day_idx_shrinkage, on=['order_dow'])
prior_all_stats = prior_all_stats.merge(product_day_idx, on=['product_id', 'order_dow'])
#prior_all_stats = prior_all_stats.merge(product_day_idx_shrinkage, on=['order_dow'])
prior_all_stats = prior_all_stats.merge(indprod_day_idx, on=['user_id', 'product_id', 'order_dow'])

In [82]:
prior_all_stats = prior_all_stats.merge(individual_hod_idx, on=['user_id', 'order_hour_of_day'])
prior_all_stats = prior_all_stats.merge(product_hod_idx, on=['product_id', 'order_hour_of_day'])
#prior_all_stats = prior_all_stats.merge(individual_hod_idx_shrinkage, on=['order_hour_of_day'])
#prior_all_stats = prior_all_stats.merge(product_hod_idx_shrinkage, on=['order_hour_of_day'])
prior_all_stats = prior_all_stats.merge(indprod_hod_idx, on=['user_id', 'product_id', 'order_hour_of_day'])

In [None]:
#prior_all_stats_old = prior_all_stats.copy()
#prior_all_stats = prior_all_stats_old.copy()

In [None]:
#product_shrinkage_weight = 200
#user_shrinkage_weight = 13
#indprod_user_shrinkage_weight = 13
#indprod_product_shrinkage_weight = 0

In [None]:
#for hd in ['hod', 'day']:
#    prior_all_stats['user_'+hd+'_proportion'] = ((prior_all_stats['user_num_orders'] * prior_all_stats['user_'+hd+'_proportion'] + 
#                                                user_shrinkage_weight * prior_all_stats['user_'+hd+'_proportion_shrinkage'])
#                                                /(prior_all_stats['user_num_orders'] + user_shrinkage_weight))
#    prior_all_stats.drop('user_'+hd+'_proportion_shrinkage', axis=1, inplace=True)
#    prior_all_stats['product_'+hd+'_proportion'] = ((prior_all_stats['product_num_orders'] * prior_all_stats['product_'+hd+'_proportion'] + 
#                                                product_shrinkage_weight * prior_all_stats['product_'+hd+'_proportion_shrinkage'])
#                                                /(prior_all_stats['product_num_orders'] + product_shrinkage_weight))
#    prior_all_stats.drop('product_'+hd+'_proportion_shrinkage', axis=1, inplace=True)
#
#    prior_all_stats['indprod_'+hd+'_proportion'] = ((prior_all_stats['indprod_num_orders'] * prior_all_stats['indprod_'+hd+'_proportion'] +
#                                                    indprod_user_shrinkage_weight * prior_all_stats['user_'+hd+'_proportion'] +
#                                                    indprod_product_shrinkage_weight * prior_all_stats['product_'+hd+'_proportion'])
#                                                   /(prior_all_stats['indprod_num_orders'] + indprod_user_shrinkage_weight + indprod_product_shrinkage_weight))

In [None]:
#product_shrinkage_weight = 10
#user_shrinkage_weight = 2
#indprod_user_shrinkage_weight = 4
#indprod_product_shrinkage_weight = 0

In [None]:
#labels = ['hod_cos', 'hod_sin', 'dow_cos', 'dow_sin']
#hoddow = ['hod', 'hod', 'dow', 'dow']
#operations = [np.cos, np.sin, np.cos, np.sin]
#for i in range(4):
#    lab = labels[i]
#    op = operations[i]
#    hd = hoddow[i]
#    prior_all_stats['user_'+lab] = op(np.arctan2(prior_all_stats['user_num_orders'] * prior_all_stats['user_'+hd+'_sin'] +
#                                                user_shrinkage_weight * prior_individual_shrinkage['user_'+hd+'_sin'],
#                                                prior_all_stats['user_num_orders'] * prior_all_stats['user_'+hd+'_cos'] +
#                                                user_shrinkage_weight * prior_individual_shrinkage['user_'+hd+'_cos']))
#    prior_all_stats['product_'+lab] = op(np.arctan2(prior_all_stats['product_num_orders'] * prior_all_stats['product_'+hd+'_sin'] +
#                                                product_shrinkage_weight * prior_products_shrinkage['product_'+hd+'_sin'],
#                                                prior_all_stats['user_num_orders'] * prior_all_stats['product_'+hd+'_cos'] +
#                                                product_shrinkage_weight * prior_products_shrinkage['product_'+hd+'_cos']))
#for i in range(4):
#    lab = labels[i]
#    op = operations[i]    
#    hd = hoddow[i]
#    prior_all_stats['indprod_'+lab] = op(np.arctan2(
#        prior_all_stats['indprod_num_orders'] * prior_all_stats['indprod_'+hd+'_sin'] +
 #       indprod_user_shrinkage_weight * prior_all_stats['user_'+hd+'_sin'] +
 #       indprod_product_shrinkage_weight * prior_all_stats['product_'+hd+'_sin'],
 #       prior_all_stats['indprod_num_orders'] * prior_all_stats['indprod_'+hd+'_cos'] +
 #       indprod_user_shrinkage_weight * prior_all_stats['user_'+hd+'_cos'] +
 #       indprod_product_shrinkage_weight * prior_all_stats['product_'+hd+'_cos']))

In [83]:
prior_all_stats.drop(['order_hour_of_day', 'order_dow'], axis=1, inplace=True)

In [84]:
prior_all_stats['indprod_dow_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_dow_sin, prior_all_stats.order_dow_cos) - 
                                                np.arctan2(prior_all_stats.indprod_dow_sin, prior_all_stats.indprod_dow_cos))
prior_all_stats['indprod_hod_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_hod_sin, prior_all_stats.order_hod_cos) - 
                                                np.arctan2(prior_all_stats.indprod_hod_sin, prior_all_stats.indprod_hod_cos))
prior_all_stats['user_dow_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_dow_sin, prior_all_stats.order_dow_cos) - 
                                                np.arctan2(prior_all_stats.user_dow_sin, prior_all_stats.user_dow_cos))
prior_all_stats['user_hod_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_hod_sin, prior_all_stats.order_hod_cos) - 
                                                np.arctan2(prior_all_stats.user_hod_sin, prior_all_stats.user_hod_cos))
prior_all_stats['product_dow_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_dow_sin, prior_all_stats.order_dow_cos) - 
                                                np.arctan2(prior_all_stats.product_dow_sin, prior_all_stats.product_dow_cos))
prior_all_stats['product_hod_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_hod_sin, prior_all_stats.order_hod_cos) - 
                                                np.arctan2(prior_all_stats.product_hod_sin, prior_all_stats.product_hod_cos))


In [85]:
del individual_hod_idx
del product_hod_idx
del indprod_hod_idx
del individual_day_idx
del product_day_idx
del indprod_day_idx

In [86]:
prior_all_stats['user_distinct_products'] = prior_all_stats.groupby('user_id')['product_id'].transform('count')

In [87]:
prior_all_stats = prior_all_stats.merge(train_df[['user_id', 'product_id', 'reordered']], how="left", on=["user_id", "product_id"])
prior_all_stats.reordered = prior_all_stats.reordered.fillna(0)                  


In [None]:
#prior_all_stats.indprod_num_products_dcos_14_reg = prior_all_stats.indprod_num_products_dcos_14_reg.fillna(value=0)
#prior_all_stats.indprod_num_products_dsin_14_reg = prior_all_stats.indprod_num_products_dsin_14_reg.fillna(value=0)
#prior_all_stats.indprod_num_products_dcos_30_reg = prior_all_stats.indprod_num_products_dcos_30_reg.fillna(value=0)
#prior_all_stats.indprod_num_products_dsin_30_reg = prior_all_stats.indprod_num_products_dsin_30_reg.fillna(value=0)

In [88]:
prior_all_stats.drop(['reverse_date', 'reverse_order_number'], axis=1, inplace=True)

In [89]:
prior_all_stats['user_mean_proportion_products'] = prior_all_stats.user_mean_products / prior_all_stats.user_distinct_products

In [90]:
none_df = prior_all_stats.query("product_id == 'None'")

In [96]:
none_df.drop(['product_dow_sin',
       'product_dow_cos', 'product_hod_sin', 'product_hod_cos',
       'product_num_orders', 'product_proportion_orders',
       'product_days_per_order', 'product_day_proportion',
        'product_hod_proportion', 'product_dow_avg_diff',
        'product_hod_avg_diff'], axis=1, inplace=True)

In [99]:
none_df.drop(['indprod_add_to_cart_order', 'indprod_add_to_cart_proportion'], axis=1, inplace=True)

In [102]:
none_df.to_hdf("data/none_stats.h5", "table")

In [93]:
prior_all_stats = prior_all_stats.query("product_id != 'None'")

In [104]:
prior_all_stats = prior_all_stats.merge(short_vectors_df, on="product_id")

In [105]:
prior_all_stats.shape

(13307953, 115)

In [106]:
prior_all_stats.to_hdf("data/prior_all_stats.h5", "table")

In [89]:
1+1

2

In [14]:
prior_all_stats = pd.read_hdf("data/prior_all_stats.h5", "table")

In [None]:
prior_all_stats.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
nulls = prior_all_stats.isnull().mean(axis=0)

In [None]:
nulls[nulls > 0]

In [27]:
# split into train, validation, and test sets
#prior_all_stats = pd.read_hdf("data/prior_all_stats.h5", "table")
prior_all_stats['validation_set'] = 0
prior_all_stats['prediction'] = 0
valid_users = prior_all_stats.loc[prior_all_stats.eval_set == "train", "user_id"].unique()
valid_users = pd.Series(valid_users).sample(frac=.1, random_state=1234)


In [4]:
prior_all_stats['prediction'] = 0
all_users = prior_all_stats.loc[prior_all_stats.eval_set == "train", "user_id"].unique()
np.random.shuffle(all_users)

In [5]:
valid_set = pd.DataFrame({'user_id': all_users, 'validation_set': np.arange(0, all_users.shape[0]) % 10})

In [6]:
prior_all_stats = prior_all_stats.merge(valid_set, on='user_id', how='left')

KeyboardInterrupt: 

In [None]:
prior_all_stats.validation_set = prior_all_stats.validation_set.fillna(-1)

In [28]:
prior_all_stats.loc[prior_all_stats.user_id.isin(valid_users), 'validation_set'] = 1


In [10]:

#prior_train = prior_all_stats.loc[(prior_all_stats.eval_set == "train") & (prior_all_stats.validation_set == 0)]
#prior_valid = prior_all_stats.loc[prior_all_stats.validation_set == 1]
prior_test = prior_all_stats.loc[prior_all_stats.eval_set == "test"]

#X_train = prior_train.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()
#y_train = prior_train.reordered.as_matrix()
#X_valid = prior_valid.drop(["prediction", "eval_set", "validation_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()
#y_valid = prior_valid.reordered.as_matrix()
X_test = prior_test.drop(["eval_set", "order_id", "reordered", "user_id", "product_id"], axis=1).as_matrix()

In [5]:
import xgboost as xgb

In [31]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.1
params['max_depth'] = 6
params['nthread'] = 12

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

#bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, verbose_eval=10)

In [3]:
#bst.save_model('add_distinct.model')
bst = xgb.Booster()
bst.load_model('multi_xgb0.model')

NameError: name 'xgb' is not defined

In [32]:
y_predicted = bst.predict(d_valid)

In [33]:
prior_valid = prior_valid.copy()

In [46]:
guess = [-1, -.1, -1, -.3]
width = np.array([.2, .03, .2, .1])
best_reorder_cutoff = (0, 0)
best_none_cutoff = (0, 0)
best_cutoff_f1 = 0
for i in range(2):
    for reorder_cutoff in [(x,y) for x in np.arange(guess[0]-4*width[0], guess[0]+4*width[0], width[0]) for y in np.arange(guess[1]-4*width[1], guess[1]+4*width[1], width[1])]:
        prior_valid['reorder_cutoff'] = np.exp(reorder_cutoff[0] + reorder_cutoff[1] * np.log(prior_valid.user_distinct_products))
        prior_valid.loc[:,'prediction'] = 1 * (y_predicted > prior_valid.reorder_cutoff)
        prior_valid['p_not'] = 1 - y_predicted
        prior_valid['hit'] = (prior_valid.reordered * prior_valid.prediction)
        prior_valid_agg = prior_valid.groupby("user_id").agg({'reordered': np.sum, 
                                                          'prediction': np.sum, 
                                                          'hit': np.sum,
                                                              'user_distinct_products': np.mean,
                                                             'p_not': np.prod})
        for none_cutoff in [(x,y) for x in np.arange(guess[2]-4*width[2], guess[2]+4*width[2], width[2]) for y in np.arange(guess[3]-4*width[3], guess[3]+4*width[3], width[3])]:
            prior_valid_agg['none_cutoff'] = np.exp(none_cutoff[0] + none_cutoff[1] * np.log(prior_valid_agg.user_distinct_products))
            prior_valid_agg['putnone'] = (prior_valid_agg.p_not > prior_valid_agg.none_cutoff) | (prior_valid_agg.prediction == 0)
            prior_valid_agg['truenone'] = (prior_valid_agg.reordered == 0)
            prior_valid_agg['r'] = prior_valid_agg.reordered
            prior_valid_agg['p'] = prior_valid_agg.prediction
            prior_valid_agg['h'] = prior_valid_agg.hit
            prior_valid_agg.loc[prior_valid_agg.putnone & prior_valid_agg.truenone, "h"] = 1
            prior_valid_agg.loc[prior_valid_agg.putnone, 'p'] = prior_valid_agg.loc[prior_valid_agg.putnone, 'p'] + 1
            prior_valid_agg.loc[prior_valid_agg.truenone, 'r'] = prior_valid_agg.loc[prior_valid_agg.truenone, 'r'] + 1
            prior_valid_agg['precision'] = (prior_valid_agg['h']) / (prior_valid_agg['p'])
            prior_valid_agg['recall'] = (prior_valid_agg['h']) / (prior_valid_agg['r'])
            prior_valid_agg['f1'] = 2 * prior_valid_agg['precision'] * prior_valid_agg['recall'] / (prior_valid_agg['precision'] + prior_valid_agg['recall'] + .000001)
            if prior_valid_agg['f1'].mean() > best_cutoff_f1:
                best_cutoff_f1 = prior_valid_agg['f1'].mean()
                best_reorder_cutoff = reorder_cutoff
                best_none_cutoff = none_cutoff
    guess = [best_reorder_cutoff[0], best_reorder_cutoff[1], best_none_cutoff[0], best_none_cutoff[1]]
    width = width / 4
print("best reorder cutoff:", best_reorder_cutoff)
print("best none cutoff:", best_none_cutoff)
print("best f1:", best_cutoff_f1)

best reorder cutoff: (-1.1499999999999999, -0.13749999999999998)
best none cutoff: (-0.85000000000000009, -0.57499999999999984)
best f1: 0.389216180222


In [62]:
#best_reorder_cutoff =  (-1.4281249999999999, -0.061562499999999951)
#best_none_cutoff = (-0.31250000000000006, -0.79999999999999993)
#best_reorder_cutoff = (-1.4499999999999997, -0.048437499999999967)
#best_none_cutoff = (-0.64374999999999982, -0.62656249999999969)
best_reorder_cutoff = (-1.2781250000000002, -0.099062499999999956)
best_none_cutoff = (-0.71250000000000013, -0.68749999999999989)

In [64]:
#d_test = xgb.DMatrix(X_test)
y_test = bst.predict(d_test)

In [55]:
(prediction_2 == prediction_1).mean()

0.98559470439609276

In [69]:
prediction_0.mean()

0.15333938028159688

In [85]:
putnone_0.mean()

0.19314666666666666

In [61]:
prior_all_stats.reordered.mean()

0.062280352207435656

In [112]:
putnone_all = ( 1*putnone_1 + 1*putnone_2 + 1*putnone_0  > 1)

In [113]:
#prior_test = prior_test.copy()
prior_test['prediction'] = 1 * (prediction_0 + prediction_1 + prediction_2 > 1)#1 * (y_test > np.exp(best_reorder_cutoff[0] + best_reorder_cutoff[1] * np.log(prior_test.user_distinct_products)))
prior_test['p_not'] = 1 - y_test

In [66]:
#prediction_0= 1 * (y_test > np.exp(best_reorder_cutoff[0] + best_reorder_cutoff[1] * np.log(prior_test.user_distinct_products)))


In [114]:
writenone_df = prior_test.groupby('order_id').agg({'p_not': np.prod, 
                                                   'prediction': np.sum, 
                                                   'user_distinct_products': np.mean}).reset_index()

In [115]:
#putnone_0 = 1 * (writenone_df.p_not > np.exp(best_none_cutoff[0] + best_none_cutoff[1] * np.log(writenone_df.user_distinct_products))) | (writenone_df.prediction == 0)

writenone_df['putnone'] = putnone_all#(writenone_df.p_not > np.exp(best_none_cutoff[0] + best_none_cutoff[1] * np.log(writenone_df.user_distinct_products))) | (writenone_df.prediction == 0)
writenone_df['nonestring'] = ''
writenone_df.loc[writenone_df.putnone, 'nonestring'] = 'None'

In [116]:
prediction_df = prior_test[prior_test['prediction'] == 1].copy()

In [117]:
prediction_df = prediction_df[['order_id', 'product_id']]

In [118]:
prediction_lists = prediction_df.groupby('order_id').agg(lambda x: " ".join(x.astype(str))).reset_index()

In [119]:
prediction_lists = prediction_lists.merge(writenone_df[['order_id', 'nonestring']], on='order_id', how='right')

In [120]:
prediction_lists['products'] = prediction_lists.product_id.fillna('')

In [121]:
prediction_lists['products'] = prediction_lists.products + " " + prediction_lists.nonestring

In [122]:
prediction_lists = prediction_lists[['order_id', 'products']]

In [123]:
prediction_lists.to_csv("submissions/test_xgb.csv", index=False)