In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc


% matplotlib inline

In [2]:
import spacy

nlp = spacy.load('en_vectors_glove_md')

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
aisles_df = pd.read_csv("data/aisles.csv")
departments_df = pd.read_csv("data/departments.csv")
products_df = pd.read_csv("data/products.csv")
orders_df = pd.read_csv("data/orders.csv")
prior_df = pd.read_csv("data/order_products__prior.csv")
#train_df = pd.read_csv("data/order_products__train.csv")
products_df_merged = (products_df
                      .merge(departments_df, on="department_id")
                      .merge(aisles_df, on="aisle_id"))

In [5]:
orders_df = orders_df.query('eval_set == "prior"')

In [6]:
orders_df['max_order'] = orders_df.groupby('user_id').order_number.transform(max)

In [7]:
orders_df.loc[orders_df.order_number == orders_df.max_order, 'eval_set'] = 'extratrain'

In [8]:
orders_df.drop('max_order', axis=1, inplace=True)

In [9]:
train_df = prior_df.loc[prior_df.order_id.isin(orders_df.query('eval_set == "extratrain"').order_id),:]

In [10]:
prior_df = prior_df.loc[prior_df.order_id.isin(orders_df.query('eval_set == "prior"').order_id),:]

In [11]:
none_train_df = train_df.groupby('order_id').agg({'product_id': lambda x: "None",
                                             'add_to_cart_order': lambda x: 0,
                                             'reordered': np.sum}).reset_index()
none_train_df.reordered = (none_train_df.reordered == 0).astype(int)
none_train_df = none_train_df.query('reordered == 1')
none_prior_df = prior_df.groupby('order_id').agg({'product_id': lambda x: "None",
                                             'add_to_cart_order': lambda x: 0,
                                             'reordered': np.sum}).reset_index()
none_prior_df.reordered = (none_prior_df.reordered == 0).astype(int)
none_prior_df = none_prior_df.query('reordered == 1')
train_df = pd.concat([train_df, none_train_df])
prior_df = pd.concat([prior_df, none_prior_df])


In [12]:
products_df_merged['allwords'] = products_df_merged.product_name.str.cat([products_df_merged.department, products_df_merged.aisle], sep=' ')

In [13]:
products_df_merged.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,aisle,allwords
0,1,Chocolate Sandwich Cookies,61,19,snacks,cookies cakes,Chocolate Sandwich Cookies snacks cookies cakes
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,snacks,cookies cakes,Nutter Butter Cookie Bites Go-Pak snacks cooki...
2,102,Danish Butter Cookies,61,19,snacks,cookies cakes,Danish Butter Cookies snacks cookies cakes
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,snacks,cookies cakes,Gluten Free All Natural Chocolate Chip Cookies...
4,285,Mini Nilla Wafers Munch Pack,61,19,snacks,cookies cakes,Mini Nilla Wafers Munch Pack snacks cookies cakes


In [14]:
vectors = np.array(products_df_merged.allwords.apply(lambda x: nlp(x).vector).tolist())

In [15]:
from sklearn.decomposition import PCA

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
pca = PCA(n_components=30)


In [18]:
pca.fit(vectors)

PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [19]:
pca.explained_variance_

array([ 1.63897806,  0.67271467,  0.49110381,  0.35325241,  0.33290182,
        0.28086152,  0.23732772,  0.20535763,  0.1999167 ,  0.17141023,
        0.15493189,  0.1502843 ,  0.1335667 ,  0.11481437,  0.10986206,
        0.09835583,  0.09378005,  0.0880809 ,  0.08301061,  0.07795332,
        0.07236853,  0.0661347 ,  0.06390929,  0.06154105,  0.05985243,
        0.05726281,  0.05470563,  0.04985289,  0.04753485,  0.04581898])

In [20]:
short_vectors = pca.transform(vectors)

In [21]:
cosine_similarity(short_vectors[0,:].reshape(1, -1), short_vectors[1,:].reshape(1, -1))

array([[ 0.78882176]])

In [22]:
short_vectors_df = pd.DataFrame(short_vectors)

In [23]:
short_vectors_df.columns = ["dim"+str(c) for c in short_vectors_df.columns]

In [24]:
short_vectors_df['product_id'] = products_df_merged.product_id

In [25]:
del products_df
del products_df_merged
del short_vectors
del vectors


In [26]:
gc.collect()

1220

In [27]:
orders_df['absolute_date'] = orders_df.groupby("user_id").days_since_prior_order.cumsum().fillna(0)

In [28]:
orders_df['max_order_number'] = orders_df.groupby("user_id").order_number.transform(max)
orders_df['max_absolute_date'] = orders_df.groupby("user_id").absolute_date.transform(max)

In [29]:
orders_df['reverse_date'] = orders_df.max_absolute_date - orders_df.absolute_date
orders_df['reverse_order_number'] = orders_df.max_order_number - orders_df.order_number

In [30]:
train_df = train_df.merge(orders_df[["order_id", "user_id"]], on="order_id")

In [31]:
prior_df = prior_df.merge(orders_df, on="order_id")

In [32]:
prior_df['order_dow_angle'] = (prior_df.order_dow / 
                                     (prior_df.order_dow.max() + 1) * 2 * np.pi)
prior_df['order_hod_angle'] = (prior_df.order_hour_of_day / 
                                             (prior_df.order_dow.max() + 1) * 2 * np.pi)

In [33]:
prior_df['order_dow_sin'] = np.sin(prior_df.order_dow_angle)
prior_df['order_dow_cos'] = np.cos(prior_df.order_dow_angle)
prior_df['order_hod_sin'] = np.sin(prior_df.order_hod_angle)
prior_df['order_hod_cos'] = np.cos(prior_df.order_hod_angle)
prior_df['num_products'] = 1
prior_df['num_products_dw_8'] = np.exp(-np.log(2)/8 * prior_df.reverse_date)
prior_df['num_products_dw_16'] = np.exp(-np.log(2)/16 * prior_df.reverse_date)
prior_df['num_products_dw_32'] = np.exp(-np.log(2)/32 * prior_df.reverse_date)
prior_df['num_products_dw_64'] = np.exp(-np.log(2)/64 * prior_df.reverse_date)
prior_df['num_products_dw_128'] = np.exp(-np.log(2)/128 * prior_df.reverse_date)
prior_df['num_products_ow_2'] = np.exp(-np.log(2)/2 * prior_df.reverse_order_number)
prior_df['num_products_ow_4'] = np.exp(-np.log(2)/4 * prior_df.reverse_order_number)
prior_df['num_products_ow_8'] = np.exp(-np.log(2)/8 * prior_df.reverse_order_number)
prior_df['num_products_ow_16'] = np.exp(-np.log(2)/16 * prior_df.reverse_order_number)
prior_df['num_products_ow_32'] = np.exp(-np.log(2)/32 * prior_df.reverse_order_number)

In [34]:
prior_df['num_products_dsin_14'] = (1.01 + np.sin(2*np.pi*(prior_df.reverse_date/14)))/2
prior_df['num_products_dcos_14'] = (1.01 + np.cos(2*np.pi*(prior_df.reverse_date/14)))/2
prior_df['num_products_dsin_30'] = (1.01 + np.sin(2*np.pi*(prior_df.reverse_date/30)))/2
prior_df['num_products_dcos_30'] = (1.01 + np.cos(2*np.pi*(prior_df.reverse_date/30)))/2

In [35]:
prior_day_idx = prior_df[['user_id', 'order_id', 'product_id']].join(pd.get_dummies(prior_df.order_dow))
days_prior = prior_day_idx.drop(['product_id', 'user_id'], axis=1).groupby('order_id').agg(np.mean).agg(np.mean)

product_day_idx = (prior_day_idx.drop(['user_id', 'order_id'], axis=1)
                   .groupby("product_id").agg(np.mean).reset_index()
                   .melt(id_vars='product_id', var_name="day", value_name="product_day_proportion"))
product_day_idx.day = product_day_idx.day.astype(int)

individual_day_idx = (prior_day_idx.drop(['product_id'], axis=1)
                   .groupby(['user_id', 'order_id']).agg(np.mean).reset_index()
                     .drop(['order_id'], axis=1).groupby('user_id').agg(np.mean).reset_index()
                   .melt(id_vars='user_id', var_name="day", value_name="user_day_proportion"))
individual_day_idx.day = individual_day_idx.day.astype(int)

indprod_day_idx = (prior_day_idx.drop(['order_id'], axis=1)
                   .groupby(['user_id', 'product_id']).agg(np.mean).reset_index()
                   .melt(id_vars=['user_id', 'product_id'], var_name="day", value_name="indprod_day_proportion"))
indprod_day_idx.day = indprod_day_idx.day.astype(int)

indprod_day_idx.rename(columns={'day': 'order_dow'}, inplace=True)
product_day_idx.rename(columns={'day': 'order_dow'}, inplace=True)
individual_day_idx.rename(columns={'day': 'order_dow'}, inplace=True)

prior_hod_idx = prior_df[['user_id', 'order_id', 'product_id']].join(pd.get_dummies(prior_df.order_hour_of_day))

prior_hod_idx_orig = prior_hod_idx
prior_hod_idx = prior_hod_idx.copy()

for i in range(24):
    prior_hod_idx[i] = prior_hod_idx_orig[(i - 1) % 24] + prior_hod_idx_orig[i] + prior_hod_idx_orig[(i + 1) % 24]
hod_prior = prior_hod_idx.drop(['product_id', 'user_id'], axis=1).groupby('order_id').agg(np.mean).agg(np.mean)

del prior_hod_idx_orig

product_hod_idx = (prior_hod_idx.drop(['user_id', 'order_id'], axis=1)
                   .groupby("product_id").agg(np.mean).reset_index()
                   .melt(id_vars='product_id', var_name="hod", value_name="product_hod_proportion"))
product_hod_idx.hod = product_hod_idx.hod.astype(int)

individual_hod_idx = (prior_hod_idx.drop(['product_id'], axis=1)
                   .groupby(['user_id', 'order_id']).agg(np.mean).reset_index()
                     .drop(['order_id'], axis=1).groupby('user_id').agg(np.mean).reset_index()
                   .melt(id_vars='user_id', var_name="hod", value_name="user_hod_proportion"))
individual_hod_idx.hod = individual_hod_idx.hod.astype(int)

indprod_hod_idx = (prior_hod_idx.drop(['order_id'], axis=1)
                   .groupby(['user_id', 'product_id']).agg(np.mean).reset_index()
                   .melt(id_vars=['user_id', 'product_id'], var_name="hod", value_name="indprod_hod_proportion"))
indprod_hod_idx.hod = indprod_hod_idx.hod.astype(int)

indprod_hod_idx.rename(columns={'hod': 'order_hour_of_day'}, inplace=True)
product_hod_idx.rename(columns={'hod': 'order_hour_of_day'}, inplace=True)
individual_hod_idx.rename(columns={'hod': 'order_hour_of_day'}, inplace=True)

In [36]:
prior_product_stats = prior_df.groupby("product_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum})
prior_product_stats['order_dow_angle'] = np.arctan2(prior_product_stats.order_dow_sin, prior_product_stats.order_dow_cos)
prior_product_stats['order_hod_angle'] = np.arctan2(prior_product_stats.order_hod_sin, prior_product_stats.order_hod_cos)

In [37]:
prior_product_stats.order_dow_sin = np.sin(prior_product_stats.order_dow_angle)
prior_product_stats.order_dow_cos = np.cos(prior_product_stats.order_dow_angle)
prior_product_stats.order_hod_sin = np.sin(prior_product_stats.order_hod_angle)
prior_product_stats.order_hod_cos = np.cos(prior_product_stats.order_hod_angle)
prior_product_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)
prior_product_stats.reset_index(inplace=True)
prior_product_stats.columns = ['product_id', 'product_dow_sin', 'product_dow_cos', 'product_hod_sin', 'product_hod_cos', 'product_num_orders']

In [38]:
prior_indorder_stats = prior_df.groupby(["user_id", "order_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum,
                                                               'num_products_dw_8': np.mean,
                                                                'num_products_dw_16': np.mean,
                                                                'num_products_dw_32': np.mean,
                                                                'num_products_dw_64': np.mean,
                                                                'num_products_dw_128': np.mean,
                                                                'num_products_dcos_14': np.mean,
                                                                'num_products_dsin_14': np.mean,
                                                                'num_products_dcos_30': np.mean,
                                                                'num_products_dsin_30': np.mean,
                                                                'num_products_ow_2': np.mean,           
                                                                'num_products_ow_4': np.mean,
                                                                'num_products_ow_8': np.mean,                  
                                                                'num_products_ow_16': np.mean,
                                                                'num_products_ow_32': np.mean,       
                                                                      'absolute_date': np.max,
                                                                     'order_number': np.max}).reset_index()

In [39]:
prior_individual_stats = prior_indorder_stats.groupby("user_id").agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': [np.sum, np.mean],
                                                               'num_products_dw_8': np.sum,
                                                                'num_products_dw_16': np.sum,
                                                                'num_products_dw_32': np.sum,
                                                                'num_products_dw_64': np.sum,
                                                                'num_products_dw_128': np.sum,
                                                                'num_products_dcos_14': np.sum,
                                                                'num_products_dsin_14': np.sum,
                                                                'num_products_dcos_30': np.sum,
                                                                'num_products_dsin_30': np.sum,
                                                                'num_products_ow_2': np.sum,           
                                                                'num_products_ow_4': np.sum,
                                                                'num_products_ow_8': np.sum,                  
                                                                'num_products_ow_16': np.sum,
                                                                'num_products_ow_32': np.sum,  
                                                                'absolute_date': np.max,      
                                                                'order_number': np.max})

In [40]:
prior_individual_stats.columns = ['order_dow_sin', 'order_dow_cos',
                                  'order_hod_sin', 'order_hod_cos', 
                                   'num_products', 'mean_products', 
                                                               'num_products_dw_8',
                                                                'num_products_dw_16',
                                                                'num_products_dw_32',
                                                                'num_products_dw_64',
                                                                'num_products_dw_128',
                                                                'num_products_dcos_14',
                                                                'num_products_dsin_14',
                                                                'num_products_dcos_30',
                                                                'num_products_dsin_30',
                                                                'num_products_ow_2',           
                                                                'num_products_ow_4',
                                                                'num_products_ow_8',                  
                                                                'num_products_ow_16',
                                                                'num_products_ow_32', 
                                  'max_absolute_date', "max_order_number"]

In [41]:
prior_individual_stats['order_dow_angle'] = np.arctan2(prior_individual_stats.order_dow_sin, prior_individual_stats.order_dow_cos)
prior_individual_stats['order_hod_angle'] = np.arctan2(prior_individual_stats.order_hod_sin, prior_individual_stats.order_hod_cos)
prior_individual_stats.order_dow_sin = np.sin(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_dow_cos = np.cos(prior_individual_stats.order_dow_angle)
prior_individual_stats.order_hod_sin = np.sin(prior_individual_stats.order_hod_angle)
prior_individual_stats.order_hod_cos = np.cos(prior_individual_stats.order_hod_angle)
prior_individual_stats.drop(['order_dow_angle', 'order_hod_angle'], axis=1, inplace=True)

In [42]:
prior_individual_stats = prior_individual_stats.reset_index()

In [43]:
prior_individual_stats.columns = ['user_id', 'user_dow_sin',
                                  'user_dow_cos', 'user_hod_sin',
                                  'user_hod_cos', 'user_num_products',
                                  'user_mean_products', 
                                                             'user_num_products_dw_8',
                                                                'user_num_products_dw_16',
                                                                'user_num_products_dw_32',
                                                                'user_num_products_dw_64',
                                                                'user_num_products_dw_128',
                                                                'user_num_products_dcos_14',
                                                                'user_num_products_dsin_14',
                                                                'user_num_products_dcos_30',
                                                                'user_num_products_dsin_30',
                                                                'user_num_products_ow_2',           
                                                                'user_num_products_ow_4',
                                                                'user_num_products_ow_8',                  
                                                                'user_num_products_ow_16',
                                                                'user_num_products_ow_32', 
  
                                  'user_num_days',
                                 'user_num_orders']
prior_individual_stats['user_days_per_order'] = prior_individual_stats.user_num_days / prior_individual_stats.user_num_orders

In [44]:
order_date_diffs = prior_indorder_stats[['user_id', 'absolute_date']].groupby('user_id').absolute_date.apply(lambda x: x.sort_values().diff()[1:])
user_days_per_order = order_date_diffs.reset_index().groupby('user_id').absolute_date.agg([np.mean, lambda x: np.std(x, ddof=1)]).reset_index()
user_days_per_order.columns = ['user_id', 'user_days_per_order_mean', 'user_days_per_order_std']

In [45]:
prior_individual_stats = prior_individual_stats.merge(user_days_per_order, on='user_id')

In [46]:
prior_indorder_stats = prior_indorder_stats[['order_id', 'num_products']]
prior_indorder_stats.columns = ['order_id', 'num_products_in_order']

In [47]:
prior_indprod_stats = (prior_df.merge(prior_indorder_stats[['order_id', 'num_products_in_order']], on='order_id')
                       .merge(prior_individual_stats[['user_id', 'user_num_orders', 'user_num_days']], on='user_id', how='left'))
prior_indprod_stats['add_to_cart_proportion'] = prior_indprod_stats['add_to_cart_order'] / prior_indprod_stats['num_products_in_order']
prior_indprod_stats['indprod_inorder_1'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders)
prior_indprod_stats['indprod_inorder_2'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 1)
#prior_indprod_stats['indprod_inorder_3'] = 1 * (prior_indprod_stats.order_number == prior_indprod_stats.user_num_orders - 2)
prior_indprod_stats = prior_indprod_stats.groupby(["user_id", "product_id"]).agg({'order_dow_sin': np.sum, 
                                                                'order_dow_cos': np.sum, 
                                                                'order_hod_sin': np.sum,
                                                                'order_hod_cos': np.sum,
                                                               'num_products': np.sum,
                                                               'num_products_dw_8': np.sum,
                                                                'num_products_dw_16': np.sum,
                                                                'num_products_dw_32': np.sum,
                                                                'num_products_dw_64': np.sum,
                                                                'num_products_dw_128': np.sum,
                                                                'num_products_dcos_14': np.sum,
                                                                'num_products_dsin_14': np.sum,
                                                                'num_products_dcos_30': np.sum,
                                                                'num_products_dsin_30': np.sum,
                                                                'num_products_ow_2': np.sum,           
                                                                'num_products_ow_4': np.sum,
                                                                'num_products_ow_8': np.sum,                  
                                                                'num_products_ow_16': np.sum,
                                                                'num_products_ow_32': np.sum,                                   
                                                           'add_to_cart_order': np.mean,
                                                           'add_to_cart_proportion': np.mean,
                                                           'indprod_inorder_1': np.sum,
                                                           'indprod_inorder_2': np.sum,
                                                           #'indprod_inorder_3': np.sum,
                                                            'user_num_orders': np.mean,
                                                            'user_num_days': np.mean,
                                                            'reverse_date': np.min,
                                                            'reverse_order_number': np.min}).reset_index()

In [48]:
prior_indprod_stats['order_dow_angle'] = np.arctan2(prior_indprod_stats.order_dow_sin, prior_indprod_stats.order_dow_cos)
prior_indprod_stats['order_hod_angle'] = np.arctan2(prior_indprod_stats.order_hod_sin, prior_indprod_stats.order_hod_cos)
prior_indprod_stats['proportion_orders'] = prior_indprod_stats.num_products / (prior_indprod_stats.user_num_orders)
prior_indprod_stats['days_per_order'] = prior_indprod_stats.user_num_days / (prior_indprod_stats.num_products)

In [49]:
%%time
order_date_diffs = prior_df.loc[prior_df.user_id < 100,['user_id', 'product_id', 'absolute_date']].groupby(['user_id', 'product_id']).filter(lambda x: len(x) > 2).groupby(['user_id', 'product_id']).absolute_date.apply(lambda x: x.sort_values().diff()[1:])
indprod_days_per_order = order_date_diffs.reset_index().groupby(['user_id', 'product_id']).absolute_date.agg([np.mean, lambda x: np.std(x, ddof=1)]).reset_index()
indprod_days_per_order.columns = ['user_id', 'product_id', 'indprod_days_per_order_mean', 'indprod_days_per_order_var']

CPU times: user 2.02 s, sys: 31.5 ms, total: 2.05 s
Wall time: 2.04 s


In [50]:
prior_indprod_stats.order_dow_sin = np.sin(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_dow_cos = np.cos(prior_indprod_stats.order_dow_angle)
prior_indprod_stats.order_hod_sin = np.sin(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.order_hod_cos = np.cos(prior_indprod_stats.order_hod_angle)
prior_indprod_stats.drop(['order_dow_angle', 'order_hod_angle', 'user_num_orders', 'user_num_days'], axis=1, inplace=True)

In [51]:
prior_indprod_stats.columns = ['user_id', 'product_id', 'indprod_dow_sin', 
                               'indprod_dow_cos', 'indprod_hod_sin',
                               'indprod_hod_cos', 'indprod_num_orders',
                               'indprod_num_products_dw_8', 'indprod_num_products_dw_16', 
                               'indprod_num_products_dw_32', 'indprod_num_products_dw_64', 
                               'indprod_num_products_dw_128', 'indprod_num_products_dcos_14',
                               'indprod_num_products_dsin_14', 'indprod_num_products_dcos_30',
                               'indprod_num_products_dsin_30', 'indprod_num_products_ow_2',
                               'indprod_num_products_ow_4', 'indprod_num_products_ow_8',
                               'indprod_num_products_ow_16', 'indprod_num_products_ow_32',
                               'indprod_add_to_cart_order', 'indprod_add_to_cart_proportion',
                               'indprod_inorder_1', 'indprod_inorder_2', #'indprod_inorder_3',
                               'indprod_days_since_last', 'indprod_orders_since_last',
                                'indprod_proportion_orders', 'indprod_days_per_order']

In [52]:
# use indprod means to add more product stats
product_order_proportions = (prior_indprod_stats[['user_id', 'product_id', 
                                                  'indprod_proportion_orders', 'indprod_days_per_order']]
                             .groupby("product_id")
                             .agg({'indprod_proportion_orders': np.mean,
                                  'indprod_days_per_order': np.mean})).reset_index()
product_order_proportions.columns = ['product_id', 'product_proportion_orders', 'product_days_per_order']
prior_product_stats = prior_product_stats.merge(product_order_proportions, on='product_id')

In [53]:
prior_all_stats = prior_indprod_stats.merge(prior_individual_stats, on="user_id", how="left").merge(prior_product_stats, on='product_id', how="left")

In [54]:
del prior_indprod_stats
del prior_individual_stats
del prior_product_stats

In [55]:
gc.collect()

832

In [56]:
for label in ['ow_2', 'ow_4', 'ow_8', 'ow_16', 'ow_32', 'dw_8', 'dw_16', 'dw_32', 'dw_64', 'dw_128', 'dcos_14', 'dsin_14', 'dcos_30', 'dsin_30']:
    prior_all_stats['indprod_num_products_'+ label + '_reg'] = prior_all_stats['indprod_num_products_' + label] / prior_all_stats['user_num_products_' + label]
    prior_all_stats.drop(['user_num_products_' + label], axis=1, inplace=True)


In [57]:
orders_df_last = orders_df[orders_df.eval_set != "prior"].copy()
orders_df_last['order_dow_angle'] = (orders_df_last.order_dow / 
                                     (orders_df_last.order_dow.max() + 1) * 2 * np.pi)
orders_df_last['order_hod_angle'] = (orders_df_last.order_hour_of_day / 
                                             (orders_df_last.order_dow.max() + 1) * 2 * np.pi)
orders_df_last['order_dow_sin'] = np.sin(orders_df_last.order_dow_angle)
orders_df_last['order_dow_cos'] = np.cos(orders_df_last.order_dow_angle)
orders_df_last['order_hod_sin'] = np.sin(orders_df_last.order_hod_angle)
orders_df_last['order_hod_cos'] = np.cos(orders_df_last.order_hod_angle)

In [58]:
orders_df_last.drop(["order_number", "order_dow_angle", "order_hod_angle"], axis=1, inplace=True)
# "order_dow", "order_hour_of_day", 

In [59]:
prior_all_stats = prior_all_stats.merge(orders_df_last, on="user_id", how="inner")

In [60]:
prior_all_stats.shape

(12628967, 75)

In [61]:
prior_all_stats = prior_all_stats.merge(individual_day_idx, on=['user_id', 'order_dow'])
#prior_all_stats = prior_all_stats.merge(individual_day_idx_shrinkage, on=['order_dow'])
prior_all_stats = prior_all_stats.merge(product_day_idx, on=['product_id', 'order_dow'])
#prior_all_stats = prior_all_stats.merge(product_day_idx_shrinkage, on=['order_dow'])
prior_all_stats = prior_all_stats.merge(indprod_day_idx, on=['user_id', 'product_id', 'order_dow'])

In [62]:
prior_all_stats = prior_all_stats.merge(individual_hod_idx, on=['user_id', 'order_hour_of_day'])
prior_all_stats = prior_all_stats.merge(product_hod_idx, on=['product_id', 'order_hour_of_day'])
#prior_all_stats = prior_all_stats.merge(individual_hod_idx_shrinkage, on=['order_hour_of_day'])
#prior_all_stats = prior_all_stats.merge(product_hod_idx_shrinkage, on=['order_hour_of_day'])
prior_all_stats = prior_all_stats.merge(indprod_hod_idx, on=['user_id', 'product_id', 'order_hour_of_day'])

In [63]:
prior_all_stats['days_prior'] = days_prior[prior_all_stats['order_dow']].tolist()
prior_all_stats['hod_prior'] = hod_prior[prior_all_stats['order_hour_of_day']].tolist()

In [64]:
prior_all_stats.drop(['order_hour_of_day', 'order_dow'], axis=1, inplace=True)

In [65]:
prior_all_stats['indprod_dow_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_dow_sin, prior_all_stats.order_dow_cos) - 
                                                np.arctan2(prior_all_stats.indprod_dow_sin, prior_all_stats.indprod_dow_cos))
prior_all_stats['indprod_hod_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_hod_sin, prior_all_stats.order_hod_cos) - 
                                                np.arctan2(prior_all_stats.indprod_hod_sin, prior_all_stats.indprod_hod_cos))
prior_all_stats['user_dow_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_dow_sin, prior_all_stats.order_dow_cos) - 
                                                np.arctan2(prior_all_stats.user_dow_sin, prior_all_stats.user_dow_cos))
prior_all_stats['user_hod_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_hod_sin, prior_all_stats.order_hod_cos) - 
                                                np.arctan2(prior_all_stats.user_hod_sin, prior_all_stats.user_hod_cos))
prior_all_stats['product_dow_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_dow_sin, prior_all_stats.order_dow_cos) - 
                                                np.arctan2(prior_all_stats.product_dow_sin, prior_all_stats.product_dow_cos))
prior_all_stats['product_hod_avg_diff'] = np.cos(np.arctan2(prior_all_stats.order_hod_sin, prior_all_stats.order_hod_cos) - 
                                                np.arctan2(prior_all_stats.product_hod_sin, prior_all_stats.product_hod_cos))


In [66]:
del individual_hod_idx
del product_hod_idx
del indprod_hod_idx
del individual_day_idx
del product_day_idx
del indprod_day_idx

In [67]:
gc.collect()

189

In [68]:
prior_all_stats['user_distinct_products'] = prior_all_stats.groupby('user_id')['product_id'].transform('count')

In [69]:
prior_all_stats = prior_all_stats.merge(train_df[['user_id', 'product_id', 'reordered']], how="left", on=["user_id", "product_id"])
prior_all_stats.reordered = prior_all_stats.reordered.fillna(0)                  


In [70]:
prior_all_stats.eval("product_day_proportion=(product_num_orders * product_day_proportion + 30 * days_prior)/(product_num_orders+30)", inplace=True)
prior_all_stats.eval("product_hod_proportion=(product_num_orders * product_hod_proportion + 15 * hod_prior)/(product_num_orders+15)", inplace=True)
#prior_all_stats.eval("user_day_proportion=(user_num_orders * user_day_proportion + 0 * days_prior)/(user_num_orders+0)", inplace=True)
#prior_all_stats.eval("user_hod_proportion=(user_num_orders * user_hod_proportion + 0 * hod_prior)/(user_num_orders+0)", inplace=True)
prior_all_stats.eval("indprod_day_proportion=(indprod_num_orders * indprod_day_proportion + 10 * days_prior)/(indprod_num_orders+10)", inplace=True)
prior_all_stats.eval("indprod_hod_proportion=(indprod_num_orders * indprod_hod_proportion + 5 * hod_prior)/(indprod_num_orders+5)", inplace=True)


In [71]:
prior_all_stats.drop(['days_prior', 'hod_prior', 'reverse_date', 'reverse_order_number'], axis=1, inplace=True)

In [72]:
prior_all_stats['user_mean_proportion_products'] = prior_all_stats.user_mean_products / prior_all_stats.user_distinct_products

In [73]:
none_df = prior_all_stats.query("product_id == 'None'")

In [74]:
none_df.drop(['product_dow_sin',
       'product_dow_cos', 'product_hod_sin', 'product_hod_cos',
       'product_num_orders', 'product_proportion_orders',
       'product_days_per_order', 'product_day_proportion',
        'product_hod_proportion', 'product_dow_avg_diff',
        'product_hod_avg_diff'], axis=1, inplace=True)

In [75]:
none_df.drop(['indprod_add_to_cart_order', 'indprod_add_to_cart_proportion'], axis=1, inplace=True)

In [76]:
#none_df.to_hdf("data/none_stats.h5", "table")

In [77]:
prior_all_stats = prior_all_stats.query("product_id != 'None'")

In [78]:
none_proportion = none_df[['user_id', 'indprod_proportion_orders']]
none_proportion.columns = ['user_id', 'user_proportion_none']
prior_all_stats = prior_all_stats.merge(none_proportion, on='user_id')

In [79]:
prior_all_stats = prior_all_stats.merge(short_vectors_df, on="product_id")

In [80]:
prior_all_stats.shape

(12422758, 117)

In [81]:
vecs = ['dim'+str(i) for i in range(30)]
pos = prior_all_stats[['user_id', 'indprod_num_orders'] + vecs].copy()
pos['indprod_sum_orders'] = pos.groupby('user_id').indprod_num_orders.transform('sum')
pos.loc[:, 'dim0':'dim29'] = pos.loc[:, 'dim0':'dim29'].multiply(
    pos['indprod_num_orders']/(pos.indprod_sum_orders + 15), axis=0)
user_mean_vecs = pos.groupby('user_id').agg('sum').drop(['indprod_num_orders', 'indprod_sum_orders'], axis=1).reset_index()

In [82]:
pca = PCA(n_components=15)
pca.fit(user_mean_vecs.loc[:, 'dim0':'dim29'])
short_vectors = pca.transform(user_mean_vecs.loc[:, 'dim0':'dim29'])
short_vectors = pd.DataFrame(short_vectors)
short_vectors.columns = ['user_dim'+str(c) for c in range(15)]
short_vectors['user_id'] = user_mean_vecs['user_id']

In [83]:
prior_all_stats = prior_all_stats.merge(short_vectors, on='user_id')
none_df = none_df.merge(short_vectors, on='user_id')

In [84]:
#prior_all_stats = prior_all_stats.drop('indprod_inorder_3', axis=1)
#none_df = none_df.drop('indprod_inorder_3', axis=1)

In [85]:
#all_users = prior_all_stats.loc[prior_all_stats.eval_set == "extratrain", "user_id"].unique()
#np.random.seed(1234)
#np.random.shuffle(all_users)
#valid_set = pd.DataFrame({'user_id': all_users, 'validation_set': np.arange(0, all_users.shape[0]) % 11})
#prior_all_stats = prior_all_stats.merge(valid_set, on='user_id', how='left')
#prior_all_stats.validation_set = prior_all_stats.validation_set.fillna(-1)
prior_all_stats['validation_set'] = -1

In [86]:
#none_df = none_df.merge(valid_set, on='user_id', how='left')
#none_df.validation_set = none_df.validation_set.fillna(-1)
none_df['validation_set'] = -1

In [87]:
prior_all_stats['extratrain'] = True
none_df['extratrain'] = True

In [88]:
%who

PCA	 aisles_df	 cosine_similarity	 days_prior	 departments_df	 gc	 hod_prior	 i	 indprod_days_per_order	 
label	 nlp	 none_df	 none_prior_df	 none_proportion	 none_train_df	 np	 order_date_diffs	 orders_df	 
orders_df_last	 pca	 pd	 plt	 pos	 prior_all_stats	 prior_day_idx	 prior_df	 prior_hod_idx	 
prior_indorder_stats	 product_order_proportions	 short_vectors	 short_vectors_df	 spacy	 train_df	 user_days_per_order	 user_mean_vecs	 vecs	 


In [89]:
del aisles_df
del departments_df
del indprod_days_per_order
del none_prior_df
del none_proportion
del none_train_df
del order_date_diffs
del orders_df
del orders_df_last
del prior_day_idx
del prior_df
del prior_hod_idx
del prior_indorder_stats
del short_vectors
del short_vectors_df
del train_df
del user_days_per_order
del user_mean_vecs

In [90]:
gc.collect()

1437

In [91]:
none_df.to_hdf("data/none_stats_extratrain.h5", "table")

In [92]:
del none_df
gc.collect()

7

In [93]:
prior_all_stats.product_id = prior_all_stats.product_id.astype(int)
prior_all_stats['subset_key'] = np.random.randint(0, 252, (prior_all_stats.shape[0]))

In [94]:
prior_all_stats.to_hdf("data/prior_all_stats_extratrain.h5", "table", format='table', data_columns=['eval_set', 'validation_set'])


In [95]:
prior_all_stats.shape

(12422758, 135)