In [0]:
import os
from google_drive_downloader import GoogleDriveDownloader as gdd

import pandas as pd
import seaborn as sns
import numpy as np
import  matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

from umap import UMAP
from sklearn.manifold import TSNE
import gc
from tqdm import tqdm

# Load data

In [2]:
if not os.path.exists('./data'):
  os.mkdir('./data')

gdd.download_file_from_google_drive(file_id='1EMS377_Ew2h0esEMdEj54X894J_xU7Fl', dest_path='./data/instacart_online_grocery_shopping_2017_05_01.tar.gz')

Downloading 1EMS377_Ew2h0esEMdEj54X894J_xU7Fl into ./data/instacart_online_grocery_shopping_2017_05_01.tar.gz... Done.


In [0]:
! mkdir ./data/unziped

In [0]:
! tar -xzf ./data/instacart_online_grocery_shopping_2017_05_01.tar.gz -C ./data/unziped

In [0]:
orders_df = pd.read_csv('./data/unziped/instacart_2017_05_01/orders.csv')

In [0]:
orders_df = orders_df.drop(orders_df[orders_df['eval_set'] == 'test'].index)

In [0]:
orders_df.fillna(999, inplace=True)

In [0]:
order_products_prior_df = pd.read_csv('./data/unziped/instacart_2017_05_01/order_products__prior.csv')
order_products_train_df = pd.read_csv('./data/unziped/instacart_2017_05_01/order_products__train.csv')
products_df = pd.read_csv('./data/unziped/instacart_2017_05_01/products.csv')

# Metrics

In [0]:
def precision_at_k(predicted, actual, k):
  assert len(predicted) >= k
  intersection = set(predicted[:k]) & set(actual)
  return len(intersection) / k

In [0]:
def average_precision_at_k(predicted, actual, k):
  tmp = 0
  actual_length = min(k,len(actual))
  if len(predicted) < actual_length:
    warnings.warn("Length of predict is less than k")
  for i in range(actual_length):
    if predicted[i] in actual:
      tmp += precision_at_k(predicted[:i+1], actual, i+1)
  return tmp / actual_length

In [0]:
def mean_average_precision_at_k(predicted_list, actual_list, k):
  tmp = 0
  cnt = 0
  for predicted, actual in zip(predicted_list, actual_list):
    if len(actual) != 0:
      tmp += average_precision_at_k(predicted, actual, k)
      cnt += 1
  assert cnt != 0
  return tmp / cnt

# Basic Recommenders

In [0]:
class MostPopularRecommender():
  def __init__(self):
    pass
  
  def fit(self, X_train):
    assert type(X_train) == pd.DataFrame
    self.most_popular = list(X_train['product_id'].value_counts().index[:])
  
  def predict(self, X_test, top_n):
    predicted = pd.DataFrame()
    predicted['user_id'] = list(X_test['user_id'].unique())
    predicted['product_id'] = [self.most_popular[:top_n] for i in range(X_test['user_id'].nunique())]
    return predicted

In [0]:
class SVDRecommender():
  def __init__(self, k, batch_size):
    self.k = k
    self.batch_size = batch_size

  def fit(self, X_train):
    # count total number of bought products
    svd_input = X_train.groupby(['user_id','product_id']).agg(buy_num=pd.NamedAgg(column='order_id', aggfunc='count')).reset_index()
    # collect unique ids for user and products from dataset
    self.item_id = svd_input['product_id'].unique()
    self.user_id = svd_input['user_id'].unique()
    # encode ids using indices
    self.user_dict = {user:i for i,user in enumerate(self.user_id)}
    self.item_dict = {item:i for i,item in enumerate(self.item_id)}
    self.inv_item_dict = {v:k for k,v in self.item_dict.items()}

    svd_input["x_rows"] = svd_input['user_id'].apply(lambda s : self.user_dict[s])
    svd_input["x_cols"] = svd_input['product_id'].apply(lambda s : self.item_dict[s])
    
    # form sparse matrix
    X_user_item = csr_matrix((svd_input["buy_num"],\
                        (svd_input["x_rows"], svd_input["x_cols"])),\
                        shape = (len(self.user_id), len(self.item_id)))
    
    clf_svd = TruncatedSVD(n_components=self.k)
    # obtain dense representations
    self.user_proj = clf_svd.fit_transform(X_user_item)
    self.item_proj = clf_svd.components_

  def predict(self, X_test, top_n):
    # df to store predictions for each user
    predicted = pd.DataFrame()
    user_id_test = list(set(X_test['user_id'].unique()) & set(self.user_id))
    # evaluate number of batches
    batch_number = len(user_id_test) // self.batch_size
    #print(len(user_id_test))
    def decode_prod(x):
        return [self.inv_item_dict[prod_id] for prod_id in x]

    for i in range(batch_number):
      if i % 5 == 0:
        print(i)
      batch_predict = pd.DataFrame()
      batch_predict['user_id'] = self.user_id[self.batch_size*i:self.batch_size*(i+1)]
      user_indices = [self.user_dict[id] for id in self.user_id[self.batch_size*i:self.batch_size*(i+1)]]
      predicted_ratings = np.matmul(self.user_proj[user_indices,:],self.item_proj)
      item_indices = np.argsort(predicted_ratings, axis=1)[:,-1:-(top_n+1):-1]
      batch_predict['product_id'] = [list(prod) for prod in item_indices]

      batch_predict = batch_predict['product_id'].apply(decode_prod)
      predicted = pd.concat([predicted, batch_predict])

    # make prediction for all remains users
    batch_predict = pd.DataFrame()
    batch_predict['user_id'] = user_id_test[self.batch_size*batch_number:]
    user_indices = [self.user_dict[id] for id in user_id_test[self.batch_size*batch_number:]]
    predicted_ratings = np.matmul(self.user_proj[user_indices,:],self.item_proj)
    item_indices = np.argsort(predicted_ratings, axis=1)[:,-1:-(top_n+1):-1]
    batch_predict['product_id'] = [list(prod) for prod in item_indices]

    batch_predict = batch_predict['product_id'].apply(decode_prod)
    predicted = pd.concat([predicted, batch_predict])
    predicted.rename(columns={0 : 'product_id'},inplace=True)
    predicted['user_id'] = user_id_test
    predicted.set_index('user_id',inplace=True)
    return predicted

In [0]:
def eval_results(actual, predicted,k):

  def collect(x):
    return list(x)
  
  ground_truth = actual.groupby('user_id').agg({'product_id':collect})
  return mean_average_precision_at_k(list(predicted['product_id']), list(ground_truth['product_id']), k)

# Evaluate basic models

In [0]:
train_orders_merged_df = pd.merge(orders_df[orders_df['eval_set'] == 'train'], order_products_train_df, on='order_id')
prior_orders_merged_df = pd.merge(orders_df[orders_df['eval_set'] == 'prior'], order_products_prior_df, on='order_id')

Add order_num column to separate prior into train_df and val_df

In [0]:
prior_orders_merged_df = pd.merge(prior_orders_merged_df.groupby('user_id').agg(last_order_number=pd.NamedAgg(column='order_number', aggfunc='max')).reset_index(), prior_orders_merged_df, on='user_id')

In [17]:
prior_orders_merged_df

Unnamed: 0,user_id,last_order_number,order_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,1,10,2539329,prior,1,2,8,999.0,196,1,0
1,1,10,2539329,prior,1,2,8,999.0,14084,2,0
2,1,10,2539329,prior,1,2,8,999.0,12427,3,0
3,1,10,2539329,prior,1,2,8,999.0,26088,4,0
4,1,10,2539329,prior,1,2,8,999.0,26405,5,0
...,...,...,...,...,...,...,...,...,...,...,...
32434484,206209,13,2977660,prior,13,1,12,7.0,14197,5,1
32434485,206209,13,2977660,prior,13,1,12,7.0,38730,6,0
32434486,206209,13,2977660,prior,13,1,12,7.0,31477,7,0
32434487,206209,13,2977660,prior,13,1,12,7.0,6567,8,0


Split into train_df and val_df: last order comes into val, all others into train

In [0]:
train_df, val_df = prior_orders_merged_df[prior_orders_merged_df['order_number'] < prior_orders_merged_df['last_order_number']], prior_orders_merged_df[prior_orders_merged_df['order_number'] == prior_orders_merged_df['last_order_number']]

In [19]:
train_df.shape

(30294701, 11)

In [20]:
val_df.shape

(2139788, 11)

Train most popular on train_df and predict on val

In [0]:
mp = MostPopularRecommender()
mp.fit(train_df)
predicted_mp = mp.predict(val_df,top_n=5)

In [22]:
predicted_mp

Unnamed: 0,user_id,product_id
0,1,"[24852, 13176, 21137, 21903, 47209]"
1,2,"[24852, 13176, 21137, 21903, 47209]"
2,3,"[24852, 13176, 21137, 21903, 47209]"
3,4,"[24852, 13176, 21137, 21903, 47209]"
4,5,"[24852, 13176, 21137, 21903, 47209]"
...,...,...
206204,206205,"[24852, 13176, 21137, 21903, 47209]"
206205,206206,"[24852, 13176, 21137, 21903, 47209]"
206206,206207,"[24852, 13176, 21137, 21903, 47209]"
206207,206208,"[24852, 13176, 21137, 21903, 47209]"


Train most popular on train_df and predict on val

In [23]:
svd_rec = SVDRecommender(20, 4000)
svd_rec.fit(train_df)
predicted_svd = svd_rec.predict(val_df,top_n=5)

0
5
10
15
20
25
30
35
40
45
50


Form ground truth

In [0]:
ground_truth = val_df.groupby('user_id').agg({'product_id':lambda x: list(x)}).reset_index()

In [25]:
ground_truth

Unnamed: 0,user_id,product_id
0,1,"[196, 46149, 39657, 38928, 25133, 10258, 35951..."
1,2,"[24852, 16589, 1559, 19156, 18523, 22825, 2741..."
2,3,"[39190, 18599, 23650, 21903, 47766, 24810]"
3,4,"[26576, 25623, 21573]"
4,5,"[27344, 24535, 43693, 40706, 16168, 21413, 139..."
...,...,...
206204,206205,"[27845, 21137, 43352, 3896, 19173, 22035, 38739]"
206205,206206,"[11520, 23029, 42623]"
206206,206207,"[27845, 36011, 39180, 13176, 33787, 44632, 337..."
206207,206208,"[13176, 34213, 23579, 33000, 31404, 35688, 236..."


Merge all together

In [0]:
all_predict = pd.merge(predicted_svd, predicted_mp, on='user_id')
all_predict = pd.merge(all_predict, ground_truth, on='user_id')

In [0]:
#all_predict.to_csv('all_predict_val.csv')

In [27]:
all_predict

Unnamed: 0,user_id,product_id_x,product_id_y,product_id
0,1,"[49235, 196, 6184, 13176, 12341]","[24852, 13176, 21137, 21903, 47209]","[196, 46149, 39657, 38928, 25133, 10258, 35951..."
1,2,"[47209, 24852, 47766, 33754, 4957]","[24852, 13176, 21137, 21903, 47209]","[24852, 16589, 1559, 19156, 18523, 22825, 2741..."
2,3,"[47766, 21903, 16797, 43352, 21137]","[24852, 13176, 21137, 21903, 47209]","[39190, 18599, 23650, 21903, 47766, 24810]"
3,4,"[19057, 28204, 30391, 45007, 39877]","[24852, 13176, 21137, 21903, 47209]","[26576, 25623, 21573]"
4,5,"[27966, 16797, 26209, 47626, 26604]","[24852, 13176, 21137, 21903, 47209]","[27344, 24535, 43693, 40706, 16168, 21413, 139..."
...,...,...,...,...
206204,206205,"[27845, 21137, 49235, 22035, 5077]","[24852, 13176, 21137, 21903, 47209]","[27845, 21137, 43352, 3896, 19173, 22035, 38739]"
206205,206206,"[27086, 5785, 47626, 8277, 49235]","[24852, 13176, 21137, 21903, 47209]","[11520, 23029, 42623]"
206206,206207,"[13176, 44632, 27966, 35221, 21709]","[24852, 13176, 21137, 21903, 47209]","[27845, 36011, 39180, 13176, 33787, 44632, 337..."
206207,206208,"[13176, 27845, 21137, 27966, 26209]","[24852, 13176, 21137, 21903, 47209]","[13176, 34213, 23579, 33000, 31404, 35688, 236..."


In [0]:
def eval_precision(predicted, actual,k=5):
  return average_precision_at_k(predicted, actual,k)

In [0]:
all_predict['svd_score'] = all_predict.apply(lambda x: eval_precision(x['product_id_x'], x['product_id']), axis=1)
all_predict['mp_score'] = all_predict.apply(lambda x: eval_precision(x['product_id_y'], x['product_id']), axis=1)

Choose best recommender

1 for svd, 0 for most pop

In [0]:
all_predict['best_recommender'] = all_predict.apply(lambda x: 0 if x['mp_score'] >= x['svd_score'] else 1, axis=1)

In [34]:
all_predict

Unnamed: 0,user_id,product_id_x,product_id_y,product_id,svd_score,mp_score,best_recommender
0,1,"[49235, 196, 6184, 13176, 12341]","[24852, 13176, 21137, 21903, 47209]","[196, 46149, 39657, 38928, 25133, 10258, 35951...",0.10,0.000000,1
1,2,"[47209, 24852, 47766, 33754, 4957]","[24852, 13176, 21137, 21903, 47209]","[24852, 16589, 1559, 19156, 18523, 22825, 2741...",0.55,0.280000,1
2,3,"[47766, 21903, 16797, 43352, 21137]","[24852, 13176, 21137, 21903, 47209]","[39190, 18599, 23650, 21903, 47766, 24810]",0.40,0.050000,1
3,4,"[19057, 28204, 30391, 45007, 39877]","[24852, 13176, 21137, 21903, 47209]","[26576, 25623, 21573]",0.00,0.000000,0
4,5,"[27966, 16797, 26209, 47626, 26604]","[24852, 13176, 21137, 21903, 47209]","[27344, 24535, 43693, 40706, 16168, 21413, 139...",0.04,0.000000,1
...,...,...,...,...,...,...,...
206204,206205,"[27845, 21137, 49235, 22035, 5077]","[24852, 13176, 21137, 21903, 47209]","[27845, 21137, 43352, 3896, 19173, 22035, 38739]",0.55,0.066667,1
206205,206206,"[27086, 5785, 47626, 8277, 49235]","[24852, 13176, 21137, 21903, 47209]","[11520, 23029, 42623]",0.00,0.000000,0
206206,206207,"[13176, 44632, 27966, 35221, 21709]","[24852, 13176, 21137, 21903, 47209]","[27845, 36011, 39180, 13176, 33787, 44632, 337...",0.60,0.100000,1
206207,206208,"[13176, 27845, 21137, 27966, 26209]","[24852, 13176, 21137, 21903, 47209]","[13176, 34213, 23579, 33000, 31404, 35688, 236...",0.20,0.100000,1


# generate features

In [0]:
def generate_features(data_df):
  def mode(x):
    return x.value_counts().index[0]

  user_features1_df = data_df.drop_duplicates(subset=['order_id']).groupby('user_id').agg({'days_since_prior_order':'median', 'order_dow':mode,\
                                                                                       'order_hour_of_day':'median', 'order_number':'nunique'})
  user_features1_df.rename(columns={'days_since_prior_order':'median_day_since_prior_order', 
                                  'order_dow':'mode_day',
                                  'order_hour_of_day':'median_hour_of_day',
                                  'order_number' : 'number_of_orders'}, inplace=True)
  
  def is_reordered(x):
    if 1 in list(x.values):
      return 1
    else:
      return 0


  aggregated_by_orders = pd.merge(data_df.groupby('order_id').agg({'product_id':'count', 'reordered':is_reordered}).reset_index().rename(columns={'product_id':'products_num'})\
         , data_df.drop_duplicates(subset=['order_id']).loc[:,['order_id', 'user_id']], on='order_id')

  user_features2_df = aggregated_by_orders.groupby('user_id').agg({'products_num' : 'median', 'reordered':is_reordered})

  user_features2_df.rename(columns={'products_num':'median_number_of_products_in_order'}, inplace=True)
  
  user_features3_df = data_df.groupby('user_id').agg(unique_prod_total=pd.NamedAgg(column = 'product_id', aggfunc='nunique'))

  all_user_features_df = pd.concat([user_features1_df, user_features2_df, user_features3_df], axis=1)

  return all_user_features_df


In [0]:
train_only_features_df = generate_features(train_df)
train_val_features_df = generate_features(prior_orders_merged_df)

In [35]:
train_only_features_df

Unnamed: 0_level_0,median_day_since_prior_order,mode_day,median_hour_of_day,number_of_orders,median_number_of_products_in_order,reordered,unique_prod_total
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,20.0,1,9.0,9,5.0,1,15
2,13.0,2,10.0,13,13.0,1,96
3,11.0,0,16.0,11,8.0,1,33
4,20.0,4,12.0,4,3.0,1,14
5,11.0,3,16.0,3,9.0,1,19
...,...,...,...,...,...,...,...
206205,514.5,2,13.5,2,12.5,1,23
206206,3.0,0,17.0,66,4.0,1,150
206207,8.0,1,12.0,15,13.0,1,92
206208,7.0,2,14.5,48,12.5,1,195


In [36]:
train_val_features_df

Unnamed: 0_level_0,median_day_since_prior_order,mode_day,median_hour_of_day,number_of_orders,median_number_of_products_in_order,reordered,unique_prod_total
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,20.5,4,8.5,10,5.5,1,18
2,13.0,2,10.0,14,13.5,1,102
3,11.5,0,16.0,12,7.0,1,33
4,19.0,5,13.0,5,3.0,1,17
5,15.0,3,17.0,4,10.0,1,23
...,...,...,...,...,...,...,...
206205,30.0,2,15.0,3,8.0,1,24
206206,3.0,0,17.0,67,4.0,1,150
206207,12.0,2,12.0,16,13.0,1,92
206208,7.0,2,15.0,49,13.0,1,198


In [0]:
full_train_df = pd.merge(train_only_features_df, all_predict.loc[:,['user_id','best_recommender']], on='user_id')

In [0]:
full_test_df = train_val_features_df.loc[:,['median_day_since_prior_order','mode_day',	'median_hour_of_day','number_of_orders','median_number_of_products_in_order','reordered','unique_prod_total']]

In [40]:
full_test_df

Unnamed: 0_level_0,median_day_since_prior_order,mode_day,median_hour_of_day,number_of_orders,median_number_of_products_in_order,reordered,unique_prod_total
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,20.5,4,8.5,10,5.5,1,18
2,13.0,2,10.0,14,13.5,1,102
3,11.5,0,16.0,12,7.0,1,33
4,19.0,5,13.0,5,3.0,1,17
5,15.0,3,17.0,4,10.0,1,23
...,...,...,...,...,...,...,...
206205,30.0,2,15.0,3,8.0,1,24
206206,3.0,0,17.0,67,4.0,1,150
206207,12.0,2,12.0,16,13.0,1,92
206208,7.0,2,15.0,49,13.0,1,198


In [41]:
full_train_df

Unnamed: 0,user_id,median_day_since_prior_order,mode_day,median_hour_of_day,number_of_orders,median_number_of_products_in_order,reordered,unique_prod_total,best_recommender
0,1,20.0,1,9.0,9,5.0,1,15,1
1,2,13.0,2,10.0,13,13.0,1,96,1
2,3,11.0,0,16.0,11,8.0,1,33,1
3,4,20.0,4,12.0,4,3.0,1,14,0
4,5,11.0,3,16.0,3,9.0,1,19,1
...,...,...,...,...,...,...,...,...,...
206204,206205,514.5,2,13.5,2,12.5,1,23,1
206205,206206,3.0,0,17.0,66,4.0,1,150,0
206206,206207,8.0,1,12.0,15,13.0,1,92,1
206207,206208,7.0,2,14.5,48,12.5,1,195,1


# metaclassifier

In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
clf = LogisticRegression()

In [44]:
full_train_df.loc[:,['median_day_since_prior_order','mode_day',	'median_hour_of_day','number_of_orders','median_number_of_products_in_order','reordered','unique_prod_total']]

Unnamed: 0,median_day_since_prior_order,mode_day,median_hour_of_day,number_of_orders,median_number_of_products_in_order,reordered,unique_prod_total
0,20.0,1,9.0,9,5.0,1,15
1,13.0,2,10.0,13,13.0,1,96
2,11.0,0,16.0,11,8.0,1,33
3,20.0,4,12.0,4,3.0,1,14
4,11.0,3,16.0,3,9.0,1,19
...,...,...,...,...,...,...,...
206204,514.5,2,13.5,2,12.5,1,23
206205,3.0,0,17.0,66,4.0,1,150
206206,8.0,1,12.0,15,13.0,1,92
206207,7.0,2,14.5,48,12.5,1,195


In [45]:
clf.fit(full_train_df.loc[:,['median_day_since_prior_order','mode_day',	'median_hour_of_day','number_of_orders','median_number_of_products_in_order','reordered','unique_prod_total']],
        full_train_df['best_recommender'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

User for which we need prediction

In [0]:
users = list(set(train_orders_merged_df['user_id'].unique()) & set(prior_orders_merged_df['user_id'].unique()))

In [0]:
pred = clf.predict(train_val_features_df.loc[users,:])

In [48]:
pred.shape

(131209,)

In [0]:
X_test = pd.DataFrame({'user_id' : train_val_features_df.loc[users,:].index, 'best_recommender':pred})

In [0]:
X_test_mp = X_test.query('best_recommender == 0')
X_test_svd = X_test.query('best_recommender == 1')

Predict using corresponding recommender. We retrain both recommenders on train + val = prior



In [0]:
mp = MostPopularRecommender()
mp.fit(prior_orders_merged_df)
predicted_mp = mp.predict(X_test_mp,top_n=5)

In [53]:
svd_rec = SVDRecommender(20, 4000)
svd_rec.fit(prior_orders_merged_df)
predicted_svd = svd_rec.predict(X_test_svd,top_n=5)

0
5


Concat predictions

In [0]:
full_predict = pd.concat([predicted_mp, predicted_svd.reset_index()])
full_predict.sort_values(by='user_id', inplace=True)

Finally, eval performance

In [55]:
eval_results(train_orders_merged_df, full_predict,k=2)

0.09008528378388678