<a href="https://colab.research.google.com/github/Yanina-Kutovaya/RecSys-amazon/blob/main/notebooks/02_Baseline_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommender system Amazon - Grocery and Gourmet Food

## Baseline model

[Data source](https://nijianmo.github.io/amazon/index.html)

[Build dataset, EDA](https://colab.research.google.com/drive/1IuLjTF8I0wt9zMiYbehRxLNiHJ6sg_1r?usp=sharing)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!git clone -q https://github.com/Yanina-Kutovaya/RecSys-amazon.git
!pip install -r RecSys-amazon/requirements_Colab.txt -q  
!wget -q -P /content/RecSys-amazon/data/01_raw https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Grocery_and_Gourmet_Food.csv --no-check-certificate
!wget -q -P /content/RecSys-amazon/data/01_raw https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz --no-check-certificate
!cp /content/drive/MyDrive/OTUS_ML_advanced/06_RecSys/amazon/meta_Grocery_and_Gourmet_Food.json.gz /content/RecSys-amazon/data/01_raw/

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 KB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 KB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone


In [3]:
%cd RecSys-amazon

/content/RecSys-amazon


In [4]:
import sys
import os

sys.path.append(os.getcwd())
sys.path.append(os.path.join(os.getcwd(), "src", "recsys_amazon"))

In [5]:
import random
import pandas as pd
import numpy as np

from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import cross_validate

from data.make_dataset import build_dataset
from features.user_features import fit_transform_user_features, get_text_embeddings
from features.item_features import fit_transform_item_features
from features.new_item_user_features import get_user_item_features
from features.candidates_lvl_2 import get_candidates
from features.targets import get_targets_lvl_2
from models.serialize import store, load
from scripts.train_save_model import train_store
from src.recsys_amazon.metrics import (
   get_results, adjust_results_for_metrics, precision_at_k
)
from models.save_artifacts import (
    save_dataset, 
    load_dataset,
    save_user_features,
    save_item_featutes, 
    save_user_item_features,
    save_candidates,
    save_train_dataset_lvl_2  
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
PATH = '/content/drive/MyDrive/OTUS_ML_advanced/06_RecSys/amazon/'
DEFAULT_RANDOM_SEED = 25
N_ITEMS = 100

In [8]:
def set_all_seeds(seed=DEFAULT_RANDOM_SEED):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

set_all_seeds()

In [9]:
regenerate = True

## 1. Data ingestion

In [10]:
if regenerate:  
  data_train_lvl_1, data_val_lvl_1, data_val_lvl_2, item_features, user_reviews = build_dataset()
  save_dataset(data_train_lvl_1, data_val_lvl_1, data_val_lvl_2, item_features, user_reviews)    
else:
  data_train_lvl_1, data_val_lvl_1, data_val_lvl_2, item_features, user_reviews = load_dataset()  

In [11]:
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (8433, 4)



Unnamed: 0,item_id,user_id,rating,timestamp
0,B00005BPQ9,A2XCLJRGFANRC,5.0,1425168000
1,B00005BPQ9,AHKXYQ7TP6REK,4.0,1419206400


In [12]:
print(f'data_val_lvl_1.shape = {data_val_lvl_1.shape}\n')
data_val_lvl_1.head(2)

data_val_lvl_1.shape = (7925, 4)



Unnamed: 0,item_id,user_id,rating,timestamp
0,B00005BPQ9,A3TH42PC3EZV1B,4.0,1445558400
1,B00006FMLY,A8CGWGJ9OM58J,5.0,1455753600


In [13]:
print(f'data_val_lvl_2.shape = {data_val_lvl_2.shape}\n')
data_val_lvl_2.head(2)

data_val_lvl_2.shape = (1646, 4)



Unnamed: 0,item_id,user_id,rating,timestamp
0,B0000D916Y,A1STWKVUMXNZAB,3.0,1524096000
1,B0000D916Y,A2RI05C8BVQIWB,4.0,1513468800


In [14]:
print(f'item_features.shape = {item_features.shape}\n')
item_features.head(2)

item_features.shape = (2529, 15)



Unnamed: 0,item_id,price,main_cat,category_1,category_2,brand,rank,rank_group,title,description,len_also_view,also_view,len_also_buy,also_buy,category_3
0,B00005BPQ9,0.0,Grocery,Candy & Chocolate,Candy & Chocolate Assortments,Whoppers,413552.0,Grocery & Gourmet Food,WHOPPERS Candy (Chocolate Covered Malted Milk ...,WHOPPERS Malted Milk Balls are the classic con...,2.0,"['B0016G1H8G', 'B0029JVUAE']",14.0,"['B0029JVUAE', 'B00374XTQI', 'B000IXUK2W', 'B0...",
1,B00006FMLY,10.41,Amazon Home,Cooking & Baking,"Frosting, Icing & Decorations",Wilton,13984.0,Kitchen & Dining (See Top 100 in Kitchen & Din...,"Wilton Jimmies Rainbow Sprinkle Assortment, 3....",Sprinkle your treats with sweet color using th...,0.0,[],0.0,[],Sprinkles & Glitters


In [15]:
print(f'user_reviews.shape = {user_reviews.shape}\n')
user_reviews.head(2)

user_reviews.shape = (10927, 5)



Unnamed: 0,user_id,item_id,timestamp,text,len_text
0,A3TH42PC3EZV1B,B00005BPQ9,1445558400,"Delicious, and a good value... What more is to...",55
1,A8CGWGJ9OM58J,B00006FMLY,1455753600,great for decorating.,21


## 2. Build the 1st level recommender system

### 2.1. Recommender evaluation on cross-validation

In [16]:
reader = Reader(
    line_format='user item rating',
    rating_scale=(1, 5),
)
train_data = Dataset.load_from_df(
    data_train_lvl_1[['user_id', 'item_id', 'rating']], reader
)
algo = SVD(random_state=DEFAULT_RANDOM_SEED)
res = cross_validate(algo, train_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

rmse_mean = round(res['test_rmse'].mean(), 3)
rmse_std =  round(res['test_rmse'].std(), 3)
mae_mean = round(res['test_mae'].mean(), 3)
mae_std =  round(res['test_mae'].std(), 3)

print(f'\nRMSE = {rmse_mean} +/- {rmse_std}')
print(f'MAE = {mae_mean} +/- {mae_std}')

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9136  0.8927  0.9144  0.9255  0.8943  0.9081  0.0126  
MAE (testset)     0.6210  0.6264  0.6364  0.6436  0.6183  0.6291  0.0095  
Fit time          0.07    0.08    0.07    0.07    0.07    0.07    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    

RMSE = 0.908 +/- 0.013
MAE = 0.629 +/- 0.01


### 2.2. Recommender evaluation on hold-out dataset

In [17]:
trainset = train_data.build_full_trainset()
recommender = SVD(random_state=DEFAULT_RANDOM_SEED)
recommender.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd232c551c0>

In [18]:
def recommender_evaluation(data_val_lvl_1):
  data_valid = Dataset.load_from_df(
      data_val_lvl_1[['user_id', 'item_id', 'rating']], reader
    )  
  validset = [data_valid.df.loc[i].to_list() for i in range(len(data_valid.df))]
  predictions =recommender.test(validset)
  print(accuracy.rmse(predictions, verbose=True))
  print(accuracy.mae(predictions, verbose=True))

#### 2.2.1. Full validation dataset

In [19]:
recommender_evaluation(data_val_lvl_1)

RMSE: 0.9811
0.9811105984535157
MAE:  0.6944
0.6943896179704978


#### 2.2.2. Validation dataset without new users and items

In [20]:
current_users = list(set(data_val_lvl_1["user_id"]) & set(data_train_lvl_1["user_id"]))
current_items = list(set(data_val_lvl_1["item_id"]) & set(data_train_lvl_1["item_id"]))

new_users = list(set(data_val_lvl_1["user_id"]) - set(data_train_lvl_1["user_id"]))
new_items = list(set(data_val_lvl_1["item_id"]) - set(data_train_lvl_1["item_id"]))
print(f'New users: {len(new_users)} out of {data_val_lvl_1["user_id"].nunique()}')
print(f'New users: {len(new_items)} out of {data_val_lvl_1["item_id"].nunique()}')

New users: 802 out of 2047
New users: 852 out of 2320


In [21]:
cond_1 = data_val_lvl_1['user_id'].isin(current_users)
cond_2 = data_val_lvl_1['item_id'].isin(current_items)
df = data_val_lvl_1[cond_1 & cond_2].reset_index()
recommender_evaluation(df)

RMSE: 0.9559
0.9559379575017717
MAE:  0.6583
0.6582566458585274


### 2.3. Final model

In [22]:
def get_recommender(
    data_train_lvl_1,     
    seed=DEFAULT_RANDOM_SEED
  ):  
  reader = Reader(
      line_format='user item rating',
      rating_scale=(1, 5),
  )
  trainset = Dataset.load_from_df(
      data_train_lvl_1[['user_id', 'item_id', 'rating']], reader
  ).build_full_trainset()  

  recommender = SVD(random_state=seed)
  recommender.fit(trainset) 

  return recommender

In [23]:
recommender = get_recommender(data_train_lvl_1)

#### Select candidates for the 2nd level dataset

In [24]:
if regenerate: 
  candidates_lvl_2 = get_candidates(
    recommender,
    data_train_lvl_1,
    data_val_lvl_1, 
    data_val_lvl_2,
    n_items=N_ITEMS
  )
  save_candidates(candidates_lvl_2)
else:
  candidates_lvl_2 = pd.read_parquet(
      "data/03_primary/candidates_lvl_2.parquet.gzip"
  )
print(f'candidates_lvl_2.shape = {candidates_lvl_2.shape}\n')
candidates_lvl_2.head(2)

candidates_lvl_2.shape = (224100, 2)



Unnamed: 0,user_id,item_id
837121,AZX0JC5BPPV25,B000EDDS6Q
837201,AZX0JC5BPPV25,B00CIW2KF6


## 3. Build dataset for the 2nd level model

### 3.1. Generate features

In [25]:
if regenerate:  
  user_features_transformed = fit_transform_user_features(user_reviews)
  save_user_features(user_features_transformed)
else:
  user_features_transformed = pd.read_parquet("data/04_feature/user_features_transformed.parquet.gzip")

print(f'user_features_transformed.shape = {user_features_transformed.shape}\n')
user_features_transformed.head(2)

  0%|          | 0/30 [00:00<?, ?it/s]

user_features_transformed.shape = (10927, 58)



Unnamed: 0,user_id,item_id,len_text,n_words,n_sentences,mean_sent_len,len_1st_sentence,n_words_1st_sentence,r_0,r_1,...,r_40,r_41,r_42,r_43,r_44,r_45,r_46,r_47,r_48,r_49
0,A3TH42PC3EZV1B,B00005BPQ9,55,11,1,55.0,55,11,-0.041628,6.9e-05,...,0.041438,-0.005693,0.017075,0.014315,0.053603,0.02476,-0.033107,0.046196,0.018372,-0.00651
1,A8CGWGJ9OM58J,B00006FMLY,21,3,1,21.0,21,3,-0.0001,0.025156,...,0.029046,-0.037217,0.009144,0.033788,0.033577,-0.006208,-0.008406,0.03542,0.031174,-0.018744


In [26]:
if regenerate:
  item_features_transformed = fit_transform_item_features(item_features)  
  save_item_featutes(item_features_transformed)
else:  
  item_features_transformed =  pd.read_parquet(
      "data/04_feature/item_features_transformed.parquet.gzip"
  )
print(f'item_features_transformed.shape = {item_features_transformed.shape}\n')
item_features_transformed.head(2)

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

item_features_transformed.shape = (2529, 276)



Unnamed: 0,item_id,price,rank,len_also_view,len_also_buy,main_cat_count,category_1_count,category_2_count,brand_count,rank_group_count,...,s1_2_40,s1_2_41,s1_2_42,s1_2_43,s1_2_44,s1_2_45,s1_2_46,s1_2_47,s1_2_48,s1_2_49
0,B00005BPQ9,0.0,413552.0,2.0,14.0,0.948992,0.132068,0.007908,0.668248,0.902728,...,,,,,,,,,,
1,B00006FMLY,10.41,13984.0,0.0,0.0,0.008699,0.154607,0.004745,0.002768,0.005536,...,1.019566e-12,1.907818e-12,-1.408269e-12,-2.901878e-12,8.644447e-13,1.320685e-12,-7.519267e-13,9.130066e-16,9.245289e-13,-1.446427e-12


In [27]:
if regenerate:
  user_item_features = get_user_item_features(data_val_lvl_1, item_features)  
  save_user_item_features(user_item_features)
else:  
  user_item_features =  pd.read_parquet(
      "data/04_feature/user_item_features.parquet.gzip"
  )
print(f'user_item_features.shape = {user_item_features.shape}\n')
user_item_features.head(2)

user_item_features.shape = (7925, 30)



Unnamed: 0,user_id,item_id,mean_rating_user,n_rated_user,mean_price_user,total_spent_user,n_rated_item,total_spent_item,n_users_brand,n_ratings_brand,...,mean_rating_category_2,mean_price_category_2,total_spent_category_2,n_brands_category_2,n_users_category_3,n_ratings_category_3,mean_rating_category_3,mean_price_category_3,total_spent_category_3,n_brands_category_3
0,A3TH42PC3EZV1B,B00005BPQ9,4.666667,12,17.054167,204.65,1,0.0,1.0,1.0,...,4.676471,7.733824,262.95,13.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A8CGWGJ9OM58J,B00006FMLY,5.0,25,16.2604,406.51,1,10.41,10.0,12.0,...,4.875,12.176875,194.83,7.0,1.0,1.0,5.0,10.41,10.41,1.0


### 3.3. Generate train dataset

In [28]:
if regenerate:
  train_dataset_lvl_2 = get_targets_lvl_2(
      data_val_lvl_1, 
      candidates_lvl_2,
      item_features_transformed,
      user_features_transformed,
      user_item_features
  )
  save_train_dataset_lvl_2(train_dataset_lvl_2)
else:
  train_dataset_lvl_2 = pd.read_parquet(
      "data/05_model_input/train_dataset_lvl_2.parquet.gzip"
  )
print(f'targets_lvl_2.shape = {train_dataset_lvl_2.shape}\n')
train_dataset_lvl_2.tail(2)

targets_lvl_2.shape = (231024, 362)



Unnamed: 0,user_id,item_id,target,price,rank,len_also_view,len_also_buy,main_cat_count,category_1_count,category_2_count,...,mean_rating_category_2,mean_price_category_2,total_spent_category_2,n_brands_category_2,n_users_category_3,n_ratings_category_3,mean_rating_category_3,mean_price_category_3,total_spent_category_3,n_brands_category_3
233881,AJ2F2NP4MXH1G,B01DD3NOMQ,1.0,0.0,684494.0,0.0,0.0,0.948992,0.289047,0.074733,...,4.30687,15.756901,10320.77,116.0,40.0,57.0,4.666667,19.904211,1134.54,20.0
233883,A1K92V58V3MSQ1,B01GK5IDFC,1.0,14.26,179413.0,2.0,2.0,0.948992,0.020957,0.011467,...,4.53271,14.55,1556.85,20.0,31.0,46.0,4.5,17.84413,820.83,5.0


## 4. Train the model

In [29]:
train_store(train_dataset_lvl_2, 'baseline_v1')

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[6]	training's auc: 0.999995	valid_1's auc: 0.999571


## 5. Inference

In [30]:
model_lgb = load('baseline_v1')

In [31]:
predictions_train = model_lgb.predict(
    train_dataset_lvl_2.iloc[:, 2:].drop('target', axis=1).fillna(0)
)
print(f'predictions_train.shape = {predictions_train.shape}\n')
predictions_train[:7]

predictions_train.shape = (231024,)



array([0.03269133, 0.03269133, 0.03269133, 0.03269133, 0.03269133,
       0.03269133, 0.03269133])

In [32]:
result_lvl_2 = get_results(data_val_lvl_2, train_dataset_lvl_2, predictions_train)

print(f'result_lvl_2.shape = {result_lvl_2.shape}\n')
result_lvl_2.head(2)    

result_lvl_2.shape = (657, 4)



Unnamed: 0,user_id,actual,actual_adj,recommendations
0,A104FF3C0UPVA5,[B0028PQ9BE],[B0028PQ9BE],"[B001FA1S4Q, B00KRFLDBS, B00ADYXY7E, B00BBJLIB..."
1,A10CRW7XRJBJ2G,"[B001M09BQQ, B002F1PSZY]","[B001M09BQQ, B002F1PSZY]","[B00BJ8U1MO, B000FA7Q2A, B0001CXUHW, B0005ZVGL..."


## 6. Metrics

In [33]:
adjusted_results_lvl_2 = adjust_results_for_metrics(result_lvl_2)

print(f'adjusted_results_lvl_2.shape = {adjusted_results_lvl_2.shape}\n')
adjusted_results_lvl_2.sample(5)

adjusted_results_lvl_2.shape = (87, 5)



Unnamed: 0,user_id,actual,actual_adj,recommendations,len_actual_adj
56,A1D42WHLILI5AK,"[B0014EOU1G, B0014EOUWU, B0014ET2MI, B0014EQI4...","[B0014EOU1G, B0014EOUWU, B0014ET2MI, B0014EQI4...","[B000MIFS4S, B003ZVG4WY, B00ZK6FXPG, B00M31BOR...",8
15,A12TX53NK2DH4B,"[B00BUKL666, B00D3M2QP4, B008QMX2SG, B00KSN9TM...","[B00BUKL666, B00D3M2QP4, B008QMX2SG, B00KSN9TM...","[B00XOORKRK, B00ZK6FXPG, B010ULFOWC, B012NC1KB...",6
373,A356O4DG52SUQR,"[B001M1V5P0, B002Z08ROO, B005FC73VW, B00NMJ05W...","[B001M1V5P0, B002Z08ROO, B005FC73VW, B00NMJ05W...","[B00AWJ2ZFM, B00BUKL666, B00D3M2QP4, B008QMX2S...",5
107,A1NS1F7GLLSEZI,"[B00099XNG0, B0010VSBPO, B001N2GRX8, B00HK4AN7...","[B00099XNG0, B0010VSBPO, B001N2GRX8, B00HK4AN7...","[B0014DUG7E, B001AY4T98, B0019FEOYS, B000NMI5K...",7
558,AFBPB7ZIXS9BP,"[B000CQ01NS, B000CQ01GU, B000EFPVLY, B001EO5Y2...","[B000CQ01NS, B000CQ01GU, B000EFPVLY, B001EO5Y2...","[B005C1GRRM, B00856TSCC, B000LKZ78E, B002L9TQ3...",6


In [34]:
metrics = adjusted_results_lvl_2.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.05287356321839081


In [35]:
i = 107
actual = adjusted_results_lvl_2.loc[i, "actual_adj"]
item_features.loc[actual, 'title'].unique().tolist()

['Chex Snack Mix Bold Party Blend, 8.75-Ounce Bags (Pack of 12)',
 'Chex Mix Trail Mix, Sweet &amp; Salty, 8.75-Ounce Bags (Pack of 12)',
 'CLIF BAR - Energy Bar - Apricot - (2.4 Ounce Protein Bar, 12 Count)',
 "Hall's Chocolate Walnut Fudge, 1 Pound",
 "Hall's Penuche Walnut Fudge, 1 Pound",
 'Planters Mixed Nuts, Lightly Salted Deluxe Mixed Nuts, 15.25 Ounce',
 'Basilur | Gift Tea Set | Tea Book -Vol 2 | Collectable Metal Tin Caddy | Pure Ceylon Black Tea with fruits| 100g /3.52 oz.']

In [36]:
recommendations = adjusted_results_lvl_2.loc[i, "recommendations"]
item_features.loc[recommendations, 'title'].unique().tolist()

['Snickers Almond 24 Bars',
 'Wilton Jordan Almonds, Ideal for Individually Packaged Edible Wedding and Baby Shower Favors, Use for Easter or Spring Celebrations, Mega Pack, Assorted Pastel Colors (44 oz.)',
 'LU Cookies Shortbread, Scottish Recipe, 4.41-Ounce Boxes (Pack of 6)',
 'PAYDAY Peanut Caramel Candy Bar(Pack of 24)',
 'KIND Bars, Dark Chocolate Nuts &amp; Sea Salt, Gluten Free, 1.4 Ounce Bars, 12 Count']