<a href="https://colab.research.google.com/github/Yanina-Kutovaya/RecSys-retail/blob/main/notebooks/Baseline_model_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip
#!unzip -q /tmp/awscliv2.zip -d /tmp
#!rm /tmp/awscliv2.zip
#!sudo /tmp/aws/install --update
#!rm -rf /tmp/aws/ 
#!aws configure
#region eu-west-1

In [None]:
#!git clone https://github.com/Yanina-Kutovaya/RecSys-retail.git
#!pip install -r RecSys-retail/requirements_Colab.txt

In [None]:
%cd RecSys-retail

/content/RecSys-retail


In [None]:
import sys
import os

sys.path.append(os.getcwd())
sys.path.append(os.path.join(os.getcwd(), "src", "recsys_retail"))

In [None]:
import logging
import pandas as pd
import joblib

from data.make_dataset import load_data
from features.data_time_split import time_split, time_split_2
from features.prefilter import prefilter_items
from features.user_features import fit_transform_user_features
from features.item_features import fit_transform_item_features
from features.recommenders import MainRecommender
from features.candidates_lvl_2 import get_candidates
from features.new_item_user_features import get_user_item_features
from features.targets import get_targets_lvl_2
from data.validation import train_test_split
from models.serialize import store, load
from scripts.train_save_model import train_store
from src.recsys_retail.metrics import (
   get_results, adjust_results_for_metrics, precision_at_k
)

In [None]:
import warnings
warnings.filterwarnings("ignore")

# 1. Data ingestion

In [None]:
#!dvc pull

In [None]:
data, item_features, user_features = load_data()

In [None]:
print(f'data.shape = {data.shape}\n')
data.head(2)

data.shape = (2396804, 12)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
print(f'item_features.shape = {item_features.shape}\n')
item_features.head(2)

item_features.shape = (92353, 7)



Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [None]:
print(f'user_features.shape = {user_features.shape}\n')
user_features.head(2)

user_features.shape = (801, 8)



Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


# 2. Prefilter transactions data

In [None]:
logging.info('Preprocessing level 1 train dataset...')

i0 = data['item_id'].nunique()
u0 = data['user_id'].nunique()
t0 = data.shape[0]

data = prefilter_items(data, item_features)

i1 = data['item_id'].nunique()
u1 = data['user_id'].nunique()
t1 = data.shape[0]

print(f'The number of items decreased from {i0} to {i1}')
print(f'The number of users decreased from {u0} to {u1}')
print(f'The number of transactions decreased from {t0} to {t1}')

The number of items decreased from 89051 to 2501
The number of users decreased from 2499 to 2472
The number of transactions decreased from 2396804 to 414502


In [None]:
print(f'data.shape = {data.shape}\n')
data.head(2)

data.shape = (414502, 13)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
14,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,0.0,0.0,2.5


# 3. Train-validation-test time split for two-stage recommender system

Train - validation - test schema:

-- old purchases -- | -- 6 weeks-- | -- 3 weeks--

In [None]:
logging.info('Splitting dataset for level 1, level 2 preprocessing...')

data_train_lvl_1, data_train_lvl_2, data_val_lvl_2 = time_split(data)

In [None]:
data_train_lvl_1.shape[0], data_train_lvl_2.shape[0], data_val_lvl_2.shape[0]

(363397, 30113, 20992)

# 4. Level 1 

In [None]:
logger = logging.getLogger(__name__)

__all__ = ['preprocess_data']

PATH_1 = 'data/02_intermediate/'
DATA_TRAIN_LVL_1_PATH = PATH_1 + 'data_train_lvl_1.csv.zip'

N_ITEMS = 200
PATH_2 = 'data/05_model_input/'
TRAIN_DATASET_LVL_2_PATH = PATH_2 + 'train_dataset_lvl_2.csv.zip'

## 4.1. Preprocess user features and merge with transactions data 

In [None]:
user_features_transformed = fit_transform_user_features(user_features)

print(f'user_features_transformed.shape = {user_features_transformed.shape}\n')
user_features_transformed.head(2)

user_features_transformed.shape = (801, 14)



Unnamed: 0,user_id,age_desc,income_desc,homeowner_desc,household_size_desc,kid_category_desc,marital_status_code A,marital_status_code B,marital_status_code U,hh_comp_desc Single Male,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids
0,1,5.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7,3.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
data_train_lvl_1 = pd.merge(
    data_train_lvl_1, user_features_transformed, on='user_id', how='left'
)
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (363397, 26)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,household_size_desc,kid_category_desc,marital_status_code A,marital_status_code B,marital_status_code U,hh_comp_desc Single Male,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids
0,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,...,,,,,,,,,,
1,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


## 4.2. Preprocess item features and merge with transactions data

In [None]:
item_features_transformed = fit_transform_item_features(item_features)

print(f'item_features_transformed.shape = {item_features_transformed.shape}\n')
item_features_transformed.head(2)

item_features_transformed.shape = (92353, 7)



Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,0.111243,1.0,1.0,0.010057,0.027888,0.000359
1,26081,0.111243,0.012532,1.0,0.175647,0.426295,1.0


In [None]:
data_train_lvl_1 = pd.merge(
    data_train_lvl_1, item_features_transformed, on='item_id', how='left'
)
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (363397, 32)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,...,,,,,0.0157,1.0,1.0,0.341236,0.130478,0.025158
1,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,...,1.0,0.0,0.0,0.0,0.004418,0.062173,1.0,0.09375,0.047809,0.00183


In [None]:
logging.info('Saving preprocessed level 1 train dataset...')

train_data_lvl_1_path = DATA_TRAIN_LVL_1_PATH
data_train_lvl_1.to_csv(train_data_lvl_1_path, index=False, compression='zip')

# 5. Level 2

## 5.1. Build a recommender

In [None]:
logging.info('Selecting users for level 2 dataset...')

recommender = MainRecommender(
    data_train_lvl_1, 
    n_factors_ALS=50, 
    regularization_ALS=0.001,
    iterations_ALS=15,
    num_threads_ALS=4
)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2501 [00:00<?, ?it/s]

## 5.2. Generate users for level 2 with their preferences

In [None]:
n_items = N_ITEMS
users_lvl_2 = get_candidates(
    recommender, data_train_lvl_1, data_train_lvl_2, data_val_lvl_2, n_items
)
print(f'\nusers_lvl_2.shape = {users_lvl_2.shape}\n')
users_lvl_2.head(2)


users_lvl_2.shape = (2101, 2)



Unnamed: 0,user_id,candidates
0,2021,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,1753,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


## 5.3. Generate new features adding to them users and items embeddings

In [None]:
logging.info('Generating new user-item features...')
user_item_features = get_user_item_features(recommender, data_train_lvl_1)

print(f'user_item_features.shape = {user_item_features.shape}\n')
user_item_features.head(2)

user_item_features.shape = (363397, 240)



Unnamed: 0,user_id,item_id,sales_value,retail_disc,coupon_disc,coupon_match_disc,price,age_desc,income_desc,homeowner_desc,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,2375,1085983,2.99,-0.4,0.0,0.0,2.99,,,,...,-2.690238,1.054461,0.742704,-1.799052,2.279338,1.229397,3.93216,3.648286,-0.463106,-4.679028
1,2375,1085983,3.49,0.0,0.0,0.0,3.49,,,,...,-2.690238,1.054461,0.742704,-1.799052,2.279338,1.229397,3.93216,3.648286,-0.463106,-4.679028


## 5.4. Generate train dataset for level 2 model

In [None]:
logging.info('Generating train dataset for level 2 model...')

train_dataset_lvl_2 = get_targets_lvl_2(
    users_lvl_2, 
    data_train_lvl_2,
    item_features_transformed, 
    user_features_transformed,     
    user_item_features, 
    n_items
)
print(f'train_dataset_lvl_2.shape = {train_dataset_lvl_2.shape}\n')
train_dataset_lvl_2.head(2)

train_dataset_lvl_2.shape = (453441, 260)



Unnamed: 0,user_id,item_id,target,manufacturer_x,department_x,brand_x,commodity_desc_x,sub_commodity_desc_x,curr_size_of_product_x,age_desc_x,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,2021,1088959,0.0,0.006075,1.0,1.0,0.294181,0.178287,0.049369,,...,,,,,,,,,,
1,2021,987628,0.0,1.0,1.0,0.0,0.051724,0.023904,0.030648,,...,,,,,,,,,,


In [None]:
logging.info('Saving train dataset level 2...')

train_dataset_lvl_2_path = TRAIN_DATASET_LVL_2_PATH
train_dataset_lvl_2.to_csv(
    train_dataset_lvl_2_path, index=False, compression='zip'
)

# 6. Training the model

In [None]:
train_store(train_dataset_lvl_2, 'baseline_v1')

[LightGBM] [Info] Number of positive: 11332, number of negative: 351420
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41297
[LightGBM] [Info] Number of data points in the train set: 362752, number of used features: 259
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031239 -> initscore=-3.434352
[LightGBM] [Info] Start training from score -3.434352
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[288]	training's auc: 0.933658	valid_1's auc: 0.92552


# 7. Inference

In [None]:
model_lgb = load('baseline_v1')

In [None]:
predictions_train = model_lgb.predict(
    train_dataset_lvl_2.drop('target', axis=1).fillna(0)
)
print(f'predictions_train.shape = {predictions_train.shape}\n')
predictions_train[:7]

predictions_train.shape = (453441,)



array([0.0109462 , 0.01103865, 0.0109462 , 0.0109462 , 0.0109462 ,
       0.0109462 , 0.0109462 ])

In [None]:
result_lvl_2 = get_results(data_val_lvl_2, train_dataset_lvl_2, predictions_train)

print(f'result_lvl_2.shape = {result_lvl_2.shape}\n')
result_lvl_2.head(2)

result_lvl_2.shape = (1792, 4)



Unnamed: 0,user_id,actual,actual_adj,recommendations
0,1,"[856942, 865456, 951954, 971585, 990656, 11311...","[865456, 990656, 1041796, 1115576]","[865456, 872137, 10149640, 1005274, 7025114]"
1,3,"[835476, 999999]",,"[1101173, 907631, 972931, 869577, 883932]"


# 8. Metrics

In [None]:
adjusted_results_lvl_2 = adjust_results_for_metrics(result_lvl_2)

print(f'adjusted_results_lvl_2.shape = {adjusted_results_lvl_2.shape}\n')
adjusted_results_lvl_2.head(2)

adjusted_results_lvl_2.shape = (275, 5)



Unnamed: 0,user_id,actual,actual_adj,recommendations,len_actual_adj
12,19,"[944466, 999104, 1031083, 1048483, 6533236, 94...","[944466, 999104, 1048483, 940700, 6533608, 883...","[1101173, 999104, 837751, 1041796, 12301839]",9
14,22,"[916758, 935968, 1119830, 13158992, 999999, 10...","[825749, 880150, 944836, 995645, 5569374]","[944836, 835098, 880150, 990656, 999104]",5


In [None]:
metrics = adjusted_results_lvl_2.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.2894545454545454


# 9. Training a model on a full dataset

## 9.1. Train-validation time split for two-stage recommender system

Train - validation schema:

-- old purchases -- | -- 6 weeks-- 

In [None]:
data_train, data_valid = time_split_2(data)

## 9.2 Generate users for level 2 with their preferences

In [None]:
users_final = get_candidates(
    recommender, data_train, data_valid, n_items=n_items)
print(f'users_final.shape = {users_final.shape}\n')
users_final.head(2)

users_final.shape = (1979, 2)



Unnamed: 0,user_id,candidates
0,84,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,2200,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


## 9.3. Generate new features adding to them users and items embeddings

In [None]:
logging.info('Generating new user-item features...')

user_item_features_final = get_user_item_features(
    recommender, data_train)

print(f'user_item_features_final.shape = {user_item_features_final.shape}\n')
user_item_features_final.head(2)

user_item_features_final.shape = (378567, 221)



Unnamed: 0,user_id,item_id,sales_value,retail_disc,coupon_disc,coupon_match_disc,price,n_baskets_user,n_items_user,sales_value_user,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,2375,1085983,2.99,-0.4,0.0,0.0,2.99,131,131,506.55,...,-2.690238,1.054461,0.742704,-1.799052,2.279338,1.229397,3.93216,3.648286,-0.463106,-4.679028
1,2375,1085983,3.49,0.0,0.0,0.0,3.49,131,131,506.55,...,-2.690238,1.054461,0.742704,-1.799052,2.279338,1.229397,3.93216,3.648286,-0.463106,-4.679028


## 9.4. Generate train dataset for level 2 model

In [None]:
logging.info('Generating train dataset for level 2 model...')

train_dataset_final = get_targets_lvl_2(
    users_final, 
    data_valid,
    item_features_transformed, 
    user_features_transformed,      
    user_item_features_final, 
    n_items
)
print(f'train_dataset_final.shape = {train_dataset_final.shape}')
train_dataset_final.head(2)

In [None]:
train_dataset_final['user_id'].value_counts()

In [None]:
logging.info('Saving train dataset final...')

PATH = 'data/05_model_input/'
TRAIN_DATASET_FINAL_PATH = PATH + 'train_dataset_final.csv.zip'

train_dataset_final_path = TRAIN_DATASET_FINAL_PATH
train_dataset_final.to_csv(
    train_dataset_final_path, index=False, compression='zip'
)

## 9.5. Training the model 

In [39]:
train_store(train_dataset_final, 'LightGBM_v1_final')

[LightGBM] [Info] Number of positive: 13163, number of negative: 331017
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41008
[LightGBM] [Info] Number of data points in the train set: 344180, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038245 -> initscore=-3.224760
[LightGBM] [Info] Start training from score -3.224760
Training until validation scores don't improve for 30 rounds
[1000]	training's auc: 0.950147	valid_1's auc: 0.938952
[2000]	training's auc: 0.953955	valid_1's auc: 0.941892
[3000]	training's auc: 0.960313	valid_1's auc: 0.943971
Early stopping, best iteration is:
[2981]	training's auc: 0.960182	valid_1's auc: 0.94399


## 9.6. Inference

### 9.6.1 Test data ingestion

In [40]:
TEST_URL = 'https://storage.yandexcloud.net/recsys-retail-input/test.csv'
test = pd.read_csv(TEST_URL)

u = test['user_id'].nunique()
i = test['item_id'].nunique()
print(f'test.shape = {test.shape}\n')
print(f'The number of users = {u}')
print(f'The number of items = {i}\n')

test.head(2)

test.shape = (88734, 12)

The number of users = 1885
The number of items = 20497



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0


### 9.6.2. Test data preprocessing

#### Users for inference with their preferences

In [41]:
n_items = N_ITEMS
users_inference = get_candidates(
    recommender, data_train, test, n_items=n_items
)
print(f'\nusers_inference.shape = {users_inference.shape}\n')
users_inference.head(2)


users_inference.shape = (1885, 2)



Unnamed: 0,user_id,candidates
0,1340,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,588,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


#### Test dataset for inference

In [42]:
test_dataset_inference = get_targets_lvl_2(
    users_inference, 
    data_valid,
    item_features_transformed, 
    user_features_transformed,    
    user_item_features_final,     
    n_items
    )

print(f'targets_inference.shape = {test_dataset_inference.shape}\n')
test_dataset_inference.head(2)

targets_inference.shape = (409871, 241)



Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,1340,1088959,0.0,0.006075,1.0,1.0,0.294181,0.178287,0.049369,,...,,,,,,,,,,
1,1340,987628,0.0,1.0,1.0,0.0,0.051724,0.023904,0.030648,,...,,,,,,,,,,


### 9.6.3 Generating recommendations

In [None]:
model_lgb = load('LightGBM_v1_final')

predictions_final = model_lgb.predict(
    test_dataset_inference.drop('target', axis=1).fillna(0)
)
print(f'predictions_final.shape = {predictions_final.shape}\n')
predictions_final[:7]

In [44]:
result_final = get_results(test, test_dataset_inference, predictions_final)

print(f'result_final.shape = {result_final.shape}\n')
result_final.head(2)

result_final.shape = (1885, 4)



Unnamed: 0,user_id,actual,actual_adj,recommendations
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...",[990656],"[10149640, 865456, 1041796, 990656, 5569374]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[885023, 947798, 990656, 12302069]","[885023, 835098, 1108094, 947798, 971949]"


## 9.7. Metrics

In [45]:
adjusted_results = adjust_results_for_metrics(result_final)

print(f'adjusted_results.shape = {adjusted_results.shape}\n')
adjusted_results.head(2)

adjusted_results.shape = (183, 5)



Unnamed: 0,user_id,actual,actual_adj,recommendations,len_actual_adj
9,14,"[833715, 835098, 835347, 846823, 853643, 85531...","[835098, 869577, 878445, 898068, 1082310, 1091...","[1130858, 1082310, 1044805, 961269, 878445]",6
13,19,"[849843, 883404, 944486, 1021133, 1033615, 113...","[999104, 1063577, 1101173, 837751, 1118012, 82...","[999104, 918335, 1131310, 944466, 883932]",8


In [46]:
metrics = adjusted_results.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.28524590163934427


## 9.8. Saving final results


In [48]:
PATH = 'data/06_model_output'
FINAL_RESULTS_PATH = PATH + 'recommendations_v1_csv.zip'
final_results_path = FINAL_RESULTS_PATH

df = result_final[['user_id', 'recommendations']].copy()
df.to_csv(final_results_path , index=False, compression='zip')
df.head(2)

Unnamed: 0,user_id,recommendations
0,1,"[10149640, 865456, 1041796, 990656, 5569374]"
1,2,"[885023, 835098, 1108094, 947798, 971949]"
