<a href="https://colab.research.google.com/github/Yanina-Kutovaya/RecSys-retail/blob/main/notebooks/Baseline_model_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Yanina-Kutovaya/RecSys-retail.git
!pip install -r RecSys-retail/requirements_Colab.txt

Cloning into 'RecSys-retail'...
remote: Enumerating objects: 938, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 938 (delta 38), reused 64 (delta 25), pack-reused 852[K
Receiving objects: 100% (938/938), 259.35 KiB | 11.28 MiB/s, done.
Resolving deltas: 100% (463/463), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit==0.6.1
  Downloading implicit-0.6.1-cp37-cp37m-manylinux2014_x86_64.whl (18.6 MB)
[K     |████████████████████████████████| 18.6 MB 1.2 MB/s 
[?25hCollecting category-encoders==2.5.1.post0
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 442 kB/s 
Installing collected packages: implicit, category-encoders
Successfully installed category-encoders-2.5.1.post0 implicit-0.6.1


In [2]:
%cd RecSys-retail

/content/RecSys-retail


In [3]:
import logging
import pandas as pd
import joblib

from src.recsys_retail.data.make_dataset import load_data
from src.recsys_retail.features.data_time_split import time_split, time_split_2
from src.recsys_retail.features.prefilter import prefilter_items
from src.recsys_retail.features.user_features import fit_transform_user_features
from src.recsys_retail.features.item_features import fit_transform_item_features
from src.recsys_retail.features.recommenders import MainRecommender
from src.recsys_retail.features.candidates_lvl_2 import get_candidates
from src.recsys_retail.features.new_item_user_features import get_user_item_features
from src.recsys_retail.features.targets import get_targets_lvl_2
from src.recsys_retail.data.validation import train_test_split
from src.recsys_retail.models.serialize import store, load
from scripts.train_save_model import train_store
from src.recsys_retail.metrics import (
   get_results, adjust_results_for_metrics, precision_at_k
)

In [4]:
import warnings
warnings.filterwarnings("ignore")

# 1. Data ingestion

In [5]:
data, item_features, user_features = load_data()

In [6]:
print(f'data.shape = {data.shape}\n')
data.head(2)

data.shape = (2396804, 12)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [7]:
print(f'item_features.shape = {item_features.shape}\n')
item_features.head(2)

item_features.shape = (92353, 7)



Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [8]:
print(f'user_features.shape = {user_features.shape}\n')
user_features.head(2)

user_features.shape = (801, 8)



Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


# 2. Prefilter transactions data

In [9]:
logging.info('Preprocessing level 1 train dataset...')

i0 = data['item_id'].nunique()
u0 = data['user_id'].nunique()
t0 = data.shape[0]

data = prefilter_items(data, item_features)

i1 = data['item_id'].nunique()
u1 = data['user_id'].nunique()
t1 = data.shape[0]

print(f'The number of items decreased from {i0} to {i1}')
print(f'The number of users decreased from {u0} to {u1}')
print(f'The number of transactions decreased from {t0} to {t1}')

The number of items decreased from 89051 to 2501
The number of users decreased from 2499 to 2472
The number of transactions decreased from 2396804 to 414502


In [10]:
print(f'data.shape = {data.shape}\n')
data.head(2)

data.shape = (414502, 13)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
14,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,0.0,0.0,2.5


# 3. Train-validation-test time split for two-stage recommender system

Train - validation - test schema:

-- old purchases -- | -- 6 weeks-- | -- 3 weeks--

In [11]:
logging.info('Splitting dataset for level 1, level 2 preprocessing...')

data_train_lvl_1, data_train_lvl_2, data_val_lvl_2 = time_split(data)

In [12]:
data_train_lvl_1.shape[0], data_train_lvl_2.shape[0], data_val_lvl_2.shape[0]

(363397, 30113, 20992)

# 4. Level 1 

In [13]:
logger = logging.getLogger(__name__)

__all__ = ['preprocess_data']

PATH_1 = 'data/02_intermediate/'
DATA_TRAIN_LVL_1_PATH = PATH_1 + 'data_train_lvl_1.csv.zip'

N_ITEMS = 100
PATH_2 = 'data/05_model_input/'
TRAIN_DATASET_LVL_2_PATH = PATH_2 + 'train_dataset_lvl_2.csv.zip'

## 4.1. Preprocess user features and merge with transactions data 

In [14]:
user_features_transformed = fit_transform_user_features(user_features)

print(f'user_features_transformed.shape = {user_features_transformed.shape}\n')
user_features_transformed.head(2)

user_features_transformed.shape = (801, 14)



Unnamed: 0,user_id,age_desc,income_desc,homeowner_desc,household_size_desc,kid_category_desc,marital_status_code A,marital_status_code B,marital_status_code U,hh_comp_desc Single Male,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids
0,1,5.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7,3.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
data_train_lvl_1 = pd.merge(
    data_train_lvl_1, user_features_transformed, on='user_id', how='left'
)
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (363397, 26)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,household_size_desc,kid_category_desc,marital_status_code A,marital_status_code B,marital_status_code U,hh_comp_desc Single Male,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids
0,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,...,,,,,,,,,,
1,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


## 4.2. Preprocess item features and merge with transactions data

In [16]:
item_features_transformed = fit_transform_item_features(item_features)

print(f'item_features_transformed.shape = {item_features_transformed.shape}\n')
item_features_transformed.head(2)

item_features_transformed.shape = (92353, 7)



Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,0.111243,1.0,1.0,0.010057,0.027888,0.000359
1,26081,0.111243,0.012532,1.0,0.175647,0.426295,1.0


In [17]:
data_train_lvl_1 = pd.merge(
    data_train_lvl_1, item_features_transformed, on='item_id', how='left'
)
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (363397, 32)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,...,,,,,0.0157,1.0,1.0,0.341236,0.130478,0.025158
1,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,...,1.0,0.0,0.0,0.0,0.004418,0.062173,1.0,0.09375,0.047809,0.00183


In [18]:
logging.info('Saving preprocessed level 1 train dataset...')

train_data_lvl_1_path = DATA_TRAIN_LVL_1_PATH
data_train_lvl_1.to_csv(train_data_lvl_1_path, index=False, compression='zip')

# 5. Level 2

## 5.1. Build a recommender

In [19]:
logging.info('Selecting users for level 2 dataset...')

recommender = MainRecommender(
    data_train_lvl_1, 
    n_factors_ALS=50, 
    regularization_ALS=0.001,
    iterations_ALS=15,
    num_threads_ALS=4
)


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2501 [00:00<?, ?it/s]

## 5.2. Generate users for level 2 with their preferences

In [20]:
n_items = N_ITEMS
users_lvl_2 = get_candidates(
    recommender, data_train_lvl_1, data_train_lvl_2, data_val_lvl_2, n_items
)
print(f'\nusers_lvl_2.shape = {users_lvl_2.shape}\n')
users_lvl_2.head(2)


users_lvl_2.shape = (2101, 2)



Unnamed: 0,user_id,candidates
0,2021,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,1753,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


## 5.3. Generate new features adding to them users and items embeddings

In [21]:
logging.info('Generating new user-item features...')
user_item_features = get_user_item_features(recommender, data_train_lvl_1)

print(f'user_item_features.shape = {user_item_features.shape}\n')
user_item_features.head(2)

user_item_features.shape = (363397, 112)



Unnamed: 0,user_id,item_id,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,mean_n_items_basket,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,2375,1085983,15.0,4.0,14.548387,12.332105,70,87,124,3.157895,...,-3.591278,-0.742257,-6.291358,-1.436868,-0.473858,-6.695119,2.686646,-4.328279,1.544849,1.040506
1,2375,1085983,15.0,4.0,14.548387,12.332105,70,87,124,3.157895,...,-3.591278,-0.742257,-6.291358,-1.436868,-0.473858,-6.695119,2.686646,-4.328279,1.544849,1.040506


## 5.4. Generate train dataset for level 2 model

In [22]:
logging.info('Generating train dataset for level 2 model...')

train_dataset_lvl_2 = get_targets_lvl_2(
    users_lvl_2, 
    data_train_lvl_2,
    item_features_transformed, 
    user_features_transformed,     
    user_item_features, 
    n_items
)
print(f'train_dataset_lvl_2.shape = {train_dataset_lvl_2.shape}')
train_dataset_lvl_2.head(2)

train_dataset_lvl_2.shape = (210100, 132)


Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,2021,1088959,0.0,0.006075,1.0,1.0,0.294181,0.178287,0.049369,,...,,,,,,,,,,
1,2021,987628,0.0,1.0,1.0,0.0,0.051724,0.023904,0.030648,,...,,,,,,,,,,


In [23]:
train_dataset_lvl_2['user_id'].value_counts()

2021    100
405     100
1163    100
1780    100
2301    100
       ... 
1158    100
1767    100
740     100
2062    100
2042    100
Name: user_id, Length: 2101, dtype: int64

In [24]:
logging.info('Saving train dataset level 2...')

train_dataset_lvl_2_path = TRAIN_DATASET_LVL_2_PATH
train_dataset_lvl_2.to_csv(
    train_dataset_lvl_2_path, index=False, compression='zip'
)

# 6. Training the model

In [25]:
train_store(train_dataset_lvl_2, 'LightGBM_v1')

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[27]	training's auc: 0.760785	valid_1's auc: 0.758542


# 7. Inference

In [26]:
model_lgb = load('LightGBM_v1')

In [27]:
predictions_lgb_train = model_lgb.predict(
    train_dataset_lvl_2.drop('target', axis=1).fillna(0)
)
print(f'predictions_lgb_train.shape = {predictions_lgb_train.shape}\n')
predictions_lgb_train[:7]

predictions_lgb_train.shape = (210100,)



array([0.00967873, 0.00967873, 0.00967873, 0.00967873, 0.00967873,
       0.00967873, 0.00967873])

In [28]:
result_lvl_2 = get_results(data_val_lvl_2, train_dataset_lvl_2, predictions_lgb_train)

print(f'result_lvl_2.shape = {result_lvl_2.shape}\n')
result_lvl_2.head(2)

result_lvl_2.shape = (1792, 4)



Unnamed: 0,user_id,actual,actual_adj,recommendations
0,1,"[856942, 865456, 951954, 971585, 990656, 11311...","[865456, 990656, 1041796, 1115576]","[872137, 865456, 1041796, 1101173, 5569374]"
1,3,"[835476, 999999]",,"[1101173, 907631, 883932, 972931, 13115279]"


# 8. Metrics

In [29]:
adjusted_results_lvl_2 = adjust_results_for_metrics(result_lvl_2)

print(f'adjusted_results_lvl_2.shape = {adjusted_results_lvl_2.shape}\n')
adjusted_results_lvl_2.head(2)

adjusted_results_lvl_2.shape = (120, 5)



Unnamed: 0,user_id,actual,actual_adj,recommendations,len_actual_adj
12,19,"[944466, 999104, 1031083, 1048483, 6533236, 94...","[944466, 999104, 940700, 883932, 918335]","[999104, 835098, 865456, 1041796, 1101173]",5
29,40,"[821565, 907308, 995852, 1104343, 1117824, 556...","[1104343, 1117824, 880150, 947441, 1048962, 92...","[880150, 944836, 926065, 821083, 947441]",6


In [30]:
metrics = adjusted_results_lvl_2.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.2700000000000001


# 9. Training a model on a full dataset

## 9.1. Train-validation time split for two-stage recommender system

Train - validation schema:

-- old purchases -- | -- 6 weeks-- 

In [31]:
data_train, data_valid = time_split_2(data)

## 9.2 Generate users for level 2 with their preferences

In [32]:
users_final = get_candidates(
    recommender, data_train, data_valid, n_items=n_items)
print(f'users_final.shape = {users_final.shape}\n')
users_final.head(2)

users_final.shape = (1979, 2)



Unnamed: 0,user_id,candidates
0,84,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,2200,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


## 9.3. Generate new features adding to them users and items embeddings

In [33]:
logging.info('Generating new user-item features...')

user_item_features_final = get_user_item_features(
    recommender, data_train)

print(f'user_item_features_final.shape = {user_item_features_final.shape}\n')
user_item_features_final.head(2)

user_item_features_final.shape = (378567, 112)



Unnamed: 0,user_id,item_id,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,mean_n_items_basket,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,2375,1085983,15.0,4.0,16.529412,12.354878,71,90,131,3.097561,...,-3.591278,-0.742257,-6.291358,-1.436868,-0.473858,-6.695119,2.686646,-4.328279,1.544849,1.040506
1,2375,1085983,15.0,4.0,16.529412,12.354878,71,90,131,3.097561,...,-3.591278,-0.742257,-6.291358,-1.436868,-0.473858,-6.695119,2.686646,-4.328279,1.544849,1.040506


## 9.4. Generate train dataset for level 2 model

In [34]:
logging.info('Generating train dataset for level 2 model...')

train_dataset_final = get_targets_lvl_2(
    users_final, 
    data_valid,
    item_features_transformed, 
    user_features_transformed,      
    user_item_features_final, 
    n_items
)
print(f'train_dataset_final.shape = {train_dataset_final.shape}')
train_dataset_final.head(2)

train_dataset_final.shape = (197900, 132)


Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,84,1088959,0.0,0.006075,1.0,1.0,0.294181,0.178287,0.049369,,...,,,,,,,,,,
1,84,987628,0.0,1.0,1.0,0.0,0.051724,0.023904,0.030648,,...,,,,,,,,,,


In [35]:
train_dataset_final['user_id'].value_counts()

84      100
2005    100
1686    100
1564    100
491     100
       ... 
92      100
1775    100
1699    100
2130    100
1635    100
Name: user_id, Length: 1979, dtype: int64

In [36]:
logging.info('Saving train dataset final...')

PATH = 'data/05_model_input/'
TRAIN_DATASET_FINAL_PATH = PATH + 'train_dataset_final.csv.zip'

train_dataset_final_path = TRAIN_DATASET_FINAL_PATH
train_dataset_final.to_csv(
    train_dataset_final_path, index=False, compression='zip'
)

## 9.5. Training the model 

In [37]:
train_store(train_dataset_final, 'LightGBM_v1_final')

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[373]	training's auc: 0.874908	valid_1's auc: 0.850057


## 9.6. Inference

### 9.6.1 Test data ingestion

In [38]:
TEST_URL = 'https://storage.yandexcloud.net/recsys-retail-input/test.csv'
test = pd.read_csv(TEST_URL)

u = test['user_id'].nunique()
i = test['item_id'].nunique()
print(f'test.shape = {test.shape}\n')
print(f'The number of users = {u}')
print(f'The number of items = {i}\n')

test.head(2)

test.shape = (88734, 12)

The number of users = 1885
The number of items = 20497



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0


### 9.6.2. Test data preprocessing

#### Users for inference with their preferences

In [39]:
n_items = N_ITEMS
users_inference = get_candidates(
    recommender, data_train, test, n_items=n_items
)
print(f'\nusers_inference.shape = {users_inference.shape}\n')
users_inference.head(2)


users_inference.shape = (1885, 2)



Unnamed: 0,user_id,candidates
0,1340,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,588,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


#### Test dataset for inference

In [40]:
test_dataset_inference = get_targets_lvl_2(
    users_inference, 
    data_valid,
    item_features_transformed, 
    user_features_transformed,    
    user_item_features_final,     
    n_items
    )

print(f'targets_inference.shape = {test_dataset_inference.shape}\n')
test_dataset_inference.head(2)

targets_inference.shape = (188500, 132)



Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,1340,1088959,0.0,0.006075,1.0,1.0,0.294181,0.178287,0.049369,,...,,,,,,,,,,
1,1340,987628,0.0,1.0,1.0,0.0,0.051724,0.023904,0.030648,,...,,,,,,,,,,


### 9.6.3 Generating recommendations

In [41]:
model_lgb = load('LightGBM_v1_final')

predictions_final = model_lgb.predict(
    test_dataset_inference.drop('target', axis=1).fillna(0)
)
print(f'predictions_final.shape = {predictions_final.shape}\n')
predictions_final[:7]

predictions_final.shape = (188500,)



array([0.00457608, 0.00635565, 0.003995  , 0.00397029, 0.00608295,
       0.00397164, 0.0059075 ])

In [42]:
result_final = get_results(test, test_dataset_inference, predictions_final)

print(f'result_final.shape = {result_final.shape}\n')
result_final.head(2)

result_final.shape = (1885, 4)



Unnamed: 0,user_id,actual,actual_adj,recommendations
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...",[990656],"[865456, 872137, 5569374, 1041796, 1101173]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[947798, 990656, 12302069]","[835098, 883932, 947798, 12302069, 972931]"


## 9.7. Metrics

In [43]:
adjusted_results = adjust_results_for_metrics(result_final)

print(f'adjusted_results.shape = {adjusted_results.shape}\n')
adjusted_results.head(2)

adjusted_results.shape = (66, 5)



Unnamed: 0,user_id,actual,actual_adj,recommendations,len_actual_adj
13,19,"[849843, 883404, 944486, 1021133, 1033615, 113...","[999104, 1063577, 1101173, 823704, 1044805, 89...","[999104, 883932, 936594, 1096343, 918335]",6
53,67,"[1025650, 5978648, 16769918, 823758, 832976, 8...","[1115576, 869577, 971949, 845193, 903350, 835098]","[999104, 1041796, 1096343, 823704, 883932]",6


In [44]:
metrics = adjusted_results.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.23030303030303026


## 9.8. Saving final results


In [45]:
PATH = 'data/06_model_output'
FINAL_RESULTS_PATH = PATH + 'recommendations_v1_csv.zip'
final_results_path = FINAL_RESULTS_PATH

df = result_final[['user_id', 'recommendations']].copy()
df.to_csv(final_results_path , index=False, compression='zip')
df.head(2)

Unnamed: 0,user_id,recommendations
0,1,"[865456, 872137, 5569374, 1041796, 1101173]"
1,2,"[835098, 883932, 947798, 12302069, 972931]"
