<a href="https://colab.research.google.com/github/Yanina-Kutovaya/RecSys-retail/blob/main/notebooks/Baseline_model_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Yanina-Kutovaya/RecSys-retail.git
%cd RecSys-retail
!pip install -r requirements_Colab.txt

Cloning into 'RecSys-retail'...
remote: Enumerating objects: 609, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 609 (delta 12), reused 24 (delta 8), pack-reused 574[K
Receiving objects: 100% (609/609), 192.35 KiB | 1.07 MiB/s, done.
Resolving deltas: 100% (279/279), done.
/content/RecSys-retail
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit==0.6.1
  Downloading implicit-0.6.1-cp37-cp37m-manylinux2014_x86_64.whl (18.6 MB)
[K     |████████████████████████████████| 18.6 MB 1.3 MB/s 
[?25hCollecting category-encoders==2.5.1.post0
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 586 kB/s 
Installing collected packages: implicit, category-encoders
Successfully installed category-encoders-2.5.1.post0 implicit-0.6.1


In [2]:
import logging
import pandas as pd

from src.recsys_retail.data.make_dataset import load_data
from src.recsys_retail.features.data_time_split import time_split
from src.recsys_retail.features.prefilter import prefilter_items
from src.recsys_retail.features.user_features import fit_transform_user_features
from src.recsys_retail.features.item_features import fit_transform_item_features
from src.recsys_retail.features.candidates_lvl_2 import get_candidates
from src.recsys_retail.features.new_item_user_features import get_user_item_features
from src.recsys_retail.features.targets import get_targets_lvl_2

from src.recsys_retail.data.validation import train_test_split
from src.recsys_retail.models.serialize import store, load
from scripts.train_save_model import train_store
from src.recsys_retail.metrics import precision_at_k, get_results

  f"CUDA extension is built, but disabling GPU support because of '{e}'",


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
logger = logging.getLogger(__name__)

__all__ = ['preprocess_data']

PATH_1 = 'data/02_intermediate/'
DATA_TRAIN_LVL_1_PATH = PATH_1 + 'data_train_lvl_1.csv.zip'

N_ITEMS = 100
PATH_2 = 'data/05_model_input/'
TRAIN_DATASET_LVL_2_PATH = PATH_2 + 'train_dataset_lvl_2.csv.zip'

# 1. Data ingestion

In [5]:
data, item_features, user_features = load_data()

In [6]:
print(f'data.shape = {data.shape}\n')
data.head(2)

data.shape = (2396804, 12)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [7]:
print(f'item_features.shape = {item_features.shape}\n')
item_features.head(2)

item_features.shape = (92353, 7)



Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [8]:
print(f'user_features.shape = {user_features.shape}\n')
user_features.head(2)

user_features.shape = (801, 8)



Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


# 2. Train-validation-test time split for two-stage recommender system

Train - validation - test schema:

-- old purchases -- | -- 6 weeks-- | -- 3 weeks--

In [9]:
logging.info('Splitting dataset for level 1, level 2 preprocessing...')

data_train_lvl_1, data_train_lvl_2, data_val_lvl_2 = time_split(data)

In [10]:
data_train_lvl_1.shape[0], data_train_lvl_2.shape[0], data_val_lvl_2.shape[0]

(2108779, 169711, 118314)

# 3. Data preprocessing

## Level 1

### 1. Prefilter level 1 transactions data

In [11]:
logging.info('Preprocessing level 1 train dataset...')

i0 = data_train_lvl_1['item_id'].nunique()
u0 = data_train_lvl_1['user_id'].nunique()
t0 = data_train_lvl_1.shape[0]

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features)

i1 = data_train_lvl_1['item_id'].nunique()
u1 = data_train_lvl_1['user_id'].nunique()
t1 = data_train_lvl_1.shape[0]

print(f'The number of items decreased from {i0} to {i1}')
print(f'The number of users decreased from {u0} to {u1}')
print(f'The number of transactions decreased from {t0} to {t1}')

The number of items decreased from 83685 to 2421
The number of users decreased from 2498 to 2459
The number of transactions decreased from 2108779 to 359837


In [12]:
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (359837, 13)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0,1.5
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99


### 2. Preprocess user features and merge with transactions data 

In [13]:
user_features_transformed = fit_transform_user_features(user_features)

print(f'user_features_transformed.shape = {user_features_transformed.shape}\n')
user_features_transformed.head(2)

user_features_transformed.shape = (801, 14)



Unnamed: 0,user_id,age_desc,income_desc,homeowner_desc,household_size_desc,kid_category_desc,marital_status_code A,marital_status_code B,marital_status_code U,hh_comp_desc Single Male,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids
0,1,5.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7,3.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
data_train_lvl_1 = pd.merge(
    data_train_lvl_1, user_features_transformed, on='user_id', how='left'
)
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (359837, 26)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,household_size_desc,kid_category_desc,marital_status_code A,marital_status_code B,marital_status_code U,hh_comp_desc Single Male,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids
0,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,...,,,,,,,,,,
1,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,...,,,,,,,,,,


### 3. Preprocess item features and merge with transactions data

In [15]:
item_features_transformed = fit_transform_item_features(item_features)

print(f'item_features_transformed.shape = {item_features_transformed.shape}\n')
item_features_transformed.head(2)

item_features_transformed.shape = (92353, 507)



Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0,1,2,...,490,491,492,493,494,495,496,497,498,499
0,25671,0.111243,1.0,1.0,0.010057,0.027888,0.000359,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26081,0.111243,0.012532,1.0,0.175647,0.426295,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
data_train_lvl_1 = pd.merge(
    data_train_lvl_1, item_features_transformed, on='item_id', how='left'
)
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (359837, 532)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,490,491,492,493,494,495,496,497,498,499
0,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
logging.info('Saving preprocessed level 1 train dataset...')

train_data_lvl_1_path = DATA_TRAIN_LVL_1_PATH
data_train_lvl_1.to_csv(train_data_lvl_1_path, index=False, compression='zip')

## Level 2

### 1. Generate recommender and users for level 2 with their preferences

In [18]:
logging.info('Generating level 2 dataset...')  

n_items = N_ITEMS
users_lvl_2, recommender = get_candidates(
    data_train_lvl_1, data_train_lvl_2, data_val_lvl_2, n_items
)
print(f'\nusers_lvl_2.shape = {users_lvl_2.shape}\n')
users_lvl_2.head(2)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2421 [00:00<?, ?it/s]


users_lvl_2.shape = (2280, 2)



Unnamed: 0,user_id,candidates
0,2070,"[818981, 968732, 923723, 992292, 1074516, 1136..."
1,2021,"[818981, 968732, 923723, 992292, 1074516, 1136..."


### 2. Generate new features adding to them users and items embeddings

In [19]:
logging.info('Generating new user-item features...')
user_item_features = get_user_item_features(recommender, data_train_lvl_1)

print(f'user_item_features.shape = {user_item_features.shape}\n')
user_item_features.head(2)

user_item_features.shape = (359837, 52)



Unnamed: 0,user_id,item_id,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,mean_n_items_basket,...,user_factor_11,user_factor_12,user_factor_13,user_factor_14,user_factor_15,user_factor_16,user_factor_17,user_factor_18,user_factor_19,user_factor_20
0,2375,8160430,16.0,1.0,19.233333,12.261351,60,89,118,3.189189,...,0.481279,9.698819,-12.379603,-0.121839,3.340241,11.778405,-2.126867,3.032073,12.240597,2.302167
1,2375,1085983,15.0,4.0,19.233333,12.261351,70,89,118,3.189189,...,0.481279,9.698819,-12.379603,-0.121839,3.340241,11.778405,-2.126867,3.032073,12.240597,2.302167


### 3. Generate train dataset for level 2 model

In [20]:
train_dataset_lvl_2 = get_targets_lvl_2(
    users_lvl_2, data_train_lvl_2, item_features_transformed,
    user_features_transformed, user_item_features, n_items
)
print(f'train_dataset_lvl_2.shape = {train_dataset_lvl_2.shape}')
train_dataset_lvl_2.head(2)

train_dataset_lvl_2.shape = (225746, 572)


Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0,...,user_factor_11,user_factor_12,user_factor_13,user_factor_14,user_factor_15,user_factor_16,user_factor_17,user_factor_18,user_factor_19,user_factor_20
0,2070,818981,0.0,0.050493,1.0,1.0,0.228807,0.181275,0.001895,0.0,...,,,,,,,,,,
1,2070,968732,0.0,0.011203,1.0,1.0,0.143678,0.010956,0.00477,0.0,...,,,,,,,,,,


In [21]:
train_dataset_lvl_2['user_id'].value_counts()

1813    100
1480    100
1984    100
120     100
1675    100
       ... 
1839     99
198      99
2019     99
1273     99
2044     99
Name: user_id, Length: 2280, dtype: int64

In [22]:
logging.info('Saving train dataset level 2...')

train_dataset_lvl_2_path = TRAIN_DATASET_LVL_2_PATH
train_dataset_lvl_2.to_csv(
    train_dataset_lvl_2_path, index=False, compression='zip'
)

# 4. Training the model

In [23]:
train_store(train_dataset_lvl_2, 'LightGBM_v1')

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[391]	training's auc: 0.893101	valid_1's auc: 0.851923


# 5. Inference

In [24]:
model_lgb = load('LightGBM_v1')

In [25]:
predictions_lgb_train = model_lgb.predict(
    train_dataset_lvl_2.drop('target', axis=1).fillna(0)
)
print(f'predictions_lgb_train.shape = {predictions_lgb_train.shape}\n')
predictions_lgb_train[:7]

predictions_lgb_train.shape = (225746,)



array([0.00432354, 0.00333443, 0.00541022, 0.00476471, 0.00396564,
       0.00453774, 0.00426045])

In [26]:
result_lvl_2 = get_results(data_val_lvl_2, train_dataset_lvl_2, predictions_lgb_train)

print(f'result_lvl_2.shape = {result_lvl_2.shape}\n')
result_lvl_2.head(2)

result_lvl_2.shape = (2042, 3)



Unnamed: 0,user_id,actual,recommendations
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[872137, 940947, 865456, 1041796, 1101173]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1101173, 907631, 972931, 952317, 883932]"


# 6. Metrics

In [27]:
metrics = result_lvl_2.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.06483839373163564
