<a href="https://colab.research.google.com/github/Yanina-Kutovaya/RecSys-retail/blob/main/notebooks/Baseline_model_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Yanina-Kutovaya/RecSys-retail.git
!pip install -r RecSys-retail/requirements_Colab.txt

Cloning into 'RecSys-retail'...
remote: Enumerating objects: 734, done.[K
remote: Counting objects: 100% (160/160), done.[K
remote: Compressing objects: 100% (86/86), done.[K
remote: Total 734 (delta 84), reused 108 (delta 51), pack-reused 574[K
Receiving objects: 100% (734/734), 221.41 KiB | 4.43 MiB/s, done.
Resolving deltas: 100% (351/351), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit==0.6.1
  Downloading implicit-0.6.1-cp37-cp37m-manylinux2014_x86_64.whl (18.6 MB)
[K     |████████████████████████████████| 18.6 MB 315 kB/s 
[?25hCollecting category-encoders==2.5.1.post0
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 678 kB/s 
Installing collected packages: implicit, category-encoders
Successfully installed category-encoders-2.5.1.post0 implicit-0.6.1


In [2]:
%cd RecSys-retail

/content/RecSys-retail


In [3]:
import logging
import pandas as pd
import joblib

from src.recsys_retail.data.make_dataset import load_data
from src.recsys_retail.features.data_time_split import time_split
from src.recsys_retail.features.prefilter import prefilter_items
from src.recsys_retail.features.user_features import fit_transform_user_features
from src.recsys_retail.features.item_features import fit_transform_item_features
from src.recsys_retail.models.train_recommender import train_save_recommender
from src.recsys_retail.features.candidates_lvl_2 import get_candidates
from src.recsys_retail.features.new_item_user_features import get_user_item_features
from src.recsys_retail.features.targets import get_targets_lvl_2
from src.recsys_retail.data.validation import train_test_split
from src.recsys_retail.models.serialize import store, load
from scripts.train_save_model import train_store
from src.recsys_retail.metrics import precision_at_k, get_results

  f"CUDA extension is built, but disabling GPU support because of '{e}'",


In [4]:
import warnings
warnings.filterwarnings("ignore")

# 1. Data ingestion

In [5]:
data, item_features, user_features = load_data()

In [6]:
print(f'data.shape = {data.shape}\n')
data.head(2)

data.shape = (2396804, 12)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [7]:
print(f'item_features.shape = {item_features.shape}\n')
item_features.head(2)

item_features.shape = (92353, 7)



Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [8]:
print(f'user_features.shape = {user_features.shape}\n')
user_features.head(2)

user_features.shape = (801, 8)



Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


# 2. Prefilter transactions data

In [9]:
logging.info('Preprocessing level 1 train dataset...')

i0 = data['item_id'].nunique()
u0 = data['user_id'].nunique()
t0 = data.shape[0]

data = prefilter_items(data, item_features)

i1 = data['item_id'].nunique()
u1 = data['user_id'].nunique()
t1 = data.shape[0]

print(f'The number of items decreased from {i0} to {i1}')
print(f'The number of users decreased from {u0} to {u1}')
print(f'The number of transactions decreased from {t0} to {t1}')

The number of items decreased from 89051 to 2501
The number of users decreased from 2499 to 2472
The number of transactions decreased from 2396804 to 414502


In [10]:
print(f'data.shape = {data.shape}\n')
data.head(2)

data.shape = (414502, 13)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
14,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,0.0,0.0,2.5


# 3. Train-validation-test time split for two-stage recommender system

Train - validation - test schema:

-- old purchases -- | -- 6 weeks-- | -- 3 weeks--

In [11]:
logging.info('Splitting dataset for level 1, level 2 preprocessing...')

data_train_lvl_1, data_train_lvl_2, data_val_lvl_2 = time_split(data)

In [12]:
data_train_lvl_1.shape[0], data_train_lvl_2.shape[0], data_val_lvl_2.shape[0]

(363397, 30113, 20992)

# 4. Level 1 

In [13]:
logger = logging.getLogger(__name__)

__all__ = ['preprocess_data']

PATH_1 = 'data/02_intermediate/'
DATA_TRAIN_LVL_1_PATH = PATH_1 + 'data_train_lvl_1.csv.zip'

N_ITEMS = 100
PATH_2 = 'data/05_model_input/'
TRAIN_DATASET_LVL_2_PATH = PATH_2 + 'train_dataset_lvl_2.csv.zip'

## 1. Preprocess user features and merge with transactions data 

In [14]:
user_features_transformed = fit_transform_user_features(user_features)

print(f'user_features_transformed.shape = {user_features_transformed.shape}\n')
user_features_transformed.head(2)

user_features_transformed.shape = (801, 14)



Unnamed: 0,user_id,age_desc,income_desc,homeowner_desc,household_size_desc,kid_category_desc,marital_status_code A,marital_status_code B,marital_status_code U,hh_comp_desc Single Male,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids
0,1,5.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7,3.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
data_train_lvl_1 = pd.merge(
    data_train_lvl_1, user_features_transformed, on='user_id', how='left'
)
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (363397, 26)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,household_size_desc,kid_category_desc,marital_status_code A,marital_status_code B,marital_status_code U,hh_comp_desc Single Male,hh_comp_desc Single Female,hh_comp_desc 2 Adults No Kids,hh_comp_desc 1 Adult Kids,hh_comp_desc 2 Adults Kids
0,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,...,,,,,,,,,,
1,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


## 2. Preprocess item features and merge with transactions data

In [16]:
item_features_transformed = fit_transform_item_features(item_features)

print(f'item_features_transformed.shape = {item_features_transformed.shape}\n')
item_features_transformed.head(2)

item_features_transformed.shape = (92353, 507)



Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0,1,2,...,490,491,492,493,494,495,496,497,498,499
0,25671,0.111243,1.0,1.0,0.010057,0.027888,0.000359,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26081,0.111243,0.012532,1.0,0.175647,0.426295,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
data_train_lvl_1 = pd.merge(
    data_train_lvl_1, item_features_transformed, on='item_id', how='left'
)
print(f'data_train_lvl_1.shape = {data_train_lvl_1.shape}\n')
data_train_lvl_1.head(2)

data_train_lvl_1.shape = (363397, 532)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,490,491,492,493,494,495,496,497,498,499
0,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
logging.info('Saving preprocessed level 1 train dataset...')

train_data_lvl_1_path = DATA_TRAIN_LVL_1_PATH
data_train_lvl_1.to_csv(train_data_lvl_1_path, index=False, compression='zip')

# 5. Level 2

## 1. Build a recommender

In [19]:
logging.info('Selecting users for level 2 dataset...')

recommender = train_save_recommender(data_train_lvl_1)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/2501 [00:00<?, ?it/s]

## 2. Generate users for level 2 with their preferences

In [20]:
n_items = N_ITEMS
users_lvl_2 = get_candidates(
    recommender, data_train_lvl_1, data_train_lvl_2, data_val_lvl_2, n_items
)
print(f'\nusers_lvl_2.shape = {users_lvl_2.shape}\n')
users_lvl_2.head(2)


users_lvl_2.shape = (2101, 2)



Unnamed: 0,user_id,candidates
0,2021,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,1753,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


## 3. Generate new features adding to them users and items embeddings

In [21]:
logging.info('Generating new user-item features...')
user_item_features = get_user_item_features(recommender, data_train_lvl_1)

print(f'user_item_features.shape = {user_item_features.shape}\n')
user_item_features.head(2)

user_item_features.shape = (363397, 212)



Unnamed: 0,user_id,item_id,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,mean_n_items_basket,...,user_factor_91,user_factor_92,user_factor_93,user_factor_94,user_factor_95,user_factor_96,user_factor_97,user_factor_98,user_factor_99,user_factor_100
0,2375,1085983,15.0,4.0,14.548387,12.332105,70,87,124,3.157895,...,-0.34775,-0.28972,0.717852,11.066954,-5.631204,-2.390208,8.382318,4.497089,-3.823046,-0.340527
1,2375,1085983,15.0,4.0,14.548387,12.332105,70,87,124,3.157895,...,-0.34775,-0.28972,0.717852,11.066954,-5.631204,-2.390208,8.382318,4.497089,-3.823046,-0.340527


## 3. Generate train dataset for level 2 model

In [22]:
logging.info('Generating train dataset for level 2 model...')

train_dataset_lvl_2 = get_targets_lvl_2(
    users_lvl_2, 
    data_train_lvl_2,     
    user_item_features, 
    n_items
)
print(f'train_dataset_lvl_2.shape = {train_dataset_lvl_2.shape}')
train_dataset_lvl_2.head(2)

train_dataset_lvl_2.shape = (210100, 213)


Unnamed: 0,user_id,item_id,target,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,...,user_factor_91,user_factor_92,user_factor_93,user_factor_94,user_factor_95,user_factor_96,user_factor_97,user_factor_98,user_factor_99,user_factor_100
0,2021,1088959,0.0,,,,,,,,...,,,,,,,,,,
1,2021,987628,0.0,,,,,,,,...,,,,,,,,,,


In [23]:
train_dataset_lvl_2['user_id'].value_counts()

2021    100
405     100
1163    100
1780    100
2301    100
       ... 
1158    100
1767    100
740     100
2062    100
2042    100
Name: user_id, Length: 2101, dtype: int64

In [24]:
logging.info('Saving train dataset level 2...')

train_dataset_lvl_2_path = TRAIN_DATASET_LVL_2_PATH
train_dataset_lvl_2.to_csv(
    train_dataset_lvl_2_path, index=False, compression='zip'
)

# 5. Training the model

In [25]:
train_store(train_dataset_lvl_2, 'LightGBM_v1')

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[8]	training's auc: 0.759105	valid_1's auc: 0.759733


# 6. Inference

In [26]:
model_lgb = load('LightGBM_v1')

In [27]:
predictions_lgb_train = model_lgb.predict(
    train_dataset_lvl_2.drop('target', axis=1).fillna(0)
)
print(f'predictions_lgb_train.shape = {predictions_lgb_train.shape}\n')
predictions_lgb_train[:7]

predictions_lgb_train.shape = (210100,)



array([0.01015275, 0.01015275, 0.01015275, 0.01015275, 0.01015275,
       0.01015275, 0.01015275])

In [28]:
result_lvl_2 = get_results(data_val_lvl_2, train_dataset_lvl_2, predictions_lgb_train)

print(f'result_lvl_2.shape = {result_lvl_2.shape}\n')
result_lvl_2.head(2)

result_lvl_2.shape = (1792, 3)



Unnamed: 0,user_id,actual,recommendations
0,1,"[856942, 865456, 951954, 971585, 990656, 11311...","[872137, 865456, 1041796, 1101173, 5569374]"
1,3,"[835476, 999999]","[907631, 883932, 1101173, 9676866, 972931]"


# 7. Metrics

In [29]:
metrics = result_lvl_2.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.06383928571428572


# 8. Training a model on a full dataset

## 8.1. Train-validation-test time split for two-stage recommender system

Train - validation schema:

-- old purchases -- | -- 6 weeks-- 

In [30]:
validation_weeks = 6
data_train = data[data['week_no'] < data['week_no'].max() - validation_weeks]
data_valid = data[data['week_no'] >= data['week_no'].max() - validation_weeks]

## 8.2 Generate users for level 2 with their preferences

In [31]:
PATH = 'data/04_feature/'
candidates_path = PATH + 'candidates_final.csv.zip'
users_final = get_candidates(
    recommender, data_train, data_valid, n_items=n_items, candidates_path=candidates_path
)
print(f'users_final.shape = {users_final.shape}\n')
users_final.head(2)

users_final.shape = (1979, 2)



Unnamed: 0,user_id,candidates
0,84,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,2200,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


## 8.3. Generate new features adding to them users and items embeddings

In [32]:
logging.info('Generating new user-item features...')

PATH = 'data/04_feature/'
user_item_features_path = PATH + 'user_item_features_final.csv.zip'

user_item_features_final = get_user_item_features(
    recommender, data_train,  user_item_features_path=user_item_features_path)

print(f'user_item_features_final.shape = {user_item_features_final.shape}\n')
user_item_features_final.head(2)

user_item_features_final.shape = (378567, 212)



Unnamed: 0,user_id,item_id,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,mean_n_items_basket,...,user_factor_91,user_factor_92,user_factor_93,user_factor_94,user_factor_95,user_factor_96,user_factor_97,user_factor_98,user_factor_99,user_factor_100
0,2375,1085983,15.0,4.0,16.529412,12.354878,71,90,131,3.097561,...,-0.34775,-0.28972,0.717852,11.066954,-5.631204,-2.390208,8.382318,4.497089,-3.823046,-0.340527
1,2375,1085983,15.0,4.0,16.529412,12.354878,71,90,131,3.097561,...,-0.34775,-0.28972,0.717852,11.066954,-5.631204,-2.390208,8.382318,4.497089,-3.823046,-0.340527


## 8.4. Generate train dataset for level 2 model

In [33]:
logging.info('Generating train dataset for level 2 model...')

train_dataset_final = get_targets_lvl_2(
    users_final, 
    data_valid,      
    user_item_features_final, 
    n_items
)
print(f'train_dataset_final.shape = {train_dataset_final.shape}')
train_dataset_final.head(2)

train_dataset_final.shape = (197900, 213)


Unnamed: 0,user_id,item_id,target,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,...,user_factor_91,user_factor_92,user_factor_93,user_factor_94,user_factor_95,user_factor_96,user_factor_97,user_factor_98,user_factor_99,user_factor_100
0,84,1088959,0.0,,,,,,,,...,,,,,,,,,,
1,84,987628,0.0,,,,,,,,...,,,,,,,,,,


In [34]:
train_dataset_final['user_id'].value_counts()

84      100
2005    100
1686    100
1564    100
491     100
       ... 
92      100
1775    100
1699    100
2130    100
1635    100
Name: user_id, Length: 1979, dtype: int64

In [35]:
logging.info('Saving train dataset final...')

PATH = 'data/05_model_input/'
TRAIN_DATASET_FINAL_PATH = PATH + 'train_dataset_final.csv.zip'

train_dataset_final_path = TRAIN_DATASET_FINAL_PATH
train_dataset_final.to_csv(
    train_dataset_final_path, index=False, compression='zip'
)

## 8.5. Training the model 

In [36]:
train_store(train_dataset_final, 'LightGBM_v1_final')

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[356]	training's auc: 0.850713	valid_1's auc: 0.83445


## 8.6. Inference

### 8.6.1 Test data ingestion

In [37]:
TEST_URL = 'https://storage.yandexcloud.net/recsys-retail-input/test.csv'
test = pd.read_csv(TEST_URL)

u = test['user_id'].nunique()
i = test['item_id'].nunique()
print(f'test.shape = {test.shape}\n')
print(f'The number of users = {u}')
print(f'The number of items = {i}\n')

test.head(2)

test.shape = (88734, 12)

The number of users = 1885
The number of items = 20497



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0


### 8.6.2. Test data preprocessing

#### Users for inference with their preferences

In [38]:
n_items = N_ITEMS
users_inference = get_candidates(
    recommender, data_train, test, n_items=n_items
)
print(f'\nusers_inference.shape = {users_inference.shape}\n')
users_inference.head(2)


users_inference.shape = (1885, 2)



Unnamed: 0,user_id,candidates
0,1340,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,588,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


#### Test dataset for inference

In [39]:
test_dataset_inference = get_targets_lvl_2(
    users_inference, 
    data_valid,    
    user_item_features_final,     
    n_items
    )

print(f'targets_inference.shape = {test_dataset_inference.shape}\n')
test_dataset_inference.head(2)

targets_inference.shape = (188500, 213)



Unnamed: 0,user_id,item_id,target,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,...,user_factor_91,user_factor_92,user_factor_93,user_factor_94,user_factor_95,user_factor_96,user_factor_97,user_factor_98,user_factor_99,user_factor_100
0,1340,1088959,0.0,,,,,,,,...,,,,,,,,,,
1,1340,987628,0.0,,,,,,,,...,,,,,,,,,,


### 8.6.3 Generating recommendations

In [40]:
model_lgb = load('LightGBM_v1_final')

predictions_final = model_lgb.predict(
    test_dataset_inference.drop('target', axis=1).fillna(0)
)
print(f'predictions_final.shape = {predictions_final.shape}\n')
predictions_final[:7]

predictions_final.shape = (188500,)



array([0.0046502 , 0.00792365, 0.0046502 , 0.0046502 , 0.0062072 ,
       0.00565147, 0.0062072 ])

In [41]:
result_final = get_results(test, test_dataset_inference, predictions_final)

print(f'result_final.shape = {result_final.shape}\n')
result_final.head(2)

result_final.shape = (1885, 3)



Unnamed: 0,user_id,actual,recommendations
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[865456, 872137, 1041796, 1101173, 5569374]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[947798, 883932, 835098, 1001277, 972931]"


## 8.7. Metrics

In [42]:
metrics = result_final.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.052413793103448285


## 8.8. Saving final results


In [43]:
PATH = 'data/06_model_output'
FINAL_RESULTS_PATH = PATH + 'recommendations_v1_csv.zip'
final_results_path = FINAL_RESULTS_PATH

df = result_final[['user_id', 'recommendations']].copy()
df.to_csv(final_results_path , index=False, compression='zip')
df.head(2)

Unnamed: 0,user_id,recommendations
0,1,"[865456, 872137, 1041796, 1101173, 5569374]"
1,2,"[947798, 883932, 835098, 1001277, 972931]"
