<a href="https://colab.research.google.com/github/Yanina-Kutovaya/RecSys-retail/blob/main/notebooks/Baseline_model_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip
#!unzip -q /tmp/awscliv2.zip -d /tmp
#!rm /tmp/awscliv2.zip
#!sudo /tmp/aws/install --update
#!rm -rf /tmp/aws/ 
#!aws configure
# eu-west-1

In [2]:
#!git clone https://github.com/Yanina-Kutovaya/RecSys-retail.git
#!pip install -r RecSys-retail/requirements_Colab.txt

In [3]:
%cd RecSys-retail

/content/RecSys-retail


In [4]:
import sys
import os

sys.path.append(os.getcwd())
sys.path.append(os.path.join(os.getcwd(), "src", "recsys_retail"))

In [5]:
import logging
import pandas as pd
import joblib

from data.make_dataset import load_data
from features.data_time_split import time_split, time_split_2
from features.prefilter import prefilter_items
from features.user_features import fit_transform_user_features
from features.item_features import fit_transform_item_features
from features.recommenders import MainRecommender
from features.candidates_lvl_2 import get_candidates
from features.new_item_user_features import get_user_item_features
from features.targets import get_targets_lvl_2
from data.validation import train_test_split
from models.serialize import store, load
from scripts.train_save_model import train_store
from src.recsys_retail.metrics import (
   get_results, adjust_results_for_metrics, precision_at_k
)

In [6]:
import warnings
warnings.filterwarnings("ignore")

# 1. Data ingestion

In [7]:
#!dvc pull

In [8]:
data, item_features, user_features = load_data()

In [9]:
print(f'data.shape = {data.shape}\n')
data.head(2)

data.shape = (2396804, 12)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [10]:
print(f'item_features.shape = {item_features.shape}\n')
item_features.head(2)

item_features.shape = (92353, 7)



Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [11]:
print(f'user_features.shape = {user_features.shape}\n')
user_features.head(2)

user_features.shape = (801, 8)



Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


# 2. Prefilter transactions data

In [12]:
logging.info('Preprocessing level 1 train dataset...')

i0 = data['item_id'].nunique()
u0 = data['user_id'].nunique()
t0 = data.shape[0]

data = prefilter_items(data, item_features)

i1 = data['item_id'].nunique()
u1 = data['user_id'].nunique()
t1 = data.shape[0]

print(f'The number of items decreased from {i0} to {i1}')
print(f'The number of users decreased from {u0} to {u1}')
print(f'The number of transactions decreased from {t0} to {t1}')

The number of items decreased from 89051 to 2501
The number of users decreased from 2499 to 2472
The number of transactions decreased from 2396804 to 414502


In [13]:
print(f'data.shape = {data.shape}\n')
data.head(2)

data.shape = (414502, 13)



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
14,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,0.0,0.0,2.5


# 3. Train-validation-test time split for two-stage recommender system

Train - validation - test schema:

-- old purchases -- | -- 6 weeks-- | -- 3 weeks--

In [14]:
logging.info('Splitting dataset for level 1, level 2 preprocessing...')

data_train_lvl_1, data_train_lvl_2, data_val_lvl_2 = time_split(data)

In [15]:
data_train_lvl_1.shape[0], data_train_lvl_2.shape[0], data_val_lvl_2.shape[0]

(363397, 30113, 20992)

# 4. Level 1 

In [16]:
logger = logging.getLogger(__name__)

__all__ = ['preprocess_data']

N_ITEMS = 200
PATH_2 = 'data/05_model_input/'
TRAIN_DATASET_LVL_2_PATH = PATH_2 + 'train_dataset_lvl_2.csv.zip'

## 4.1 Build a recommender

In [17]:
logging.info('Selecting users for level 2 dataset...')

recommender = MainRecommender(
    data_train_lvl_1, 
    n_factors_ALS=50, 
    regularization_ALS=0.001,
    iterations_ALS=15,
    num_threads_ALS=4
)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2501 [00:00<?, ?it/s]

## 4.2 Generate users for level 2

In [18]:
n_items = N_ITEMS
users_lvl_2 = get_candidates(
    recommender, data_train_lvl_1, data_train_lvl_2, data_val_lvl_2, n_items
)
print(f'\nusers_lvl_2.shape = {users_lvl_2.shape}\n')
users_lvl_2.head(2)


users_lvl_2.shape = (2101, 2)



Unnamed: 0,user_id,candidates
0,2021,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,1753,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


# 5. Level 2

## 5.1 Generate new features adding to them users and items embeddings

In [19]:
logging.info('Generating new user-item features...')
user_item_features = get_user_item_features(recommender, data_train_lvl_1)

print(f'user_item_features.shape = {user_item_features.shape}\n')
user_item_features.head(2)

user_item_features.shape = (363397, 221)



Unnamed: 0,user_id,item_id,sales_value,retail_disc,coupon_disc,coupon_match_disc,price,n_baskets_user,n_items_user,sales_value_user,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,2375,1085983,2.99,-0.4,0.0,0.0,2.99,124,124,468.62,...,0.825242,1.843381,1.283919,-1.265464,-6.855611,-0.158912,1.788134,-3.631162,-2.304692,-3.69075
1,2375,1085983,3.49,0.0,0.0,0.0,3.49,124,124,468.62,...,0.825242,1.843381,1.283919,-1.265464,-6.855611,-0.158912,1.788134,-3.631162,-2.304692,-3.69075


## 5.2 Preprocess user features

In [20]:
user_features_transformed = fit_transform_user_features(user_features)

print(f'user_features_transformed.shape = {user_features_transformed.shape}\n')
user_features_transformed.head(2)

user_features_transformed.shape = (801, 42)



Unnamed: 0,user_id,marital_status_code_A,marital_status_code_U,marital_status_code_B,homeowner_desc_Homeowner,homeowner_desc_Unknown,homeowner_desc_Renter,homeowner_desc_Probable Renter,homeowner_desc_Probable Owner,hh_comp_desc_2 Adults No Kids,...,income_desc_8,income_desc_9,income_desc_10,household_size_desc_0,household_size_desc_1,household_size_desc_2,household_size_desc_3,kid_category_desc_0,kid_category_desc_1,kid_category_desc_2
0,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,7,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


## 5.3 Preprocess item features

In [21]:
item_features_transformed = fit_transform_item_features(item_features)

print(f'item_features_transformed.shape = {item_features_transformed.shape}\n')
item_features_transformed.head(2)

item_features_transformed.shape = (92353, 46)



Unnamed: 0_level_0,brand,manufacturer_count,department_count,commodity_desc_count,sub_commodity_desc_count,curr_size_of_product_count,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,...,item_desc_22,item_desc_23,item_desc_24,item_desc_25,item_desc_26,item_desc_27,item_desc_28,item_desc_29,item_desc_30,item_desc_31
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25671,1,0.015278,0.42252,0.000314,0.000314,0.00013,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
26081,1,0.015278,0.005306,0.005306,0.004645,0.331413,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 5.4 Generate train dataset for level 2 model

In [22]:
logging.info('Generating train dataset for level 2 model...')

train_dataset_lvl_2 = get_targets_lvl_2(
    users_lvl_2, 
    data_train_lvl_2,
    item_features_transformed, 
    user_features_transformed,     
    user_item_features, 
    n_items
)
print(f'train_dataset_lvl_2.shape = {train_dataset_lvl_2.shape}\n')
train_dataset_lvl_2.head(2)

train_dataset_lvl_2.shape = (453441, 309)



Unnamed: 0,user_id,item_id,target,brand,manufacturer_count,department_count,commodity_desc_count,sub_commodity_desc_count,curr_size_of_product_count,manufacturer_0,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,2021,1088959,0.0,1,0.000845,0.42252,0.008879,0.001949,0.016372,0,...,,,,,,,,,,
1,2021,987628,0.0,0,0.137256,0.42252,0.00157,0.000271,0.010168,0,...,,,,,,,,,,


In [23]:
logging.info('Saving train dataset level 2...')

train_dataset_lvl_2_path = TRAIN_DATASET_LVL_2_PATH
train_dataset_lvl_2.to_csv(
    train_dataset_lvl_2_path, index=False, compression='zip'
)

# 6. Training the model

In [24]:
train_store(train_dataset_lvl_2, 'baseline_v1')

[LightGBM] [Info] Number of positive: 11332, number of negative: 351420
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 41097
[LightGBM] [Info] Number of data points in the train set: 362752, number of used features: 304
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031239 -> initscore=-3.434352
[LightGBM] [Info] Start training from score -3.434352
Training until validation scores don't improve for 30 rounds
[1000]	training's auc: 0.954571	valid_1's auc: 0.938769
[2000]	training's auc: 0.960669	valid_1's auc: 0.940868
Early stopping, best iteration is:
[2132]	training's auc: 0.961491	valid_1's auc: 0.941151


# 7. Inference

In [25]:
model_lgb = load('baseline_v1')

In [26]:
predictions_train = model_lgb.predict(
    train_dataset_lvl_2.drop('target', axis=1).fillna(0)
)
print(f'predictions_train.shape = {predictions_train.shape}\n')
predictions_train[:7]

predictions_train.shape = (453441,)



array([0.00189324, 0.00420652, 0.00245615, 0.00282851, 0.00230294,
       0.00209848, 0.00336822])

In [27]:
result_lvl_2 = get_results(data_val_lvl_2, train_dataset_lvl_2, predictions_train)

print(f'result_lvl_2.shape = {result_lvl_2.shape}\n')
result_lvl_2.head(2)

result_lvl_2.shape = (1792, 4)



Unnamed: 0,user_id,actual,actual_adj,recommendations
0,1,"[856942, 865456, 951954, 971585, 990656, 11311...","[865456, 990656, 1041796, 1115576]","[10149640, 865456, 872137, 7025114, 1005274]"
1,3,"[835476, 999999]",,"[907631, 972931, 1101173, 964968, 883932]"


# 8. Metrics

In [28]:
adjusted_results_lvl_2 = adjust_results_for_metrics(result_lvl_2)

print(f'adjusted_results_lvl_2.shape = {adjusted_results_lvl_2.shape}\n')
adjusted_results_lvl_2.head(2)

adjusted_results_lvl_2.shape = (275, 5)



Unnamed: 0,user_id,actual,actual_adj,recommendations,len_actual_adj
12,19,"[944466, 999104, 1031083, 1048483, 6533236, 94...","[944466, 999104, 1048483, 940700, 6533608, 883...","[999104, 1101173, 837751, 1131310, 944466]",9
14,22,"[916758, 935968, 1119830, 13158992, 999999, 10...","[825749, 880150, 944836, 995645, 5569374]","[944836, 835098, 990656, 880150, 999104]",5


In [29]:
metrics = adjusted_results_lvl_2.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.2923636363636364


# 9. Training a model on a full dataset

## 9.1. Train-validation time split for two-stage recommender system

Train - validation schema:

-- old purchases -- | -- 6 weeks-- 

In [30]:
data_train, data_valid = time_split_2(data)

## 9.2 Generate users for level 2 with their preferences

In [31]:
users_final = get_candidates(
    recommender, data_train, data_valid, n_items=n_items)
print(f'users_final.shape = {users_final.shape}\n')
users_final.head(2)

users_final.shape = (1979, 2)



Unnamed: 0,user_id,candidates
0,84,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,2200,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


## 9.3. Generate new features adding to them users and items embeddings

In [32]:
logging.info('Generating new user-item features...')

user_item_features_final = get_user_item_features(
    recommender, data_train)

print(f'user_item_features_final.shape = {user_item_features_final.shape}\n')
user_item_features_final.head(2)

user_item_features_final.shape = (378567, 221)



Unnamed: 0,user_id,item_id,sales_value,retail_disc,coupon_disc,coupon_match_disc,price,n_baskets_user,n_items_user,sales_value_user,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,2375,1085983,2.99,-0.4,0.0,0.0,2.99,131,131,506.55,...,0.825242,1.843381,1.283919,-1.265464,-6.855611,-0.158912,1.788134,-3.631162,-2.304692,-3.69075
1,2375,1085983,3.49,0.0,0.0,0.0,3.49,131,131,506.55,...,0.825242,1.843381,1.283919,-1.265464,-6.855611,-0.158912,1.788134,-3.631162,-2.304692,-3.69075


## 9.4. Generate train dataset for level 2 model

In [33]:
logging.info('Generating train dataset for level 2 model...')

train_dataset_final = get_targets_lvl_2(
    users_final, 
    data_valid,
    item_features_transformed, 
    user_features_transformed,      
    user_item_features_final, 
    n_items
)
print(f'train_dataset_final.shape = {train_dataset_final.shape}')
train_dataset_final.head(2)

train_dataset_final.shape = (430226, 309)


Unnamed: 0,user_id,item_id,target,brand,manufacturer_count,department_count,commodity_desc_count,sub_commodity_desc_count,curr_size_of_product_count,manufacturer_0,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,84,1088959,0.0,1,0.000845,0.42252,0.008879,0.001949,0.016372,0,...,,,,,,,,,,
1,84,987628,0.0,0,0.137256,0.42252,0.00157,0.000271,0.010168,0,...,,,,,,,,,,


In [34]:
train_dataset_final['user_id'].value_counts()

982     400
1609    396
2004    390
371     373
1475    367
       ... 
659     200
2404    200
2287    200
87      200
1635    200
Name: user_id, Length: 1979, dtype: int64

In [35]:
logging.info('Saving train dataset final...')

PATH = 'data/05_model_input/'
TRAIN_DATASET_FINAL_PATH = PATH + 'train_dataset_final.csv.zip'

train_dataset_final_path = TRAIN_DATASET_FINAL_PATH
train_dataset_final.to_csv(
    train_dataset_final_path, index=False, compression='zip'
)

## 9.5. Training the model 

In [36]:
train_store(train_dataset_final, 'LightGBM_v1_final')

[LightGBM] [Info] Number of positive: 13163, number of negative: 331017
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 41179
[LightGBM] [Info] Number of data points in the train set: 344180, number of used features: 304
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038245 -> initscore=-3.224760
[LightGBM] [Info] Start training from score -3.224760
Training until validation scores don't improve for 30 rounds
[1000]	training's auc: 0.951168	valid_1's auc: 0.939032
[2000]	training's auc: 0.956754	valid_1's auc: 0.942373
Early stopping, best iteration is:
[2895]	training's auc: 0.962691	valid_1's auc: 0.943621


## 9.6. Inference

### 9.6.1 Test data ingestion

In [37]:
test = pd.read_csv('data/01_raw/test.csv')

u = test['user_id'].nunique()
i = test['item_id'].nunique()
print(f'test.shape = {test.shape}\n')
print(f'The number of users = {u}')
print(f'The number of items = {i}\n')

test.head(2)

test.shape = (88734, 12)

The number of users = 1885
The number of items = 20497



Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0


### 9.6.2. Test data preprocessing

#### Users for inference with their preferences

In [38]:
n_items = N_ITEMS
users_inference = get_candidates(
    recommender, data_train, test, n_items=n_items
)
print(f'\nusers_inference.shape = {users_inference.shape}\n')
users_inference.head(2)


users_inference.shape = (1885, 2)



Unnamed: 0,user_id,candidates
0,1340,"[1088959, 987628, 1048962, 1067425, 13115279, ..."
1,588,"[1088959, 987628, 1048962, 1067425, 13115279, ..."


#### Test dataset for inference

In [39]:
test_dataset_inference = get_targets_lvl_2(
    users_inference, 
    data_valid,
    item_features_transformed, 
    user_features_transformed,    
    user_item_features_final,     
    n_items
    )

print(f'targets_inference.shape = {test_dataset_inference.shape}\n')
test_dataset_inference.head(2)

targets_inference.shape = (409871, 309)



Unnamed: 0,user_id,item_id,target,brand,manufacturer_count,department_count,commodity_desc_count,sub_commodity_desc_count,curr_size_of_product_count,manufacturer_0,...,user_factor_41,user_factor_42,user_factor_43,user_factor_44,user_factor_45,user_factor_46,user_factor_47,user_factor_48,user_factor_49,user_factor_50
0,1340,1088959,0.0,1,0.000845,0.42252,0.008879,0.001949,0.016372,0,...,,,,,,,,,,
1,1340,987628,0.0,0,0.137256,0.42252,0.00157,0.000271,0.010168,0,...,,,,,,,,,,


### 9.6.3 Generating recommendations

In [40]:
model_lgb = load('LightGBM_v1_final')

predictions_final = model_lgb.predict(
    test_dataset_inference.drop('target', axis=1).fillna(0)
)
print(f'predictions_final.shape = {predictions_final.shape}\n')
predictions_final[:7]

predictions_final.shape = (409871,)



array([0.00291487, 0.00473868, 0.00296673, 0.00238004, 0.00208429,
       0.00360961, 0.00462089])

In [41]:
result_final = get_results(test, test_dataset_inference, predictions_final)

print(f'result_final.shape = {result_final.shape}\n')
result_final.head(2)

result_final.shape = (1885, 4)



Unnamed: 0,user_id,actual,actual_adj,recommendations
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...",[990656],"[10149640, 865456, 1041796, 990656, 5569374]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[885023, 947798, 990656, 12302069]","[885023, 835098, 1108094, 947798, 883932]"


## 9.7. Metrics

In [42]:
adjusted_results = adjust_results_for_metrics(result_final)

print(f'adjusted_results.shape = {adjusted_results.shape}\n')
adjusted_results.head(2)

adjusted_results.shape = (183, 5)



Unnamed: 0,user_id,actual,actual_adj,recommendations,len_actual_adj
9,14,"[833715, 835098, 835347, 846823, 853643, 85531...","[835098, 869577, 878445, 898068, 1082310, 1091...","[1130858, 1101173, 961269, 1131310, 5568721]",6
13,19,"[849843, 883404, 944486, 1021133, 1033615, 113...","[999104, 1063577, 1101173, 837751, 1118012, 82...","[999104, 918335, 1131310, 883932, 944466]",8


In [43]:
metrics = adjusted_results.apply(
    lambda row: precision_at_k(row['recommendations'], row['actual'], 5), axis=1
).mean()

print(f'precision@5 = {metrics}')

precision@5 = 0.29180327868852457


## 9.8. Saving final results


In [44]:
PATH = 'data/06_model_output'
FINAL_RESULTS_PATH = PATH + 'recommendations_v1_csv.zip'
final_results_path = FINAL_RESULTS_PATH

df = result_final[['user_id', 'recommendations']].copy()
df.to_csv(final_results_path , index=False, compression='zip')
df.head(2)

Unnamed: 0,user_id,recommendations
0,1,"[10149640, 865456, 1041796, 990656, 5569374]"
1,2,"[885023, 835098, 1108094, 947798, 883932]"
