In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#%matplotlib inline

# Для работы с матрицами

from scipy.sparse import csr_matrix, coo_matrix
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from lightfm import LightFM

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# evaluation metrics
from lightfm.evaluation import precision_at_k, recall_at_k
from src.metrics import prec_at_k, rec_at_k

# Написанные нами функции

from src.utils import prefilter_items, get_user_matrix, get_item_matrix, get_useritem_matrix, cold_user_recommend
from src.recommenders import MainRecommender

  from .autonotebook import tqdm as notebook_tqdm
  "LightFM was compiled without OpenMP support. "


In [2]:
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

In [3]:
# param grid to search for better arguments' values for the model
param_grid = {
    'no_components': [10, 20, 30],
    'learning_rate': np.linspace(0.005, 0.03, num=5), 
    'loss': ['bpr', 'warp'], #=, # 
    'item_alpha': np.linspace(0.001, 0.05, num=5), 
    'user_alpha': np.linspace(0.001, 0.05, num=5)
}


In [4]:
# downloading data

data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')
test_data = pd.read_csv('../data/retail_test1.csv')

In [5]:
# same column names through users', items' and transactions' tables
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [6]:
# Важна схема обучения и валидации!
# -- давние покупки --train | -- 6 недель -- val_lvl_1| -- 3 недель --val_lvl_2 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

#data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [7]:
# dataframe with purchase results for each user in validation 1 data - data_val_lvl_1
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [8]:
# dataframe with purchase results for each user in validation 2 data -data_val_lvl_2
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [9]:
# dataframe with purchase results for each user in validation 2 data -data_val_lvl_2
result_test = test_data.groupby('user_id')['item_id'].unique().reset_index()
result_test.columns=['user_id', 'actual']
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


**1. Prefiltering data (train, valid1, valid2, test) checking number of items left for recommendation**

In [10]:
# prefiltering train data
n_items_before = data_train_lvl_1['item_id'].nunique()

prefiltered_train_lvl_1 = prefilter_items(data_train_lvl_1)

n_items_after = prefiltered_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5000


In [11]:
# prefiltering level 1 validation data
n_items_before = data_val_lvl_1['item_id'].nunique()

prefiltered_lvl_1 = prefilter_items(data_val_lvl_1)

n_items_after = prefiltered_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 27649 to 5000


In [12]:
# prefiltering level 2 validation data
n_items_before = data_val_lvl_2['item_id'].nunique()

prefiltered_lvl_2 = prefilter_items(data_val_lvl_2)

n_items_after = prefiltered_lvl_2['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 24329 to 5000


In [13]:
# prefiltering test data
n_items_before = test_data['item_id'].nunique()

prefiltered_test = prefilter_items(test_data)

n_items_after = prefiltered_test['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 20497 to 5000


In [14]:
# checking number of unique users in all data
data['user_id'].nunique()

2499

In [15]:
#checking number of users in test data
test_data['user_id'].nunique()

1885

In [16]:
# number of users in test data new to train data
len([us for us in test_data['user_id'].unique() if us in data['user_id']])

1885

In [17]:
#number of users in prefiltered train data
prefiltered_train_lvl_1['user_id'].values

array([2375, 2375, 2375, ...,  856,  856,  856], dtype=int64)

* *Warm users* - valid data users, that are also in prefiltered train data   
* *Cold users* - valid data users, that are NOT in prefiltered train data

In [18]:
warm_users_1 = [x for x in result_lvl_1['user_id'].values if x in prefiltered_train_lvl_1['user_id'].values]
cold_users_1 = [x for x in result_lvl_1['user_id'].values if x not in prefiltered_train_lvl_1['user_id'].values]

In [19]:
print(f'in validation 1 data number of warm users: {len(warm_users_1)}, of cold users {len(cold_users_1)}')

in validation 1 data number of warm users: 2145, of cold users 9


In [20]:
warm_users_2 = [x for x in result_lvl_2['user_id'].values if x in prefiltered_train_lvl_1['user_id'].values]
cold_users_2 = [x for x in result_lvl_2['user_id'].values if x not in prefiltered_train_lvl_1['user_id'].values]

In [21]:
print(f'in validation 2 data number of warm users: {len(warm_users_2)}, of cold users {len(cold_users_2)}')

in validation 2 data number of warm users: 2036, of cold users 6


In [22]:
warm_users_test = [x for x in result_test['user_id'].values if x in prefiltered_train_lvl_1['user_id'].values]
cold_users_test = [x for x in result_test['user_id'].values if x not in prefiltered_train_lvl_1['user_id'].values]

In [23]:
print(f'in test data number of warm users: {len(warm_users_test)}, of cold users {len(cold_users_test)}')

in test data number of warm users: 1876, of cold users 9


In [24]:
# preparing user_item matrix with number of each purchased item by each user
"""
user_item_matrix = pd.pivot_table(prefiltered_train_lvl_1, index='user_id', columns='item_id', values='quantity',
                                      aggfunc='count', fill_value=0)
user_item_matrix = user_item_matrix.astype(float)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()
"""

"\nuser_item_matrix = pd.pivot_table(prefiltered_train_lvl_1, index='user_id', columns='item_id', values='quantity',\n                                      aggfunc='count', fill_value=0)\nuser_item_matrix = user_item_matrix.astype(float)\nsparse_user_item = csr_matrix(user_item_matrix).tocsr()\n"

In [25]:
#user_item_matrix.head(2)

**checking warm items through dataframes**

In [26]:
# unique items in different dataframes
unique_items_all = data['item_id'].unique()
unique_items_train = data_train_lvl_1['item_id'].unique()
unique_items_prefiltered = prefiltered_train_lvl_1['item_id'].unique()
unique_items_lvl1 = data_val_lvl_1['item_id'].unique()
unique_items_prefil_lvl1 = prefiltered_lvl_1['item_id'].unique()
unique_items_lvl2 = data_val_lvl_2['item_id'].unique()
unique_items_prefil_lvl2 = prefiltered_lvl_2['item_id'].unique()
unique_items_test = test_data['item_id'].unique()
unique_items_prefil_test = prefiltered_test['item_id'].unique()
unique_items_infeatures = item_features['item_id'].unique()

In [27]:
print(f'number of items in: \n \
- all {len(unique_items_all)},\n \
- prefiltered train {len(unique_items_prefiltered)},\n \
- lvl1 {len(unique_items_lvl1)},\n \
- prefiltered lvl1 {len(unique_items_prefil_lvl1)},\n \
- lvl2 {len(unique_items_lvl2)},\n \
- prefiltered lvl2 {len(unique_items_prefil_lvl2)},\n \
- test {len(unique_items_test)},\n \
- prefiltered test {len(unique_items_prefil_test)},\n \
- features data {len(unique_items_infeatures)}')

number of items in: 
 - all 89051,
 - prefiltered train 5000,
 - lvl1 27649,
 - prefiltered lvl1 5000,
 - lvl2 24329,
 - prefiltered lvl2 5000,
 - test 20497,
 - prefiltered test 5000,
 - features data 92353


In [28]:
# number of items in prefiltered validation lvl 1 transactions absent in prefiltered train (cold items for valid lvl 1)
len([item_n for item_n in unique_items_prefil_lvl1 if item_n not in unique_items_prefiltered])

1525

In [29]:
# number of items in prefiltered validation lvl 2 transactions absent in prefiltered train (cold items for valid lvl 2)
len([item_n for item_n in unique_items_prefil_lvl2 if item_n not in unique_items_prefiltered])

1606

In [30]:
# number of items in prefiltered test transactions absent in prefiltered train (cold items for test)
len([item_n for item_n in unique_items_prefil_test if item_n not in unique_items_prefiltered])

1769

**2. Building ALS model to get embeddings**

In [31]:
# als_model on prefiltered train
als_model = MainRecommender(prefiltered_train_lvl_1)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:04<00:00,  3.63it/s]
100%|███████████████████████████████████████████████████████████████████████████| 5000/5000 [00:00<00:00, 14197.30it/s]


In [32]:
user_item_matrix = als_model.user_item
item_factors = als_model.item_factors
user_factors = als_model.user_factors
user_item_matrix

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819255,...,15511891,15596279,15596488,15596515,15778533,15926844,15926886,15927403,15927661,15927850
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# als_model on prefiltered lvl1
als_model_lvl1 = MainRecommender(prefiltered_lvl_1)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  5.83it/s]
100%|██████████████████████████████████████████████████████████████████████████| 5000/5000 [00:00<00:00, 147064.33it/s]


In [34]:
user_item_matrix_lvl1 = als_model_lvl1.user_item
item_factors_lvl1 = als_model_lvl1.item_factors
user_factors_lvl1 = als_model_lvl1.user_factors
user_item_matrix_lvl1

item_id,397896,480014,707683,818980,819063,819112,819227,819255,819304,819308,...,17105257,17105540,17106064,17168855,17169131,17178955,17179662,17179814,17208470,17215077
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**3. Prepare user and item features for lightfm**

* on prefiltered train

In [35]:
# prepare user_item matrix on train prefilterted
user_item_lightfm = get_useritem_matrix(user_item_matrix)
user_item_lightfm.shape

(2484, 5000)

In [36]:
# prepare user_features matrix (adding new features, adding embeddings)
user_features_lightfm = get_user_matrix(prefiltered_train_lvl_1, user_features)
user_features_lightfm.shape

(2484, 51)

In [37]:
user_features_lightfm.head(2)

Unnamed: 0_level_0,mean_time,average_basket,baskets_per_week,age_desc_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,...,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_0,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1400.204956,2.374918,20.452235,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2.0,1772.341064,2.639509,10.744353,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [38]:
# prepare item_features matrix (adding new features, adding embeddings, changing id numbers)
item_features_lightfm = get_item_matrix(prefiltered_train_lvl_1, item_features)
item_features_lightfm.shape

(5000, 29)

In [39]:
item_features_lightfm.head(2)

Unnamed: 0_level_0,manufacturer,retail_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,price,department_CHEF SHOPPE,department_COUP/STR & MFG,department_DELI,...,department_NUTRITION,department_PASTRY,department_PRODUCE,department_SALAD BAR,department_SEAFOOD,department_SEAFOOD-PCKGD,department_SPIRITS,department_TRAVEL & LEISUR,brand_National,brand_Private
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
202291,69,-0.5925,4,0.047059,4681,3.059477,0.002279,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
397896,69,-0.355979,97,1.141176,17973,17.620588,0.00238,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


* on prefiltered validation level 1

In [40]:
# prepare user_item matrix on train prefilterted
lvl1_user_item_lightfm = get_useritem_matrix(user_item_matrix_lvl1)
lvl1_user_item_lightfm.shape

(2110, 5000)

In [41]:
# prepare validation user and item features for lightfm

In [42]:
val1_item_features_lightfm = get_item_matrix(prefiltered_lvl_1, item_features)
val1_item_features_lightfm.shape

(5000, 30)

In [43]:
val1_item_features_lightfm.tail(3)

Unnamed: 0_level_0,manufacturer,retail_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,price,department_CHEF SHOPPE,department_COUP/STR & MFG,department_DELI,...,department_NUTRITION,department_PASTRY,department_PRODUCE,department_SALAD BAR,department_SEAFOOD,department_SEAFOOD-PCKGD,department_SPIRITS,department_TRAVEL & LEISUR,brand_National,brand_Private
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17179814,103,-0.18,17,2.833333,78224,3.488716,1.11,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17208470,1415,-0.150909,22,3.666667,455,1.763566,3.131515,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17215077,1273,0.0,7,1.166667,78224,3.488716,2.69,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [44]:
val1_user_features_lightfm = get_user_matrix(prefiltered_lvl_1, user_features)
val1_user_features_lightfm.shape

(2110, 51)

In [45]:
val1_user_features_lightfm.head(2)

Unnamed: 0_level_0,mean_time,average_basket,baskets_per_week,age_desc_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,...,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_0,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1245.216187,2.392703,29.51,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2.0,1930.307739,2.036923,17.653333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


**4. Normalizing data**

* normalizing train

In [46]:
scaler = preprocessing.MinMaxScaler()

In [47]:
def scaling_data(scaler, data_matrix):
    matrix_copy = data_matrix.copy()
    if 'user_id' in matrix_copy.columns.values:
        matrix_copy.set_index('user_id', inplace=True)
    elif 'item_id' in matrix_copy.columns.values:
        matrix_copy.set_index('item_id', inplace=True)
    else:
        pass
    cols = matrix_copy.columns
    ind = matrix_copy.index
    d = scaler.fit_transform(matrix_copy)
    scaled_df = pd.DataFrame(d, index=ind, columns=cols)
    
    return scaled_df

In [48]:
scaled_item_features = scaling_data(scaler, item_features_lightfm)
scaled_item_features

Unnamed: 0_level_0,manufacturer,retail_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,price,department_CHEF SHOPPE,department_COUP/STR & MFG,department_DELI,...,department_NUTRITION,department_PASTRY,department_PRODUCE,department_SALAD BAR,department_SEAFOOD,department_SEAFOOD-PCKGD,department_SPIRITS,department_TRAVEL & LEISUR,brand_National,brand_Private
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
202291,0.010586,0.970631,0.000123,0.000123,0.005234,0.139145,0.000114,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
397896,0.010586,0.982355,0.003948,0.003948,0.020291,1.000000,0.000119,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
420647,0.010586,0.974158,0.000576,0.000576,0.005234,0.139145,0.000137,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
480014,0.010586,0.973417,0.001069,0.001069,0.020291,1.000000,0.000116,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
545926,0.010586,0.970259,0.000000,0.000000,0.005234,0.139145,0.000145,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15926844,0.085638,0.986251,0.004647,0.004647,1.000000,0.126692,0.111184,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15926886,0.085638,0.986546,0.005428,0.005428,1.000000,0.126692,0.111752,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15927403,0.197346,0.992038,0.003948,0.003948,1.000000,0.126692,0.029521,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15927661,0.807237,0.999804,0.002138,0.002138,1.000000,0.126692,0.012443,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [49]:
scaled_user_features = scaling_data(scaler, user_features_lightfm)

* normalizing valid level 1

In [50]:
scaled_item_features_lvl1 = scaling_data(scaler, val1_item_features_lightfm)

In [51]:
scaled_user_features_lvl1 = scaling_data(scaler, val1_user_features_lightfm)

### 4. Building lightfm model on prefiltered train

In [52]:
#model without normalizing
model = LightFM(no_components=10,
                loss='bpr',
                learning_rate=0.005, 
                item_alpha=0.001,
                user_alpha=0.001, 
                random_state=42)

model.fit(coo_matrix(user_item_lightfm),
          user_features=csr_matrix(user_features_lightfm.values).tocsr(),
          item_features=csr_matrix(item_features_lightfm.values).tocsr(),

          epochs=10, 
          num_threads=1)


<lightfm.lightfm.LightFM at 0x157122f4dc8>

In [53]:
train_precision = precision_at_k(model, coo_matrix(user_item_lightfm),
                                 user_features=csr_matrix(user_features_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_features_lightfm.values).tocsr(),
                                 k=5).mean()

train_precision


0.38405797

In [54]:
model_scaled = LightFM(no_components=10,
                loss='warp',
                learning_rate=0.005, 
                item_alpha=0.001,
                user_alpha=0.001, 
                random_state=42)

model_scaled.fit(coo_matrix(user_item_lightfm),
          user_features=csr_matrix(scaled_user_features.values).tocsr(),
          item_features=csr_matrix(scaled_item_features.values).tocsr(),

          epochs=10, 
          num_threads=1) 

<lightfm.lightfm.LightFM at 0x157122f9988>

In [55]:
train_precision = precision_at_k(model_scaled, coo_matrix(user_item_lightfm),
                                 user_features=csr_matrix(scaled_user_features.values).tocsr(),
                                 item_features=csr_matrix(scaled_item_features.values).tocsr(),
                                 k=5).mean()

train_precision

0.5502416

In [56]:
"""%%time
gbm = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='top_k_accuracy')
gbm.fit(coo_matrix(user_item_matrix),user_features=csr_matrix(scaled_user_features.values).tocsr(),
                                 item_features=csr_matrix(scaled_item_features.values).tocsr())

print('Best parameters found by grid search are:', gbm.best_params_)


Best parameters found by grid search are: {'item_alpha': 0.001, 'learning_rate': 0.005, 'loss': 'bpr', 'no_components': 10, 'user_alpha': 0.001}
Wall time: 9h 35min 7s
"""

"%%time\ngbm = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='top_k_accuracy')\ngbm.fit(coo_matrix(user_item_matrix),user_features=csr_matrix(scaled_user_features.values).tocsr(),\n                                 item_features=csr_matrix(scaled_item_features.values).tocsr())\n\nprint('Best parameters found by grid search are:', gbm.best_params_)\n\n\nBest parameters found by grid search are: {'item_alpha': 0.001, 'learning_rate': 0.005, 'loss': 'bpr', 'no_components': 10, 'user_alpha': 0.001}\nWall time: 9h 35min 7s\n"

* Lightfm predictions on train

In [57]:
train_lightfm_preds = []
for train_user in user_features_lightfm.index.values.astype(int):
    predictions = model_scaled.predict(user_ids=int(als_model.userid_to_id[train_user]), item_ids=np.fromiter(als_model.itemid_to_id.values(), dtype=float),
                            user_features=csr_matrix(scaled_user_features.values).tocsr(),
                            item_features=csr_matrix(scaled_item_features.values).tocsr(),
                            num_threads=1)
    train_lightfm_preds.append(predictions)

In [58]:
# forming dataframe of predictions

train_preds = pd.DataFrame(train_lightfm_preds, index=als_model.user_item.index, columns=als_model.user_item.columns)

In [59]:
train_preds

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819255,...,15511891,15596279,15596488,15596515,15778533,15926844,15926886,15927403,15927661,15927850
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-77.677849,-78.148010,-77.658058,-78.276772,-77.683365,-77.684517,-78.319786,-77.415451,-77.155762,-76.737144,...,-77.178032,-77.210640,-77.313232,-77.320045,-77.875481,-77.402351,-77.366928,-77.441284,-77.527283,-77.464050
2,-127.155533,-127.751015,-127.134331,-127.889984,-127.161491,-127.163101,-127.936668,-126.806999,-126.524948,-125.987206,...,-126.541603,-126.578224,-126.680016,-126.680153,-127.304367,-126.794495,-126.756226,-126.832458,-126.994995,-126.857460
3,-126.965981,-127.562668,-126.944702,-127.702141,-126.971970,-126.973595,-127.749008,-126.615967,-126.332817,-125.793732,...,-126.349266,-126.386307,-126.488472,-126.488609,-127.115913,-126.603394,-126.564987,-126.641518,-126.804626,-126.666595
4,-124.520317,-125.101860,-124.499672,-125.237083,-124.526115,-124.527679,-125.282532,-124.179008,-123.904480,-123.380280,...,-123.921448,-123.956322,-124.055351,-124.055473,-124.665115,-124.166801,-124.129555,-124.203766,-124.362450,-124.228096
5,-127.432053,-128.028961,-127.410820,-128.168152,-127.438026,-127.439651,-128.214905,-127.082260,-126.799767,-126.260376,...,-126.817673,-126.853149,-126.955139,-126.955292,-127.579147,-127.069733,-127.031418,-127.107758,-127.270393,-127.132797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,-68.508858,-68.990349,-68.489830,-69.114334,-68.514175,-68.515350,-69.155815,-68.220505,-67.970497,-67.542305,...,-68.020103,-68.021713,-68.119194,-68.124359,-68.652893,-68.208946,-68.174843,-68.244736,-68.343582,-68.266747
2497,-77.424652,-77.999313,-77.402672,-78.143135,-77.430809,-77.432388,-78.191391,-77.190079,-76.898369,-76.424950,...,-76.797333,-76.955086,-77.062523,-77.064568,-77.765091,-77.176353,-77.136749,-77.217056,-77.368599,-77.242813
2498,-83.149414,-83.660889,-83.129112,-83.793221,-83.155075,-83.156357,-83.837517,-82.885139,-82.617828,-82.185150,...,-82.622986,-82.672752,-82.775864,-82.780914,-83.365173,-82.872192,-82.835762,-82.911079,-83.015953,-82.934601
2499,-65.873451,-66.468498,-65.852211,-66.607590,-65.879417,-65.881012,-66.654327,-65.547615,-65.265923,-64.772469,...,-65.239792,-65.319321,-65.422188,-65.422523,-66.089378,-65.535950,-65.497635,-65.573097,-65.730118,-65.598129


In [60]:
# sorting out the highest ranking 50 items on each user
df_best50 = []
for x in train_preds.index:
    df_best50.append([(train_preds.loc[x].sort_values(ascending=False).index.values[:50])])
    

In [61]:
# forming dataframe with predicions
df_best50 = pd.DataFrame(df_best50, index=als_model.user_item.index, columns=['rec_50'])

* validation of train model on valid level 1 data

In [62]:
# dataframe with two columns:
# - actual results on valid level 1 users and
# - predicted results for warm users with NaN for cold users

r_1 = result_lvl_1.join(df_best50, on=['user_id'], how='left')

In [63]:
r_1.loc[r_1['user_id'].isin(cold_users_1),
      ['rec_50']]

Unnamed: 0,rec_50
49,
525,
553,
581,
625,
819,
1715,
1718,
2036,


In [64]:
r_1.loc[~r_1['user_id'].isin(cold_users_1),
      ['rec_50']]

Unnamed: 0,rec_50
0,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
1,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
3,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
4,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
...,...
2149,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2150,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2151,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2152,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."


In [65]:
# recommendations for cold users added
r_1.loc[r_1['user_id'].isin(cold_users_1),
      ['rec_50']] = r_1['user_id'].apply(lambda x:cold_user_recommend(prefiltered_train_lvl_1, n=50))

In [66]:
r_1

Unnamed: 0,user_id,actual,rec_50
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
...,...,...,...
2149,2496,"[831509, 867188, 1013623, 1048851, 5592734, 16...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2150,2497,"[820291, 824759, 838797, 859010, 859075, 86077...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2151,2498,"[865511, 962991, 1076374, 1102358, 5564901, 15...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2152,2499,"[861282, 921744, 1050968, 13842089, 828837, 86...","[1082185, 6534178, 1029743, 995242, 1106523, 9..."


In [67]:
# precision at 5 for predicions for valid users level 1

pres_users = []
for x_user in r_1['user_id']:
    user_pres = prec_at_k(r_1.loc[r_1['user_id']== x_user, 'rec_50'].values[0],
                               r_1.loc[r_1['user_id']== x_user,'actual'].values[0],
                               k=5)
    pres_users.append(user_pres)
    
np.mean(pres_users)

0.24094707520891365

* validation of train model on valid level 2 data

In [68]:
# dataframe with two columns:
# - actual results on valid level 2 users (the latest transactions) and
# - predicted results for warm users with NaN for cold users

r_2 = result_lvl_2.join(df_best50, on=['user_id'], how='left')

In [69]:
r_2.loc[r_2['user_id'].isin(cold_users_2),
      ['rec_50']]

Unnamed: 0,rec_50
511,
581,
770,
1622,
1625,
1934,


In [70]:
r_2.loc[~r_2['user_id'].isin(cold_users_2),
      ['rec_50']]

Unnamed: 0,rec_50
0,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
1,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
3,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
4,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
...,...
2037,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2038,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2039,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2040,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."


In [71]:
# adding recommendations for cold users (most popular 50 items from prefiltered data)
r_2.loc[r_2['user_id'].isin(cold_users_2),
      ['rec_50']
     ] = r_2['user_id'].apply(lambda x:cold_user_recommend(prefiltered_train_lvl_1, n=50))

In [72]:
r_2.loc[r_2['user_id'].isin(cold_users_2),
      ['rec_50']]

Unnamed: 0,rec_50
511,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
581,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
770,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
1622,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
1625,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
1934,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."


In [73]:
r_2.loc[~r_2['user_id'].isin(cold_users_2),
      ['rec_50']]

Unnamed: 0,rec_50
0,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
1,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
3,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
4,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
...,...
2037,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2038,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2039,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2040,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."


In [74]:
# precision at 5 for predicions for valid users level 2

pres_users = []
for x_user in r_2['user_id']:
    user_pres = prec_at_k(r_2.loc[r_2['user_id']== x_user, 'rec_50'].values[0],
                               r_2.loc[r_2['user_id']== x_user,'actual'].values[0],
                               k=5)
    pres_users.append(user_pres)
    
np.mean(pres_users)

0.20558276199804118

### 4. Building lightfm model on prefiltered valid level 1

In [75]:
model_scaled_lvl1 = LightFM(no_components=10,
                loss='warp',
                learning_rate=0.005, 
                item_alpha=0.001,
                user_alpha=0.001, 
                random_state=42)

model_scaled_lvl1.fit(coo_matrix(lvl1_user_item_lightfm),
          user_features=csr_matrix(scaled_user_features_lvl1.values).tocsr(),
          item_features=csr_matrix(scaled_item_features_lvl1.values).tocsr(),

          epochs=10, 
          num_threads=1) 

<lightfm.lightfm.LightFM at 0x1578af0c408>

In [76]:
train_precision_lvl1 = precision_at_k(model_scaled_lvl1, coo_matrix(lvl1_user_item_lightfm),
                                 user_features=csr_matrix(scaled_user_features_lvl1.values).tocsr(),
                                 item_features=csr_matrix(scaled_item_features_lvl1.values).tocsr(),
                                 k=5).mean()

train_precision_lvl1

0.24597156

* Lightfm predictions on valid lvl 1

In [77]:
lvl1_lightfm_preds = []
for lvl1_user in val1_user_features_lightfm.index.values.astype(int):
    predictions_lvl1 = model_scaled_lvl1.predict(user_ids=int(als_model_lvl1.userid_to_id[lvl1_user]), item_ids=np.fromiter(als_model_lvl1.itemid_to_id.values(), dtype=float),
                            user_features=csr_matrix(scaled_user_features_lvl1.values).tocsr(),
                            item_features=csr_matrix(scaled_item_features_lvl1.values).tocsr(),
                            num_threads=1)
    lvl1_lightfm_preds.append(predictions_lvl1)

In [78]:
# forming dataframe of predictions

lvl1_preds = pd.DataFrame(lvl1_lightfm_preds, index=als_model_lvl1.user_item.index, columns=als_model_lvl1.user_item.columns)

In [79]:
lvl1_preds

item_id,397896,480014,707683,818980,819063,819112,819227,819255,819304,819308,...,17105257,17105540,17106064,17168855,17169131,17178955,17179662,17179814,17208470,17215077
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-32.790184,-32.898685,-32.066875,-32.125462,-32.123386,-32.125957,-32.115536,-31.552748,-31.908970,-31.579220,...,-31.472574,-32.654217,-32.463856,-31.616217,-32.067333,-32.129295,-32.148426,-31.929722,-32.534904,-32.186310
2,-49.173267,-49.294800,-48.305122,-48.408161,-48.402927,-48.409306,-48.396587,-47.756660,-48.162991,-47.884701,...,-47.618038,-48.969959,-48.728878,-47.830494,-48.339367,-48.412304,-48.433681,-48.181545,-48.891792,-48.477772
4,-48.209171,-48.327763,-47.363396,-47.463352,-47.458168,-47.464455,-47.452049,-46.829475,-47.224064,-46.956390,...,-46.695480,-48.015106,-47.779385,-46.899567,-47.396156,-47.467373,-47.488216,-47.242100,-47.938221,-47.531242
6,-48.023350,-48.141529,-47.180901,-47.279263,-47.274113,-47.280369,-47.268005,-46.648125,-47.040813,-46.774956,...,-46.515099,-47.830067,-47.595207,-46.717464,-47.212322,-47.283279,-47.304054,-47.058796,-47.753452,-47.346928
7,-35.908215,-36.024929,-35.107498,-35.184002,-35.181694,-35.184628,-35.173283,-34.562939,-34.951099,-34.607994,...,-34.466000,-35.737961,-35.532169,-34.636002,-35.121452,-35.188229,-35.208797,-34.973202,-35.615650,-35.249641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,-29.460079,-29.566860,-28.705446,-28.796751,-28.794455,-28.797243,-28.786976,-28.234371,-28.583614,-28.290318,...,-28.147738,-29.311354,-29.122005,-28.295650,-28.739882,-28.800667,-28.819380,-28.603827,-29.216408,-28.856594
2497,-33.248478,-33.368408,-32.373577,-32.557621,-32.552208,-32.558220,-32.546455,-31.918575,-32.315281,-32.012196,...,-31.811880,-33.145641,-32.908417,-31.987673,-32.489113,-32.560890,-32.581985,-32.334084,-33.040810,-32.625137
2498,-35.986542,-36.104343,-35.178513,-35.239799,-35.237499,-35.240444,-35.228977,-34.629742,-35.004810,-34.677006,...,-34.526031,-35.809223,-35.601658,-34.686947,-35.176964,-35.244186,-35.264904,-35.027157,-35.694366,-35.306084
2499,-26.985180,-27.092291,-26.183945,-26.330833,-26.326881,-26.331348,-26.320938,-25.777697,-26.115482,-25.879839,...,-25.691616,-26.872940,-26.668909,-25.824669,-26.271769,-26.334318,-26.353027,-26.133625,-26.793978,-26.390963


In [80]:
# sorting out the highest ranking 50 items on each user
lvl1_best50 = []
for x in lvl1_preds.index:
    lvl1_best50.append([(lvl1_preds.loc[x].sort_values(ascending=False).index.values[:50])])
    

In [81]:
np.shape(lvl1_best50)

(2110, 1, 50)

In [82]:
# forming dataframe with predicions
df_lvl1_best50 = pd.DataFrame(lvl1_best50, index=als_model_lvl1.user_item.index, columns=['rec_50'])

* validation of lvl1 model on valid level 2 data

In [83]:
warm_users_3 = [x for x in result_lvl_2['user_id'].values if x in prefiltered_lvl_1['user_id'].values]
cold_users_3 = [x for x in result_lvl_2['user_id'].values if x not in prefiltered_lvl_1['user_id'].values]

In [84]:
print(f'in validation 2 data number of warm users: {len(warm_users_3)}, of cold users {len(cold_users_3)}')

in validation 2 data number of warm users: 1883, of cold users 159


In [85]:
# dataframe with two columns:
# - actual results on valid level 1 users and
# - predicted results for warm users with NaN for cold users

r_3 = result_lvl_2.join(df_lvl1_best50, on=['user_id'], how='left')

In [86]:
r_3.loc[r_3['user_id']==3, :]

Unnamed: 0,user_id,actual,rec_50
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...",


In [87]:
r_3.loc[r_3['user_id'].isin(cold_users_3),
      ['rec_50']]

Unnamed: 0,rec_50
1,
37,
46,
47,
56,
...,...
1962,
1966,
2011,
2030,


In [88]:
r_3.loc[~r_3['user_id'].isin(cold_users_3),
      ['rec_50']]

Unnamed: 0,rec_50
0,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
3,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
4,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
5,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
...,...
2037,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2038,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2039,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."
2040,"[1082185, 6534178, 1029743, 995242, 1106523, 9..."


In [89]:
# recommendations for cold users added (with popular items from valid lvl1)
r_3.loc[r_3['user_id'].isin(cold_users_3),
      ['rec_50']] = r_3['user_id'].apply(lambda x:cold_user_recommend(prefiltered_lvl_1, n=50))

In [90]:
r_3.loc[r_3['user_id']==62, :]

Unnamed: 0,user_id,actual,rec_50


In [91]:
# precision at 5 for predicions for valid users level 1

pres_users_3 = []
for x_user in r_3['user_id']:
    user_pres_3 = prec_at_k(r_3.loc[r_3['user_id']== x_user, 'rec_50'].values[0],
                               r_3.loc[r_3['user_id']== x_user,'actual'].values[0],
                               k=5)
    pres_users_3.append(user_pres_3)
    
np.mean(pres_users_3)

0.20558276199804118

* **Exploring data for prediction similarity reasons**

In [93]:
prefiltered_train_lvl_1.tail(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2109566,856,40630539494,593,961747,1,1.5,372,-0.99,1831,85,0.0,0.0
2109569,856,40630539494,593,1120213,1,1.67,372,0.0,1831,85,0.0,0.0


In [94]:
prefiltered_train_copy = prefiltered_train_lvl_1.copy()
item_features_copy = item_features.copy()

In [95]:
items_departments = pd.DataFrame(item_features_copy['department'].values, index=item_features_copy['item_id'], columns=['department'])
data_department = prefiltered_train_copy.merge(items_departments, on='item_id', how="left")

In [96]:
count_depts = data_department.groupby('department')['item_id'].count().reset_index().sort_values('item_id', ascending=False)
count_depts

Unnamed: 0,department,item_id
6,GROCERY,882886
14,PRODUCE,187968
9,MEAT-PCKGD,68286
3,DRUG GM,63849
8,MEAT,55043
2,DELI,31737
7,KIOSK-GAS,17973
13,PASTRY,14818
15,SALAD BAR,7233
10,MISC SALES TRAN,4681


In [97]:
popular_train = prefiltered_train_copy.groupby('item_id')['quantity'].count().reset_index()

In [98]:
popular_train.merge(items_departments, on='item_id', how="left").sort_values('quantity', ascending=False)[:50]

Unnamed: 0,item_id,quantity,department
3414,1082185,24318,PRODUCE
4344,6534178,16233,KIOSK-GAS
2763,1029743,11661,GROCERY
2313,995242,10226,GROCERY
3694,1106523,8011,GROCERY
2153,981760,7505,GROCERY
4019,1133018,6029,GROCERY
889,883404,5513,GROCERY
1797,951590,5117,GROCERY
3954,1127831,5069,PRODUCE


In [99]:
item_features_copy.loc[item_features_copy['department'] == 'GROCERY', ['department', 'sub_commodity_desc', 'commodity_desc']]

Unnamed: 0,department,sub_commodity_desc,commodity_desc
0,GROCERY,ICE - CRUSHED/CUBED,FRZN ICE
3,GROCERY,APPLE SAUCE,FRUIT - SHELF STABLE
4,GROCERY,SPECIALTY COOKIES,COOKIES/CONES
5,GROCERY,SPICES & SEASONINGS,SPICES & EXTRACTS
6,GROCERY,TRAY PACK/CHOC CHIP COOKIES,COOKIES/CONES
...,...,...,...
92342,GROCERY,BAGGED CHEESE SNACKS,BAG SNACKS
92343,GROCERY,BAGGED CHEESE SNACKS,BAG SNACKS
92344,GROCERY,DAIRY CASE CITRUS PNCH/OJ SUBS,REFRGRATD JUICES/DRNKS
92347,GROCERY,PAPER TOWELS & HOLDERS,PAPER TOWELS


In [100]:
item_features_copy.loc[item_features_copy['department'] == 'PRODUCE', ['department', 'sub_commodity_desc', 'commodity_desc']]

Unnamed: 0,department,sub_commodity_desc,commodity_desc
70,PRODUCE,POTATOES RUSSET (BULK&BAG),POTATOES
76,PRODUCE,POTATOES RUSSET (BULK&BAG),POTATOES
104,PRODUCE,POPCORN - MICROWAVE,POPCORN
204,PRODUCE,ORGANIC CITRUS,ORGANICS FRUIT & VEGETABLES
277,PRODUCE,PACKAGED MIX,PROCESSED
...,...,...,...
92311,PRODUCE,ORGANIC HERBS,ORGANICS FRUIT & VEGETABLES
92312,PRODUCE,ORGANIC HERBS,ORGANICS FRUIT & VEGETABLES
92317,PRODUCE,ORGANIC HERBS,ORGANICS FRUIT & VEGETABLES
92325,PRODUCE,APPLES OTHER (BULK&BAG),APPLES


### 5. Working on test data

In [101]:
# prefiltering test data
n_items_before = test_data['item_id'].nunique()

prefiltered_test = prefilter_items(test_data)

n_items_after = prefiltered_test['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 20497 to 5000


In [102]:
prefiltered_test_copy = prefiltered_test.copy()

In [103]:
test_data_department = prefiltered_test_copy.merge(items_departments, on='item_id', how="left")
count_test_depts = test_data_department.groupby('department')['item_id'].count().reset_index().sort_values('item_id', ascending=False)
count_test_depts

Unnamed: 0,department,item_id
6,GROCERY,41605
14,PRODUCE,7501
3,DRUG GM,3362
9,MEAT-PCKGD,2900
8,MEAT,2393
2,DELI,1358
13,PASTRY,769
7,KIOSK-GAS,762
12,NUTRITION,476
16,SALAD BAR,278


In [104]:
# als_model on prefiltered test
als_model_test = MainRecommender(prefiltered_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  5.87it/s]
100%|██████████████████████████████████████████████████████████████████████████| 5000/5000 [00:00<00:00, 138894.36it/s]


In [105]:
user_item_matrix_test = als_model_test.user_item
np.shape(user_item_matrix_test)

(1844, 5000)

In [106]:
# prepare user_item matrix on test prefilterted
test_user_item_lightfm = get_useritem_matrix(user_item_matrix_test)
test_user_item_lightfm.shape

(1844, 5000)

In [107]:
# prepare test item and user features for lightfm

In [108]:
test_item_features_lightfm = get_item_matrix(prefiltered_test, item_features)
test_item_features_lightfm.shape

(5000, 29)

In [109]:
test_item_features_lightfm.tail(3)

Unnamed: 0_level_0,manufacturer,retail_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,price,department_CHEF SHOPPE,department_COUP/STR & MFG,department_DELI,...,department_NUTRITION,department_PASTRY,department_PRODUCE,department_RESTAURANT,department_SALAD BAR,department_SEAFOOD,department_SEAFOOD-PCKGD,department_TRAVEL & LEISUR,brand_National,brand_Private
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18106286,69,0.0,5,1.666667,41605,3.805799,1.138,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18119004,69,-0.31,2,0.666667,41605,3.805799,0.6725,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18119016,69,-0.016154,13,4.333333,41605,3.805799,2.081538,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [110]:
test_user_features_lightfm = get_user_matrix(prefiltered_test, user_features)
test_user_features_lightfm.shape

(1844, 51)

In [111]:
test_user_features_lightfm.head(2)

Unnamed: 0_level_0,mean_time,average_basket,baskets_per_week,age_desc_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,...,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_0,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1640.0,2.2172,18.476667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2.0,1734.0,1.81325,24.176667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [112]:
# normalizing test data
scaled_item_features_test = scaling_data(scaler, test_item_features_lightfm)
scaled_user_features_test = scaling_data(scaler, test_user_features_lightfm)

In [113]:
# building the model
model_scaled_test = LightFM(no_components=10,
                loss='warp',
                learning_rate=0.005, 
                item_alpha=0.001,
                user_alpha=0.001, 
                random_state=42)

model_scaled_test.fit(coo_matrix(test_user_item_lightfm),
          user_features=csr_matrix(scaled_user_features_test.values).tocsr(),
          item_features=csr_matrix(scaled_item_features_test.values).tocsr(),

          epochs=10, 
          num_threads=1) 

<lightfm.lightfm.LightFM at 0x15712c91b08>

In [114]:
test_precision = precision_at_k(model_scaled_test, coo_matrix(test_user_item_lightfm),
                                 user_features=csr_matrix(scaled_user_features_test.values).tocsr(),
                                 item_features=csr_matrix(scaled_item_features_test.values).tocsr(),
                                 k=5).mean()

test_precision

0.20227766

* Lightfm predictions on test

In [115]:
test_lightfm_preds = []
for test_user in test_user_features_lightfm.index.values.astype(int):
    predictions_test = model_scaled_test.predict(user_ids=int(als_model_test.userid_to_id[test_user]), item_ids=np.fromiter(als_model_test.itemid_to_id.values(), dtype=float),
                            user_features=csr_matrix(scaled_user_features_test.values).tocsr(),
                            item_features=csr_matrix(scaled_item_features_test.values).tocsr(),
                            num_threads=1)
    test_lightfm_preds.append(predictions_test)

In [116]:
# forming dataframe of predictions

test_preds = pd.DataFrame(test_lightfm_preds, index=als_model_test.user_item.index, columns=als_model_test.user_item.columns)

In [117]:
test_preds

item_id,397896,480014,566870,819063,819112,819304,819308,819330,819423,819765,...,18006036,18006037,18022247,18022252,18055329,18056451,18105264,18106286,18119004,18119016
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-26.023607,-26.054928,-25.682451,-25.089350,-25.532166,-25.463364,-25.180943,-25.429300,-26.019756,-25.010973,...,-25.567366,-25.539425,-25.511429,-25.470671,-25.514147,-26.120928,-25.412657,-25.489180,-25.566029,-25.275402
2,-37.578758,-37.611961,-37.114826,-36.473660,-36.956539,-36.876801,-36.650558,-36.814083,-37.488319,-36.364059,...,-36.999344,-36.969875,-36.901997,-36.858593,-36.904968,-37.675571,-37.023941,-36.878250,-36.959930,-36.649105
3,-37.180122,-37.212894,-36.720631,-36.088081,-36.564487,-36.485813,-36.263439,-36.423584,-37.090599,-35.979576,...,-36.606777,-36.577705,-36.510292,-36.467487,-36.513229,-37.274071,-36.631771,-36.486870,-36.567471,-36.260803
6,-36.751339,-36.783665,-36.295868,-35.671787,-36.141846,-36.064186,-35.846024,-36.002487,-36.663044,-35.564453,...,-36.183628,-36.154949,-36.088039,-36.045799,-36.090935,-36.843758,-36.210201,-36.064934,-36.144428,-35.841911
7,-28.311577,-28.344955,-27.976286,-27.343603,-27.817444,-27.742502,-27.449100,-27.679626,-28.305454,-27.233349,...,-27.857086,-27.827356,-27.766651,-27.723476,-27.769590,-28.453297,-27.718485,-27.743050,-27.824741,-27.515558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,-23.534189,-23.563864,-23.190981,-22.623062,-23.049410,-22.980753,-22.750711,-22.903431,-23.510935,-22.503740,...,-23.085419,-23.059048,-22.981432,-22.942844,-22.984070,-23.681639,-23.032118,-22.960323,-23.033140,-22.756720
2497,-26.594830,-26.629084,-26.323402,-25.667645,-26.160046,-26.080082,-25.781332,-25.945044,-26.612896,-25.484230,...,-26.203609,-26.173115,-26.035423,-25.990685,-26.038435,-26.877182,-26.105238,-26.010986,-26.095190,-25.775791
2498,-26.714685,-26.746626,-26.405857,-25.798162,-26.253529,-26.181099,-25.896816,-26.110584,-26.730064,-25.682388,...,-26.291710,-26.263275,-26.194237,-26.152777,-26.197054,-26.875645,-26.164738,-26.171572,-26.249876,-25.953238
2499,-22.326986,-22.357538,-22.000742,-21.414114,-21.854822,-21.783091,-21.545395,-21.682375,-22.314816,-21.270271,...,-21.893354,-21.866186,-21.763151,-21.723204,-21.765854,-22.513472,-21.851519,-21.741318,-21.816475,-21.531101


In [118]:
# sorting out the highest ranking 50 items on each user
test_best5 = []
for x in test_preds.index:
    test_best5.append([(test_preds.loc[x].sort_values(ascending=False).index.values[:5])])
    

In [119]:
# forming dataframe with predicions
test_best5 = pd.DataFrame(test_best5, index=als_model_test.user_item.index, columns=['rec_5'])

In [120]:
test_best5

Unnamed: 0_level_0,rec_5
user_id,Unnamed: 1_level_1
1,"[1082185, 6534178, 995242, 1029743, 1133018]"
2,"[1082185, 6534178, 995242, 1029743, 1133018]"
3,"[1082185, 6534178, 995242, 1029743, 1133018]"
6,"[1082185, 6534178, 995242, 1029743, 1133018]"
7,"[1082185, 6534178, 995242, 1029743, 1133018]"
...,...
2496,"[1082185, 6534178, 995242, 1029743, 1133018]"
2497,"[1082185, 6534178, 995242, 1029743, 1133018]"
2498,"[1082185, 6534178, 995242, 1029743, 1133018]"
2499,"[1082185, 6534178, 995242, 1029743, 1133018]"


In [121]:
test_best5.to_csv('recommendations.csv')