In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#%matplotlib inline

# Для работы с матрицами

from scipy.sparse import csr_matrix, coo_matrix
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Матричная факторизация
from implicit import als
#from implicit.nearest_neighbours import ItemItemRecommender

# Модель второго уровня
from lightgbm import LGBMClassifier
from lightfm import LightFM

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# evaluation metrics
from lightfm.evaluation import precision_at_k, recall_at_k
from src.metrics import prec_at_k, rec_at_k

# Написанные нами функции

from src.utils import prefilter_items, get_user_matrix, get_item_matrix, get_useritem_matrix, cold_user_recommend
from src.recommenders import MainRecommender
#from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm
  "LightFM was compiled without OpenMP support. "


In [2]:
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

In [3]:
# param grid to search for better arguments' values for the model
param_grid = {
    'no_components': [10, 20, 30],
    'learning_rate': np.linspace(0.005, 0.03, num=5), 
    'loss': ['bpr', 'warp'], #=, # 
    'item_alpha': np.linspace(0.001, 0.05, num=5), 
    'user_alpha': np.linspace(0.001, 0.05, num=5)
}


In [4]:
# downloading data

data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')
test_data = pd.read_csv('../data/retail_test1.csv')

In [5]:
# same column names through users', items' and transactions' tables
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [6]:
# Важна схема обучения и валидации!
# -- давние покупки --train | -- 6 недель -- val_lvl_1| -- 3 недель --val_lvl_2 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

#data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [7]:
# dataframe with purchase results for each user in validation 1 data - data_val_lvl_1
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [8]:
# dataframe with purchase results for each user in validation 2 data -data_val_lvl_2
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [9]:
# dataframe with purchase results for each user in validation 2 data -data_val_lvl_2
result_test = test_data.groupby('user_id')['item_id'].unique().reset_index()
result_test.columns=['user_id', 'actual']
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


**1. Prefiltering data (train, valid1, valid2, test) checking number of items left for recommendation**

In [10]:
# prefiltering train data
n_items_before = data_train_lvl_1['item_id'].nunique()

prefiltered_train_lvl_1 = prefilter_items(data_train_lvl_1)

n_items_after = prefiltered_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5000


In [11]:
# prefiltering level 1 validation data
n_items_before = data_val_lvl_1['item_id'].nunique()

prefiltered_lvl_1 = prefilter_items(data_val_lvl_1)

n_items_after = prefiltered_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 27649 to 5000


In [12]:
# prefiltering level 2 validation data
n_items_before = data_val_lvl_2['item_id'].nunique()

prefiltered_lvl_2 = prefilter_items(data_val_lvl_2)

n_items_after = prefiltered_lvl_2['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 24329 to 5000


In [13]:
# prefiltering test data
n_items_before = test_data['item_id'].nunique()

prefiltered_test = prefilter_items(test_data)

n_items_after = prefiltered_test['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 20497 to 5000


In [14]:
# checking number of unique users in all data
data['user_id'].nunique()

2499

In [15]:
#checking number of users in test data
test_data['user_id'].nunique()

1885

In [16]:
# number of users in test data new to train data
len([us for us in test_data['user_id'].unique() if us in data['user_id']])

1885

In [17]:
#number of users in prefiltered train data
prefiltered_train_lvl_1['user_id'].values

array([2375, 2375, 2375, ...,  856,  856,  856], dtype=int64)

* *Warm users* - valid data users, that are also in prefiltered train data   
* *Cold users* - valid data users, that are NOT in prefiltered train data

In [18]:
warm_users_1 = [x for x in result_lvl_1['user_id'].values if x in prefiltered_train_lvl_1['user_id'].values]
cold_users_1 = [x for x in result_lvl_1['user_id'].values if x not in prefiltered_train_lvl_1['user_id'].values]

In [19]:
print(f'in validation 1 data number of warm users: {len(warm_users_1)}, of cold users {len(cold_users_1)}')

in validation 1 data number of warm users: 2145, of cold users 9


In [20]:
warm_users_2 = [x for x in result_lvl_2['user_id'].values if x in prefiltered_train_lvl_1['user_id'].values]
cold_users_2 = [x for x in result_lvl_2['user_id'].values if x not in prefiltered_train_lvl_1['user_id'].values]

In [21]:
print(f'in validation 2 data number of warm users: {len(warm_users_2)}, of cold users {len(cold_users_2)}')

in validation 2 data number of warm users: 2036, of cold users 6


In [22]:
warm_users_test = [x for x in result_test['user_id'].values if x in prefiltered_train_lvl_1['user_id'].values]
cold_users_test = [x for x in result_test['user_id'].values if x not in prefiltered_train_lvl_1['user_id'].values]

In [23]:
print(f'in test data number of warm users: {len(warm_users_test)}, of cold users {len(cold_users_test)}')

in test data number of warm users: 1876, of cold users 9


In [24]:
# preparing user_item matrix with number of each purchased item by each user
"""
user_item_matrix = pd.pivot_table(prefiltered_train_lvl_1, index='user_id', columns='item_id', values='quantity',
                                      aggfunc='count', fill_value=0)
user_item_matrix = user_item_matrix.astype(float)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()
"""

"\nuser_item_matrix = pd.pivot_table(prefiltered_train_lvl_1, index='user_id', columns='item_id', values='quantity',\n                                      aggfunc='count', fill_value=0)\nuser_item_matrix = user_item_matrix.astype(float)\nsparse_user_item = csr_matrix(user_item_matrix).tocsr()\n"

In [25]:
#user_item_matrix.head(2)

**checking warm items through dataframes**

In [26]:
# unique items in different dataframes
unique_items_all = data['item_id'].unique()
unique_items_train = data_train_lvl_1['item_id'].unique()
unique_items_prefiltered = prefiltered_train_lvl_1['item_id'].unique()
unique_items_lvl1 = data_val_lvl_1['item_id'].unique()
unique_items_prefil_lvl1 = prefiltered_lvl_1['item_id'].unique()
unique_items_lvl2 = data_val_lvl_2['item_id'].unique()
unique_items_prefil_lvl2 = prefiltered_lvl_2['item_id'].unique()
unique_items_test = test_data['item_id'].unique()
unique_items_prefil_test = prefiltered_test['item_id'].unique()
unique_items_infeatures = item_features['item_id'].unique()

In [27]:
print(f'number of items in: \n \
- all {len(unique_items_all)},\n \
- prefiltered train {len(unique_items_prefiltered)},\n \
- lvl1 {len(unique_items_lvl1)},\n \
- prefiltered lvl1 {len(unique_items_prefil_lvl1)},\n \
- lvl2 {len(unique_items_lvl2)},\n \
- prefiltered lvl2 {len(unique_items_prefil_lvl2)},\n \
- test {len(unique_items_test)},\n \
- prefiltered test {len(unique_items_prefil_test)},\n \
- features data {len(unique_items_infeatures)}')

number of items in: 
 - all 89051,
 - prefiltered train 5000,
 - lvl1 27649,
 - prefiltered lvl1 5000,
 - lvl2 24329,
 - prefiltered lvl2 5000,
 - test 20497,
 - prefiltered test 5000,
 - features data 92353


In [28]:
# number of items in prefiltered validation lvl 1 transactions absent in prefiltered train (cold items for valid lvl 1)
len([item_n for item_n in unique_items_prefil_lvl1 if item_n not in unique_items_prefiltered])

1525

In [29]:
# number of items in prefiltered validation lvl 2 transactions absent in prefiltered train (cold items for valid lvl 2)
len([item_n for item_n in unique_items_prefil_lvl2 if item_n not in unique_items_prefiltered])

1606

In [30]:
# number of items in prefiltered test transactions absent in prefiltered train (cold items for test)
len([item_n for item_n in unique_items_prefil_test if item_n not in unique_items_prefiltered])

1769

**2. Building ALS model to get embeddings**

In [31]:
# als_model on prefiltered train
als_model = MainRecommender(prefiltered_train_lvl_1)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:03<00:00,  4.05it/s]
100%|███████████████████████████████████████████████████████████████████████████| 5000/5000 [00:00<00:00, 14399.64it/s]


In [32]:
user_item_matrix = als_model.user_item
item_factors = als_model.item_factors
user_factors = als_model.user_factors
user_item_matrix

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819255,...,15511891,15596279,15596488,15596515,15778533,15926844,15926886,15927403,15927661,15927850
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# als_model on prefiltered lvl1
als_model_lvl1 = MainRecommender(prefiltered_lvl_1)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:02<00:00,  6.05it/s]
100%|██████████████████████████████████████████████████████████████████████████| 5000/5000 [00:00<00:00, 138893.44it/s]


In [34]:
user_item_matrix_lvl1 = als_model_lvl1.user_item
item_factors_lvl1 = als_model_lvl1.item_factors
user_factors_lvl1 = als_model_lvl1.user_factors
user_item_matrix_lvl1

item_id,397896,480014,707683,818980,819063,819112,819227,819255,819304,819308,...,17105257,17105540,17106064,17168855,17169131,17178955,17179662,17179814,17208470,17215077
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**3. Prepare user and item features for lightfm**

* on prefiltered train

In [35]:
# prepare user_item matrix on train prefilterted
user_item_lightfm = get_useritem_matrix(user_item_matrix)
user_item_lightfm.shape

(2484, 5000)

In [36]:
# prepare user_features matrix (adding new features, adding embeddings)
user_features_lightfm = get_user_matrix(prefiltered_train_lvl_1, user_features)
user_features_lightfm.shape

(2484, 51)

In [37]:
user_features_lightfm.head(2)

Unnamed: 0_level_0,mean_time,average_basket,baskets_per_week,age_desc_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,...,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_0,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1400.204956,2.374918,20.452235,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2.0,1772.341064,2.639509,10.744353,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [38]:
# prepare item_features matrix (adding new features, adding embeddings, changing id numbers)
item_features_lightfm = get_item_matrix(prefiltered_train_lvl_1, item_features)
item_features_lightfm.shape

(5000, 29)

In [39]:
item_features_lightfm.head(2)

Unnamed: 0_level_0,manufacturer,retail_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,price,department_CHEF SHOPPE,department_COUP/STR & MFG,department_DELI,...,department_NUTRITION,department_PASTRY,department_PRODUCE,department_SALAD BAR,department_SEAFOOD,department_SEAFOOD-PCKGD,department_SPIRITS,department_TRAVEL & LEISUR,brand_National,brand_Private
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
202291,69,-0.5925,4,0.047059,4681,3.059477,0.002279,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
397896,69,-0.355979,97,1.141176,17973,17.620588,0.00238,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


* on prefiltered validation level 1

In [40]:
# prepare user_item matrix on train prefilterted
lvl1_user_item_lightfm = get_useritem_matrix(user_item_matrix)
lvl1_user_item_lightfm.shape

(2484, 5000)

In [41]:
# prepare validation user and item features for lightfm

In [42]:
val1_item_features_lightfm = get_item_matrix(prefiltered_lvl_1, item_features)
val1_item_features_lightfm.shape

(5000, 30)

In [43]:
val1_item_features_lightfm.tail(3)

Unnamed: 0_level_0,manufacturer,retail_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,price,department_CHEF SHOPPE,department_COUP/STR & MFG,department_DELI,...,department_NUTRITION,department_PASTRY,department_PRODUCE,department_SALAD BAR,department_SEAFOOD,department_SEAFOOD-PCKGD,department_SPIRITS,department_TRAVEL & LEISUR,brand_National,brand_Private
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17179814,103,-0.18,17,2.833333,78224,3.488716,1.11,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17208470,1415,-0.150909,22,3.666667,455,1.763566,3.131515,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17215077,1273,0.0,7,1.166667,78224,3.488716,2.69,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [44]:
val1_user_features_lightfm = get_user_matrix(prefiltered_lvl_1, user_features)
val1_user_features_lightfm.shape

(2110, 51)

In [45]:
val1_user_features_lightfm.head(2)

Unnamed: 0_level_0,mean_time,average_basket,baskets_per_week,age_desc_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,...,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_0,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1245.216187,2.392703,29.51,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2.0,1930.307739,2.036923,17.653333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


**4. Normalizing data**

* normalizing train

In [46]:
scaler = preprocessing.MinMaxScaler()

In [47]:
def scaling_data(scaler, data_matrix):
    matrix_copy = data_matrix.copy()
    if 'user_id' in matrix_copy.columns.values:
        matrix_copy.set_index('user_id', inplace=True)
    elif 'item_id' in matrix_copy.columns.values:
        matrix_copy.set_index('item_id', inplace=True)
    else:
        pass
    cols = matrix_copy.columns
    ind = matrix_copy.index
    d = scaler.fit_transform(matrix_copy)
    scaled_df = pd.DataFrame(d, index=ind, columns=cols)
    
    return scaled_df

In [48]:
scaled_item_features = scaling_data(scaler, item_features_lightfm)
scaled_item_features

Unnamed: 0_level_0,manufacturer,retail_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,price,department_CHEF SHOPPE,department_COUP/STR & MFG,department_DELI,...,department_NUTRITION,department_PASTRY,department_PRODUCE,department_SALAD BAR,department_SEAFOOD,department_SEAFOOD-PCKGD,department_SPIRITS,department_TRAVEL & LEISUR,brand_National,brand_Private
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
202291,0.010586,0.970631,0.000123,0.000123,0.005234,0.139145,0.000114,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
397896,0.010586,0.982355,0.003948,0.003948,0.020291,1.000000,0.000119,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
420647,0.010586,0.974158,0.000576,0.000576,0.005234,0.139145,0.000137,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
480014,0.010586,0.973417,0.001069,0.001069,0.020291,1.000000,0.000116,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
545926,0.010586,0.970259,0.000000,0.000000,0.005234,0.139145,0.000145,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15926844,0.085638,0.986251,0.004647,0.004647,1.000000,0.126692,0.111184,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15926886,0.085638,0.986546,0.005428,0.005428,1.000000,0.126692,0.111752,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15927403,0.197346,0.992038,0.003948,0.003948,1.000000,0.126692,0.029521,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15927661,0.807237,0.999804,0.002138,0.002138,1.000000,0.126692,0.012443,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [49]:
scaled_user_features = scaling_data(scaler, user_features_lightfm)

* normalizing valid level 1

In [50]:
scaled_item_features_lvl1 = scaling_data(scaler, val1_item_features_lightfm)

In [51]:
scaled_user_features_lvl1 = scaling_data(scaler, val1_user_features_lightfm)

### building lightfm model

In [52]:
#model without normalizing
model = LightFM(no_components=10,
                loss='bpr',
                learning_rate=0.005, 
                item_alpha=0.001,
                user_alpha=0.001, 
                random_state=42)

model.fit(coo_matrix(user_item_lightfm),
          user_features=csr_matrix(user_features_lightfm.values).tocsr(),
          item_features=csr_matrix(item_features_lightfm.values).tocsr(),

          epochs=10, 
          num_threads=1)


<lightfm.lightfm.LightFM at 0x28b05803808>

In [53]:
train_precision = precision_at_k(model, coo_matrix(user_item_lightfm),
                                 user_features=csr_matrix(user_features_lightfm.values).tocsr(),
                                 item_features=csr_matrix(item_features_lightfm.values).tocsr(),
                                 k=5).mean()

train_precision


0.38405797

In [54]:
model_scaled = LightFM(no_components=10,
                loss='bpr',
                learning_rate=0.005, 
                item_alpha=0.001,
                user_alpha=0.001, 
                random_state=42)

model_scaled.fit(coo_matrix(user_item_lightfm),
          user_features=csr_matrix(scaled_user_features.values).tocsr(),
          item_features=csr_matrix(scaled_item_features.values).tocsr(),

          epochs=10, 
          num_threads=1) 

<lightfm.lightfm.LightFM at 0x28b05aee288>

In [55]:
train_precision = precision_at_k(model_scaled, coo_matrix(user_item_lightfm),
                                 user_features=csr_matrix(scaled_user_features.values).tocsr(),
                                 item_features=csr_matrix(scaled_item_features.values).tocsr(),
                                 k=5).mean()

train_precision

0.50209343

In [56]:
"""%%time
gbm = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='top_k_accuracy')
gbm.fit(coo_matrix(user_item_matrix),user_features=csr_matrix(scaled_user_features.values).tocsr(),
                                 item_features=csr_matrix(scaled_item_features.values).tocsr())

print('Best parameters found by grid search are:', gbm.best_params_)


Best parameters found by grid search are: {'item_alpha': 0.001, 'learning_rate': 0.005, 'loss': 'bpr', 'no_components': 10, 'user_alpha': 0.001}
Wall time: 9h 35min 7s
"""

"%%time\ngbm = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='top_k_accuracy')\ngbm.fit(coo_matrix(user_item_matrix),user_features=csr_matrix(scaled_user_features.values).tocsr(),\n                                 item_features=csr_matrix(scaled_item_features.values).tocsr())\n\nprint('Best parameters found by grid search are:', gbm.best_params_)\n\n\nBest parameters found by grid search are: {'item_alpha': 0.001, 'learning_rate': 0.005, 'loss': 'bpr', 'no_components': 10, 'user_alpha': 0.001}\nWall time: 9h 35min 7s\n"

* Lightfm predictions on train

In [57]:
train_lightfm_preds = []
for train_user in user_features_lightfm.index.values.astype(int):
    predictions = model_scaled.predict(user_ids=int(als_model.userid_to_id[train_user]), item_ids=np.fromiter(als_model.itemid_to_id.values(), dtype=float),
                            user_features=csr_matrix(scaled_user_features.values).tocsr(),
                            item_features=csr_matrix(scaled_item_features.values).tocsr(),
                            num_threads=1)
    train_lightfm_preds.append(predictions)

In [58]:
# forming dataframe of predictions

train_preds = pd.DataFrame(train_lightfm_preds, index=als_model.user_item.index, columns=als_model.user_item.columns)

In [59]:
train_preds

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819255,...,15511891,15596279,15596488,15596515,15778533,15926844,15926886,15927403,15927661,15927850
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-120.787079,-122.063858,-120.783585,-122.083496,-120.787949,-120.786880,-122.089249,-121.612930,-121.543190,-122.555351,...,-121.807800,-121.543274,-121.536430,-121.551437,-120.163452,-121.567909,-121.563202,-121.613930,-122.005264,-121.616875
2,-164.566406,-166.173676,-164.562134,-166.198227,-164.567520,-164.566254,-166.205475,-165.388214,-165.312561,-166.443344,...,-165.634689,-165.304962,-165.305603,-165.320251,-163.783279,-165.352371,-165.346603,-165.386597,-165.868042,-165.390594
3,-172.066864,-173.789566,-172.062332,-173.815201,-172.067993,-172.066681,-173.822845,-172.965927,-172.884720,-174.115158,...,-173.244965,-172.876953,-172.875916,-172.891968,-171.208771,-172.925232,-172.919189,-172.964294,-173.484024,-172.968430
4,-180.825439,-182.717804,-180.820557,-182.745239,-180.826630,-180.825165,-182.753418,-181.819626,-181.730484,-183.094101,...,-182.148666,-181.722748,-181.720123,-181.738449,-179.858627,-181.772385,-181.765961,-181.818115,-182.387756,-181.822479
5,-181.224091,-183.099380,-181.219208,-183.126892,-181.225311,-181.223877,-183.135040,-182.202728,-182.114166,-183.458572,...,-182.523956,-182.106384,-182.104523,-182.122559,-180.272064,-182.156708,-182.150253,-182.201202,-182.764694,-182.205612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,-137.620850,-139.287445,-137.616776,-139.310059,-137.621841,-137.620453,-139.316650,-138.615448,-138.532761,-139.825699,...,-138.924774,-138.530014,-138.520355,-138.538818,-136.747330,-138.561050,-138.555725,-138.615570,-139.123062,-138.618942
2497,-157.303024,-159.271179,-157.297699,-159.301239,-157.304321,-157.302750,-159.310028,-158.399261,-158.301666,-159.750458,...,-158.720230,-158.295181,-158.292313,-158.312531,-156.314346,-158.346497,-158.339432,-158.398331,-158.998810,-158.403061
2498,-130.734314,-132.115768,-130.731018,-132.134094,-130.735123,-130.733978,-132.139252,-131.598389,-131.530090,-132.618271,...,-131.832153,-131.527588,-131.518463,-131.533951,-130.029663,-131.552216,-131.547958,-131.598389,-132.025528,-131.601105
2499,-132.569901,-134.348160,-132.564697,-134.378250,-132.571213,-132.569885,-134.387314,-133.420349,-133.333923,-134.537079,...,-133.674530,-133.325760,-133.332153,-133.348267,-131.730621,-133.385254,-133.378082,-133.418777,-133.939362,-133.423737


In [60]:
# sorting out the highest ranking 50 items on each user
df_best500 = []
for x in train_preds.index:
    df_best500.append([(train_preds.loc[x].sort_values(ascending=False).index.values[:50])])
    

In [61]:
# forming dataframe with predicions
df_best50 = pd.DataFrame(df_best500, index=als_model.user_item.index, columns=['rec_50'])

In [62]:
#
r = result_lvl_1.join(df_best50, on=['user_id'], how='left')

In [63]:
# recommendations for cold users added
r.loc[r['user_id'].isin(cold_users_1),
      ['rec_50']
     ] = r['user_id'].apply(lambda x:cold_user_recommend(prefiltered_train_lvl_1, n=50))

In [64]:
# after adding popular items for cold users on matrix multiplication method for warm users

pres_users = []
for x_user in r['user_id']:
    user_pres = prec_at_k(r.loc[r['user_id']== x_user, 'rec_50'].values[0],
                               r.loc[r['user_id']== x_user,'actual'].values[0],
                               k=5)
    pres_users.append(user_pres)
    
np.mean(pres_users)

0.21541318477251625