# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
PATH = 'C:/Users/ASER/Desktop/GeekBrains/RecSys/data/'

In [4]:
data = pd.read_csv(PATH + 'retail_train.csv')
item_features = pd.read_csv(PATH + 'product.csv')
user_features = pd.read_csv(PATH + 'hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [6]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [7]:
recommender.get_als_recommendations(2375, N=5)

[899624, 845208, 1004906, 860776, 1037863]

In [8]:
recommender.get_own_recommendations(2375, N=5)

[1036501, 1079023, 1085983, 907099, 1027642]

In [9]:
recommender.get_similar_items_recommendation(2375, N=5)

[868764, 889731, 1055646, 1046545, 9527160]

In [10]:
recommender.get_similar_users_recommendation(2375, N=1) # 294, 542, 742, ##1021, ##1070, 1531, ##1541, 1867

[1082212]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [11]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [12]:
N = 50
users_train = data_train_lvl_1['user_id'].tolist()
users_valid = result_lvl_1['user_id'].tolist()
new_users = list(set(users_valid) - set(users_train))
users_list = list(set(users_valid) & set(users_train))

cols = ['top_popular', 'als_recommendations', 
        'own_recommendations', 'similar_items_recommendation',
        #'similar_users_recommendation'
       ]
df = pd.DataFrame(index=users_valid, columns=cols)

overall_top_purchases = recommender.overall_top_purchases[:N]
for user in new_users:
    for col in cols:
        df.loc[user, col] = overall_top_purchases
        
for user in users_list:
    df.loc[user, 'top_popular'] = overall_top_purchases
    df.loc[user, 'als_recommendations'] = recommender.get_als_recommendations(user, N)
    df.loc[user, 'own_recommendations'] = recommender.get_own_recommendations(user, N)
    df.loc[user, 'similar_items_recommendation'] = recommender.get_similar_items_recommendation(user, N)    
    # df.loc[user, 'similar_users_recommendation'] = recommender.get_similar_users_recommendation(user, N)    
    
        
df = df.reset_index()
df.columns = ['user_id'] + cols
result_lvl_1 = result_lvl_1.merge(df, on='user_id', how='left')

In [13]:
for col in cols:    
    recall = result_lvl_1.apply(lambda row: recall_at_k(row[col], row['actual'], N), axis=1).mean()
    print(f'recall_at_{N} {col}: {round(recall, 4)}')

recall_at_50 top_popular: 0.0848
recall_at_50 als_recommendations: 0.075
recall_at_50 own_recommendations: 0.1003
recall_at_50 similar_items_recommendation: 0.0562


__Вывод__: own recommendtions + top-popular дают лучший recall

In [14]:
k = [20, 50, 100, 200, 500]
cols = ['own_recommendations_' + str(i) for i in k]
df = pd.DataFrame(index=users_valid, columns=cols)

for i, N in enumerate(k):
    overall_top_purchases = recommender.overall_top_purchases[:N]
    for user in new_users:        
        df.loc[user, cols[i]] = overall_top_purchases
        
    for user in users_list:
        df.loc[user, cols[i]] = recommender.get_own_recommendations(user, N)
        
df = df.reset_index()
df.columns = ['user_id'] + cols
result_lvl_1 = result_lvl_1.merge(df, on='user_id', how='left')

In [15]:
for i, N in enumerate(k):   
    recall = result_lvl_1.apply(lambda row: recall_at_k(row[cols[i]], row['actual'], N), axis=1).mean()
    print(f'recall_at_{N} {cols[i]}: {round(recall, 4)}')

recall_at_20 own_recommendations_20: 0.0619
recall_at_50 own_recommendations_50: 0.1003
recall_at_100 own_recommendations_100: 0.148
recall_at_200 own_recommendations_200: 0.2212
recall_at_500 own_recommendations_500: 0.3296


__Вывод:__ recall_at_k увеличивается с увеличением k. 

Слишком большое число рекомендаций равносильно их полному отсутствию. Оптимальное число рекомендаций зависит от того, для кого мы их делаем: если мы что-то рекомендуем оптовым покупателям, то 100-200 позиций выглядят разумно, а если домохозяке, то лучше остановиться на 5-10 позициях.

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [16]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


#### 1. Одноуровневая модель

In [17]:
data_train = data[data['week_no'] < data['week_no'].max() - val_lvl_2_size_weeks]

n_items_before = data_train['item_id'].nunique()
data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


In [18]:
recommender = MainRecommender(data_train)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [19]:
N = 5
users_train = data_train_lvl_1['user_id'].tolist() + data_train_lvl_2['user_id'].tolist()
users_valid = result_lvl_2['user_id'].tolist()
new_users = list(set(users_valid) - set(users_train))
users_list = list(set(users_valid) & set(users_train))

cols = ['lvl_1_recommendations']
df = pd.DataFrame(index=users_valid, columns=cols)
        
for user in users_list:
    df.loc[user, 'lvl_1_recommendations'] = recommender.get_own_recommendations(user, N)
    
df = df.reset_index()
df.columns = ['user_id'] + cols
result_lvl_2 = result_lvl_2.merge(df, on='user_id', how='left')

In [20]:
result_lvl_2.apply(lambda row: precision_at_k(row['lvl_1_recommendations'], row['actual'], 5), axis=1).mean()

0.22801175318315378

#### 2. Двухуровневая модель 

In [21]:
user_features['age_desc'].replace(
    {'19-24': 22, '25-34': 30, '35-44': 40, '45-54': 50, '55-64': 60, '65+': 70},
    inplace=True)

user_features['marital_status_code'].replace(
    {'U': 0, 'A': 1, 'B': 2}, inplace=True)

user_features['income_desc'].replace(
    {'Under 15K': 10, '15-24K': 20, '25-34K':30, '35-49K': 40,
     '50-74K': 62, '75-99K': 87, '100-124K': 112, '125-149K': 137, 
     '150-174K': 162, '175-199K': 187, '200-249K': 225, '250K+':275}, inplace=True)

user_features['homeowner_desc'].replace(
    {'Unknown': 0, 'Probable Renter': 1, 'Renter': 2,
     'Probable Owner': 3, 'Homeowner': 4}, inplace=True)

user_features['hh_comp_desc'].replace(
    {'Unknown': 0, 'Single Male': 1, 'Single Female': 2,
     '1 Adult Kids': 3, '2 Adults No Kids': 4, '2 Adults Kids':5},inplace=True)

user_features['household_size_desc'].replace({'5+': 5}, inplace=True) 

user_features['kid_category_desc'].replace(
    {'None/Unknown': 0, '3+': 3}, inplace=True)

user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,70,1,40,4,4,2,0,1
1,50,1,62,4,4,2,0,7


In [22]:
names = ['manufacturer', 'department', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product']
for name in names:
    new_name = name + '_freq'
    a = item_features[name].value_counts()
    ind = a.index.tolist()
    for i in ind:
        item_features.loc[item_features[name] == i, new_name] = a[i]

item_features['brand'] = np.where(item_features['brand']=='Private', 0, 1)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,department_freq,commodity_desc_freq,sub_commodity_desc_freq,curr_size_of_product_freq
0,25671,2,GROCERY,1,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,1411.0,39021.0,29.0,29.0,12.0
1,26081,2,MISC. TRANS.,1,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,1411.0,490.0,490.0,429.0,30607.0


In [23]:
df = data_train_lvl_1.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
df = df.groupby('user_id')['sales_value'].mean().reset_index()
df.columns = ['user_id', 'mean_check']
user_features = user_features.merge(df, on='user_id')
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,mean_check
0,70,1,40,4,4,2,0,1,54.534677
1,50,1,62,4,4,2,0,7,59.761026


In [24]:
data_train_lvl_1['hour'] = data_train_lvl_1['trans_time'] // 100
user_item_features = data_train_lvl_1.groupby(['user_id', 'item_id'])['hour'].median().reset_index()
user_item_features.columns = ['user_id', 'item_id', 'median_sales_hour']

data_train_lvl_1['weekday'] = data_train_lvl_1['day'] % 7
df = data_train_lvl_1.groupby(['user_id', 'item_id'])['weekday'].median().reset_index()
df.columns = ['user_id', 'item_id', 'median_weekday']
user_item_features = user_item_features.merge(df, on=['user_id', 'item_id'])

user_item_features.head(2)

Unnamed: 0,user_id,item_id,median_sales_hour,median_weekday
0,1,820165,13.0,2.0
1,1,823721,13.0,4.0


In [25]:
data_train_lvl_1

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price,hour,weekday
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0,1.39,16,1
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0,0.82,16,1
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0,0.99,16,1
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0,1.21,16,1
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0,1.50,16,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2109568,856,40630539494,593,999999,1,1.99,372,-1.00,1831,85,0.0,0.0,1.99,18,5
2109569,856,40630539494,593,1120213,1,1.67,372,0.00,1831,85,0.0,0.0,1.67,18,5
2109570,856,40630539494,593,999999,1,5.69,372,-0.30,1831,85,0.0,0.0,5.69,18,5
2109571,856,40630539494,593,999999,1,10.99,372,-3.30,1831,85,0.0,0.0,10.99,18,5


In [26]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

recommender = MainRecommender(data_train_lvl_1)
users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
users_lvl_2.head(2)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




Unnamed: 0,user_id,candidates
0,2070,"[834103, 878302, 1119399, 1085604, 13511722, 9..."
1,2021,"[1119454, 1019142, 871279, 835578, 863762, 101..."


In [27]:
df=pd.DataFrame({'user_id':users_lvl_2.user_id.values.repeat(len(users_lvl_2.candidates[0])),
                 'item_id':np.concatenate(users_lvl_2.candidates.values)})

targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
targets_lvl_2['target'].fillna(0, inplace= True)

targets_lvl_2['target'].mean()

0.1763372502019228

In [28]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_item_features, on=['user_id', 'item_id'], how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,manufacturer_freq,...,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_check,median_sales_hour,median_weekday
0,2070,834103,1.0,2224,GROCERY,1,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,355.0,...,50.0,0.0,62.0,0.0,0.0,1,0,11.157471,4.0,4.0
1,2070,834103,1.0,2224,GROCERY,1,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,355.0,...,50.0,0.0,62.0,0.0,0.0,1,0,11.157471,4.0,4.0


In [29]:
SELECTED_FEATURES_NAMES = ['brand',
                           'manufacturer_freq', 'department_freq', 'commodity_desc_freq',
                           'sub_commodity_desc_freq', 'curr_size_of_product_freq', 
                           'age_desc', 'marital_status_code', 'income_desc', 
                           'homeowner_desc', 'hh_comp_desc',
                           'mean_check', 'median_sales_hour', 'median_weekday',
    
    #'household_size_desc', 'kid_category_desc',
]

In [30]:
X_train = targets_lvl_2[SELECTED_FEATURES_NAMES]
y_train = targets_lvl_2[['target']]

In [31]:
model_lgb = LGBMClassifier(objective='binary', max_depth=7)
model_lgb.fit(X_train.fillna(0), y_train)

LGBMClassifier(max_depth=7, objective='binary')

In [32]:
model_lgb = LGBMClassifier(objective='binary', max_depth=7)
model_lgb.fit(X_train[SELECTED_FEATURES_NAMES], y_train)

train_preds = model_lgb.predict_proba(X_train)[:, 1]

In [33]:
result = targets_lvl_2[['user_id', 'item_id']]
result['predictions'] = train_preds
result.sort_values(['predictions'], ascending=False, inplace=True)
result = result.groupby('user_id').head(5)

In [34]:
df = result.groupby('user_id')['item_id'].unique().reset_index()
df.columns = ['user_id', 'lgb_recommendations']
result_lvl_2 = result_lvl_1[['user_id', 'actual']].merge(df, on='user_id')
result_lvl_2

Unnamed: 0,user_id,actual,lgb_recommendations
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[934369, 932949, 8090541, 928342, 931136]"
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1075368, 904236, 983078, 1040807, 1035843]"
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[962229, 1075368, 891423, 1115098, 904973]"
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1027808, 1119051]"
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[1017061, 1122358, 1001525, 6602729, 1010424]"
...,...,...,...
2147,2496,"[831509, 867188, 1013623, 1048851, 5592734, 16...","[1056509, 972931, 986760, 10285187]"
2148,2497,"[820291, 824759, 838797, 859010, 859075, 86077...","[1066893, 1031316, 864279, 1135834, 850102]"
2149,2498,"[865511, 962991, 1076374, 1102358, 5564901, 15...","[7104690, 9526100, 933354, 1028473, 9420336]"
2150,2499,"[861282, 921744, 1050968, 13842089, 828837, 86...","[1070820, 947798, 873964, 5570048]"


In [35]:
result_lvl_2.apply(lambda row: precision_at_k(row['lgb_recommendations'], row['actual'], 5), axis=1).mean()
# 0.28105638166047087

0.28105638166047087

__Вывод:__ При использовании двухуровневой модели precision@5 вырос.