# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
#from recommenders import MainRecommender
from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('../retail_train.csv')
item_features = pd.read_csv('../product.csv')
user_features = pd.read_csv('../hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
n_items_before = data_train_lvl_1['item_id'].nunique()

prefiltered_train_lvl_1 = prefilter_items(data_train_lvl_1)

n_items_after = prefiltered_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5000


In [4]:
prefiltered_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
user_item_matrix = pd.pivot_table(prefiltered_train_lvl_1, index='user_id', columns='item_id', values='quantity',
                                      aggfunc='count', fill_value=0)
user_item_matrix

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819255,...,15511891,15596279,15596488,15596515,15778533,15926844,15926886,15927403,15927661,15927850
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,2,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2498,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2499,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Задание 1



Дают ли own recommendtions + top-popular лучший recall?  

 
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [6]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна
B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500} 

In [7]:
def prepare_matrx(data):
        
    user_item_matrix = pd.pivot_table(data, index='user_id', columns='item_id', values='quantity',
                                      aggfunc='count', fill_value=0)
    
    user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit
    
    return user_item_matrix

In [8]:
def prepare_dict(user_item_matrix):
    userids = user_item_matrix.index.values
    itemids = user_item_matrix.columns.values
    matrix_userids = np.arange(len(userids))
    matrix_itemids = np.arange(len(itemids))

    id_to_itemid = dict(zip(matrix_itemids, itemids))
    id_to_userid = dict(zip(matrix_userids, userids))
    itemid_to_id = dict(zip(itemids, matrix_itemids))
    userid_to_id = dict(zip(userids, matrix_userids))

    return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

In [9]:
def upd_dict(self, user_id):
    if user_id not in userid_to_id.keys():
        max_id = max(list(userid_to_id.values()))
        max_id += 1
        
        userid_to_id.update({user_id: max_id})
        id_to_userid.update({max_id: user_id})
    return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

In [10]:
def fit(data, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
    user_item_matrix = prepare_matrx(data)
    user_item_matrix = bm25_weight(user_item_matrix.T).T
    model = AlternatingLeastSquares(factors=n_factors, regularization=regularization,
                                    iterations=iterations, num_threads=num_threads)
    model.fit(csr_matrix(user_item_matrix).T.tocsr())

    return model

In [11]:
def get_als_recommends(prefiltered_data, user, unfiltered_data, model, N=50):
    if user in prefiltered_data['user_id'].unique():
        prefiltered_matrix = prepare_matrx(prefiltered_data)
        id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = prepare_dict(prefiltered_matrix)
        prefiltered_matrix = bm25_weight(prefiltered_matrix.T).T

        rec = model.recommend(userid=userid_to_id[user], user_items=csr_matrix(prefiltered_matrix).tocsr(),
                              N=N, filter_already_liked_items=False, recalculate_user=True)
        rec_ids = [id_to_itemid[f[0]] for f in rec]
        
    else:
        
        new_df = pd.concat([prefiltered_data, unfiltered_data.loc[unfiltered_data['user_id']==user, :]])
        prefiltered_matrix = prepare_matrx(new_df)
        id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = prepare_dict(prefiltered_matrix)
        prefiltered_matrix = bm25_weight(prefiltered_matrix.T).T
        rec = als_model.recommend(userid=userid_to_id[user], user_items=csr_matrix(new_df).tocsr(),
                              N=N, filter_already_liked_items=False, recalculate_user=True)
        rec_ids = [id_to_itemid[f[0]] for f in rec]
        
    return np.array(rec_ids)

In [12]:
als_model = fit(prefiltered_train_lvl_1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:05<00:00,  2.98it/s]


In [13]:
num_candidates = [20, 50, 100, 200, 500]

In [15]:
%%time

for k in num_candidates:
    column_name = f'als{k}'
    result_lvl_1[column_name] = result_lvl_1['user_id'].apply(lambda x: get_als_recommends(prefiltered_train_lvl_1,
                                                                                     x, data_val_lvl_1, als_model, N=k))

# code in this cell successfully implemented
# took CPU times: total: 7h 11min 6s
# Wall time: 7h 12min 21s
# the results were dumped to csv file, which we are now downloading in the next cell



CPU times: total: 7h 5min 15s
Wall time: 7h 6min 2s


In [16]:
result_lvl_1.to_csv('../als_test_recommendations.csv')

In [17]:
read_result_lvl_1 = pd.read_csv('../als_test_recommendations.csv', index_col=0)

In [18]:
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als20,als50,als100,als200,als500
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1062572, 1082185, 1082212, 885290, 856942, 55...","[1062572, 1082185, 1082212, 885290, 856942, 55...","[1062572, 1082185, 1082212, 885290, 856942, 55...","[1062572, 1082185, 1082212, 885290, 856942, 55...","[1062572, 1082185, 1082212, 885290, 856942, 55..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[834484, 5569230, 1082185, 855672, 854852, 113...","[834484, 5569230, 1082185, 855672, 854852, 113...","[834484, 5569230, 1082185, 855672, 854852, 113...","[834484, 5569230, 1082185, 855672, 854852, 113...","[834484, 5569230, 1082185, 855672, 854852, 113..."


In [19]:
read_result_lvl_1.head(7)

Unnamed: 0,user_id,actual,als20,als50,als100,als200,als500
0,1,[ 853529 865456 867607 872137 874905 ...,[1062572 1082185 1082212 885290 856942 55770...,[1062572 1082185 1082212 885290 856942 55770...,[ 1062572 1082185 1082212 885290 856942 ...,[ 1062572 1082185 1082212 885290 856942 ...,[ 1062572 1082185 1082212 885290 856942 ...
1,2,[15830248 838136 839656 861272 866211 ...,[ 834484 5569230 1082185 855672 854852 11376...,[ 834484 5569230 1082185 855672 854852 11376...,[ 834484 5569230 1082185 855672 854852 ...,[ 834484 5569230 1082185 855672 854852 ...,[ 834484 5569230 1082185 855672 854852 ...
2,4,[ 883932 970760 1035676 1055863 1097610 ...,[ 902172 846550 951590 891423 1119454 8839...,[ 902172 846550 951590 891423 1119454 ...,[ 902172 846550 951590 891423 1119454 ...,[ 902172 846550 951590 891423 1119454 ...,[ 902172 846550 951590 891423 1119454 ...
3,6,[ 1024306 1102949 6548453 835394 940804 ...,[1082185 878996 999250 863632 1024306 8570...,[1082185 878996 999250 863632 1024306 8570...,[ 1082185 878996 999250 863632 1024306 ...,[ 1082185 878996 999250 863632 1024306 ...,[ 1082185 878996 999250 863632 1024306 ...
4,7,[ 836281 843306 845294 914190 920456 ...,[ 853643 1003188 849843 1082185 8119303 ...,[ 853643 1003188 849843 1082185 8119303 ...,[ 853643 1003188 849843 1082185 8119303 ...,[ 853643 1003188 849843 1082185 8119303 ...,[ 853643 1003188 849843 1082185 8119303 ...
5,8,[ 868075 886787 945611 1005186 1008787 ...,[1082185 840361 916122 981760 1029743 8262...,[1082185 840361 916122 981760 1029743 8262...,[ 1082185 840361 916122 981760 1029743 ...,[ 1082185 840361 916122 981760 1029743 ...,[ 1082185 840361 916122 981760 1029743 ...
6,9,[ 883616 1029743 1039126 1051323 1082772 ...,[1082212 1029743 856772 893018 982790 94167...,[ 1082212 1029743 856772 893018 982790 ...,[ 1082212 1029743 856772 893018 982790 ...,[ 1082212 1029743 856772 893018 982790 ...,[ 1082212 1029743 856772 893018 982790 ...


In [20]:
%%time

for k in num_candidates:
    column_name = f'als{k}'
    print(f"precision at k = {k} from {k} candidates is {result_lvl_1.apply(lambda row: precision_at_k(row[column_name], row['actual'], k=k), axis=1).mean()}")
    print(f"precision at k = 5 from {k} candidates is {result_lvl_1.apply(lambda row: precision_at_k(row[column_name], row['actual'], k=5), axis=1).mean()}")

precision at k = 20 from 20 candidates is 0.12546425255338906
precision at k = 5 from 20 candidates is 0.17734447539461468
precision at k = 50 from 50 candidates is 0.0921819870009285
precision at k = 5 from 50 candidates is 0.17734447539461468
precision at k = 100 from 100 candidates is 0.07110956360259982
precision at k = 5 from 100 candidates is 0.17734447539461468
precision at k = 200 from 200 candidates is 0.053307799442896944
precision at k = 5 from 200 candidates is 0.17734447539461468
precision at k = 500 from 500 candidates is 0.03428876508820799
precision at k = 5 from 500 candidates is 0.17734447539461468
CPU times: total: 2.17 s
Wall time: 2.19 s


In [22]:
%%time

for k in num_candidates:
    column_name = f'als{k}'
    print(f"recall_at_k = {k} from {k} candidates is {result_lvl_1.apply(lambda row: recall_at_k(row[column_name], row['actual'], k=k), axis=1).mean()}")
    print(f"recall_at_k = 5 from {k} candidates is {result_lvl_1.apply(lambda row: recall_at_k(row[column_name], row['actual'], k=5), axis=1).mean()}")

recall_at_k = 20 from 20 candidates is 0.04482995288071624
recall_at_k = 5 from 20 candidates is 0.01789508306220854
recall_at_k = 50 from 50 candidates is 0.07923768819848652
recall_at_k = 5 from 50 candidates is 0.01789508306220854
recall_at_k = 100 from 100 candidates is 0.11762020916456381
recall_at_k = 5 from 100 candidates is 0.01789508306220854
recall_at_k = 200 from 200 candidates is 0.1725552877619684
recall_at_k = 5 from 200 candidates is 0.01789508306220854
recall_at_k = 500 from 500 candidates is 0.26958839708716525
recall_at_k = 5 from 500 candidates is 0.01789508306220854
CPU times: total: 2.16 s
Wall time: 2.17 s


In [None]:
# your_code

In [None]:
### Финальный проект

Мы уже прошли всю необходимуб теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ).
Рекомендуем вам **начать делать проект сразу после этого домашнего задания**
- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 25%
- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями 