### Домашнее задание к вебинару 6

#### Импорт библиотек

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

#import os, sys
#module_path = os.path.abspath(os.path.join(os.pardir))
#if module_path not in sys.path:
#    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
#from src.utils1 import prefilter_items
from src.utils import prefilter_items
from src.recommenders1 import MainRecommender

### Загрузка данных

In [12]:
data = pd.read_csv('../retail_train.csv')
item_features = pd.read_csv('../product.csv')
user_features = pd.read_csv('../hh_demographic.csv')

### Process features dataset

In [13]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

### Разделим датасет на 3 части - тренировочный, тестовый и валидационный

In [14]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

# модель 1 уровня - обучается на давних покупках, валидируется на промежуточных 6 неделях
# модель 2 уровня - обучается на промежуточных 6 неделях, валидируется на последних 3 неделях


VAL_LVL1_WEEKS = 6
VAL_LVL2_WEEKS = 3

In [15]:
# берем данные для тренировки модели 1 уровня
data_train_lvl1 = data[data['week_no'] < data['week_no'].max() - (VAL_LVL1_WEEKS + VAL_LVL2_WEEKS)]

# берем данные для валидации matching модели
data_val_lvl1 = data[(data['week_no'] >= data['week_no'].max() - (VAL_LVL1_WEEKS + VAL_LVL2_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_LVL2_WEEKS))]


# берем данные для тренировки модели 2 уровня
data_train_lvl2 = data_val_lvl1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_lvl2 = data[data['week_no'] >= data['week_no'].max() - VAL_LVL2_WEEKS]

In [16]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data['user_id'].nunique()} Items: {df_data['item_id'].nunique()}")

In [17]:
print_stats_data(data_train_lvl1,'train_matcher - data lvl1')
print_stats_data(data_val_lvl1,'val_matcher - data lvl1')
print_stats_data(data_train_lvl2,'train_ranker - data lvl2')
print_stats_data(data_val_lvl2,'val_ranker - data lvl2')

train_matcher - data lvl1
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher - data lvl1
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker - data lvl2
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker - data lvl2
Shape: (118314, 12) Users: 2042 Items: 24329


In [18]:
# выше видим разброс по пользователям и товарам

In [19]:
data_train_lvl1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


### Префильтрация items

In [20]:
n_items_before = data_train_lvl1['item_id'].nunique()

data_train_lvl1 = prefilter_items(data_train_lvl1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


### Избавляемся от "холодного старта", осталвяем только "теплый старт"

In [21]:
data_train_lvl1.user_id.values

array([1285, 1285, 1285, ...,  856,  856,  856])

In [22]:
# ищем общих пользователей
common_users = list(set(data_train_lvl1.user_id.values)&(set(data_val_lvl1.user_id.values))\
                    &set(data_val_lvl2.user_id.values))

data_train_lvl1 = data_train_lvl1[data_train_lvl1.user_id.isin(common_users)]
data_val_lvl1 = data_val_lvl1[data_val_lvl1.user_id.isin(common_users)]
data_train_lvl2 = data_train_lvl2[data_train_lvl2.user_id.isin(common_users)]
data_val_lvl2 = data_val_lvl2[data_val_lvl2.user_id.isin(common_users)]

print_stats_data(data_train_lvl1,'train_matcher - data lvl1')
print_stats_data(data_val_lvl1,'val_matcher - data lvl1')
print_stats_data(data_train_lvl2,'train_ranker - data lvl2')
print_stats_data(data_val_lvl2,'val_ranker - data lvl2')

train_matcher - data lvl1
Shape: (550491, 13) Users: 1909 Items: 5000
val_matcher - data lvl1
Shape: (163158, 12) Users: 1909 Items: 27115
train_ranker - data lvl2
Shape: (163158, 12) Users: 1909 Items: 27115
val_ranker - data lvl2
Shape: (115790, 12) Users: 1909 Items: 24032


In [23]:
# Теперь warm-start по пользователям

### Инициализируем класс Main Recommender

В нем предусмотрено сразу обучение на обучающей выборке модели 1 уровня (ALS модель)

In [24]:
recommender = MainRecommender(data_train_lvl1)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

### Задание 1 - Измеряем recall@k

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?



In [25]:
# Опишем функции, в которые будут оборачиваться будущие вычисления

ACTUAL_COL = 'actual'

# Функция для получение рекомендаций
def make_recommendations(df_result, recommend_model, N=50, user_col='user_id'):
    return df_result[user_col].apply(lambda x: recommend_model(x, N=N))

# Функция для расчета recall@k
def calc_recall_at_k(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, round((df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()), 4)

# Фнукция для расчета precision@k
def calc_precision_at_k(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [26]:
# Создаем результиующий датафрейм для расчета результата метрики на тестовых данных
# Для 1 уровня тестовый набор данных - промежуточные 6 недель - data_val_lvl1

result_eval_lvl1 = data_val_lvl1.groupby('user_id')['item_id'].unique().reset_index()
result_eval_lvl1.columns=['user_id', 'actual']
result_eval_lvl1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."


In [27]:
%%time

models = {'als_rec': recommender.get_als_recommendations,
          'own_rec': recommender.get_own_recommendations, 
          'similar_item_rec': recommender.get_similar_items_recommendation, 
          'similar_user_rec': recommender.get_similar_users_recommendation}

for col_name, model in models.items():
    result_eval_lvl1[col_name] = make_recommendations(result_eval_lvl1, model)

CPU times: user 2min 10s, sys: 4min 56s, total: 7min 7s
Wall time: 1min 16s


In [28]:
result_eval_lvl1.head(8)

Unnamed: 0,user_id,actual,als_rec,own_rec,similar_item_rec,similar_user_rec
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[856942, 1037332, 5577022, 1002850, 1077133, 9...","[856942, 896666, 877391, 5577022, 1088462, 888...","[13417587, 1087955, 974177, 929341, 1088462, 9...","[6552318, 925350, 1061394, 949257, 6633167, 89..."
1,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[1026118, 930118, 857006, 1084036, 878996, 854...","[13003092, 972416, 995598, 9831557, 876626, 10...","[5569845, 819308, 5569230, 878996, 948650, 141...","[970160, 949257, 1005456, 957080, 823031, 9701..."
2,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[1126899, 1098694, 9338009, 9803591, 6396558, ...","[894360, 998519, 896666, 5570590, 1136033, 942...","[893018, 9836106, 6602697, 903800, 1029743, 12...","[1057168, 8118500, 924004, 877310, 9677052, 83..."
3,8,"[868075, 886787, 945611, 1005186, 1008787, 101...","[1075368, 916122, 1034176, 12810393, 12811532,...","[12808385, 981660, 5577022, 6463874, 886103, 8...","[5569845, 978332, 909714, 1044078, 1109136, 98...","[908314, 958023, 5577022, 1108055, 1133850, 95..."
4,9,"[883616, 1029743, 1039126, 1051323, 1082772, 1...","[1029743, 960732, 899624, 893018, 1042942, 948...","[918046, 9655676, 936830, 6632283, 1098910, 80...","[5569230, 1008032, 1015785, 7138817, 1111786, ...","[962967, 9831557, 42346, 1110743, 1129610, 103..."
5,13,"[6544236, 822407, 908317, 1056775, 1066289, 11...","[8090541, 879699, 5569376, 862070, 872177, 135...","[9488065, 965772, 1038985, 862070, 6554400, 10...","[5569230, 6979191, 1103705, 9673266, 824072, 9...","[8090612, 1131382, 1108055, 6534078, 1102003, ..."
6,14,"[917277, 981760, 878234, 925514, 986394, 10220...","[1106523, 910673, 1127758, 972143, 1025611, 87...","[822161, 1026439, 894360, 8090610, 902377, 874...","[5569230, 990335, 1025611, 863762, 910673, 901...","[962967, 875089, 841365, 5570147, 911311, 9019..."
7,15,"[996016, 1014509, 1044404, 1087353, 976199, 10...","[12523928, 863632, 866211, 1001827, 1042616, 1...","[823576, 1053530, 1052975, 9526159, 901656, 10...","[901062, 5569230, 1135476, 999999, 5564906, 86...","[12523928, 953180, 823031, 1032703, 5578856, 7..."


Переходим к расчету recall@k

### Recall@50

In [29]:
top_k_recall = 50

In [30]:
sorted(calc_recall_at_k(result_eval_lvl1, top_k_recall), key=lambda x: x[1],reverse=True)

[('own_rec', 0.0638),
 ('als_rec', 0.0492),
 ('similar_item_rec', 0.0321),
 ('similar_user_rec', 0.0061)]

In [31]:
sorted(calc_recall_at_k(result_eval_lvl1, top_k_recall), key=lambda x: x[1],reverse=True)

[('own_rec', 0.0638),
 ('als_rec', 0.0492),
 ('similar_item_rec', 0.0321),
 ('similar_user_rec', 0.0061)]

**Вывод:** Наибольший recall при k=50 кандидатов в рекомендацию дает модель own_recommendations

**Изучим, как будет меняться метрика в зависимости от количества кандидатов**

In [32]:
top_k_list = [20, 50, 100, 200, 500]

In [33]:
for k in top_k_list:
    result_eval_lvl1 = data_val_lvl1.groupby('user_id')['item_id'].unique().reset_index()
    result_eval_lvl1.columns=['user_id', ACTUAL_COL]
    
    for column_name, model in models.items():
        result_eval_lvl1[column_name] = make_recommendations(result_eval_lvl1, model, N=k)
        
    print(f'{k} кандидатов: \n{sorted(calc_recall_at_k(result_eval_lvl1, k), key=lambda x: x[1],reverse=True)}')

20 кандидатов: 
[('own_rec', 0.037), ('als_rec', 0.0309), ('similar_item_rec', 0.0168), ('similar_user_rec', 0.0038)]
50 кандидатов: 
[('own_rec', 0.0638), ('als_rec', 0.0492), ('similar_item_rec', 0.0321), ('similar_user_rec', 0.0061)]
100 кандидатов: 
[('own_rec', 0.0948), ('als_rec', 0.0688), ('similar_item_rec', 0.0523), ('similar_user_rec', 0.0101)]
200 кандидатов: 
[('own_rec', 0.1337), ('als_rec', 0.0966), ('similar_item_rec', 0.0846), ('similar_user_rec', 0.0139)]
500 кандидатов: 
[('own_rec', 0.1784), ('als_rec', 0.1457), ('similar_item_rec', 0.1343), ('similar_user_rec', 0.0209)]


**Вывод:** очевидно, что с увеличением количества кандидатов метрика пропорционально растет. Градация по убываю метрики среди моделей сохраняется (лучший результат показывает модель own_recommenders (однако не будем забывать, что она не совсем "правильная" на практике, т.к. часто рекомендует товары, которые пользователь и так купит), самый слабый результат - модель similar_user_rec).  
  
  
Однако, чем больше кандидатов, тем сложнее модель и тем дольше она обучается, а также это отражается на модели следующего уровня. Остановимся пока на количестве кандидатов - 100

### Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

**Задание 2.**

Обучите модель 2-ого уровня, при этом:

- Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар

- Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_ranker

- Вырос ли precision@5 при использовании двухуровневой модели?

In [63]:
# Наши периоды:
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

### Подготовка данных для трейна

In [64]:
#Оставляем количество кандидатов = 100 и own_recommendations
N_PREDICT = 100

#  Пробовала также 200  -разницы в метрике не было

In [65]:
# взяли пользователей из трейна для ранжирования

USER_COL = 'user_id'
ITEM_COL = 'item_id'

#начинам собирать таблицу с юзерами и их кандидатами в рекомендацию (рекомендуемыми товарами)
df_lvl2_candidates = pd.DataFrame(data_train_lvl2[USER_COL].unique())
df_lvl2_candidates.columns = [USER_COL]

In [66]:
# собираем кандитатов с первого этапа (matcher)
df_lvl2_candidates['candidates'] = df_lvl2_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [67]:
df_items = df_lvl2_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [68]:
df_lvl2_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1097398, 101..."


In [69]:
df_lvl2_candidates = df_lvl2_candidates.drop('candidates', axis=1).join(df_items)

In [70]:
df_lvl2_candidates.tail(5)

Unnamed: 0,user_id,item_id
1914,1745,844179
1914,1745,1126899
1914,1745,1044078
1914,1745,1127831
1914,1745,866211


Датасет df_lvl2_candidates содержит user_id и топ-200 рекомендаций для каждого userа - результат модели ALS

#### Создаем трейн датасет для ранжирования с учетом результаа (кандидатов) с этапа 1 

In [71]:
# В изначальном датасете даны только фактические покупки юзеров, значит их можно заменить на target = 1
# в df_train_lvl2 будет содержаться тренировочный набор данных для обучения модели 2 уровня

df_train_lvl2 = data_train_lvl2[[USER_COL, ITEM_COL]].copy()
df_train_lvl2['target'] = 1  # тут только покупки 

In [72]:
df_train_lvl2.head(3)



Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1


In [73]:
#Присоединим к df_lvl2_candidates пары user-item из тренировочного датасета (из исходных данных)

df_train_lvl2 = df_lvl2_candidates.merge(df_train_lvl2, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_train_lvl2 = df_train_lvl2.drop_duplicates(subset=[USER_COL, ITEM_COL])

# Заполняем пропущенные значения таргета 0 - значит пользователь не совершал такие покупки раньше
df_train_lvl2['target'].fillna(0, inplace= True)

In [74]:
df_train_lvl2.target.value_counts()

0.0    177822
1.0     11150
Name: target, dtype: int64

0 класса существенно больше, чем 1, вероятно, это связано с большим количеством выбранных рекомендаций после 1 уровня

In [75]:
df_train_lvl2.tail(5)

Unnamed: 0,user_id,item_id,target
196359,1745,1070820,0.0
196360,1745,844179,0.0
196362,1745,1044078,0.0
196363,1745,1127831,0.0
196364,1745,866211,0.0


In [76]:
df_train_lvl2['target'].mean()

0.05900345024659738

#### Подготавливаем фичи для обучения модели

In [77]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [78]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [79]:
df_train_lvl2 = df_train_lvl2.merge(item_features, on='item_id', how='left')
df_train_lvl2 = df_train_lvl2.merge(user_features, on='user_id', how='left')

df_train_lvl2.head(4)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


**Фичи user_id:**
    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории
    - Кол-во покупок в каждой категории
    - Частотность покупок раз/месяц
    - Долю покупок в выходные
    - Долю покупок утром/днем/вечером

**Фичи item_id**:
    - Кол-во покупок в неделю
    - Среднее ол-во покупок 1 товара в категории в неделю
    - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv)
    - Цена / Средняя цена товара в категории
    
**Фичи пары user_id - item_id**
    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
    - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

### Создадим новые фичи

#### 1) Средний чек пользователя

In [80]:
# Средний чек - соотношение суммы покупок к количеству

users_sales = data_train_lvl2.groupby(USER_COL)[['sales_value', 'quantity']].sum().reset_index()

users_sales['avg_transaction'] = users_sales['sales_value'] / users_sales['quantity']

df_train_lvl2 = df_train_lvl2.merge(users_sales[['user_id', 'avg_transaction']], on='user_id', how='left')
df_train_lvl2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_transaction
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.035173
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.035173


#### Добавим  категории к исходному обучающему датасету для создания новых фичей

In [81]:
data_department = data_train_lvl2.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0,GROCERY
1,2070,40630625006,594,1019940,1,1.0,311,-0.29,201,86,0.0,0.0,GROCERY


#### 2) Количество покупок в каждой категории
#### 3) средняя сумма покупки user-a в каждой категории

In [82]:
# Количество покупок в каждой категории

users_sales_by_department = data_department.groupby([USER_COL, 'department'])\
                        [['sales_value', 'quantity']].sum().reset_index()
users_sales_by_department.rename(columns={'quantity': 'n_sold_category'}, inplace=True)

# Средняя сумма покупки в каждой категории
users_sales_by_department['avg_users_transaction_by_category'] = users_sales_by_department['sales_value']\
                                                    /users_sales_by_department['n_sold_category']

In [83]:
users_sales_by_department.drop(columns=['sales_value'], inplace=True)

In [84]:
df_train_lvl2 = df_train_lvl2.merge(users_sales_by_department, on=[USER_COL, 'department'], how='left')
df_train_lvl2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_transaction,n_sold_category,avg_users_transaction_by_category
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.035173,3.0,3.596667
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.035173,213.0,1.461549


#### 4) Средняя сумма покупки в категории

In [85]:
# Средняя сумма покупки в категории
department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'common_mean_sales_value_by_category'}, inplace=True)
department_sales.tail(2)

df_train_lvl2 = df_train_lvl2.merge(department_sales, on='department', how='left')


#### 5) Количество покупок юзером конкретной категории в неделю

In [86]:
data_department['week_no'].max() - data_department['week_no'].min() + 1

6

In [87]:
# Рассчитываем количество недель в датасете
n_weeks = data_department['week_no'].max() - data_department['week_no'].min() + 1

users_department = data_department.groupby([USER_COL, 'department'])['quantity'].sum().reset_index()
users_department['quantity'] /= n_weeks
users_department.rename(columns={'quantity': 'n_sold_category_user_week'}, inplace=True)

df_train_lvl2 = df_train_lvl2.merge(users_department, on=[USER_COL, 'department'], how='left')
df_train_lvl2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_transaction,n_sold_category,avg_users_transaction_by_category,common_mean_sales_value_by_category,n_sold_category_user_week
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,50-74K,Unknown,Unknown,1,None/Unknown,0.035173,3.0,3.596667,4.542495,0.5
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,50-74K,Unknown,Unknown,1,None/Unknown,0.035173,213.0,1.461549,2.535227,35.5


#### 6) Фичи по item_id
*Цена  
*Количество покупок в неделю

In [88]:
# Цена рассчитывается как 
data_department.loc[data_department['sales_value'] == 0, 'sales_value'].count()

1440

In [89]:
# 1) Цена

items_sales = data_department.groupby(ITEM_COL)[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)


# 2) Количество покупок в неделю
items_sales['quantity_per_week'] = items_sales['quantity'] / n_weeks

In [90]:
items_sales.head(2)

Unnamed: 0,item_id,sales_value,quantity,price,quantity_per_week
0,28116,0.33,1,0.33,0.166667
1,28117,0.34,1,0.34,0.166667


In [91]:
df_train_lvl2 = df_train_lvl2.merge(items_sales[[ITEM_COL,'price', 'quantity_per_week']],
                                        on=ITEM_COL, how='left')

In [92]:
# если нет sales_value, т е нет цены
df_train_lvl2['Missing price'] = 0
df_train_lvl2.loc[df_train_lvl2['price'].isna(), 'Missing price'] = 1
df_train_lvl2['price'].fillna(0, inplace=True)

df_train_lvl2['Missing quantity per week'] = 0
df_train_lvl2.loc[df_train_lvl2['quantity_per_week'].isna(), 'Missing quantity per week'] = 1
df_train_lvl2['quantity_per_week'].fillna(0, inplace=True)

df_train_lvl2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,kid_category_desc,avg_transaction,n_sold_category,avg_users_transaction_by_category,common_mean_sales_value_by_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,None/Unknown,0.035173,3.0,3.596667,4.542495,0.5,3.99,0.833333,0,0
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,...,None/Unknown,0.035173,213.0,1.461549,2.535227,35.5,10.99,0.166667,0,0


### Разбираем на X_train и y_train и обучаем модель 2 уровня

In [93]:
X_train = df_train_lvl2.drop('target', axis=1)
y_train = df_train_lvl2[['target']]

In [94]:
X_train.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,kid_category_desc,avg_transaction,n_sold_category,avg_users_transaction_by_category,common_mean_sales_value_by_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week
0,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,...,None/Unknown,0.035173,3.0,3.596667,4.542495,0.5,3.99,0.833333,0,0
1,2070,1097350,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,...,None/Unknown,0.035173,213.0,1.461549,2.535227,35.5,10.99,0.166667,0,0


In [95]:
X_train.dtypes

user_id                                  int64
item_id                                  int64
manufacturer                             int64
department                              object
brand                                   object
commodity_desc                          object
sub_commodity_desc                      object
curr_size_of_product                    object
age_desc                                object
marital_status_code                     object
income_desc                             object
homeowner_desc                          object
hh_comp_desc                            object
household_size_desc                     object
kid_category_desc                       object
avg_transaction                        float64
n_sold_category                        float64
avg_users_transaction_by_category      float64
common_mean_sales_value_by_category    float64
n_sold_category_user_week              float64
price                                  float64
quantity_per_

In [96]:
cat_feats = X_train.columns[2:15].tolist()

for col in cat_feats:
    X_train[col].fillna(0, inplace = True)
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [97]:
X_train.dtypes

user_id                                   int64
item_id                                   int64
manufacturer                           category
department                             category
brand                                  category
commodity_desc                         category
sub_commodity_desc                     category
curr_size_of_product                   category
age_desc                               category
marital_status_code                    category
income_desc                            category
homeowner_desc                         category
hh_comp_desc                           category
household_size_desc                    category
kid_category_desc                      category
avg_transaction                         float64
n_sold_category                         float64
avg_users_transaction_by_category       float64
common_mean_sales_value_by_category     float64
n_sold_category_user_week               float64
price                                   

### Обучение модели ранжирования

In [98]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=13,
                     n_estimators=900,
                     learning_rate=0.1,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(*args, **kwargs)


In [99]:
df_lvl2_predict = df_train_lvl2.copy()
df_lvl2_predict['proba_item_purchase'] = train_preds[:,1]

In [100]:
df_lvl2_predict.loc[df_lvl2_predict['user_id']==2070].sort_values('proba_item_purchase', ascending=False)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,avg_transaction,n_sold_category,avg_users_transaction_by_category,common_mean_sales_value_by_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,proba_item_purchase
8,2070,1092937,1.0,1089,MEAT-PCKGD,National,LUNCHMEAT,BOLOGNA,16OZ,45-54,...,0.035173,11.0,3.426364,3.830433,1.833333,2.552857,5.833333,0,0,7.405992e-01
32,2070,1099905,1.0,830,GROCERY,National,HISPANIC,MEXICAN SAUCESSALSAPICANTEE,16 OZ,45-54,...,0.035173,213.0,1.461549,2.535227,35.500000,2.219412,2.833333,0,0,5.973791e-01
54,2070,1072917,1.0,5798,GROCERY,National,FROZEN BREAD/DOUGH,FRZN DINNER ROLLS,11 OZ,45-54,...,0.035173,213.0,1.461549,2.535227,35.500000,2.642917,4.000000,0,0,5.621694e-01
69,2070,1085357,1.0,69,MEAT-PCKGD,Private,FROZEN MEAT,FRZN BURGERS/BBQ/MEATBALL,38 OZ,45-54,...,0.035173,11.0,3.426364,3.830433,1.833333,6.748333,2.000000,0,0,5.027693e-01
68,2070,1067779,0.0,69,MEAT-PCKGD,Private,LUNCHMEAT,HAM,1 LB,45-54,...,0.035173,11.0,3.426364,3.830433,1.833333,3.060000,3.833333,0,0,2.722273e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,2070,5564067,0.0,2,DRUG GM,National,TICKETS,TICKETS,,45-54,...,0.035173,31.0,2.443871,3.984805,5.166667,0.000000,0.000000,1,1,1.640242e-05
61,2070,1052752,0.0,1156,DRUG GM,National,SHAVING CARE PRODUCTS,RAZORS AND BLADES,4 CT,45-54,...,0.035173,31.0,2.443871,3.984805,5.166667,0.000000,0.000000,1,1,1.433042e-05
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,...,0.035173,31.0,2.443871,3.984805,5.166667,0.000000,0.000000,1,1,1.253619e-05
60,2070,846502,0.0,693,DRUG GM,National,CANDY - PACKAGED,CANDY BAGS-CHOCOCLATE,12 OZ,45-54,...,0.035173,31.0,2.443871,3.984805,5.166667,0.000000,0.000000,1,1,8.644392e-06


### Evaluation on test dataset - Оценка модели на тестовом датасете

In [101]:
result_eval_lvl2 = data_val_lvl2.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_lvl2.columns=[USER_COL, ACTUAL_COL]
result_eval_lvl2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


### Eval matching on test dataset - Сопоставление предсказаний с тестовым набором данных

In [102]:
%%time
result_eval_lvl2['own_rec'] = result_eval_lvl2[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 15.7 s


Вспомним df_lvl2_candidates сет, который был получен own_recommendations на юзерах, набор пользователей мы фиксировали и он одинаков, значи и прогноз одинаков, поэтому мы можем использовать этот датафрейм для переранжирования.

In [103]:
# Функция для переранжирования

def rerank(user_id, df, USER_COL='user_id', proba_col_name='proba_item_purchase', N=5):
    return df[df[USER_COL]==user_id].sort_values(proba_col_name, ascending=False).head(N).item_id.tolist()

In [104]:
result_eval_lvl2['reranked_own_rec_lightgbm'] = result_eval_lvl2[USER_COL].apply(lambda user_id: rerank(user_id, df_lvl2_predict))

In [105]:
TOPK_PRECISION = 5
# рассчитаем precision только модели матчинга (1 уровня -  ALS, get_own_recommendations), 
# чтобы понимать влияение ранжирования (2 уровня) на метрики

print(*sorted(calc_precision_at_k(result_eval_lvl2, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec_lightgbm', 0.19258485639686482)
('own_rec', 0.1462140992167092)


In [106]:
#аналошично с recall
print(*sorted(calc_recall_at_k(result_eval_lvl2, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec_lightgbm', 0.0244)
('own_rec', 0.0174)


#### Обучим модель CatBoost, сравним результаты с  LightGBM

In [108]:
ctb = CatBoostClassifier(
                     max_depth=13,
                     n_estimators=500,
                     learning_rate=0.1,
                     cat_features=cat_feats)

ctb.fit(X_train, y_train)

train_preds = ctb.predict_proba(X_train)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.5766216	total: 436ms	remaining: 3m 37s
1:	learn: 0.4875662	total: 923ms	remaining: 3m 49s
2:	learn: 0.4245546	total: 1.31s	remaining: 3m 37s
3:	learn: 0.3653145	total: 2.56s	remaining: 5m 17s
4:	learn: 0.3240832	total: 3.29s	remaining: 5m 25s
5:	learn: 0.2977544	total: 4s	remaining: 5m 29s
6:	learn: 0.2765998	total: 4.69s	remaining: 5m 30s
7:	learn: 0.2536556	total: 10.7s	remaining: 10m 58s
8:	learn: 0.2435505	total: 11.3s	remaining: 10m 19s
9:	learn: 0.2354375	total: 12.5s	remaining: 10m 12s
10:	learn: 0.2225766	total: 15.9s	remaining: 11m 47s
11:	learn: 0.2135484	total: 20.2s	remaining: 13m 39s
12:	learn: 0.2068999	total: 24.1s	remaining: 15m 1s
13:	learn: 0.2014736	total: 28.3s	remaining: 16m 24s
14:	learn: 0.1980192	total: 33.1s	remaining: 17m 51s
15:	learn: 0.1961502	total: 33.8s	remaining: 17m 3s
16:	learn: 0.1932639	total: 38.2s	remaining: 18m 4s
17:	learn: 0.1909256	total: 42.6s	remaining: 19m 1s
18:	learn: 0.1886758	total: 46.3s	remaining: 19m 31s
19:	learn: 0.1870

153:	learn: 0.1297187	total: 10m 8s	remaining: 22m 46s
154:	learn: 0.1292026	total: 10m 13s	remaining: 22m 45s
155:	learn: 0.1284151	total: 10m 18s	remaining: 22m 43s
156:	learn: 0.1277590	total: 10m 23s	remaining: 22m 43s
157:	learn: 0.1271227	total: 10m 28s	remaining: 22m 39s
158:	learn: 0.1267590	total: 10m 31s	remaining: 22m 35s
159:	learn: 0.1258954	total: 10m 38s	remaining: 22m 35s
160:	learn: 0.1255691	total: 10m 43s	remaining: 22m 34s
161:	learn: 0.1252660	total: 10m 48s	remaining: 22m 32s
162:	learn: 0.1249151	total: 10m 52s	remaining: 22m 28s
163:	learn: 0.1245802	total: 10m 56s	remaining: 22m 25s
164:	learn: 0.1242953	total: 11m 1s	remaining: 22m 22s
165:	learn: 0.1239979	total: 11m 5s	remaining: 22m 19s
166:	learn: 0.1237444	total: 11m 9s	remaining: 22m 15s
167:	learn: 0.1233968	total: 11m 14s	remaining: 22m 12s
168:	learn: 0.1232270	total: 11m 18s	remaining: 22m 8s
169:	learn: 0.1229294	total: 11m 22s	remaining: 22m 4s
170:	learn: 0.1225046	total: 11m 27s	remaining: 22m 2s

301:	learn: 0.0883338	total: 21m 26s	remaining: 14m 3s
302:	learn: 0.0881861	total: 21m 30s	remaining: 13m 59s
303:	learn: 0.0877937	total: 21m 35s	remaining: 13m 55s
304:	learn: 0.0877350	total: 21m 39s	remaining: 13m 51s
305:	learn: 0.0875628	total: 21m 43s	remaining: 13m 46s
306:	learn: 0.0873650	total: 21m 48s	remaining: 13m 42s
307:	learn: 0.0873119	total: 21m 52s	remaining: 13m 38s
308:	learn: 0.0868547	total: 21m 57s	remaining: 13m 34s
309:	learn: 0.0865989	total: 22m 1s	remaining: 13m 30s
310:	learn: 0.0863310	total: 22m 6s	remaining: 13m 26s
311:	learn: 0.0862295	total: 22m 11s	remaining: 13m 22s
312:	learn: 0.0860688	total: 22m 16s	remaining: 13m 18s
313:	learn: 0.0859218	total: 22m 21s	remaining: 13m 14s
314:	learn: 0.0856598	total: 22m 26s	remaining: 13m 10s
315:	learn: 0.0854515	total: 22m 30s	remaining: 13m 6s
316:	learn: 0.0852055	total: 22m 34s	remaining: 13m 2s
317:	learn: 0.0845685	total: 22m 40s	remaining: 12m 58s
318:	learn: 0.0845222	total: 22m 45s	remaining: 12m 5

451:	learn: 0.0603375	total: 33m 8s	remaining: 3m 31s
452:	learn: 0.0602909	total: 33m 12s	remaining: 3m 26s
453:	learn: 0.0601825	total: 33m 17s	remaining: 3m 22s
454:	learn: 0.0600520	total: 33m 22s	remaining: 3m 18s
455:	learn: 0.0600310	total: 33m 27s	remaining: 3m 13s
456:	learn: 0.0599920	total: 33m 31s	remaining: 3m 9s
457:	learn: 0.0598544	total: 33m 36s	remaining: 3m 4s
458:	learn: 0.0597741	total: 33m 41s	remaining: 3m
459:	learn: 0.0596717	total: 33m 46s	remaining: 2m 56s
460:	learn: 0.0595124	total: 33m 50s	remaining: 2m 51s
461:	learn: 0.0591645	total: 33m 55s	remaining: 2m 47s
462:	learn: 0.0590376	total: 34m	remaining: 2m 43s
463:	learn: 0.0589707	total: 34m 4s	remaining: 2m 38s
464:	learn: 0.0588434	total: 34m 9s	remaining: 2m 34s
465:	learn: 0.0585950	total: 34m 13s	remaining: 2m 29s
466:	learn: 0.0584670	total: 34m 17s	remaining: 2m 25s
467:	learn: 0.0581018	total: 34m 22s	remaining: 2m 21s
468:	learn: 0.0578565	total: 34m 26s	remaining: 2m 16s
469:	learn: 0.0577378	t

In [111]:
df_lvl2_predict = df_train_lvl2.copy()
df_lvl2_predict['proba_item_purchase'] = train_preds[:,1]

In [110]:
result_eval_lvl2['reranked_own_rec_catb'] = result_eval_lvl2[USER_COL].apply(lambda user_id: rerank(user_id, df_lvl2_predict))

In [112]:
print(*sorted(calc_precision_at_k(result_eval_lvl2, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec_catb', 0.19665796344647327)
('reranked_own_rec_lightgbm', 0.19258485639686482)
('own_rec', 0.1462140992167092)


Метрика показывает, что модель второго уровня оказалось значительно лучше, чем отбор товаров на основе модели первого уровня без дополнительного ранжирования.  
Важно принять во внимание, что я не делала преобработку категориальных признаков. Возможно, это дало бы результат выше.