# Course project


- Целевая метрика precision@5


**!! Мы не рассматриваем холодный старт для пользователя, все наши пользователя одинаковы во всех сетах, поэтому нужно позаботиться об их исключении из теста.**

# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender


pd.pandas.set_option('display.max_columns', None)
import warnings
warnings.simplefilter('ignore')

## Read data

In [2]:
PATH_DATA = "C:/Users/yana/RS_course_project"

In [100]:
data = pd.read_csv(os.path.join(PATH_DATA,'retail_train.csv'))
item_features = pd.read_csv(os.path.join(PATH_DATA,'product.csv'))
user_features = pd.read_csv(os.path.join(PATH_DATA,'hh_demographic.csv'))

In [101]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [102]:
item_features.head(2)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [103]:
user_features.head(2)

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


# Set global const

In [104]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'


N_PREDICT = 30

In [105]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train & eval

In [106]:
# последний месяц оставляем для валидации
val_size_weeks = 4

# для обучения модели 1го уровня
data_train_matcher = data[data['week_no'] < data['week_no'].max() - val_size_weeks]
# для валидации модели 1го уровня
data_val_matcher = data[data['week_no'] >= data['week_no'].max() - val_size_weeks]
# для обученя модели 2го уровня
data_train_ranker = data_val_matcher.copy()

In [107]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [108]:
print_stats_data(data,'full dataset')
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')

full dataset
Shape: (2396804, 12) Users: 2499 Items: 89051
train_matcher
Shape: (2251517, 12) Users: 2499 Items: 86343
val_matcher
Shape: (145287, 12) Users: 2101 Items: 26561
train_ranker
Shape: (145287, 12) Users: 2101 Items: 26561


# Prefilter items

Критерии фильтрации:

- 83% продаж составляет 1 товар, 98% продаж - всего 10 товаров из 89051

In [12]:
popular_items = data.groupby('item_id')['quantity'].sum().rename('n_sold').reset_index()
popular_items['n_sold'] = popular_items['n_sold']/popular_items['n_sold'].sum()*100
popular_items = popular_items.sort_values('n_sold')
popular_items.tail(5)

Unnamed: 0,item_id,n_sold
44111,1404121,0.683818
56341,6544236,1.071972
56228,6534166,5.38132
56193,6533889,7.029342
56233,6534178,83.000361


- топ-10 самых продаваемых товаров покупаются наибольшим числом разных пользователей (корреляция 93.9%). Но с расширением ассортимента корреляция падает.

In [13]:
unique_users = data.groupby('item_id')['user_id'].nunique().rename('unique_users').reset_index()
unique_users['unique_users'] = unique_users['unique_users'] /unique_users['unique_users'].sum()*100
unique_users =unique_users.sort_values('unique_users')
unique_users.tail(5)

Unnamed: 0,item_id,unique_users
8056,840361,0.100352
29195,1029743,0.10233
25333,995242,0.108112
23846,981760,0.120514
35054,1082185,0.155131


In [14]:
print(np.corrcoef(unique_users['unique_users'], popular_items['n_sold'])[0][1])
print(np.corrcoef(unique_users['unique_users'].tail(1000), popular_items['n_sold'].tail(1000))[0][1])
print(np.corrcoef(unique_users['unique_users'].tail(100), popular_items['n_sold'].tail(100))[0][1])
print(np.corrcoef(unique_users['unique_users'].tail(10), popular_items['n_sold'].tail(10))[0][1])

0.167099848329318
0.2901786425984415
0.48510541043281313
0.9389651031592855


 - средний пользователь покупает широкий ассортимент товаров (427), часто похожих друг на друга, но пользователей много, у них различные предпочтения, поэтому если выбрать небольшое кол-во топовых товаров для рекомендаций, отразить предпочтения большинства пользователей не получится

In [15]:
data.groupby('user_id')['item_id'].nunique().median()

427.0

Возьмем 15% самых популярных товаров за последний год - 10339 наименований 

In [109]:
n_items_before = data_train_matcher['item_id'].nunique()
data_train_matcher = prefilter_items(data_train_matcher, 0.85)
n_items_after = data_train_matcher['item_id'].nunique()

print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86343 to 10339


# Make cold-start to warm-start

In [110]:
common_users = data_train_matcher.user_id.values
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_matcher,'val_matcher')

train_matcher
Shape: (2251517, 12) Users: 2499 Items: 10339
train_ranker
Shape: (145287, 12) Users: 2101 Items: 26561
val_matcher
Shape: (145287, 12) Users: 2101 Items: 26561


# Init/train recommender

In [18]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

In [111]:
# покупки пользователей в валидационный период
valid_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
valid_eval_matcher.columns=[USER_COL, ACTUAL_COL]
valid_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[829563, 830156, 832990, 840361, 856942, 87157..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [20]:
%%time
# если при обучении модели подать взвешенную матрицу, а при выдаче рекомендаций невзвешенную, то выдача рекомендаций происходит 
# очень долго, но результат значительно лучше. Поэтому в целях экономии времени выдача рекомендаций по дополнительным соседям
# сделана только для лучшей модели - tfidf
valid_eval_matcher['cosine_rec'] = valid_eval_matcher[USER_COL].apply(lambda x: recommender.get_cosine_recommendations(x, N=N_PREDICT))
valid_eval_matcher['tfidf_rec'] = valid_eval_matcher[USER_COL].apply(lambda x: recommender.get_tfidf_recommendations(x, N=N_PREDICT))
valid_eval_matcher['bm25_rec'] = valid_eval_matcher[USER_COL].apply(lambda x: recommender.get_bm25_recommendations(x, N=N_PREDICT))
valid_eval_matcher['own_rec'] = valid_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
valid_eval_matcher['als_rec'] = valid_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

Wall time: 2h 20min 32s


## Intermediate evaluation (1st-level model):

In [21]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

### Recall@50 of matching

In [22]:
TOPK_RECALL = N_PREDICT

In [23]:
sorted(calc_recall(valid_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('tfidf_rec', 0.144055556676992),
 ('cosine_rec', 0.14301375356820242),
 ('bm25_rec', 0.13822675015069052),
 ('own_rec', 0.09651106619871015),
 ('als_rec', 0.09478073674355386)]

### Precision@5 of matching

In [24]:
TOPK_PRECISION = 5

In [25]:
sorted(calc_precision(valid_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('tfidf_rec', 0.4303664921465969),
 ('cosine_rec', 0.4303664921465966),
 ('bm25_rec', 0.4241789623988579),
 ('als_rec', 0.26796763445978),
 ('own_rec', 0.21018562589242965)]

Лучший результат у модели TFIDF, он будет использован для обучения модели 2го уровня. Использование результатов нескольких моделей негативно сказалось на целевой метрике.

In [26]:
valid_eval_matcher['final'] = valid_eval_matcher['tfidf_rec']

# Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах


## Подготовка данных для трейна

In [27]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [28]:
df_match_candidates = df_match_candidates.merge(valid_eval_matcher[['user_id','final']], on = 'user_id', how = 'left')

In [29]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['final']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [30]:
df_match_candidates = df_match_candidates.drop('final', axis=1).join(df_items)

In [99]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2445,995242
0,2445,879755
0,2445,1107553
0,2445,908531


In [32]:
df_match_candidates.to_csv(os.path.join(PATH_DATA,'df_match_candidates.csv'), index=False, encoding='utf-8', sep=',')

In [19]:
#df_match_candidates = pd.read_csv(os.path.join(PATH_DATA,'df_match_candidates.csv'))

### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [112]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [113]:
df_ranker_train.target.value_counts()

0.0    47752
1.0    15278
Name: target, dtype: int64

In [114]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2445,995242,1.0
1,2445,879755,0.0


## Подготавливаем фичи для обучения модели

In [115]:
def tranform(item_features, user_features, data, df_ranker_train): 
    item_features =item_features.drop('manufacturer', axis=1)
    
    user_features['age_desc'] = user_features['age_desc'].replace(
        {'65+': 5, '45-54': 3, '25-34': 1, '35-44': 2, '19-24': 0, '55-64': 4})
    user_features['age_desc'] = user_features['age_desc'].astype('int')
    user_features['income_desc'] = user_features['income_desc'].replace(
        {'35-49K': (35+49)/2/52, # вероятно, прибыль дана за год, посчитаем прибыль за неделю 
         '50-74K': (50+74)/2/52,
         '25-34K': (25+34)/2/52,
         '75-99K': (75+99)/2/52,
         'Under 15K': 15/52,
         '100-124K': (100+124)/2/52,
         '15-24K': (15+24)/2/52,
         '125-149K': (125+149)/2/52,
         '150-174K': (150+174)/2/52,
         '250K+': 250/52,
         '175-199K': (175+199)/2/52,
         '200-249K': (200+249)/2/52})
    user_features.loc[user_features['kid_category_desc']=='3+', 'kid_category_desc'] = 3
    user_features.loc[user_features['kid_category_desc']=='None/Unknown', 'kid_category_desc'] = 0
    user_features['kid_category_desc'] = user_features['kid_category_desc'].astype('int')
    
    df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
    df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')
    data['price'] = data['sales_value'] / data['quantity']
    df_features = data.merge(item_features, on='item_id', how='left')
    
    df_ranker_train = df_ranker_train.merge(data.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
    df_ranker_train = df_ranker_train.merge(data.groupby(by=[USER_COL, ITEM_COL]).agg('quantity').sum().rename('user_quantity_value'), how='left',on=[USER_COL, ITEM_COL])
    df_ranker_train['user_quantity_value'].fillna(0, inplace= True)
    df_ranker_train['q-ty_ratio'] = df_ranker_train['user_quantity_value'] / df_ranker_train['total_quantity_value']
    df_ranker_train = df_ranker_train.merge(data.groupby(by=[ITEM_COL, USER_COL]).agg('sales_value').sum().rename('user_sales_value')/data.week_no.nunique(), how='left',on=[ITEM_COL, USER_COL])
    df_ranker_train = df_ranker_train.merge(data.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_sales_value')/data.week_no.nunique(), how='left',on=ITEM_COL)
    df_ranker_train['user_sales_value'].fillna(0, inplace= True)
    df_ranker_train['sales_value_ratio'] = df_ranker_train['user_sales_value'] / df_ranker_train['total_sales_value']
    df_ranker_train = df_ranker_train.merge(data.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)
    
    df_ranker_train = df_ranker_train.merge(data.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/data.week_no.nunique(), how='left',on=USER_COL)
    df_ranker_train = df_ranker_train.merge(data.groupby('item_id')['price'].mean(), on='item_id', how='left')
    df_ranker_train = df_ranker_train.rename(columns={'price':'item_price'})
    df_ranker_train['val_per_week'] = df_ranker_train['item_price'] * df_ranker_train['user_quantity_per_week']
    df_ranker_train = df_ranker_train.merge(df_ranker_train.groupby('department')['item_price'].mean().rename('dept_avg_price'), on='department', how='left')
    df_ranker_train['price_ratio'] = df_ranker_train['item_price'] / df_ranker_train['dept_avg_price']
    df_ranker_train['avg_price_minus_item_price'] = df_ranker_train['dept_avg_price'] - df_ranker_train['item_price']

    df_ranker_train = df_ranker_train.merge(df_features.groupby(['user_id', 'department'])['quantity'].sum().rename('q-ty_per_cat').reset_index(), on=['user_id', 'department'], how='left')
    df_ranker_train = df_ranker_train.merge(df_features.groupby('department')['quantity'].sum().rename('total_q-ty_cat'), on='department', how='left')

    df_ranker_train = df_ranker_train.merge(data.groupby(['user_id']).sales_value.sum().rename('user_sales_value')/data.week_no.nunique(), on='user_id', how='left')
    df_ranker_train= df_ranker_train.merge(data.groupby(['user_id']).basket_id.count().rename('user_baskets')/data.week_no.nunique(), on='user_id', how='left')
    df_ranker_train['avg_basket_sum'] = df_ranker_train['user_sales_value_y']/df_ranker_train['user_baskets']

    df_ranker_train= df_ranker_train.merge(data.groupby('item_id')['retail_disc'].mean(),on=ITEM_COL, how='left')
    df_ranker_train= df_ranker_train.merge(data.groupby('user_id')['item_id'].nunique().rename('user_items'),on=USER_COL, how='left')
    df_ranker_train= df_ranker_train.merge(data.groupby('user_id')['day'].nunique().rename('user_days'),on=USER_COL, how='left')
    
    commodities = df_features.groupby(['user_id', 'commodity_desc', 'week_no'])['quantity'].mean().rename('user_commodities_per_week').reset_index()
    commodities = commodities.merge(df_features.groupby(['commodity_desc', 'week_no'])['quantity'].sum().rename('total_commodities_per_week')\
                                    .reset_index(), on=['commodity_desc', 'week_no'], how='left')
    commodities['commodities_per_week'] = commodities['total_commodities_per_week'] / commodities['user_commodities_per_week']
    df_ranker_train = df_ranker_train.merge(commodities.groupby(['user_id', 'commodity_desc'])['commodities_per_week'].mean() \
        .reset_index(), on=['user_id', 'commodity_desc'], how='left')
    
    commodities_price = df_features.groupby(['user_id', 'commodity_desc', 'week_no'])['price'].sum().rename('weekly_price').reset_index()
    commodities_price = commodities_price.merge(df_features.groupby(['commodity_desc', 'week_no'])['price'].sum(). \
                                                rename('weekly_price_total').reset_index(), on=['commodity_desc', 'week_no'], how='left')
    commodities_price['mean_commodity_weekly_price'] =   commodities_price['weekly_price']/ commodities_price['weekly_price_total']                                        
    df_ranker_train = df_ranker_train.merge(commodities_price.groupby(['user_id', 'commodity_desc'])['mean_commodity_weekly_price'].mean()\
                                            .reset_index(), on=['user_id', 'commodity_desc'], how='left')
    
    df_ranker_train['q-ty_percent'] = df_ranker_train['total_quantity_value']/df_ranker_train['total_quantity_value'].sum()*100
    df_ranker_train['user_quantity_percent'] = df_ranker_train['user_quantity_value']/df_ranker_train['user_quantity_value'].sum()*100
    unique_users = data.groupby('item_id')['user_id'].nunique().rename('unique_users').reset_index()
    unique_users['unique_users'] = unique_users['unique_users']/unique_users['unique_users'].sum()*100
    df_ranker_train= df_ranker_train.merge(unique_users, on=ITEM_COL, how='left')
    user_items = data.groupby('user_id')['quantity'].sum().rename('n_sold').reset_index()
    user_items['n_sold'] = user_items['n_sold']/user_items['n_sold'].sum()*100
    df_ranker_train= df_ranker_train.merge(user_items, on=USER_COL, how='left')
    
    return df_ranker_train  
    


In [116]:
df_ranker_train = tranform(item_features, user_features, data, df_ranker_train)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_quantity_value,user_quantity_value,q-ty_ratio,user_sales_value_x,total_sales_value,sales_value_ratio,item_freq,user_quantity_per_week,item_price,val_per_week,dept_avg_price,price_ratio,avg_price_minus_item_price,q-ty_per_cat,total_q-ty_cat,user_sales_value_y,user_baskets,avg_basket_sum,retail_disc,user_items,user_days,commodities_per_week,mean_commodity_weekly_price,q-ty_percent,user_quantity_percent,unique_users,n_sold
0,2445,995242,1.0,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,,3.0,U,0.807692,Unknown,Unknown,1,0.0,19706,86.0,0.004364,0.931579,251.523789,0.003704,11397,2113.431579,1.301258,2750.11986,inf,0.0,inf,2544.0,2025562,65.145579,25.326316,2.572249,-1.153365,1559,154,760.205284,0.001945,1.2e-05,4e-05,0.108112,0.083454
1,2445,879755,0.0,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,3.0,U,0.807692,Unknown,Unknown,1,0.0,2167,25.0,0.011537,0.287684,24.201158,0.011887,1771,2113.431579,1.063321,2247.256316,inf,0.0,inf,2544.0,2025562,65.145579,25.326316,2.572249,-0.280045,1559,154,1319.10663,0.002077,1e-06,1.2e-05,0.033628,0.083454


Также в качестве дополнительных факторов добавим item_factors и user_factors из ALS. Поскольку в предыдущий MainRecommender попали не все товары, а только отфильтрованные, обучим еще один рекоммендер на валидационном датасете 

In [117]:
recommender_valid = MainRecommender(data_val_matcher)

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/26561 [00:00<?, ?it/s]

  0%|          | 0/26561 [00:00<?, ?it/s]

  0%|          | 0/26561 [00:00<?, ?it/s]

  0%|          | 0/26561 [00:00<?, ?it/s]

In [118]:
item_factors_valid = pd.DataFrame(recommender_valid.model.item_factors[:, :30])
item_factors_valid.reset_index(inplace=True)
item_factors_valid['item_id'] = item_factors_valid['index'].apply(lambda x: recommender_valid.id_to_itemid[x])
item_factors_valid = item_factors_valid.drop('index', axis=1)
#факторы из предыдущего рекоммендера
item_factors_train = pd.DataFrame(recommender.model.item_factors[:, :30])
item_factors_train.reset_index(inplace=True)
item_factors_train['item_id'] = item_factors_train['index'].apply(lambda x: recommender.id_to_itemid[x])
item_factors_train = item_factors_train.drop('index', axis=1)
# добавим ембеддинги тех товаров, которых нет в валиде, из предыдущего рекоммендера
item_factors = item_factors_valid.append(item_factors_train[~item_factors_train['item_id'].isin(np.unique(item_factors_valid['item_id']))])

In [119]:
user_factors_valid = pd.DataFrame(recommender_valid.model.user_factors[:, :30])
user_factors_valid.reset_index(inplace=True)
user_factors_valid['user_id'] = user_factors_valid['index'].apply(lambda x: recommender_valid.id_to_userid[x])
user_factors_valid = user_factors_valid.drop('index', axis=1)

In [120]:
df_ranker_train = df_ranker_train.merge(item_factors, on=ITEM_COL, how='left')
df_ranker_train = df_ranker_train.merge(user_factors_valid, on=USER_COL, how='left')

In [66]:
df_ranker_train.isnull().sum()[1:35]

item_id                           0
target                            0
department                        0
brand                             0
commodity_desc                    0
sub_commodity_desc                0
curr_size_of_product              0
age_desc                      39870
marital_status_code           39870
income_desc                   39870
homeowner_desc                39870
hh_comp_desc                  39870
household_size_desc           39870
kid_category_desc             39870
total_quantity_value              0
user_quantity_value               0
q-ty_ratio                        0
user_sales_value_x                0
total_sales_value                 0
sales_value_ratio                13
item_freq                         0
user_quantity_per_week            0
item_price                        0
val_per_week                      0
dept_avg_price                    0
price_ratio                    3455
avg_price_minus_item_price     3455
q-ty_per_cat                

Много пропусков, в основном, из-за нехватки информации о пользователях user_features. Пыталась заполнять недотающую информацию данными ближайшего соседа (соседей) из als similar users, но улучшения результата это не дало

In [54]:
# НЕ СРАБОТАЛО
known_users = np.unique(user_features['user_id'])
all_users= np.unique(df_ranker_train['user_id'])
unknown_users = np.setdiff1d(all_users, known_users)

# ищем хотя бы 1 самого похожего пользователя. Если о нем известна личная информация, добавляем ту же информацию
# для нашего пользователя. Можно искать нескольких "соседей".
similar_users = [recommender.model.similar_users(recommender.userid_to_id[el], N=2) for el in unknown_users]
for i in range(len(similar_users)):
    users_i = [recommender.id_to_userid[rec[0]] for rec in similar_users[i]]
    for user in users_i:
        if user in known_users:
            user_features = user_features.append(user_features.loc[user_features['user_id']==user])
            user_features.tail(1)['user_id'] = unknown_users[i]
            break

In [121]:
X_train = df_ranker_train.drop(['target'], axis=1)
y_train = df_ranker_train['target']

In [122]:
cat_feats = ['user_id', 'item_id',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
  'marital_status_code',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc']
X_train[cat_feats] = X_train[cat_feats].astype('category')

## Обучение модели ранжирования

In [123]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=89,
                     num_leaves = 100,
                     learning_rate=0.1,
                     scale_pos_weight = 2.98,
                     reg_lambda = 0.1,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

In [124]:
df_ranker_predict = df_ranker_train.copy()

In [125]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

# Evaluation on test dataset

In [126]:
test = pd.read_csv(os.path.join(PATH_DATA,'retail_test1.csv'))

#warm_start
common_users = df_match_candidates.user_id.values
test = test[test.user_id.isin(common_users)]

print_stats_data(df_match_candidates, 'match_candidates')
print_stats_data(test, 'test')

match_candidates
Shape: (63030, 2) Users: 2101 Items: 8502
test
Shape: (86369, 12) Users: 1761 Items: 20272


In [127]:
result_eval_ranker = test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(3)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,3,"[827683, 908531, 989069, 1071377, 1080155, 109..."
2,6,"[956902, 960791, 1037863, 1119051, 1137688, 84..."


## Eval matching on test dataset

In [135]:
%%time
#precision модели 1го уровня
result_eval_ranker['tfidf_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_tfidf_recommendations(x, N=N_PREDICT))

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

  0%|          | 0/10339 [00:00<?, ?it/s]

Wall time: 24min 2s


## Eval re-ranked matched result on test dataset

    

In [50]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).item_id.tolist()

In [128]:
result_eval_ranker['reranked_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [136]:
print('Precision@k on test dataset:')
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

Precision@k on test dataset:
('reranked_rec', 0.4136286201022153)
('tfidf_rec', 0.38693923906871086)


In [137]:
result_eval_ranker[['user_id', 'reranked_rec']].to_csv(os.path.join(PATH_DATA,'recommendations.csv'), index=False, encoding='utf-8', sep=',')