# Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'Times New Roman'

import warnings
warnings.filterwarnings('ignore')

from scipy.sparse import csr_matrix
from scipy.stats import mode

from implicit import als

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from metrics import precision, recall
from utils import prefilter_items
from recommenders import MainRecommender

## Read data

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [3]:
display(data.head(2), item_features.head(2), user_features.head(2))

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


# Process features dataset

In [4]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [5]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

In [6]:
display(data.head(2), item_features.head(2), user_features.head(2))

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


# Split dataset for train, eval, test

In [7]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [8]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [9]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [10]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [11]:
# выше видим разброс по пользователям и товарам

In [12]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


# Prefilter items

In [13]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(
    data_train_matcher, group_col='item_id', popular_col='quantity', top_popular_filter_choose=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Отфильтровано 763420 записей
Decreased # items from 83685 to 5001


# Make cold-start to warm-start

In [14]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 5001
val_matcher
Shape: (169707, 12) Users: 2153 Items: 27649
train_ranker
Shape: (169707, 12) Users: 2153 Items: 27649
val_ranker
Shape: (118303, 12) Users: 2041 Items: 24326


# Init/train recommender

In [15]:
recommender = MainRecommender(verbose=False)
recommender.fit(data_train_matcher)



<implicit.als.AlternatingLeastSquares at 0x1c99f58b3d0>

### Варианты, как получить кандидатов

Можно потом все эти варианты соединить в один

(!) Если модель рекомендует < N товаров, то рекомендации дополняются топ-популярными товарами до N

In [16]:
# Берем тестового юзера 2375

In [17]:
recommender.get_als_recommendations(2375, N=5)

[899624, 845208, 1004906, 1079023, 904360]

In [18]:
recommender.get_own_recommendations(2375, N=5)

[1036501, 1079023, 1085983, 907099, 910439]

In [19]:
recommender.get_similar_items_recommendation(2375, N=5)

[1008012, 889731, 1082185, 1046545, 981760]

In [20]:
recommender.get_similar_users_recommendation(2375, N=5)

[1076580, 1021133, 1135044, 982989, 861445]

# Eval recall of matching

In [21]:
ACTUAL_COL = 'actual'

In [22]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [23]:
# N = Neighbors
N_PREDICT = 50 

In [24]:
%%time
# для понятности расписано все в строчку, без функций, ваша задача уметь оборачивать все это в функции
result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=50))

Wall time: 10min 20s


*******************
## TEST зависимости recall@k от k

In [25]:
n_test_list = [20, 50, 100, 150, 200, 300, 500]
for N in n_test_list:
    result_eval_matcher[f'als_rec_{N}'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N))
    

In [26]:
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual,own_rec,sim_item_rec,als_rec_20,als_rec_50,als_rec_100,als_rec_150,als_rec_200,als_rec_300,als_rec_500
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[856942, 9297615, 5577022, 1074612, 9655212, 9...","[1027569, 999999, 1029743, 9526410, 5582712, 9...","[1100972, 940631, 9445549, 1127328, 963835, 82...","[1100972, 940631, 9445549, 1127328, 963835, 82...","[1100972, 940631, 9445549, 1127328, 963835, 82...","[1100972, 940631, 9445549, 1127328, 963835, 82...","[1100972, 940631, 9445549, 1127328, 963835, 82...","[1100972, 940631, 9445549, 1127328, 963835, 82...","[1100972, 940631, 9445549, 1127328, 963835, 82..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1076580, 911974, 826784, 1083296, 838136, 820...","[8090537, 1133018, 5569845, 1106523, 985999, 8...","[1025650, 6039687, 1048563, 5569230, 12256522,...","[1025650, 6039687, 1048563, 5569230, 12256522,...","[1025650, 6039687, 1048563, 5569230, 12256522,...","[1025650, 6039687, 1048563, 5569230, 12256522,...","[1025650, 6039687, 1048563, 5569230, 12256522,...","[1025650, 6039687, 1048563, 5569230, 12256522,...","[1025650, 6039687, 1048563, 5569230, 12256522,..."


In [27]:
def calc_recall(df_data, top_k, start=2, stop=None):
    score_dict = {}
    for col_name in df_data.columns[start:stop]:
        score = df_data.apply(lambda row: recall(
            row[col_name], row[ACTUAL_COL], top_k=top_k), axis=1).mean()
        score_dict[col_name] = score
    return score_dict

In [28]:
def calc_precision(df_data, top_k, start=2, stop=None):
    score_dict = {}
    for col_name in df_data.columns[start:stop]:
        score = df_data.apply(lambda row: precision(
            row[col_name], row[ACTUAL_COL], top_k=top_k), axis=1).mean()
        score_dict[col_name] = score
    return score_dict

### Recall@50 of matching

In [29]:
TOPK_RECALL = 50

In [30]:
recall_scores = calc_recall(result_eval_matcher, TOPK_RECALL, stop=4)

In [31]:
for k in n_test_list:
    col_name = f'als_rec_{k}'
    score = result_eval_matcher.apply(lambda row: recall(
            row[col_name], row[ACTUAL_COL], top_k=k), axis=1).mean()
    recall_scores[col_name] = score
  

In [32]:
recall_scores

{'own_rec': 0.10398678912002314,
 'sim_item_rec': 0.08003317656469824,
 'als_rec_20': 0.050936988154033906,
 'als_rec_50': 0.09585770530493445,
 'als_rec_100': 0.14779651399719554,
 'als_rec_150': 0.18221980991620468,
 'als_rec_200': 0.21152662254576557,
 'als_rec_300': 0.2538703708626344,
 'als_rec_500': 0.30826745159923913}

In [33]:
# чем больше N -> тем выше recall(можно было и не тестировать)
# у N на первом этапе нет оптимального значения - выбор зависит от ресурсов и от 2 этапа

### Precision@5 of matching

In [34]:
TOPK_PRECISION = 5

In [35]:
precision_scores = calc_precision(result_eval_matcher, TOPK_PRECISION, stop=5)
precision_scores

{'own_rec': 0.2581514166279593,
 'sim_item_rec': 0.14612169066418787,
 'als_rec_20': 0.14602879702740174}

# Ranking part

## Подготовка данных для трейна

In [36]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [37]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [38]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[834103, 878302, 1119399, 1085604, 13511722, 9..."
1,2021,"[1119454, 1019142, 871279, 835578, 863762, 101..."


In [39]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [40]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [41]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,834103
0,2070,878302
0,2070,1119399
0,2070,1085604


### Check warm start

In [42]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (107650, 2) Users: 2153 Items: 4872


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [43]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')
df_ranker_train['target'].fillna(0, inplace= True)

In [44]:
df_ranker_train.target.value_counts()

0.0    94722
1.0    20773
Name: target, dtype: int64

In [45]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,834103,1.0
1,2070,834103,1.0


## Подготавливаем фичи для обучения модели

In [46]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [47]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [48]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [49]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [50]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]
X_train.fillna('unknown', inplace=True)

In [51]:
X_train

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,834103,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,834103,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,834103,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,834103,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,834103,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115490,1745,948832,1719,MEAT-PCKGD,National,HOT DOGS,BETTER FOR YOU,1 LB,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
115491,1745,1028891,1225,GROCERY,National,FLUID MILK PRODUCTS,REFRIGERATED COFFEE CREAMERS,32 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
115492,1745,1054030,869,GROCERY,National,SOUP,DRY SOUP,2 CT,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
115493,1745,984669,1251,GROCERY,National,FROZEN BREAD/DOUGH,FRZN GARLIC BREAD/TOAST/STICKS,9.5 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown


In [52]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')
cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

## Обучение модели ранжирования

In [53]:
ctb = CatBoostClassifier(max_depth=6,
                         n_estimators=500,
                         learning_rate=0.1,
                         cat_features = cat_feats,
                         silent=True,
                         random_state=13)

ctb.fit(X_train, y_train)

train_preds = ctb.predict_proba(X_train)

In [54]:
df_ranker_predict = df_ranker_train.copy()

In [55]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

# Evaluation on test dataset

In [56]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Eval matching on test dataset

In [67]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 9min 30s


## Eval re-ranked matched result on test dataset
    Вспомним df_match_candidates сет, который был получен own_recommendations на юзерах, набор пользователей мы фиксировали и он одинаков, значи и прогноз одинаков, поэтому мы можем использовать этот датафрейм для переранжирования.
    

In [68]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [59]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [70]:
class FeaturesEng():

    def __init__(self):
        self.full_data = None
        
        self.user_quantity_sum = None
        self.user_sales_value_sum = None
        self.user_total_sales_value_sum = None
        self.item_quantity_sum = None
        self.user_item_quantity_sum = None
        self.user_department_quantity_sum = None
        

    def fit(self,
            data: pd.DataFrame,
            item_features: pd.DataFrame,
            user_features: pd.DataFrame):

        self.full_data = data.merge(item_features, how='left', on='item_id')
        self.full_data = self.full_data.merge(
            user_features, how='left', on='user_id')
        self.full_data['total_sales_value'] = self.full_data['quantity'] * \
            self.full_data['sales_value']
        
        # насколько часто юзер покупал, как много потратил
        self.user_quantity_sum = self.n_value_control(
            self.full_data, 'quantity', group_list=['user_id'], func=4).rename(
            columns={'cor_col': 'user_quantity_sum'})
            
        self.user_sales_value_sum = self.n_value_control(
            self.full_data, 'sales_value', group_list=['user_id'], func=4).rename(
            columns={'cor_col': 'user_sales_value_sum'})
      
        self.user_total_sales_value_sum = self.n_value_control(
            self.full_data, 'total_sales_value', group_list=['user_id'], func=4).rename(
            columns={'cor_col': 'user_total_sales_value_sum'})
        
        self.item_quantity_sum = self.n_value_control(
            self.full_data, 'quantity', group_list=['item_id'], func=4).rename(
            columns={'cor_col': 'item_quantity_sum'})
        
        self.user_item_quantity_sum = self.n_value_control(
            self.full_data, 'quantity', group_list=['user_id','item_id'], func=4).rename(
            columns={'cor_col': 'user_item_quantity_sum'})
        
        self.user_department_quantity_sum = self.n_value_control(
            self.full_data, 'quantity', group_list=['user_id','department'], func=4).rename(
            columns={'cor_col': 'user_department_quantity_sum'})
             
        

    @staticmethod
    def n_value_control(data_f, col_name: str, group_list: list = [], func=1):
        """
        Группировка по условию
        :param df: DataFrame
        :param col_name: название столбца
        :param cor_method: способ обработки (1 - медиана, 2 - мода, 3 - среднее, 4 - сумма)
        :return: DataFrame group by condition 
        """
        if func == 1:
            ddf = data_f.groupby(group_list)[col_name].median()
        if func == 2:
            ddf = data_f.groupby(group_list)[col_name].agg(
                lambda x: mode(x)[0])
        if func == 3:
            ddf = data_f.groupby(group_list)[col_name].mean()
        if func == 4:
            ddf = data_f.groupby(group_list)[col_name].sum()
        ddf = pd.DataFrame(ddf)
        ddf.rename(columns={col_name: 'cor_col'}, inplace=True)
        return ddf
    
    def transform(self, train_df):
        new_train_df = train_df.copy()
        
        user_features = [self.user_quantity_sum, self.user_sales_value_sum, self.user_total_sales_value_sum]
        for df in user_features:
            new_train_df = new_train_df.merge(df, how='left', on='user_id')
        new_train_df[['user_sales_value_sum', 'user_quantity_sum', 'user_total_sales_value_sum']].fillna(0, inplace=True)
        
        item_features = [self.item_quantity_sum,
                         self.full_data[['item_id', 'sales_value']].drop_duplicates(subset=['item_id'])]
        for df in item_features:
            new_train_df = new_train_df.merge(df, how='left', on='item_id')
#         display(new_train_df.head(2))

        new_train_df = new_train_df.merge(self.user_item_quantity_sum, how='left', on=['user_id','item_id'])
        new_train_df['user_item_quantity_sum'].fillna(0, inplace=True)
        
        new_train_df = new_train_df.merge(self.user_department_quantity_sum, how='left', on=['user_id','department'])
        new_train_df['user_department_quantity_sum'].fillna(0, inplace=True)
    
        return new_train_df

In [71]:
new_features = FeaturesEng()
new_features.fit(data, item_features, user_features)
X_train_feat = new_features.transform(X_train)

In [72]:
ctb = CatBoostClassifier(
                         max_depth=6,
                         n_estimators=500,
                         learning_rate=0.2,
                         cat_features = cat_feats,
                         silent=True,
                         random_state=13)

ctb.fit(X_train_feat, y_train)

train_preds = ctb.predict_proba(X_train_feat)

In [73]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [74]:
result_eval_ranker['reranked_own_rec_feat'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [77]:
precision_scores = calc_precision(result_eval_ranker, TOPK_PRECISION, stop=5)
precision_scores

{'reranked_own_rec': 0.16334203655352283,
 'reranked_own_rec_feat': 0.24208877284594954,
 'own_rec': 0.21215090641842038}