# Импорт библеотек

In [1]:
# проведение пути до собственных модулей
import sys
sys.path.append('../')

In [2]:
# основные модули
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# собственные модулей
from src.utils import prefilter_items
from src.metrics import precision_at_k, recall_at_k, evaluete_rec
from src.myf import reduction_memory
# модель для 1го уровня
from src.recommenders import MainRecommender

# модели для 2го уровня
from lightgbm import LGBMClassifier

# отключение предупреждений
import warnings
warnings.filterwarnings('ignore')

# Загрузка данных

In [3]:
# создание датафреймов
train = pd.read_csv('../data/retail_train.csv')
test = pd.read_csv('../data/retail_test.csv')

# зегрузка фичей
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

# снижение веса датафлеймов
train, test, item_features, user_features = map(reduction_memory, (train, test, item_features, user_features))

before:		230.09 MB
after:		141.41 MB
reduсed:	88.68 MB
before:		8.52 MB
after:		5.24 MB
reduсed:	3.28 MB
before:		5.17 MB
after:		4.25 MB
reduсed:	0.92 MB
before:		0.05 MB
after:		0.05 MB
reduсed:	0.0 MB


# Редактирование датафреймов

In [4]:
# Снижение регистров столбцов
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

# Переименование столбцов
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

## добавление фичей

In [5]:
# объединение датафреймов
train_ = train.merge(item_features, on='item_id', how='left')
train_ = train.merge(user_features, on='user_id', how='left')

test_ = test.merge(item_features, on='item_id', how='left')
test_ = test.merge(user_features, on='user_id', how='left')

In [6]:
# префильтрация
train_, test_ = map(prefilter_items, (train_, test_))

# Обучение одноуровневой модели

In [7]:
# обучение одноуровневой модели
recommender_ = MainRecommender(train_)



HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5001), HTML(value='')))




In [8]:
# создание фрейма результатов одноуровневой модели
result_ = test_.groupby('user_id')['item_id'].unique().reset_index()
result_.columns = ['user_id', 'actual']

# удаление тех пользователей, на которых модель не обучалась
result_ = result_[result_['user_id'].isin(train_['user_id'])]
result_.head()

Unnamed: 0,user_id,actual
0,1,"[999999, 883616, 931136, 940947, 958046, 96155..."
1,2,"[820165, 820291, 826784, 999999, 857849, 86621..."
2,3,"[827683, 908531, 989069, 1071377, 999999, 1096..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84..."
4,7,"[847270, 855557, 999999, 938700, 954673, 95701..."


## подбор числа кондидатов

In [9]:
# подбор числа кандидатов
N_tuple = 50, 100, 200, 400
metric = 'recall@k'

# фрейм результатов прогноза
recall_frame = pd.DataFrame(
    columns=[metric, 'similar_items', 'als', 'own']
).set_index(metric)

# прогнозирование результатов и добавление во фрейм
for N in N_tuple:
    result_['similar_items'] = result_['user_id'].apply(
        lambda x: recommender_.get_similar_items_recommendation(user=x, N=N)
    )# apply

    result_['als'] = result_['user_id'].apply(
        lambda x: recommender_.get_als_recommendations(user=x, N=N)
    )# apply

    result_['own'] = result_['user_id'].apply(
        lambda x: recommender_.get_own_recommendations(user=x, N=N)
    )# apply
    
    recall_frame.loc[N] = evaluete_rec(data=result_, true='actual', metric=metric, k=N)

recall_frame

Unnamed: 0_level_0,similar_items,als,own
recall@k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50,0.151166,0.114634,0.149338
100,0.205111,0.171245,0.216577
200,0.278041,0.253827,0.318158
400,0.384909,0.354231,0.449197


- 100 кондидатов на модели __own recommendation__ (основанной на Item Item Recommender) будет достаточно

## прогнозирование результатов на трейне (precision@5)

In [10]:
# подбор числа кандидатов
N = 100
k = 5
metric = 'precision@k'

precision_frame = pd.DataFrame(
    columns=[metric, 'similar_items', 'als', 'own']
).set_index(metric)

# прогнозирование результатов и добавление ление во фрейм
result_['similar_items'] = result_['user_id'].apply(
    lambda x: recommender_.get_similar_items_recommendation(user=x, N=N)
)# apply

result_['als'] = result_['user_id'].apply(
    lambda x: recommender_.get_als_recommendations(user=x, N=N)
)# apply

result_['own'] = result_['user_id'].apply(
    lambda x: recommender_.get_own_recommendations(user=x, N=N)
)# apply
    
precision_frame.loc[k] = evaluete_rec(data=result_, true='actual', metric=metric, k=k)

precision_frame

Unnamed: 0_level_0,similar_items,als,own
precision@k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,0.179936,0.116136,0.185881


## Вывод:

- Видно что лучший точность у __own recommender__ (Item Item Recommender) однако она все же < 0.27. 
- Попробуем двухуровневую модель с фильтрацией товаров на 1м ровне.

In [11]:
# результат прогноза 100 кондидатов
result_.head()

Unnamed: 0,user_id,actual,similar_items,als,own
0,1,"[999999, 883616, 931136, 940947, 958046, 96155...","[9487534, 999999, 1082185, 5582712, 15926844, ...","[1004390, 885290, 960732, 999391, 819518, 1094...","[856942, 9297615, 5577022, 1074612, 8293439, 9..."
1,2,"[820165, 820291, 826784, 999999, 857849, 86621...","[1133018, 1137346, 1106523, 5569845, 985999, 8...","[1004906, 1106523, 5569230, 1033142, 1041259, ...","[1076580, 838136, 911974, 826784, 1007414, 108..."
2,3,"[827683, 908531, 989069, 1071377, 999999, 1096...","[1076875, 1092026, 960318, 1075979, 823704, 11...","[8090521, 5569230, 1110244, 951590, 941741, 96...","[998206, 921345, 1092937, 964594, 13842214, 86..."
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[999999, 1041259, 904360, 845208, 948650, 1105...","[1082185, 857006, 863632, 999250, 965267, 8789...","[13003092, 1119051, 9911484, 5580166, 8203834,..."
4,7,"[847270, 855557, 999999, 938700, 954673, 95701...","[999999, 1038985, 1015247, 1094955, 967395, 11...","[1130111, 853643, 5981267, 1003188, 1031833, 9...","[845814, 949836, 9338009, 1075524, 840386, 694..."


# Обучение двухуровневой модели

## Обучение 1го уровня модели

In [12]:
# разбиение на тестовые и трейновые фреймы 1го и 2го уровня
weeks = 9

# 1й уровень
train_lvl1 = train_[train_['week_no'] < train_['week_no'].max() - weeks]
test_lvl1 = train_[train_['week_no'] >= train_['week_no'].max() - weeks]

# 2й уровень
train_lvl2 = test_lvl1.copy()

In [13]:
# обучение 1го уровня модели
recommender_lvl1 = MainRecommender(train_lvl1)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [14]:
# создание фрейма результатов 1го уровня
result_lvl1 = test_lvl1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl1.columns = ['user_id', 'actual']

# удаление тех пользователей, на которых модель не обучалась
result_lvl1 = result_lvl1[result_lvl1['user_id'].isin(train_lvl1['user_id'])]
result_lvl1.head()

Unnamed: 0,user_id,actual
0,1,"[999999, 865456, 867607, 872137, 878285, 89088..."
1,2,"[999999, 838136, 839656, 861272, 866211, 87391..."
2,3,"[999999, 851057, 872021, 878302, 879948, 90963..."
3,4,"[883932, 970760, 1035676, 1055863, 999999, 677..."
4,6,"[1024306, 1102949, 6548453, 999999, 962568, 97..."


In [15]:
# подбор числа кандидатов
N_tuple = 50, 100, 200, 400
metric = 'recall@k'

# фрейм результатов прогноза
recall_lvl1 = pd.DataFrame(columns=[metric, 'similar_items', 'als', 'own']).set_index(metric)

for N in N_tuple:
    result_lvl1['similar_items'] = result_lvl1['user_id'].apply(
        lambda x: recommender_lvl1.get_similar_items_recommendation(user=x, N=N)
    )# apply

    result_lvl1['als'] = result_lvl1['user_id'].apply(
        lambda x: recommender_lvl1.get_als_recommendations(user=x, N=N)
    )# apply

    result_lvl1['own'] = result_lvl1['user_id'].apply(
        lambda x: recommender_lvl1.get_own_recommendations(user=x, N=N)
    )# apply

    recall_lvl1.loc[N] = evaluete_rec(data=result_lvl1, true='actual', metric=metric, k=N)
    
recall_lvl1

Unnamed: 0_level_0,similar_items,als,own
recall@k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50,0.118218,0.110141,0.151644
100,0.172603,0.170682,0.228656
200,0.253976,0.25781,0.344366
400,0.36848,0.372433,0.48006


- 100 кондидатов на модели __own recommendation__ (основанной на Item Item Recommender) будет достаточно

In [16]:
# подбор числа кандидатов
N = 100
k = 5
metric = 'precision@k'

precision_lvl1 = pd.DataFrame(
    columns=[metric, 'similar_items', 'als', 'own']
).set_index(metric)

# прогнозирование результатов и добавление во фрейм
result_lvl1['similar_items'] = result_lvl1['user_id'].apply(
    lambda x: recommender_lvl1.get_similar_items_recommendation(user=x, N=N)
)# apply

result_lvl1['als'] = result_lvl1['user_id'].apply(
    lambda x: recommender_lvl1.get_als_recommendations(user=x, N=N)
)# apply

result_lvl1['own'] = result_lvl1['user_id'].apply(
    lambda x: recommender_lvl1.get_own_recommendations(user=x, N=N)
)# apply
    
precision_lvl1.loc[k] = evaluete_rec(data=result_lvl1, true='actual', metric=metric, k=k)

precision_lvl1

Unnamed: 0_level_0,similar_items,als,own
precision@k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,0.200263,0.196402,0.314173


- Видно, что самое точное прогнозирование у __own recommender__ (Item Item Recommender) - __precision@5 = 0.31__ - его будем использовать для 2го уровня модели.

In [17]:
result_lvl1.head()

Unnamed: 0,user_id,actual,similar_items,als,own
0,1,"[999999, 865456, 867607, 872137, 878285, 89088...","[949620, 999999, 1082185, 1098066, 7135183, 87...","[1094924, 1098692, 1130375, 1062572, 8090513, ...","[856942, 9297615, 5577022, 1074612, 9655212, 9..."
1,2,"[999999, 838136, 839656, 861272, 866211, 87391...","[1137346, 1133018, 1106523, 5569845, 1132771, ...","[5569230, 1106523, 1041259, 834484, 1004906, 9...","[1076580, 911974, 826784, 1007414, 1083296, 83..."
2,3,"[999999, 851057, 872021, 878302, 879948, 90963...","[1076875, 1092026, 960318, 1075979, 1133018, 8...","[1110244, 951590, 5569230, 826249, 8090521, 88...","[921345, 998206, 1092937, 964594, 864615, 8856..."
3,4,"[883932, 970760, 1035676, 1055863, 999999, 677...","[951590, 13945244, 10457233, 1084310, 1074754,...","[902172, 891423, 904105, 927291, 1119454, 9997...","[891423, 936470, 910109, 887003, 1121367, 9518..."
4,6,"[1024306, 1102949, 6548453, 999999, 962568, 97...","[999999, 904360, 825541, 845208, 948650, 55698...","[1082185, 965267, 878996, 930118, 1024306, 871...","[13003092, 1119051, 9911484, 1108094, 8203834,..."


## Обучение 2го уровня модели

### Формирование фремов для обучения на train

In [18]:
# формирование фрейма с результатами прогнозирования 1го уровня для 2го уровня
result_lvl2 = pd.DataFrame(train_lvl2['user_id'].unique())
result_lvl2.columns = ['user_id']

# отбор пользователей для горячего старта
train_users = train_lvl1['user_id'].unique()
result_lvl2 = result_lvl2[result_lvl2['user_id'].isin(train_users)]

# Добавление по 100 items которые отбирает 1 уровень модели
result_lvl2['candidates'] = result_lvl2['user_id'].apply(lambda x: recommender_lvl1.get_own_recommendations(x, N=100))

result_lvl2.head()

Unnamed: 0,user_id,candidates
0,2070,"[834103, 878302, 1119399, 1085604, 13511722, 9..."
1,2021,"[1119454, 1019142, 871279, 835578, 863762, 101..."
2,1753,"[13842224, 1094371, 1089066, 862981, 901543, 1..."
3,2120,"[480014, 1082185, 6534178, 1029743, 995242, 11..."
4,1346,"[5574377, 480014, 903738, 5568758, 8090560, 91..."


In [19]:
# формирование фрейма для 2го уровня модели

# вытаскиваем всех пользователей
users_array = result_lvl2['user_id'].values

# вытаскиваем все items
candidates_lists = result_lvl2['candidates']
len_candidates= len(candidates_lists[0])
candidates_array = candidates_lists.values

# формируем фрейм с спрогнозированными результатами
df = pd.DataFrame({'user_id':users_array.repeat(len_candidates),
                   'item_id':np.concatenate(candidates_array)})
df.head()

Unnamed: 0,user_id,item_id
0,2070,834103
1,2070,878302
2,2070,1119399
3,2070,1085604
4,2070,13511722


In [20]:
# Формируем фрейм рекоммендаций user_item

# фиксируем фактическое взаимодействие
targets_train_lvl2 = train_lvl2[['user_id', 'item_id']].copy()
# отмечаем их как 1
targets_train_lvl2['target'] = 1

# объединяем фрейм фактического взаимодействия с предсказанным по совпадению пользователь-товар
targets_train_lvl2 = df.merge(targets_train_lvl2, on=['user_id', 'item_id'], how='left')
# если появятся не зафиксированные взаимодействия, отметим их как 0 
targets_train_lvl2['target'].fillna(0, inplace=True)

targets_train_lvl2

Unnamed: 0,user_id,item_id,target
0,2070,834103,1.0
1,2070,834103,1.0
2,2070,834103,1.0
3,2070,834103,1.0
4,2070,834103,1.0
...,...,...,...
249253,832,1085604,0.0
249254,832,986912,0.0
249255,832,1053690,0.0
249256,832,834484,0.0


#### Добавление фичей

In [21]:
# Добавляем фичи для user, items и user_items
targets_train_lvl2 = targets_train_lvl2.merge(item_features, on='item_id', how='left')
targets_train_lvl2 = targets_train_lvl2.merge(user_features, on='user_id', how='left')

targets_train_lvl2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


#### Разбиваем таргет на X, y

In [22]:
X_train = targets_train_lvl2.drop('target', axis=1)
y_train = targets_train_lvl2[['target']]

# Отмечаем категориальные признаки
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

### Обучение LightGBM

In [23]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

# прогнозирование train выборки
train_preds = lgb.predict(X_train)
train_preds

array([1., 1., 1., ..., 0., 0., 0.])

### Оценка обучения 2го уровня модели (train)

In [24]:
def eval_lgbm(targets, preds, k=5):
    targets['recommend'] = preds
    targets = targets[['user_id', 'item_id', 'target', 'recommend']]
    
    target = targets[targets['target'] == 1]
    target = target.groupby('user_id')['item_id'].unique().reset_index()
    target.columns = ['user_id', 'target']
    
    recommend = targets[targets['recommend'] == 1]
    recommend = recommend.groupby('user_id')['item_id'].unique().reset_index()
    recommend.columns = ['user_id', 'recommend']
    
    target_recommend = target.merge(recommend, on='user_id')
    
    result = evaluete_rec(data=target_recommend, true='target', metric='precision@k', k=k)
    
    return result

In [26]:
print(f'precision@5 (train) = {eval_lgbm(targets=targets_train_lvl2, preds=train_preds)[0]}')

precision@5 (train) = 0.5162764771460436


## Формирование фремов для прогнозирования на test

In [29]:
targets_test_lvl2 = test[['user_id', 'item_id']].copy()
targets_test_lvl2['target'] = 1

targets_test_lvl2 = df.merge(targets_test_lvl2, on=['user_id', 'item_id'], how='left')

targets_test_lvl2['target'].fillna(0, inplace= True)

targets_test_lvl2

Unnamed: 0,user_id,item_id,target
0,2070,834103,1.0
1,2070,834103,1.0
2,2070,834103,1.0
3,2070,834103,1.0
4,2070,834103,1.0
...,...,...,...
231081,832,1085604,0.0
231082,832,986912,0.0
231083,832,1053690,0.0
231084,832,834484,0.0


In [31]:
targets_test_lvl2 = targets_test_lvl2.merge(item_features, on='item_id', how='left')
targets_test_lvl2 = targets_test_lvl2.merge(user_features, on='user_id', how='left')

targets_test_lvl2.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,834103,1.0,2224,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [32]:
X_test = targets_test_lvl2.drop('target', axis=1)
y_test = targets_test_lvl2[['target']]

X_test[cat_feats] = X_test[cat_feats].astype('category')

## Прогноз и оценка результатов на test

In [33]:
test_preds = lgb.predict(X_test)
test_preds

array([1., 1., 1., ..., 0., 0., 0.])

In [34]:
print(f'precision@5 (train) = {eval_lgbm(targets=targets_test_lvl2, preds=test_preds)[0]}')

precision@5 (train) = 0.27946277495769895


## Вывод

- Двухуровневая модель выдала результат > 0.27 - цель достигнута