# Baseline. Одноуровневая модель.


Так как geekbrains не выложил датасет для валидации модели, то для этой цели использовались данные из retail_train

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier


# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k, ap_k, map_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

In [3]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [4]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
item_features = pd.read_csv('product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [6]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5000


In [7]:
shared_users_baseline = data_train.user_id.values
data_test = data_test[data_test.user_id.isin(shared_users_baseline)]

In [8]:
result_baseline = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_baseline.columns=['user_id', 'actual']
result_baseline.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [9]:
recom_baseline = MainRecommender(data_train, fake_id = None)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [10]:
precision_at_k_baseline = {}
map_k_baseline = {}

In [11]:
result_baseline['als'] = result_baseline['user_id'].apply(lambda x: recom_baseline.get_als_recommendations(x, N=5))
result_baseline.head(2)

Unnamed: 0,user_id,actual,als
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[901062, 1033142, 1041796, 979707, 1082212]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1133018, 1106523, 910032, 951590, 1053690]"


In [12]:
precision_at_k_baseline['als'] =result_baseline.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()

In [13]:
map_k_baseline['als'] = map_k(result_baseline['als'].tolist(), result_baseline['actual'].tolist())

In [14]:
result_baseline['similar_user'] = result_baseline['user_id'].apply(lambda x: recom_baseline.get_similar_users_recommendation(x, N=5))
result_baseline.head(2) 

Unnamed: 0,user_id,actual,als,similar_user
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[901062, 1033142, 1041796, 979707, 1082212]","[1029743, 1127831, 981760, 5569230, 916122]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1133018, 1106523, 910032, 951590, 1053690]","[951590, 1029743, 1127831, 995242, 981760]"


In [15]:
precision_at_k_baseline['similar_user']=result_baseline.apply(lambda row: precision_at_k(row['similar_user'], row['actual']), axis=1).mean()

In [16]:
map_k_baseline['similar_user'] = map_k(result_baseline['similar_user'].tolist(), result_baseline['actual'].tolist())

In [17]:
result_baseline['own'] = result_baseline['user_id'].apply(lambda x: recom_baseline.get_own_recommendations(x, N=5))
result_baseline.head(2)

Unnamed: 0,user_id,actual,als,similar_user,own
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[901062, 1033142, 1041796, 979707, 1082212]","[1029743, 1127831, 981760, 5569230, 916122]","[986912, 856942, 1082185, 995242, 1029743]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1133018, 1106523, 910032, 951590, 1053690]","[951590, 1029743, 1127831, 995242, 981760]","[1053690, 1082185, 951590, 1106523, 1029743]"


In [18]:
precision_at_k_baseline['own']=result_baseline.apply(lambda row: precision_at_k(row['own'], row['actual']), axis=1).mean()

In [19]:
map_k_baseline['own'] = map_k(result_baseline['own'].tolist(), result_baseline['actual'].tolist())

In [20]:
result_baseline['similar_items'] = result_baseline['user_id'].apply(lambda x: recom_baseline.get_similar_items_recommendation(x, N=5))
result_baseline.head(2) 

Unnamed: 0,user_id,actual,als,similar_user,own,similar_items
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[901062, 1033142, 1041796, 979707, 1082212]","[1029743, 1127831, 981760, 5569230, 916122]","[986912, 856942, 1082185, 995242, 1029743]","[1011046, 5582712, 15926844, 1029743, 9297615]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1133018, 1106523, 910032, 951590, 1053690]","[951590, 1029743, 1127831, 995242, 981760]","[1053690, 1082185, 951590, 1106523, 1029743]","[1076875, 1048462, 916381, 1075979, 1133018]"


In [21]:
precision_at_k_baseline['similar_items']=result_baseline.apply(lambda row: precision_at_k(row['similar_items'], row['actual']), axis=1).mean()

In [22]:
map_k_baseline['similar_items'] = map_k(result_baseline['similar_items'].tolist(), result_baseline['actual'].tolist())

In [23]:
pres_at_k = pd.DataFrame.from_dict(precision_at_k_baseline, orient='index', columns=['precision_at_k'])
pres_at_k.sort_values(by = ['precision_at_k'], ascending=False)

Unnamed: 0,precision_at_k
own,0.272244
als,0.180315
similar_user,0.103248
similar_items,0.095768


In [24]:
map_baseline = pd.DataFrame.from_dict(map_k_baseline, orient='index', columns=['map_k'])
map_baseline.sort_values(by = ['map_k'], ascending=False)

Unnamed: 0,map_k
own,0.160476
als,0.114437
similar_user,0.067385
similar_items,0.060453


В одноуровневой модели baseline лучшие значения метрик получаются при использовании ItemItemRecommender

# Двухуровневая модель

In [25]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + 1*val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + 1*val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [26]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [27]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [28]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5000


In [29]:
shared_users = data_train_lvl_1.user_id.values
data_val_lvl_1 = data_val_lvl_1[data_val_lvl_1.user_id.isin(shared_users)]
data_val_lvl_2 = data_val_lvl_2[data_val_lvl_2.user_id.isin(shared_users)]
data_train_lvl_2 = data_train_lvl_2[data_train_lvl_2.user_id.isin(shared_users)]

In [30]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [31]:
recommender = MainRecommender(data_train_lvl_1)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [32]:
result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=200))
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[979707, 940947, 995242, 1033142, 9527290, 965..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1133018, 5569230, 1106523, 8090521, 1068719, ..."


In [33]:
result_lvl_1.apply(lambda row: recall_at_k(row['als'], row['actual'], k=200), axis=1).mean()

0.2210119503653569

In [34]:
result_lvl_1['own'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als,own
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[979707, 940947, 995242, 1033142, 9527290, 965...","[9859182, 1133018, 995965, 868764, 939860, 829..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1133018, 5569230, 1106523, 8090521, 1068719, ...","[1126899, 1046584, 986912, 6632283, 995965, 80..."


In [35]:
result_lvl_1.apply(lambda row: recall_at_k(row['own'], row['actual'], k=200), axis=1).mean()

0.15014457571786705

In [71]:
result_lvl_1['similar_items'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=200))
result_lvl_1.head(2) 

Unnamed: 0,user_id,actual,als,own,similar_items
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[979707, 940947, 995242, 1033142, 9527290, 965...","[9859182, 1133018, 995965, 868764, 939860, 829...","[912676, 1029743, 9526411, 5582712, 9297615, 8..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1133018, 5569230, 1106523, 8090521, 1068719, ...","[1126899, 1046584, 986912, 6632283, 995965, 80...","[8090537, 5569845, 1106523, 1133018, 985999, 8..."


In [72]:
result_lvl_1.apply(lambda row: recall_at_k(row['similar_items'], row['actual'], k=200), axis=1).mean()

0.12139428989113946

Лучший recall дает als model.

Обучим модель второго уровня

In [36]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

In [37]:
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=200))

In [38]:
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1096036, 866211, 904129, 833025, 1107553, 870..."
1,2021,"[951590, 981760, 1044078, 844179, 5592931, 952..."


In [39]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

In [40]:
users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)

In [41]:
users_lvl_2['flag'] = 1
users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,1096036,1
0,2070,866211,1
0,2070,904129,1
0,2070,833025,1


In [42]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1096036,0.0
1,2070,866211,0.0


In [43]:
targets_lvl_2.target.value_counts()

0.0    396463
1.0     44067
Name: target, dtype: int64

In [44]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1096036,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,866211,0.0,2,PRODUCE,National,GRAPES,GRAPES WHITE,18 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [45]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [46]:
df_lvl_1 = pd.concat([data_train_lvl_1, data_val_lvl_1])
df_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1.39
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0,1.21


In [47]:
df_lvl_1['price']=df_lvl_1['sales_value'] / (np.maximum(df_lvl_1['quantity'], 1))

In [48]:
X_train = X_train.merge(df_lvl_1.groupby(by='user_id')['sales_value'].agg('mean').rename('mean_check_user'), how='left', on='user_id')

In [49]:
X_train = X_train.merge(df_lvl_1.groupby(by='user_id')['quantity'].agg('sum').rename('user_buys_month')/4, how='left', on='user_id')

In [50]:
X_train = X_train.merge(df_lvl_1.groupby(by='item_id')['quantity'].agg('sum').rename('item_buys_weeks')/df_lvl_1['week_no'].nunique(), how='left', on='item_id')

In [51]:
X_train = X_train.merge(X_train.groupby(by='department')['item_id'].agg('sum').rename('item_department_weeks')/df_lvl_1['week_no'].nunique(), how='left', on = 'department')

In [52]:
X_train = X_train.merge(df_lvl_1.groupby(by=['user_id', 'item_id'])['quantity'].agg('sum').rename('user_item_quantity')/df_lvl_1['week_no'].nunique(), how='left', on=['user_id', 'item_id'])

In [53]:
X_train = X_train.merge(df_lvl_1.groupby(by=['user_id', 'item_id'])['price'].agg('mean').rename('user_item_price'), how='left', on=['user_id', 'item_id'])

In [54]:
X_train.fillna(0)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_check_user,user_buys_month,item_buys_weeks,item_department_weeks,user_item_quantity,user_item_price
0,2070,1096036,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,...,Unknown,Unknown,1,None/Unknown,3.109432,4702.0,38.098901,5.830122e+09,0.032967,1.640
1,2070,866211,2,PRODUCE,National,GRAPES,GRAPES WHITE,18 LB,45-54,U,...,Unknown,Unknown,1,None/Unknown,3.109432,4702.0,34.384615,1.097182e+09,0.021978,3.375
2,2070,904129,69,MEAT-PCKGD,Private,MEAT - MISC,GRND/PATTY - FROZEN,3 LB,45-54,U,...,Unknown,Unknown,1,None/Unknown,3.109432,4702.0,5.527473,5.012095e+08,0.065934,8.190
3,2070,833025,69,GROCERY,Private,MILK BY-PRODUCTS,SOUR CREAMS,16 OZ,45-54,U,...,Unknown,Unknown,1,None/Unknown,3.109432,4702.0,29.186813,5.830122e+09,0.065934,1.088
4,2070,1107553,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,...,Unknown,Unknown,1,None/Unknown,3.109432,4702.0,9.439560,5.830122e+09,0.010989,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440525,1745,1075175,584,GROCERY,National,COLD CEREAL,KIDS CEREAL,22 OZ,45-54,A,...,Unknown,Single Male,2,None/Unknown,3.271673,167.0,2.725275,5.830122e+09,0.000000,0.000
440526,1745,859191,1646,PRODUCE,National,SALAD MIX,BLENDS,12 OZ,45-54,A,...,Unknown,Single Male,2,None/Unknown,3.271673,167.0,3.769231,1.097182e+09,0.000000,0.000
440527,1745,903230,69,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,20 OZ,45-54,A,...,Unknown,Single Male,2,None/Unknown,3.271673,167.0,2.054945,5.830122e+09,0.000000,0.000
440528,1745,825994,3126,PRODUCE,National,VALUE ADDED FRUIT,INSTORE CUT FRUIT,,45-54,A,...,Unknown,Single Male,2,None/Unknown,3.271673,167.0,4.824176,1.097182e+09,0.000000,0.000


In [55]:
cat_feats = X_train.columns[2:15].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [56]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)

In [57]:
lgb.fit(X_train, y_train['target'])



LGBMClassifier(categorical_column=['manufacturer', 'department', 'brand',
                                   'commodity_desc', 'sub_commodity_desc',
                                   'curr_size_of_product', 'age_desc',
                                   'marital_status_code', 'income_desc',
                                   'homeowner_desc', 'hh_comp_desc',
                                   'household_size_desc', 'kid_category_desc'],
               max_depth=7, objective='binary')

In [58]:
train_preds = lgb.predict(X_train)

In [59]:
train_preds

array([0., 0., 0., ..., 0., 0., 0.])

In [60]:
train_preds_proba = lgb.predict_proba(X_train)

In [61]:
probas = train_preds_proba[:,1]
probas

array([1.71291113e-01, 9.78193824e-02, 1.12543367e-01, ...,
       4.51996143e-05, 4.46437548e-05, 1.97358479e-02])

In [62]:
df_probas = targets_lvl_2.copy()

In [63]:
df_probas['probas'] = probas

In [64]:
df_probas

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,probas
0,2070,1096036,0.0,69,GROCERY,Private,CHEESE,IWS SINGLE CHEESE,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.171291
1,2070,866211,0.0,2,PRODUCE,National,GRAPES,GRAPES WHITE,18 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.097819
2,2070,904129,0.0,69,MEAT-PCKGD,Private,MEAT - MISC,GRND/PATTY - FROZEN,3 LB,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.112543
3,2070,833025,1.0,69,GROCERY,Private,MILK BY-PRODUCTS,SOUR CREAMS,16 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.435349
4,2070,1107553,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,0.010778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440525,1745,1075175,0.0,584,GROCERY,National,COLD CEREAL,KIDS CEREAL,22 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,0.000040
440526,1745,859191,0.0,1646,PRODUCE,National,SALAD MIX,BLENDS,12 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,0.000036
440527,1745,903230,0.0,69,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,20 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,0.000045
440528,1745,825994,0.0,3126,PRODUCE,National,VALUE ADDED FRUIT,INSTORE CUT FRUIT,,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,0.000045


In [65]:
shared_users_for_level2 = df_probas.user_id.values
data_val_lvl_2 = data_val_lvl_2[data_val_lvl_2.user_id.isin(shared_users_for_level2)]

In [66]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


In [67]:
result_lvl_2['2_level'] = result_lvl_2['user_id'].apply(lambda x: df_probas[df_probas['user_id']==x].\
                sort_values('probas', ascending=False).head(5).item_id.tolist())

In [68]:
result_lvl_2.head(2)

Unnamed: 0,user_id,actual,2_level
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1082185, 1082185, 1082185, 856942, 856942]"
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1082185, 1082185, 1098844, 1098844, 840361]"


В задании, которое озвучивал преподаватель на лекции, говорится о целевой метрике map@5>=0.2, но на всякий случай посчитаем еще precision@5

In [69]:
map_k(result_lvl_2['2_level'].tolist(), result_lvl_2['actual'].tolist())

0.26205707282913193

In [70]:
result_lvl_2.apply(lambda row: precision_at_k(row['2_level'], row['actual']), axis=1).mean()

0.25315126050419734

Сравнение метрик Baseline-одноуровневой модели и двухуровневой модели показывает, что метрика precision@5 выше для одноуровневой модели, но целевая метрика map@5 выше для двухуровневой.