### Итоговый проект (Целевая метрика map@5 (необходимо получить значение >=0.2))

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight, ItemItemRecommender, CosineRecommender

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from best_rec_lib.metrics import precision_at_k, ap_k 
from best_rec_lib.utils import prefilter_items
from best_rec_lib.recommenders import MainRecommender

In [105]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


Создадим датафрейм с покупками юзеров на тестовом датасете (последние 3 недели):

In [3]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [4]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 0 новых юзеров


### Popularity-based recommendation

In [5]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [6]:
%%time

# Можно так делать, так как рекомендация не зависит от юзера
popular_recs = popularity_recommendation(data_train, n=5)

result['popular_recommendation'] = result['user_id'].map(lambda x: popular_recs)
result.head(2)

CPU times: total: 203 ms
Wall time: 202 ms


Unnamed: 0,user_id,actual,popular_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 6533889, 1029743, 6534166, 1082185]"


### MainRecommender

In [7]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


In [8]:
recommender = MainRecommender(data_train)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [9]:
user_array = data_train['user_id'].unique().tolist()
all_top_purchases = recommender.overall_top_purchases[:5] #будем добавлять, если новый

In [10]:
%%time

result['als'] = result['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 6.16 s
Wall time: 943 ms


In [11]:
%%time

result['own'] = result['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 297 ms
Wall time: 293 ms


In [12]:
%%time

result['similar_item'] = result['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 12.5 s
Wall time: 3.17 s


In [13]:
result.head(2)

Unnamed: 0,user_id,actual,popular_recommendation,als,own,similar_item
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 995242, 885290, 832678]","[999999, 1082185, 6534178, 1029743, 995242]","[856942, 1082185, 995242, 940947, 9527290]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 6533889, 1029743, 6534166, 1082185]","[844165, 951590, 883404, 962229, 1106523]","[999999, 1082185, 6534178, 1029743, 995242]","[1092026, 1053690, 951590, 998206, 1106523]"


Проверим map@5:

In [14]:
columns = ['popular_recommendation', 'als', 'own', 'similar_item']

for col in columns:
    map_mean = result.apply(lambda row: ap_k(row[col], row['actual'], k=5), axis=1).mean()
    print(f'{col}: {map_mean}')

popular_recommendation: 0.08280770486451183
als: 0.08462291870714946
own: 0.0857394711067574
similar_item: 0.3381309174012413


### MainRecommender с BM25

In [15]:
recommender = MainRecommender(data_train,  weighting="bm25")

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [16]:
%%time

result['als_bm25'] = result['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 6.64 s
Wall time: 988 ms


In [17]:
%%time

result['own_bm25'] = result['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 1.19 s
Wall time: 295 ms


In [18]:
%%time

result['similar_item_bm25'] = result['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 11.5 s
Wall time: 2.91 s


In [19]:
columns = ['als_bm25', 'own_bm25', 'similar_item_bm25']

for col in columns:
    map_mean = result.apply(lambda row: ap_k(row[col], row['actual'], k=5), axis=1).mean()
    print(f'{col}: {map_mean}')

als_bm25: 0.07982206986614389
own_bm25: 0.1481358145608878
similar_item_bm25: 0.3381309174012413


### MainRecommender с TF-IDF

In [24]:
recommender = MainRecommender(data_train,  weighting="tfidf")

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [25]:
%%time

result['als_tf-idf'] = result['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 5.83 s
Wall time: 850 ms


In [26]:
%%time

result['own_tf-idf'] = result['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 1.02 s
Wall time: 284 ms


In [27]:
%%time

result['similar_item_tf-idf'] = result['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 12.8 s
Wall time: 3.21 s


In [28]:
columns = ['als_tf-idf', 'own_tf-idf', 'similar_item_tf-idf']

for col in columns:
    map_mean = result.apply(lambda row: ap_k(row[col], row['actual'], k=5), axis=1).mean()
    print(f'{col}: {map_mean}')

als_tf-idf: 0.0892164544564148
own_tf-idf: 0.0857394711067574
similar_item_tf-idf: 0.3381309174012413


Similar_item не меняется и остается лучшим

### ItemItemRecommender

In [29]:
recommender = MainRecommender(data_train)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [30]:
user_item_matrix = recommender._prepare_matrix(data_train) # необходимый тип матрицы для implicit

In [31]:
id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = recommender._prepare_dicts(user_item_matrix)

In [32]:
%%time

model = ItemItemRecommender(K=3, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).tocsr(),  # На вход item-user matrix
          show_progress=True)

result['itemitem_k3'] = result['user_id'].\
        map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])

  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: total: 7min 13s
Wall time: 7min 13s


In [33]:
result.apply(lambda row: ap_k(row['itemitem_k3'], row['actual'], k=5), axis=1).mean()

0.08774567417564433

### CosineRecommender

In [34]:
%%time

model = CosineRecommender(K=5, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).tocsr(), 
          show_progress=True)

result['cosine_rec'] = result['user_id'].\
        map(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])



  0%|          | 0/5001 [00:00<?, ?it/s]

CPU times: total: 7min 26s
Wall time: 7min 27s


In [35]:
result.apply(lambda row: ap_k(row['cosine_rec'], row['actual'], k=5), axis=1).mean()

0.04616225922298422

In [36]:
result.head(2)

Unnamed: 0,user_id,actual,popular_recommendation,als,own,similar_item,als_bm25,own_bm25,similar_item_bm25,als_tf-idf,own_tf-idf,similar_item_tf-idf,itemitem_k3,cosine_rec
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 6533889, 1029743, 6534166, 1082185]","[885290, 999999, 1082185, 862349, 995242]","[999999, 1082185, 6534178, 1029743, 995242]","[856942, 1082185, 995242, 940947, 9527290]","[999999, 1082185, 1104349, 960732, 1004390]","[856942, 6533889, 5577022, 9297615, 1074612]","[856942, 1082185, 995242, 940947, 9527290]","[999999, 1082185, 995242, 862349, 1005186]","[999999, 1082185, 6534178, 1029743, 995242]","[856942, 1082185, 995242, 940947, 9527290]","[1005186, 1029743, 995242, 6534178, 1082185]","[1074612, 981760, 961554, 1127831, 1082185]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 6533889, 1029743, 6534166, 1082185]","[883404, 962229, 951590, 999999, 1029743]","[999999, 1082185, 6534178, 1029743, 995242]","[1092026, 1053690, 951590, 998206, 1106523]","[999999, 951590, 8090521, 883404, 8090537]","[6533889, 1070803, 1053690, 998206, 1092937]","[1092026, 1053690, 951590, 998206, 1106523]","[999999, 951590, 1029743, 5569230, 1053690]","[999999, 1082185, 6534178, 1029743, 995242]","[1092026, 1053690, 951590, 998206, 1106523]","[1005186, 1029743, 995242, 6534178, 1082185]","[1074612, 981760, 961554, 1127831, 1082185]"


Добавим фичи и проверим метрику на similar_users

In [20]:
data_train_copy = data_train.copy()

In [21]:
#добавим среднюю цену товара
data_train_copy['sales_value_mean'] = data_train_copy.groupby('item_id')['sales_value'].transform('mean')

#добавим средний чек
data_train_copy['check'] = data_train_copy.groupby(['user_id'])['sales_value'].transform('sum')
data_train_copy['av_check'] = data_train_copy['check'] / data_train_copy.groupby(['user_id'])['quantity'].transform('sum')

In [22]:
#Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)
data_train_copy['sales_value_item_user'] = data_train_copy.groupby(['user_id', 'item_id'])['sales_value'].transform('sum') 
data_train_copy['av_sales_value_item_user'] = data_train_copy['sales_value_item_user']/data_train_copy.groupby(['user_id', 'item_id'])['quantity'].transform('sum')

In [23]:
data_train_copy.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,sales_value_mean,check,av_check,sales_value_item_user,av_sales_value_item_user
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,2.628563,2440.17,2.500174,4.38,2.19
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,0.952974,2440.17,2.500174,0.82,0.82


In [24]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [25]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [26]:
data_train_copy = data_train_copy.merge(user_features, on='user_id', how='left')
data_train_copy = data_train_copy.merge(item_features, on='item_id', how='left')

In [27]:
data_train_copy.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,...,,,,,69.0,PRODUCE,Private,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,...,,,,,2.0,PRODUCE,National,ONIONS,ONIONS SWEET (BULK&BAG),40 LB


In [28]:
data_train_copy = data_train_copy.drop(['homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc'], axis=1)
data_train_copy.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,av_sales_value_item_user,age_desc,marital_status_code,income_desc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,...,2.19,,,,69.0,PRODUCE,Private,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,...,0.82,,,,2.0,PRODUCE,National,ONIONS,ONIONS SWEET (BULK&BAG),40 LB


In [29]:
data_train_copy = data_train_copy.drop(['age_desc', 'marital_status_code', 'income_desc'], axis=1)
data_train_copy.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,check,av_check,sales_value_item_user,av_sales_value_item_user,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,...,2440.17,2.500174,4.38,2.19,69.0,PRODUCE,Private,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,...,2440.17,2.500174,0.82,0.82,2.0,PRODUCE,National,ONIONS,ONIONS SWEET (BULK&BAG),40 LB


In [30]:
recommender = MainRecommender(data_train_copy)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [31]:
%%time

result['similar_item_with_feat'] = result['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=5) 
                                                    if x in user_array else all_top_purchases)

CPU times: total: 12.7 s
Wall time: 3.15 s


In [32]:
result.apply(lambda row: ap_k(row['similar_item_with_feat'], row['actual'], k=5), axis=1).mean()

0.3381309174012413

Метрика не меняется

### Двухуровневая модель рекомендаций (LGBM)

In [33]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [34]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

In [35]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,999999,1
0,2070,1082185,1
0,2070,6534178,1
0,2070,1029743,1


In [36]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

In [37]:
targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,999999,0.0
1,2070,1082185,1.0


In [38]:
targets_lvl_2['target'].mean()

0.1478716506317579

Добавим фичи

In [57]:
data_copy = data.copy()

In [58]:
#добавим среднюю цену товара
data_copy['sales_value_mean'] = data.groupby('item_id')['sales_value'].transform('mean')

#добавим средний чек
data_copy['check'] = data_copy.groupby(['user_id'])['sales_value'].transform('sum')
data_copy['av_check'] = data_copy['check'] / data.groupby(['user_id'])['quantity'].transform('sum')

#Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)
data_copy['sales_value_item_user'] = data_copy.groupby(['user_id', 'item_id'])['sales_value'].transform('sum') 
data_copy['av_sales_value_item_user'] = data_copy['sales_value_item_user']/data_copy.groupby(['user_id', 'item_id'])['quantity'].transform('sum')

In [59]:
data_copy.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,sales_value_mean,check,av_check,sales_value_item_user,av_sales_value_item_user
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,2.625196,2486.42,2.481457,4.38,2.19
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,0.951107,2486.42,2.481457,0.82,0.82


In [54]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [55]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [60]:
data_copy = data_copy.merge(user_features, on=['user_id'], how='left')
data_copy.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,av_check,sales_value_item_user,av_sales_value_item_user,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,...,2.481457,4.38,2.19,,,,,,,
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,...,2.481457,0.82,0.82,,,,,,,


In [61]:
data_copy = data_copy.merge(item_features, on=['item_id'], how='left')
data_copy.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,...,,,,,69,PRODUCE,Private,POTATOES,POTATOES RUSSET (BULK&BAG),5 LB
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,...,,,,,2,PRODUCE,National,ONIONS,ONIONS SWEET (BULK&BAG),40 LB


In [63]:
targets_lvl_2 = targets_lvl_2.merge(data_copy, on=['user_id', 'item_id'], how='left')
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,2070,999999,0.0,,,,,,,,...,,,,,,,,,,
1,2070,1082185,1.0,31268900000.0,289.0,1.0,0.86,311.0,0.0,214.0,...,Unknown,Unknown,1.0,None/Unknown,2.0,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB


In [64]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [65]:
feats = X_train.columns[2:].tolist()
feats

['basket_id',
 'day',
 'quantity',
 'sales_value',
 'store_id',
 'retail_disc',
 'trans_time',
 'week_no',
 'coupon_disc',
 'coupon_match_disc',
 'sales_value_mean',
 'check',
 'av_check',
 'sales_value_item_user',
 'av_sales_value_item_user',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product']

In [66]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 528211 entries, 0 to 528210
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   user_id                   528211 non-null  int64  
 1   item_id                   528211 non-null  int64  
 2   basket_id                 465562 non-null  float64
 3   day                       465562 non-null  float64
 4   quantity                  465562 non-null  float64
 5   sales_value               465562 non-null  float64
 6   store_id                  465562 non-null  float64
 7   retail_disc               465562 non-null  float64
 8   trans_time                465562 non-null  float64
 9   week_no                   465562 non-null  float64
 10  coupon_disc               465562 non-null  float64
 11  coupon_match_disc         465562 non-null  float64
 12  sales_value_mean          465562 non-null  float64
 13  check                     465562 non-null  f

In [67]:
cat_feats = ['manufacturer', 'department', 'brand', 'commodity_desc', 'sub_commodity_desc',
             'curr_size_of_product',  'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
             'hh_comp_desc', 'household_size_desc', 'kid_category_desc']

In [68]:
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [69]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [70]:
X_train['predict'] = train_preds[:, 1]

In [71]:
X_train_sort = X_train.sort_values(['user_id', 'predict'], ascending= False)

Посчитаем метрику:

In [72]:
result_x_train = X_train_sort.groupby('user_id')['item_id'].unique().reset_index()
result_x_train.columns=['user_id', 'lgbm_lvl_2']
result_x_train.head(2)

Unnamed: 0,user_id,lgbm_lvl_2
0,1,"[995242, 1082185, 840361, 1004906, 1005186, 96..."
1,2,"[1106523, 1133018, 1082185, 1053690, 916122, 9..."


In [73]:
#оставим лучшие 5
result = result_x_train.apply(lambda row: row['lgbm_lvl_2'][:5], axis=1)

In [74]:
result_x_train['lgbm_lvl_2_at_5'] = result
result_x_train.head(2)

Unnamed: 0,user_id,lgbm_lvl_2,lgbm_lvl_2_at_5
0,1,"[995242, 1082185, 840361, 1004906, 1005186, 96...","[995242, 1082185, 840361, 1004906, 1005186]"
1,2,"[1106523, 1133018, 1082185, 1053690, 916122, 9...","[1106523, 1133018, 1082185, 1053690, 916122]"


In [75]:
result_lvl_2 = data_train_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2 = result_lvl_2[result_lvl_2['user_id'].isin(train_users)]
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [76]:
result_lvl_2 = result_lvl_2.merge(result_x_train[['user_id', 'lgbm_lvl_2_at_5']], on='user_id', how='left')
map_mean = result_lvl_2.apply(lambda row: ap_k(row['lgbm_lvl_2_at_5'], row['actual'], k=5), axis=1).mean()
map_mean 

0.6537141972441561

Попробуем модель второго уровня на data_val_lvl_2

In [107]:
user_features = user_features.merge(data_copy[['user_id', 'check', 'av_check']], on='user_id')
user_features = user_features.drop_duplicates(keep='first')

In [108]:
item_features = item_features.merge(data_copy[['item_id', 'sales_value_mean']], on='item_id')
item_features = item_features.drop_duplicates(keep='first')

In [109]:
user_item_features = pd.DataFrame(data_copy, columns = ['user_id', 'item_id', 'sales_value_item_user', 'av_sales_value_item_user'])

In [110]:
data_val_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.0,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,0.0,0.0


In [111]:
targets_val_lvl_2 = data_val_lvl_2.merge(item_features, on='item_id', how='left')
targets_val_lvl_2 = targets_val_lvl_2.merge(user_features, on='user_id', how='left')
targets_val_lvl_2 = targets_val_lvl_2.merge(user_item_features, on=['user_id', 'item_id'], how='left')
targets_val_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,check,av_check,sales_value_item_user,av_sales_value_item_user
0,338,41260573635,636,840173,1,1.99,369,0.0,112,92,...,,,,,,,,,1.99,1.99
1,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,...,,,,,,,,,0.89,0.89


In [112]:
targets_val_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 727482 entries, 0 to 727481
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   user_id                   727482 non-null  int64  
 1   basket_id                 727482 non-null  int64  
 2   day                       727482 non-null  int64  
 3   item_id                   727482 non-null  int64  
 4   quantity                  727482 non-null  int64  
 5   sales_value               727482 non-null  float64
 6   store_id                  727482 non-null  int64  
 7   retail_disc               727482 non-null  float64
 8   trans_time                727482 non-null  int64  
 9   week_no                   727482 non-null  int64  
 10  coupon_disc               727482 non-null  float64
 11  coupon_match_disc         727482 non-null  float64
 12  manufacturer              727482 non-null  int64  
 13  department                727482 non-null  o

In [113]:
X_test = targets_val_lvl_2

In [114]:
X_test[cat_feats] = X_test[cat_feats].astype('category')

In [115]:
test_preds = lgb.predict_proba(X_test)

In [116]:
X_test['predict'] = test_preds[:, 1]
X_test.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,check,av_check,sales_value_item_user,av_sales_value_item_user,predict
0,338,41260573635,636,840173,1,1.99,369,0.0,112,92,...,,,,,,,,1.99,1.99,0.007431
1,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,...,,,,,,,,0.89,0.89,0.008787


In [117]:
X_test_sort = X_test.sort_values(['user_id', 'predict'], ascending= False)

In [124]:
result_x_test = X_test_sort.groupby('user_id')['item_id'].unique().reset_index()
result_x_test.columns=['user_id', 'lgbm_lvl_2']
result_x_test.head(2)

Unnamed: 0,user_id,lgbm_lvl_2
0,1,"[821867, 834484, 986947, 1005186, 908213, 9544..."
1,3,"[7167249, 878302, 9526886, 879948, 851057, 994..."


In [125]:
#оставим лучшие 5
result = result_x_test.apply(lambda row: row['lgbm_lvl_2'][:5] if len(row['lgbm_lvl_2']) >= 5 
                             else all_top_purchases[:5] , axis=1)

In [126]:
result_x_test['lgbm_lvl_2_at_5'] = result

In [130]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [131]:
result_lvl_2 = result_lvl_2.merge(result_x_test[['user_id', 'lgbm_lvl_2_at_5' ]], on = 'user_id', how = 'left')

In [132]:
result_lvl_2.apply(lambda row: ap_k(row['lgbm_lvl_2_at_5'], row['actual'], k=5), axis=1).mean()

0.9165164871041458

Эффективнее двухуровненвая моедель

In [142]:
data_test = pd.read_csv('retail_test.csv')
data_test.head(2)

Unnamed: 0,user_id,item_id
0,1340,912987
1,588,1024426


In [143]:
target_test =  data_test.merge(data_copy, on=['user_id', 'item_id'], how='left')

target_test.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,1340,912987,,,,,,,,,...,,,,,,,,,,
1,588,1024426,,,,,,,,,...,,,,,,,,,,


In [144]:
target_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318609 entries, 0 to 318608
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   user_id                   318609 non-null  int64  
 1   item_id                   318609 non-null  int64  
 2   basket_id                 279882 non-null  float64
 3   day                       279882 non-null  float64
 4   quantity                  279882 non-null  float64
 5   sales_value               279882 non-null  float64
 6   store_id                  279882 non-null  float64
 7   retail_disc               279882 non-null  float64
 8   trans_time                279882 non-null  float64
 9   week_no                   279882 non-null  float64
 10  coupon_disc               279882 non-null  float64
 11  coupon_match_disc         279882 non-null  float64
 12  sales_value_mean          279882 non-null  float64
 13  check                     279882 non-null  f

In [145]:
target_test[cat_feats] = target_test[cat_feats].astype('category')

In [146]:
test_preds = lgb.predict_proba(target_test)

In [147]:
target_test['predict'] = test_preds[:, 1]
target_test.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,hh_comp_desc,household_size_desc,kid_category_desc,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,predict
0,1340,912987,,,,,,,,,...,,,,,,,,,,0.000119
1,588,1024426,,,,,,,,,...,,,,,,,,,,8.9e-05


In [148]:
target_test_sort = target_test.sort_values(['user_id', 'predict'], ascending= False)

In [157]:
result_target_test = target_test_sort.groupby('user_id')['item_id'].unique().reset_index()
result_target_test.columns=['user_id', 'lgbm_lvl_2']
result_target_test.head(2)

Unnamed: 0,user_id,lgbm_lvl_2
0,1,"[958046, 938004, 1004906, 931136, 1049998, 991..."
1,2,"[1133018, 1053690, 899624, 1108168, 857849, 88..."


In [158]:
#оставим лучшие 5
result = result_target_test.apply(lambda row: row['lgbm_lvl_2'][:5] if len(row['lgbm_lvl_2']) >= 5 
                             else all_top_purchases[:5] , axis=1)
result_target_test['lgbm_lvl_2_at_5'] = result

In [159]:
#создадим датасет с актуальными покупками на тестовом датасете
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [160]:
result = result.merge(result_target_test[['user_id', 'lgbm_lvl_2_at_5' ]], on = 'user_id', how = 'left')

In [161]:
result.head(2)

Unnamed: 0,user_id,actual,lgbm_lvl_2_at_5
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[958046, 938004, 1004906, 931136, 1049998]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1133018, 1053690, 899624, 1108168, 857849]"


In [162]:
result.apply(lambda row: ap_k(row['lgbm_lvl_2_at_5'], row['actual'], k=5), axis=1).mean()

0.9077347480106098