# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Модель второго уровня
# from lightgbm import LGBMClassifier

# import os, sys
# module_path = os.path.abspath(os.path.join(os.pardir))
# if module_path not in sys.path:
#     sys.path.append(module_path)

# Написанные нами функции
# from src.metrics import precision_at_k, recall_at_k
# from src.utils import prefilter_items
# from src.recommenders import MainRecommender

In [2]:
def prefilter_items(data_in, item_features, take_n_popular):
    data = data_in.copy()
    

    # Уберем самые популярные товары (их и так купят)
    data_grp_by_item = data.groupby('item_id')
    popularity = data_grp_by_item['user_id'].nunique().reset_index()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
    popularity['share_unique_users'] = popularity['share_unique_users'] / data['user_id'].nunique()
    
    top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]
    
    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]
    
    # Уберем товары, которые не продавались за последние 12 месяцев
    data_grp_by_item = data.groupby('item_id')

    last_week_in_data = data['week_no'].max()
    item_last_sell_week = data_grp_by_item['week_no'].max().reset_index()
    item_last_sell_week.rename(columns={'week_no': 'last_week_no'}, inplace=True)
    not_trending_items = \
        item_last_sell_week[item_last_sell_week['last_week_no'] < \
                       (last_week_in_data - 52)].item_id.to_list()
    
    data = data[~data['item_id'].isin(not_trending_items)]

    # Уберем не интересные для рекоммендаций категории (department)
    not_iteresing_department = ['MISC. TRANS.', 
                                'VIDEO RENTAL', 
                                'KIOSK-GAS', 
                                'MISC SALES TRAN', 
                                'POSTAL CENTER', 
                                'RX', 
                                'HBC']

    data_item = data.merge(item_features,
           on='item_id',
           how='inner')
    items_in_not_iteresing_department = \
        data_item[data_item['department'].isin(not_iteresing_department)].item_id.to_list()

    data = data[~data['item_id'].isin(items_in_not_iteresing_department)]

    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб. 
    
    # Уберем слишком дорогие товарыs
    data_grp_by_item = data.groupby('item_id')
    item_sum_quantity = data_grp_by_item['quantity'].sum().reset_index()
    item_sum_quantity.rename(columns={'quantity': 'sum_quantity'}, inplace=True)

    item_sum_sales_value = data_grp_by_item['sales_value'].sum().reset_index()
    item_sum_sales_value.rename(columns={'sales_value': 'sum_sales_value'}, inplace=True)

    item_sum_quantity = data_grp_by_item['quantity'].sum().reset_index()
    item_sum_quantity.rename(columns={'quantity': 'sum_quantity'}, inplace=True)
    item_with_no_quantity = item_sum_quantity[item_sum_quantity['sum_quantity'] == 0].item_id.to_list()

    item_sum_quantity = \
        item_sum_quantity[~item_sum_quantity['item_id'].isin(item_with_no_quantity)]
    item_sum_sales_value = \
        item_sum_sales_value[~item_sum_sales_value['item_id'].isin(item_with_no_quantity)]

    item_mean_sales_value = item_sum_sales_value['item_id'].to_frame()
    item_mean_sales_value['mean_sum_sales_value'] = item_sum_sales_value['sum_sales_value'] / item_sum_quantity['sum_quantity']

    q1_mean_sum_sales_value = item_mean_sales_value['mean_sum_sales_value'].quantile(q=0.05)
    q2_mean_sum_sales_value = item_mean_sales_value['mean_sum_sales_value'].quantile(q=0.95)

    item_with_acceptible_price = \
        item_mean_sales_value[(item_mean_sales_value['mean_sum_sales_value'] < q2_mean_sum_sales_value) & \
                            (item_mean_sales_value['mean_sum_sales_value'] > q1_mean_sum_sales_value)]\
                            .item_id.to_list()

    data = data[data['item_id'].isin(item_with_acceptible_price)]
    

    # Топ N товаров среди оставшихся
    data_grp_by_item = data.groupby('item_id')
    popularity = data_grp_by_item['user_id'].nunique().reset_index()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
    popularity['share_unique_users'] = popularity['share_unique_users'] / data['user_id'].nunique()
    top_popular_n = \
        popularity.sort_values(by='share_unique_users', ascending=False)[:take_n_popular].item_id.to_list()
    
    # data = data[data['item_id'].isin(top_popular_n)]

    return data, top_popular_n

In [3]:
class MainRecommender:
    """Рекоммендации, которые можно получить из ALS
    
    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """
    
    def __init__(self, data, top_popular_n, weighting=True):
        
        # your_code. Это не обязательная часть. Но если вам удобно что-либо посчитать тут - можно это сделать
        self.data_in = data
        self.top_popular = top_popular_n

        self.user_item_matrix = self.prepare_matrix(data, top_popular_n)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)
        
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 

        self.user_item_sparse_matrix = csr_matrix(self.user_item_matrix).tocsr()
        
        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
     
    @staticmethod
    def prepare_matrix(data_in, top_popular_n):
        
        # your_code

        data_in.loc[~data_in['item_id'].isin(top_popular_n), 'item_id'] = 999999

        user_item_matrix = pd.pivot_table(data_in, 
                                          index='user_id', columns='item_id', 
                                          values='quantity', # Можно пробоват ьдругие варианты
                                          aggfunc='count', 
                                          fill_value=0
                                         )

        user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit
        
        return user_item_matrix
    
    @staticmethod
    def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        
        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
     
    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
        user_item_sparse_matrix = csr_matrix(user_item_matrix).tocsr()

        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(user_item_sparse_matrix)
        
        return own_recommender
    
    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""
        user_item_sparse_matrix = csr_matrix(user_item_matrix).tocsr()

        model = AlternatingLeastSquares(factors=n_factors, 
                                        regularization=regularization,
                                        iterations=iterations,  
                                        num_threads=num_threads)
        model.fit(user_item_sparse_matrix)
        
        return model

    def extend_from_top_popular(self, recommendations, N=5):
        """Если количество рекомендаций меньше N, то дополняем их топ-популярными"""
        
        max_top_popular_len = len(self.top_popular)
        recommendations = list(recommendations)
        if len(recommendations) < N:
            if N <= max_top_popular_len:
                top_popular = [rec for rec in self.top_popular[:N] if rec not in recommendations]
                recommendations.extend(top_popular)
                recommendations = recommendations[:N]
            else:
                recommendations = recommendations[:max_top_popular_len]
        return recommendations            



    def get_recommendation_for_user(self, user, N):
        
        res = [self.id_to_itemid[rec] for rec in 
                self.model.recommend(userid=self.userid_to_id[user], 
                                     user_items=self.user_item_sparse_matrix[self.userid_to_id[user]],   # на вход user-item matrix
                                     N=N, 
                                     filter_already_liked_items=False, 
                                     filter_items=[self.itemid_to_id[999999]],
                                     # filter_items=None, 
                                     recalculate_user=True)[0]]
        return res 
    
    def get_model_recommendation(self, N=5):
        res_model_recommendation = self.data_in['user_id'].to_frame().drop_duplicates(ignore_index=True)
        
        res_model_recommendation['model_rec'] = res_model_recommendation['user_id']\
                                                .apply(lambda x: self.get_recommendation_for_user(x, N=N))
        
        return res_model_recommendation

    def get_similar_items(self, x):
        similar_item = self.model.similar_items(self.itemid_to_id[x], N=2)[0][1]
        res = self.id_to_itemid[similar_item]
        return res

    def get_similar_items_recommendation(self, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        popularity = self.data_in.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        popularity.sort_values('quantity', ascending=False, inplace=True)
        popularity = popularity[popularity['item_id'] != 999999]
        popularity = popularity.groupby('user_id').head(N)
        popularity.sort_values(by=['user_id', 'quantity'], ascending=False, inplace=True)

        popularity['similar_recommendation'] = popularity['item_id'].apply(lambda x: self.get_similar_items(x))

        recommendation_similar_items = popularity.groupby('user_id')['similar_recommendation'].unique().reset_index()
        recommendation_similar_items.columns=['user_id', 'similar_recommendation']

        recommendation_similar_items['similar_recommendation'] = \
            recommendation_similar_items['similar_recommendation'].apply(lambda x: self.extend_from_top_popular(x, N=N))

        return recommendation_similar_items

    def get_similar_users(self, user, N):
        similar_users = self.model.similar_users(self.userid_to_id[user], N=(N+1))[0]
        similar_users_id = [self.id_to_userid[user] for user in similar_users]
        return similar_users_id[1:]

    def get_similar_users_recommendation(self, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        popularity = self.data_in.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        popularity.sort_values('quantity', ascending=False, inplace=True)
        popularity = popularity[popularity['item_id'] != 999999]
        popularity = popularity.groupby('user_id').head(1)
        popularity.sort_values(by=['user_id', 'quantity'], ascending=False, inplace=True)

        popularity['similar_users_items'] = \
            popularity['user_id']\
                .apply(lambda x: popularity[popularity['user_id'].isin(self.get_similar_users(x, N=N))].item_id.to_list())

        recommendation_similar_user_items = popularity[['user_id', 'similar_users_items']]

        return recommendation_similar_user_items

In [4]:
def recall_at_k(recommended_list, bought_list, k=5):
    
    # your_code
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list[:k])
    
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)    

    return recall

In [5]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision

In [6]:
def ap_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)[:k]
    
    relevant_indexes = np.nonzero(np.isin(recommended_list, bought_list))[0]
    if len(relevant_indexes) == 0:
        return 0
    
    amount_relevant = len(relevant_indexes)
            
    sum_ = sum([precision_at_k(recommended_list, bought_list, k=index_relevant+1) for index_relevant in relevant_indexes])
    return sum_/amount_relevant

In [7]:
def map_k(recommended_list, bought_list, k=5):

    amount_user = len(bought_list)
    list_ap_k = [ap_k(recommended_list[i], bought_list[i], k) for i in np.arange(amount_user)]
    
    sum_ap_k = sum(list_ap_k)  
    return sum_ap_k/amount_user

In [8]:
FOLDER_PATH = 'E:\\Programming\\RecSys_second\\project_recsys_products\\'

In [9]:
data = pd.read_csv(FOLDER_PATH + 'raw_data\\retail_train.csv')
item_features = pd.read_csv(FOLDER_PATH + 'raw_data\\product.csv')
user_features = pd.read_csv(FOLDER_PATH + 'raw_data\\hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [10]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1, top_n_popular = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 10086


In [11]:
recommender = MainRecommender(data_train_lvl_1, top_n_popular)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [12]:
model_recs = recommender.get_model_recommendation()

In [13]:
similar_items_recs = recommender.get_similar_items_recommendation()

In [30]:
recommender.get_similar_users_recommendation()

Unnamed: 0,user_id,similar_users_items
598802,2500,"[1070820, 827570, 1070820, 1058997, 5569471]"
598658,2499,"[1092026, 912137, 907631, 982790, 883404]"
598298,2498,"[847982, 5569230, 10344585, 1010190, 5568378]"
597687,2497,"[1058997, 1031864, 951590, 899624, 1070820]"
597327,2496,"[883404, 908531, 1106523, 962229, 883404]"
...,...,...
936,5,"[1075368, 883404, 944139, 1048462, 826249]"
812,4,"[883404, 999639, 1106523, 1033857, 8090532]"
697,3,"[1092026, 1037840, 1065593, 1075368, 8090521]"
435,2,"[5569230, 1057260, 878996, 1070820, 883404]"


### Модели первого уровня


In [31]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [32]:
recall_at_k_dict = {}

k_list = [5, 20, 50, 100, 200]

for k in k_list:
    comp_name = 'model_rec_' + str(k)

    model_recs = \
        recommender.get_model_recommendation(N=k)

    result_lvl_1 = result_lvl_1.merge(model_recs,
                                      on='user_id',
                                      how='inner').rename(columns={'model_rec': comp_name})

    recall_at_k_dict[comp_name] = \
        result_lvl_1.apply(lambda row: recall_at_k(row[comp_name],
                                                   row['actual'], k=k), axis=1).mean()

In [33]:
recall_at_k_dict

{'model_rec_5': 0.012273435274937133,
 'model_rec_20': 0.034058261181882765,
 'model_rec_50': 0.06359694528595412,
 'model_rec_100': 0.09870802653872945,
 'model_rec_200': 0.14585327970432937}

In [34]:
for k in k_list:
    comp_name = 'similar_recommendation_' + str(k)

    similar_items_recs = \
        recommender.get_similar_items_recommendation(N=k)

    result_lvl_1 = result_lvl_1.merge(similar_items_recs,
                                      on='user_id',
                                      how='inner').rename(columns={'similar_recommendation': comp_name})

    recall_at_k_dict[comp_name] = \
        result_lvl_1.apply(lambda row: recall_at_k(row[comp_name],
                                                   row['actual'], k=k), axis=1).mean()

In [35]:
recall_at_k_dict

{'model_rec_5': 0.012273435274937133,
 'model_rec_20': 0.034058261181882765,
 'model_rec_50': 0.06359694528595412,
 'model_rec_100': 0.09870802653872945,
 'model_rec_200': 0.14585327970432937,
 'similar_recommendation_5': 0.01014334912290424,
 'similar_recommendation_20': 0.027128347378593442,
 'similar_recommendation_50': 0.05141363729503511,
 'similar_recommendation_100': 0.08307965579912217,
 'similar_recommendation_200': 0.13211622906451329}

In [36]:
for k in k_list:
    comp_name = 'similar_users_recs_' + str(k)

    similar_users_recs = \
        recommender.get_similar_users_recommendation(N=k)

    result_lvl_1 = result_lvl_1.merge(similar_users_recs,
                                      on='user_id',
                                      how='inner').rename(columns={'similar_users_items': comp_name})


    recall_at_k_dict[comp_name] = \
        result_lvl_1.apply(lambda row: recall_at_k(row[comp_name],\
                                                   row['actual'], k=k), axis=1).mean()

In [37]:
recall_at_k_dict

{'model_rec_5': 0.012273435274937133,
 'model_rec_20': 0.034058261181882765,
 'model_rec_50': 0.06359694528595412,
 'model_rec_100': 0.09870802653872945,
 'model_rec_200': 0.14585327970432937,
 'similar_recommendation_5': 0.01014334912290424,
 'similar_recommendation_20': 0.027128347378593442,
 'similar_recommendation_50': 0.05141363729503511,
 'similar_recommendation_100': 0.08307965579912217,
 'similar_recommendation_200': 0.13211622906451329,
 'similar_users_recs_5': 0.009096231720764592,
 'similar_users_recs_20': 0.02581360454858864,
 'similar_users_recs_50': 0.04433642393081021,
 'similar_users_recs_100': 0.06331066938017078,
 'similar_users_recs_200': 0.08542724541211826}

In [None]:
# your_code

### Модель 2-ого уровня

In [32]:
# your_code
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

# users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
# users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_model_recommendation(x, N=20))

In [33]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

In [34]:
model_rec = recommender.get_model_recommendation(N=200)

In [35]:
users_lvl_2 = users_lvl_2.merge(model_rec,
                                on='user_id',
                                how='inner')

users_lvl_2.columns = ['user_id', 'candidates']

In [36]:
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[879755, 1107553, 1085604, 1110572, 883404, 90..."
1,2021,"[871756, 981521, 951590, 12731544, 895930, 896..."


In [37]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1)\
    .stack().reset_index(level=1, drop=True)

s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,879755,1
0,2070,1107553,1
0,2070,1085604,1
0,2070,1110572,1


In [38]:
data_train_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


In [39]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id', 'quantity', 'sales_value', 'store_id', 'week_no']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [40]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,week_no,target
0,2070,879755,,,,,0.0
1,2070,1107553,,,,,0.0


In [41]:
targets_lvl_2['target'].mean()

0.06299471084093956

In [42]:
item_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92353 entries, 0 to 92352
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   item_id               92353 non-null  int64 
 1   manufacturer          92353 non-null  int64 
 2   department            92353 non-null  object
 3   brand                 92353 non-null  object
 4   commodity_desc        92353 non-null  object
 5   sub_commodity_desc    92353 non-null  object
 6   curr_size_of_product  92353 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.9+ MB


In [43]:
user_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   age_desc             801 non-null    object
 1   marital_status_code  801 non-null    object
 2   income_desc          801 non-null    object
 3   homeowner_desc       801 non-null    object
 4   hh_comp_desc         801 non-null    object
 5   household_size_desc  801 non-null    object
 6   kid_category_desc    801 non-null    object
 7   user_id              801 non-null    int64 
dtypes: int64(1), object(7)
memory usage: 50.2+ KB


In [44]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 435608 entries, 0 to 435607
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      435608 non-null  int64  
 1   item_id      435608 non-null  int64  
 2   quantity     27441 non-null   float64
 3   sales_value  27441 non-null   float64
 4   store_id     27441 non-null   float64
 5   week_no      27441 non-null   float64
 6   target       435608 non-null  float64
dtypes: float64(5), int64(2)
memory usage: 26.6 MB


In [45]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,week_no,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,879755,,,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1107553,,,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [46]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,week_no,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,879755,,,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1107553,,,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [47]:
targets_lvl_2['quantity'].fillna(targets_lvl_2['quantity'].median(),
                                 inplace=True)
targets_lvl_2['sales_value'].fillna(targets_lvl_2['sales_value'].mean(),
                                    inplace=True)

In [48]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,week_no,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,879755,1.0,2.440308,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1107553,1.0,2.440308,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [49]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,week_no,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,879755,1.0,2.440308,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1107553,1.0,2.440308,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [50]:
def calc_mode_func_series(x, mode_store):
    x = pd.Series.mode(x)
    if type(x) != float:
        if len(x) >= 1:
            x = x[0]
        else:
            x = mode_store
    return x

mode_store = pd.Series.mode(targets_lvl_2['store_id']).values[0]

df = \
    targets_lvl_2.groupby(by='user_id')['store_id']\
        .agg(lambda x: calc_mode_func_series(x, mode_store)).reset_index()

df.rename(columns={'store_id': 'mode_store_user'},
          inplace=True)

targets_lvl_2 = targets_lvl_2.merge(df, 
                                    on='user_id',
                                    how='inner')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,week_no,target,manufacturer,department,brand,...,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mode_store_user
0,2070,879755,1.0,2.440308,,,0.0,103,GROCERY,National,...,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,311.0
1,2070,1107553,1.0,2.440308,,,0.0,103,GROCERY,National,...,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,311.0


In [51]:
df = pd.pivot_table(targets_lvl_2,
                    index='item_id', columns='week_no',
                    values='quantity',
                    aggfunc='count',
                    fill_value=0
                    )

df = df.agg('median', axis='columns').reset_index()
df.columns = ['item_id', 'quantatity_of_item_per_week']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='item_id',
                                    how='inner')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,week_no,target,manufacturer,department,brand,...,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mode_store_user,quantatity_of_item_per_week
0,2070,879755,1.0,2.440308,,,0.0,103,GROCERY,National,...,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,311.0,11.0
1,1753,879755,1.0,1.11,345.0,87.0,1.0,103,GROCERY,National,...,20 OZ,45-54,U,35-49K,Homeowner,Unknown,1,None/Unknown,345.0,11.0


In [52]:
df = pd.pivot_table(targets_lvl_2,
                    index='department', columns='week_no',
                    values='quantity',
                    aggfunc='count',
                    fill_value=0
                    )

df = df.agg('median', axis='columns').reset_index()
df.columns = ['department', 'quantatity_of_item_in_category_per_week']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='department',
                                    how='inner')
targets_lvl_2.head(2)

# targets_lvl_2 = targets_lvl_2.merge(df,
#                                     on='item_id',
#                                     how='inner')

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,week_no,target,manufacturer,department,brand,...,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mode_store_user,quantatity_of_item_per_week,quantatity_of_item_in_category_per_week
0,2070,879755,1.0,2.440308,,,0.0,103,GROCERY,National,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,311.0,11.0,2991.0
1,1753,879755,1.0,1.11,345.0,87.0,1.0,103,GROCERY,National,...,45-54,U,35-49K,Homeowner,Unknown,1,None/Unknown,345.0,11.0,2991.0


In [53]:
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='department',
                    values='quantity',
                    aggfunc='count',
                    fill_value=0
                    )

df = df.idxmax(axis=1).reset_index()
df.columns = ['user_id', 'top_department']

# targets_lvl_2 = targets_lvl_2.merge(df,
#                                     on='user_id',
#                                     how='inner'
#                                     )
targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='user_id',
                                    how='inner'
                                    )

In [54]:
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='brand',
                    values='quantity',
                    aggfunc='count',
                    fill_value=0
                    )

df = df.idxmax(axis=1).reset_index()
df.columns = ['user_id', 'top_brand']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='user_id',
                                    how='inner'
                                    )

In [55]:
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='department',
                    values='sales_value',
                    aggfunc='mean',
                    fill_value=0
                    )

In [56]:
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='department',
                    values='sales_value',
                    aggfunc='mean',
                    fill_value=0
                    )

df = df.stack().reset_index()
df.columns = ['user_id', 'department', 'mean_sales_value_of_user_in_department']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on=['user_id', 'department'],
                                    how='inner')

In [57]:
targets_lvl_2['age_desc'].unique()

array(['45-54', nan, '25-34', '65+', '35-44', '19-24', '55-64'],
      dtype=object)

In [58]:
df = \
    targets_lvl_2.groupby(by=['user_id'])['age_desc']\
    .apply(lambda x: pd.Series.mode(x))
df = df.reset_index()
df.drop(columns='level_1',
        inplace=True)

df.columns=['user_id', 'age_desc_corrected']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='user_id',
                                    how='inner')

In [59]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,week_no,target,manufacturer,department,brand,...,hh_comp_desc,household_size_desc,kid_category_desc,mode_store_user,quantatity_of_item_per_week,quantatity_of_item_in_category_per_week,top_department,top_brand,mean_sales_value_of_user_in_department,age_desc_corrected
0,2070,879755,1.0,2.440308,,,0.0,103,GROCERY,National,...,Unknown,1,None/Unknown,311.0,11.0,2991.0,GROCERY,National,2.23072,45-54
1,2070,1107553,1.0,2.440308,,,0.0,103,GROCERY,National,...,Unknown,1,None/Unknown,311.0,4.0,2991.0,GROCERY,National,2.23072,45-54


In [60]:
targets_lvl_2.columns

Index(['user_id', 'item_id', 'quantity', 'sales_value', 'store_id', 'week_no',
       'target', 'manufacturer', 'department', 'brand', 'commodity_desc',
       'sub_commodity_desc', 'curr_size_of_product', 'age_desc',
       'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc',
       'household_size_desc', 'kid_category_desc', 'mode_store_user',
       'quantatity_of_item_per_week',
       'quantatity_of_item_in_category_per_week', 'top_department',
       'top_brand', 'mean_sales_value_of_user_in_department',
       'age_desc_corrected'],
      dtype='object')

In [61]:
feature_columns = \
    ['user_id', 
     'item_id', 
     'quantity', 
     'sales_value', 
     'store_id',
     'department',
     'manufacturer',
     'age_desc_corrected', 
     'brand',
     'mode_store_user',
     'quantatity_of_item_per_week',
     'quantatity_of_item_in_category_per_week', 
     'top_department',
     'top_brand', 
     'mean_sales_value_of_user_in_department'
    ]

In [62]:
targets_lvl_2['store_id'].fillna(mode_store, inplace=True)
targets_lvl_2[feature_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138256 entries, 0 to 138255
Data columns (total 15 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   user_id                                  138256 non-null  int64  
 1   item_id                                  138256 non-null  int64  
 2   quantity                                 138256 non-null  float64
 3   sales_value                              138256 non-null  float64
 4   store_id                                 138256 non-null  float64
 5   department                               138256 non-null  object 
 6   manufacturer                             138256 non-null  int64  
 7   age_desc_corrected                       138256 non-null  object 
 8   brand                                    138256 non-null  object 
 9   mode_store_user                          138256 non-null  float64
 10  quantatity_of_item_per_week     

In [63]:
X_train = targets_lvl_2[feature_columns]
y_train = targets_lvl_2['target']

In [64]:
X_train.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,store_id,department,manufacturer,age_desc_corrected,brand,mode_store_user,quantatity_of_item_per_week,quantatity_of_item_in_category_per_week,top_department,top_brand,mean_sales_value_of_user_in_department
0,2070,879755,1.0,2.440308,367.0,GROCERY,103,45-54,National,311.0,11.0,2991.0,GROCERY,National,2.23072
1,2070,1107553,1.0,2.440308,367.0,GROCERY,103,45-54,National,311.0,4.0,2991.0,GROCERY,National,2.23072


In [65]:
X_train[['store_id', 'mode_store_user']] = \
    X_train[['store_id', 'mode_store_user']].astype(np.int32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[['store_id', 'mode_store_user']] = \


In [66]:
cat_feats = ['user_id', 'item_id', 
             'store_id', 'manufacturer', 'age_desc_corrected', 'department', 
             'brand', 'mode_store_user',
             'top_department', 'top_brand']

In [244]:
# cat_feats = ['store_id', 'department', 
#              'brand',
#              'top_department', 'top_brand']

In [67]:
# cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[cat_feats] = X_train[cat_feats].astype('category')


In [68]:
from catboost import CatBoostClassifier

In [69]:
model = CatBoostClassifier(
    random_seed=55,
    iterations=100,
    learning_rate=0.1)

In [70]:
model.fit(
    X_train, y_train,
    cat_features=cat_feats,
    verbose=50
)

0:	learn: 0.5991029	total: 209ms	remaining: 20.7s
50:	learn: 0.0031613	total: 3.13s	remaining: 3.01s
99:	learn: 0.0001068	total: 6.31s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x26b91c66560>

In [71]:
train_preds = model.predict(X_train)
train_preds = train_preds.astype(bool)

In [72]:
# rec_items = X_train[train_preds].groupby(by=['user_id'])['item_id'].unique().reset_index()
# rec_items.columns = ['user_id', 'model_preds']

In [76]:
df = X_train[['user_id', 'item_id']].astype('int')
df = df[train_preds]
rec_items = df.groupby(by=['user_id'])['item_id'].unique().reset_index()
rec_items.columns = ['user_id', 'model_preds']

In [77]:
rec_items['model_preds'] = \
    rec_items['model_preds'].apply(lambda x: x[:10] if len(x) >= 10 else x)

In [78]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns = ['user_id', 'actual']
result_lvl_2.head(5)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [79]:
result_lvl_2 = result_lvl_2.merge(rec_items,
                                  on='user_id',
                                  how='inner')

In [80]:
result_lvl_2.apply(lambda row: precision_at_k(row['model_preds'], row['actual']), axis=1).mean()

0.3516891891891887

In [81]:
result_lvl_2.head(2)

Unnamed: 0,user_id,actual,model_preds
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1050310, 856942, 1046816, 1069575, 1075074, 1..."
1,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1126899, 1136257, 5592610, 5572738, 5591154, ..."


In [82]:
map_k(result_lvl_2['model_preds'].tolist(), result_lvl_2['actual'].tolist())

0.5346921921921912

In [85]:
result_lvl_2['model_preds']

0      [1050310, 856942, 1046816, 1069575, 1075074, 1...
1      [1126899, 1136257, 5592610, 5572738, 5591154, ...
2      [923746, 1068719, 868075, 840361, 916050, 1098...
3      [859075, 951590, 9859111, 914190, 5568729, 943...
4      [1022827, 835243, 896085, 1099164, 6533362, 83...
                             ...                        
735                    [840361, 862349, 1010424, 995965]
736    [883404, 826249, 865456, 1056509, 913785, 8671...
737    [1068719, 859075, 840361, 1038217, 896938, 914...
738    [914190, 954673, 1030981, 1100379, 940766, 951...
739    [883404, 826249, 5568378, 999971, 5569327, 985...
Name: model_preds, Length: 740, dtype: object

In [84]:
result_lvl_2[['user_id', 'model_preds']].to_csv('model_predictions.csv', index=False, sep=',')