# Домашнее задание

1. Перенесите метрики в модуль src.metrics.py
2. Перенесите функцию prefilter_items в модуль src.utils.py
3. Создайте модуль src.recommenders.py. Напищите код для класса ниже 
(задание обсуждали на вебинаре, для первой функции практически сделали) и положите его в src.recommenders.py
4. Проверьте, что все модули корректно импортируются

In [None]:
!pip install implicit

In [4]:
%load_ext autoreload

In [5]:
import os
import sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from content.src.metrics import precision_at_k, recall_at_k
from content.src.utils import prefilter_items

In [6]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.nearest_neighbours import bm25_weight, tfidf_weight



In [7]:
data = pd.read_csv('retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [8]:
item_features = pd.read_csv('product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [9]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [30]:
class MainRecommender:

    def __init__(self, data, weighting=None, fake_id=999999):
        
        # топ покупок каждого пользователя
        self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
        self.top_purchases.sort_values('quantity', ascending=False, inplace=True)

        if fake_id is not None:
          self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != fake_id]
        
        # топ покупок по всему датасету
        self.overall_top_parcheses = data.groupby('item_id')['quantity'].count().reset_index()
        self.overall_top_parcheses.sort_values('quantity', ascending=False, inplace=True)

        if fake_id is not None:
          self.overall_top_parcheses = self.overall_top_parcheses[self.overall_top_parcheses['item_id'] != fake_id]
        self.overall_top_parcheses = self.overall_top_parcheses['item_id'].tolist()

        self.fake_id = fake_id
        self.user_item_matrix = self.prepare_matrix(data)
        self.id_to_itemid, self.id_to_userid, \
        self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)

        self.user_item_matrix = csr_matrix(self.user_item_matrix).tocsr()
        self.user_item_matrix_for_pred = self.user_item_matrix

        if weighting == 'bm25':
          self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T.tocsr()

        if weighting == 'tfidf':
          self.user_item_matrix = tfidf_weight(self.user_item_matrix.T).T.tocsr()

        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)


    @staticmethod
    def prepare_matrix(data_train):
        
        user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id',
                                  columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0)

        user_item_matrix = user_item_matrix.astype(float)
        
        return user_item_matrix
    

    @staticmethod
    def prepare_dicts(user_item_matrix):

        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        
        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
        

    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """
        Обучает модель, которая рекомендует товары 
        среди товаров, купленных юзером
        """
    
        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        # own_recommender.fit(user_item_matrix)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return own_recommender


    @staticmethod
    def fit(user_item_matrix,
            n_factors=20,
            regularization=0.1,
            iterations=1,
            num_threads=4):
        
        model = AlternatingLeastSquares(factors=n_factors, 
                                        regularization=regularization,
                                        iterations=iterations,  
                                        num_threads=num_threads)
        # model.fit(user_item_matrix)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return model


    def update_dict(self, user_id):

        """Если появился новый пользователь, то обновляем словари"""
  
        # if user_id not in self.userid_to_id.get():
        # if np.isin(user_id, self.userid_to_id) is False:
        max_id = max(list(self.userid_to_id.values()))
        max_id += 1

        self.userid_to_id.update({user_id: max_id})
        self.id_to_userid.update({max_id: user_id})


    def get_similar_item(self, item_id):

        """Находим товар похожий на item_id"""

        recs = self.model.similar_items(self.itemid_to_id[item_id], N=2)  # N=2 товар похожий на себя
        top_rec = recs[1][0]  # берём второй товар

        return self.id_to_itemid[top_rec]


    def extend_with_top_popular(self, recommendations, N=5):

        """Если количество рекомендаций меньше топ N, то дополняем топ-популярными"""

        # if len(recommendations) < N:
        #     top_popular = [rec for rec in self.overall_top_parcheses[:N] if rec not in recommendations]
        #     recommendations.extend(top_popular)
        #     recommendations = recommendations[:N]

        if len(recommendations) < N:
            recommendations.extend(self.overall_top_parcheses[:N])
            recommendations = recommendations[:N]

        return recommendations


    def get_recommendations(self, user, model, N=5):

        """Рекомендуем топ-N товаров"""

        if np.isin(user, self.userid_to_id) is False:
            self.update_dict(user_id=user)

        filter_items = [] if self.fake_id is not None else [self.itemid_to_id[self.fake_id]]
        res = model.recommend(userid=self.userid_to_id[user], 
                              user_items=self.user_item_matrix[self.userid_to_id[user]],
                              N=N, 
                              filter_already_liked_items=False, 
                              filter_items=filter_items, 
                              recalculate_user=True)
        
        mask = res[1].argsort()[::-1]
        res = [self.id_to_itemid[rec] for rec in res[0][mask]]
        res = self.extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)

        return res


    def get_als_recommendations(self, user, N=5):

        if np.isin(user, self.userid_to_id) is False:
            self.update_dict(user_id=user)

        return self.get_recommendations(user, model=self.model, N=N)


    def get_own_recommendations(self, user, N=5):

        """Рекомендуем товары, которые пользователь уже купил"""

        if np.isin(user, self.userid_to_id) is False:
            self.update_dict(user_id=user)

        try:
            return self.get_recommendations(user, model=self.own_recommender, N=N)

        except Exception:
            res = []
            self.extend_with_top_popular(res, N=N)
            return res


    def get_similar_items_recommendation(self, user, N=5):

        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

        if np.isin(user, self.userid_to_id) is False:
            self.update_dict(user_id=user)

        top_items = self.user_item_matrix.loc[user].sort_values(ascending=False).head(N).index.tolist()
        top_items = [self.itemid_to_id[item] for item in top_items]
        ids = self.model.similar_items(top_items , N=N+1)[0][:,1:].T.flatten()[:N]
        res = [self.id_to_itemid[x] for x in ids]

        # top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N)

        # res = top_users_purchases['item_id'].apply(lambda x: self.get_similar_item(x)).tolist()
        # res = self.extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)

        return res
    
   
    def get_similar_users_recommendation(self, user, N=5):

        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""

        if np.isin(user, self.userid_to_id) is False:
            self.update_dict(user_id=user)
    
        res = []

        # Находим топ-N похожих пользователей
        similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)
        similar_users = [rec[0] for rec in similar_users]
        similar_users = similar_users[1:]   # удалим юзера из запроса

        for user in similar_users:
            res.extend(self.get_own_recommendations(user, N=1))

        res = self.extend_with_top_popular(res, N=N)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)

        return res

Проверка кода на работоспособность

In [31]:
recommender = MainRecommender(data_train)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2499 [00:00<?, ?it/s]

In [24]:
recommender.get_als_recommendations(2375, N=5)

[25671, 26081, 26093, 26190, 26355]

In [None]:
# recommender.get_own_recommendations(2375, N=5)

In [1]:
# recommender.get_similar_items_recommendation(23, N=5)

In [25]:
recommender.get_similar_users_recommendation(13, N=5)

[1082185, 1082185, 6534178, 1029743, 995242]