# Вебинар 4. Домашнее задание

In [10]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
        
from metrics import precision_at_k, recall_at_k

# Домашнее задание

1. Перенесите метрики в модуль metrics.py (убедится что они там)
2. Перенесите функцию prefilter_items в модуль utils.py
3. Создайте модуль recommenders.py. Напищите код для класса ниже 
(задание обсуждали на вебинаре, для первой функции практически сделали) и положите его в recommenders.py
4. Проверьте, что все модули корректно импортируются

In [11]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


class MainRecommender:
    """Рекоммендации, которые можно получить из ALS
    
    Input
    -----
    user_item_matrix: pd.DataFrame
        Матрица взаимодействий user-item
    """
    
    def __init__(self, data, weighting=True):
        
        self.user_top_items = data.groupby('user_id').apply(lambda x: 
                                                            [k for k, v in sorted([(row, qt) for row, qt in zip(x['item_id'], x['quantity']) if row != 999999], 
                                                                                  key=lambda y: y[1], 
                                                                                  reverse=True)])
        
        self.user_item_matrix = self.prepare_matrix(data)  # pd.DataFrame
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)
        
        if weighting:
            self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T 
        
        self.model = self.fit(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
     
    @staticmethod
    def prepare_matrix(data, values='quantity', aggfunc='count'):
        user_item_matrix = pd.pivot_table(data, 
                                  index='user_id', columns='item_id', 
                                  values=values, 
                                  aggfunc=aggfunc, 
                                  fill_value=0
                                 )
        user_item_matrix = user_item_matrix.astype(float) #
        
        return user_item_matrix
    
    @staticmethod
    def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""
        
        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))
        
        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id
     
    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""
    
        own_recommender = ItemItemRecommender(K=1, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return own_recommender
    
    @staticmethod
    def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""
        
        model = AlternatingLeastSquares(factors=n_factors, 
                                        regularization=regularization,
                                        iterations=iterations,  
                                        num_threads=num_threads)
        model.fit(csr_matrix(user_item_matrix).T.tocsr())
        
        return model

    def get_similar_items_recommendation(self, user, N=5):
        """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""
        top_items = self.user_top_items[user]
        recs_for_items = []
        for item in top_items[:N]:
            recs_for_items.append([self.id_to_itemid[self.model.similar_items(self.itemid_to_id[item], N=N + 1)[i][0]] for i in range(1, N + 1)])
        
        # Стараемся добиться максимально релевантных, неповторяющихся рекомендаций
        rec_list = []
        for i in range(1, N+1):
            for recs in recs_for_items:
                for rec in recs[:i]:
                    if rec not in rec_list:
                        rec_list.append(rec)
            if len(rec_list) >= N:
                rec_list = rec_list[:N]
                break

        assert len(rec_list) == N, 'Количество рекомендаций != {}'.format(N)
        return rec_list
    
    def get_similar_users_recommendation(self, user, N=5):
        """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
    
        res = self.own_recommender.recommend(userid=self.userid_to_id[user], 
                        user_items=csr_matrix(self.user_item_matrix).tocsr(), 
                        N=N, 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=False)

        assert len(res) == N, 'Количество рекомендаций != {}'.format(N)
        return res

Проверка, что все работает

In [12]:
data = pd.read_csv('../data/retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [13]:
item_features = pd.read_csv('../data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [14]:
from utils import FilterLower, FilterMax, FilterNonDepartment, TopBayer, PrefilterSeq, LowestPopularity, OverPopular, LongTimeNoSold
n_items_before = data_train['item_id'].nunique()

process_list=[FilterLower(low=1), FilterMax(max=60), FilterNonDepartment(item_features), OverPopular(popularity=0.5), LowestPopularity(popularity=0.01), LongTimeNoSold(weeks=62),
              TopBayer(take_n_popular=5000)]

filter = PrefilterSeq(data_train, process_list)
data_train=filter.processing()

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['price'] = self.data['sales_value'] / (np.maximum(self.data['quantity'], 1))


Decreased # items from 86865 to 5001


In [15]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [16]:
recomender = MainRecommender(data_train)

popularity = data_train.groupby(['user_id', 'item_id'])['quantity'].count().reset_index()
popularity.sort_values('quantity', ascending=False, inplace=True)

popularity = popularity[popularity['item_id'] != 999999]
popularity = popularity.groupby('user_id').head(5)
popularity.sort_values(by=['user_id','quantity'], ascending=False, inplace=True)

popularity['similar_recommendation_bpr'] = popularity['user_id'].apply(lambda x: recomender.get_similar_items_recommendation(x))

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2479 [00:00<?, ?it/s]

IndexError: index 4194 is out of bounds for axis 0 with size 2479

In [8]:
from metrics import precision_at_k
recommendation_similar_items = popularity.groupby('user_id')['similar_recommendation_bpr'].first().reset_index()
recommendation_similar_items.columns=['user_id', 'similar_recommendation_bpr']

result_predict = result.merge(recommendation_similar_items, on='user_id', how='inner')
result_predict.apply(lambda row: precision_at_k(row['similar_recommendation_bpr'], row['actual']), axis=1).mean()

KeyError: 'Column not found: similar_recommendation_bpr'