## Курсовая работа "Рекомендательные системы"

Финальный проект:
Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ).

Целевая метрика - money precision @ 5. Порог для уcпешной сдачи проекта money precision @ 5 > 20%.

Бизнес ограничения в топ-5 товарах:
- Для каждого юзера 5 рекомендаций (иногда модели могут возвращать < 5);
- 2 новых товара (юзер никогда не покупал);
- 1 дорогой товар > 7 долларов;
- Все товары из разных категорий (категория - sub_commodity_desc).

Стоимость каждого рекомендованного товара > 1 доллара. Будет public тестовый датасет, на котором вы сможете измерять метрику. Также будет private тестовый датасет для измерения финального качества. НЕ обязательно использовать 2-ух уровневые рекоммендательные системы в проекте.

Вы сдаете код проекта в виде github репозитория и .csv файл с рекомендациями. 
В .csv файле 2 столбца: user_id - (item_id1, item_id2, ..., item_id5)

### Загрузка модулей и библиотек

In [1]:
# metrics.py

import numpy as np


def recall_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    recall = flags.sum() / len(bought_list)
    
    return recall


def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
    
    return precision


def money_precision_at_k(recommended_list, bought_list, prices_recommended, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    prices_recommended = np.array(prices_recommended)

    if k < len(recommended_list):
        recommended_list = recommended_list[:k]
        prices_recommended = prices_recommended[:k]

    flags = np.isin(recommended_list, bought_list)
    res = np.dot(flags, prices_recommended) / np.sum(prices_recommended)

    return res

In [2]:
!pip install implicit



In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight#, tfidf_weight, ItemItemRecommender, CosineRecommender, TFIDFRecommender
from implicit.bpr import BayesianPersonalizedRanking

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
#from src.metrics import precision_at_k, recall_at_k, money_precision_at_k
from src.utils import prefilter_items, postfilter_items
from src.recommenders import MainRecommender

### Загрузка датасета и данных

In [4]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
data['week_no'].nunique()

95

In [6]:
users, items, interactions = data.user_id.nunique(), data.item_id.nunique(), data.shape[0]

print('# users: ', users)
print('# items: ', items)
print('# interactions: ', interactions)

# users:  2499
# items:  89051
# interactions:  2396804


In [7]:
popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity.describe()

Unnamed: 0,item_id,sales_value
count,89051.0,89051.0
mean,5115772.0,83.458481
std,5178973.0,1628.715079
min,25671.0,0.0
25%,966583.0,3.5
50%,1448516.0,10.78
75%,9553042.0,46.105
max,18024560.0,467993.62


In [8]:
popularity = data.groupby('item_id')['user_id'].nunique().reset_index()
popularity.describe()

Unnamed: 0,item_id,user_id
count,89051.0,89051.0
mean,5115772.0,14.759767
std,5178973.0,45.904111
min,25671.0,1.0
25%,966583.0,1.0
50%,1448516.0,2.0
75%,9553042.0,10.0
max,18024560.0,2039.0


In [9]:
item_features = pd.read_csv('product.csv')
item_features.head(2)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [10]:
user_features = pd.read_csv('hh_demographic.csv')
user_features.head(2)

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


### Создание обучающей и тестовой выборки

In [11]:
data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

test_size_weeks = 6

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [12]:
print(data_train['item_id'].nunique())
print(data_test['item_id'].nunique())

85334
30040


In [13]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[829323, 835108, 836423, 851515, 875240, 87737..."
1,2,"[895388, 8357614, 12301772, 821083, 828106, 83..."


In [14]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 2197 юзеров
В тестовом дата сете 0 новых юзеров


In [15]:
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [16]:
item_features['sub_commodity_desc'] = pd.Categorical(item_features['sub_commodity_desc'])
item_features['sub_commodity_desc_code'] = item_features['sub_commodity_desc'].cat.codes
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,sub_commodity_desc_code
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,1079
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,1446


In [17]:
item_mean_cost = data_train.groupby(['item_id'])['sales_value','quantity'].sum().reset_index()
item_mean_cost['mean_price'] = item_mean_cost['sales_value'] / item_mean_cost['quantity']

In [18]:
popular_exp_item = data_train.loc[data_train['sales_value'] / data_train['quantity'] >= 7].groupby('item_id')['quantity'].sum().reset_index()
popular_exp_item.sort_values('quantity', ascending=False, inplace=True)
popular_exp_item = popular_exp_item[:1].item_id.values[0]
popular_exp_item

6533765

In [19]:
n_items_before = data_train['item_id'].nunique()

data_train,top_popular = prefilter_items(data_train, take_n_popular=5000, item_mean_cost=item_mean_cost)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

85294
26231
25589
25581
15744
Decreased # items from 85334 to 15744


In [20]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[829323, 835108, 836423, 851515, 875240, 87737..."
1,2,"[895388, 8357614, 12301772, 821083, 828106, 83..."


### Бейзлайны

In [21]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [22]:
%%time

items = data_train.item_id.unique()
result['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(items, n=5))

CPU times: user 669 ms, sys: 1.76 ms, total: 670 ms
Wall time: 672 ms


In [23]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [24]:
%%time

# Можно так делать, так как рекомендация не зависит от юзера
popular_recs = popularity_recommendation(data_train, n=5)
result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)

CPU times: user 38.4 ms, sys: 938 µs, total: 39.3 ms
Wall time: 39.9 ms


In [25]:
#бейзлайны случайных рекомендаций
rak_rr = result.apply(lambda row: recall_at_k(row['random_recommendation'], row['actual']), axis=1).mean()
pak_rr = result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

print('бейзлайн recall_at_k: {} '.format(rak_rr))
print('бейзлайн precision_at_k: {} '.format(pak_rr))

бейзлайн recall_at_k: 0.00019436742666888577 
бейзлайн precision_at_k: 0.003368229403732364 


In [26]:
#бейзлайны популярных рекомендаций
ratk_pr = result.apply(lambda row: recall_at_k(row['popular_recommendation'], row['actual']), axis=1).mean()
patk_pr = result.apply(lambda row: precision_at_k(row['popular_recommendation'], row['actual']), axis=1).mean()

print('бейзлайн recall_at_k: {} '.format(ratk_pr))
print('бейзлайн precision_at_k: {} '.format(patk_pr))

бейзлайн recall_at_k: 0.006683495608789089 
бейзлайн precision_at_k: 0.09440145653163352 


In [27]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation
0,1,"[829323, 835108, 836423, 851515, 875240, 87737...","[1121367, 895817, 14077530, 930545, 1005609]","[916122, 6533765, 5569230, 844179, 1044078]"
1,2,"[895388, 8357614, 12301772, 821083, 828106, 83...","[1036106, 885917, 1123022, 1037387, 909811]","[916122, 6533765, 5569230, 844179, 1044078]"


### Обучение модели

In [28]:
rec = MainRecommender(data = data_train, 
                      top_popular = top_popular, 
                      item_features = item_features, 
                      item_mean_cost = item_mean_cost, 
                      popular_exp_item = popular_exp_item, 
                      weighting = True)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15744.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2494.0), HTML(value='')))




In [29]:
result['item_bm25'] = result['user_id'].apply(lambda x: rec.get_recommendations(x, model=rec.own_recommender, N=3000))    

In [30]:
result['als_bm25'] = result['user_id'].apply(lambda x: rec.get_recommendations(x, model=rec.model, N=3000))

In [31]:
%%time

result['result'] = result['user_id'].apply(lambda x: postfilter_items(user = x, 
                                                                      data = result, 
                                                                      data_t = data_train, 
                                                                      item_features = rec.item_features, 
                                                                      col = 'item_bm25', 
                                                                      N = 5, 
                                                                      item_mean_cost = rec.item_mean_cost, 
                                                                      all_rec = rec.all_recommendations, 
                                                                      top = rec.top_popular, 
                                                                      userid = rec.userid_to_id, 
                                                                      id_to_itemid = rec.id_to_itemid, 
                                                                      popular_exp_item = rec.popular_exp_item))

CPU times: user 6min 12s, sys: 22.5 s, total: 6min 34s
Wall time: 6min 37s


In [32]:
result['price_recommendation'] = result['result'].apply(lambda x: rec.get_price_recommendations(x))

### Получение целевых метрик

In [33]:
ratk = result.apply(lambda row: recall_at_k(row['result'], row['actual']), axis=1).mean()
patk = result.apply(lambda row: precision_at_k(row['result'], row['actual']), axis=1).mean()
mp5 = result.apply(lambda row: money_precision_at_k(row['result'], row['actual'], row['price_recommendation']), axis=1).mean()

print('recall_at_k: {} '.format(ratk))
print('precision_at_k: {} '.format(patk))
print('money_precision@5: {} '.format(mp5))

recall_at_k: 0.022858849538524906 
precision_at_k: 0.23008204193253887 
money_precision@5: 0.2011778288141283 


### Сохранение результатов

In [34]:
result['n_item'] = result['result'].apply(lambda x: len(x))
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,item_bm25,als_bm25,result,price_recommendation,n_item
0,1,"[829323, 835108, 836423, 851515, 875240, 87737...","[1121367, 895817, 14077530, 930545, 1005609]","[916122, 6533765, 5569230, 844179, 1044078]","[856942, 940947, 5577022, 9297615, 934369, 965...","[885290, 1082212, 1075074, 896085, 934369, 110...","[856942, 5577022, 934369, 1075074, 1021164]","[2.7754485049833875, 2.7398888888888853, 1.480...",5
1,2,"[895388, 8357614, 12301772, 821083, 828106, 83...","[1036106, 885917, 1123022, 1037387, 909811]","[916122, 6533765, 5569230, 844179, 1044078]","[1075368, 1040807, 8090521, 1076580, 911974, 9...","[5569230, 1021324, 8090521, 1004906, 1075368, ...","[1040807, 911974, 1075368, 8090521, 1021324]","[3.8436645962732956, 2.278136363636363, 3.5588...",5


In [35]:
result = result.loc[result['n_item'] == 5]
print('Количество рекомендаций менее 5: ', result[result['n_item'] != 5].shape[0])

Количество рекомендаций менее 5:  0


In [36]:
result_csv = result[['user_id', 'result']]

In [37]:
result_csv.to_csv('RS_coursework_GermanDU.csv', index=False)