In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [29]:
data = pd.read_csv('./data/retail_train.csv')
data.sample(10)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
429922,2459,29273233916,171,1061220,1,1.06,384,-0.12,1900,25,0.0,0.0
880135,99,31254447316,288,824073,1,1.89,31782,-0.1,2044,42,0.0,0.0
1781065,1861,35080126658,511,1068719,1,2.29,297,0.0,2126,74,0.0,0.0
1125987,2388,32113737482,348,898212,1,2.0,448,-0.59,1853,50,0.0,0.0
593119,2341,30054587514,214,9337581,1,1.0,439,-0.39,1607,31,0.0,0.0
1876885,455,35840825723,535,12525270,1,7.49,450,0.0,1217,77,0.0,0.0
70958,110,27773974688,67,1082185,1,1.41,372,0.0,1840,10,0.0,0.0
1832455,2479,35573782753,524,909714,1,2.25,406,0.0,1618,76,0.0,0.0
1016958,2208,31769832357,321,870513,1,3.99,292,-1.0,1902,47,0.0,0.0
2194238,882,40888830773,615,1106523,2,3.98,31782,-1.6,1318,89,0.0,0.0


In [30]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [31]:
def weights(data):
    
    items_weights = data.groupby('item_id')['sales_value'].sum().reset_index()
    items_weights['weight'] = items_weights.sales_value / items_weights.sales_value.sum()
    items_weights.drop(columns='sales_value', inplace=True)
    
    return items_weights

In [32]:
def weighted_random_recommendation(items_weights, n=5):
    
    recs = np.random.choice(items_weights.item_id, size=n, replace=False,
                            p=np.array(items_weights.weight)
                           )
    
    
    return recs.tolist()

In [33]:
result_weights = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_weights.columns = ['user_id', 'actual']

In [34]:
# нахождение весов предметов

items_weights = weights(data_train)

In [35]:
%%time

# создание рекомендаций пользователям из test data


result_weights['weighted_random_recommendation'] = result_weights['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))
result_weights.head()

CPU times: total: 2.8 s
Wall time: 2.8 s


Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[835278, 6602329, 969932, 936753, 830157]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[833351, 1029743, 10284878, 1133018, 981179]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1113415, 875804, 916122, 12352054, 5576391]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[5569230, 10344971, 961554, 997415, 1127831]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[6534178, 867668, 1098025, 1094993, 901253]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [36]:
result = pd.read_csv('../predictions_basic.csv')

In [37]:
result = pd.concat([result, result_weights.weighted_random_recommendation], axis=1)
result.head(1)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,weighted_random_recommendation
0,1,[ 821867 834484 856942 865456 889248 ...,"[1075637, 1008383, 2054496, 1244489, 5574878]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1098066, 995242, 981760, 1127831, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[961554, 1098066, 1127831, 981760, 1082185]","[1081177, 995785, 1004906, 1082185, 1029743]","[835278, 6602329, 969932, 936753, 830157]"


In [38]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision


In [41]:
(result.random_recommendation[1])

ValueError: invalid literal for int() with base 10: '[945951, 7025094, 1018958, 888996, 1074158]'

In [39]:
result.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [None]:
# your_code