In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [7]:
data = pd.read_csv('retail_train.csv')
data.describe()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
count,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0
mean,1271.904,33366430000.0,363.977,2827247.0,100.3763,3.100821,3048.227,-0.5400708,1561.714,52.68156,-0.01638696,-0.002897905
std,726.5644,4284798000.0,175.9385,3732798.0,1152.379,4.210229,8785.542,1.245824,401.5691,25.1331,0.2168615,0.03974618
min,1.0,26984850000.0,1.0,25671.0,0.0,0.0,1.0,-130.02,0.0,1.0,-55.93,-7.7
25%,655.0,30087140000.0,216.0,916993.0,1.0,1.29,330.0,-0.69,1307.0,32.0,0.0,0.0
50%,1271.0,32419980000.0,366.0,1027569.0,1.0,2.0,370.0,-0.02,1614.0,53.0,0.0,0.0
75%,1914.0,35145800000.0,515.0,1132178.0,1.0,3.49,422.0,0.0,1844.0,74.0,0.0,0.0
max,2500.0,41656790000.0,663.0,18024560.0,89638.0,840.0,34280.0,3.99,2359.0,95.0,0.0,0.0


In [8]:
popularity = data.groupby('item_id')['user_id'].nunique().reset_index()
popularity = popularity.query("user_id < 11")

In [9]:
popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
popularity = popularity.query("sales_value < 50")

In [10]:
popularity.describe()

Unnamed: 0,item_id,sales_value
count,67876.0,67876.0
mean,5527155.0,11.170782
std,5311787.0,11.757266
min,25671.0,0.0
25%,988454.0,2.79
50%,2033826.0,6.0
75%,9860228.0,15.66
max,18024560.0,49.99


In [72]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [73]:
data_train

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
1085442,1609,32006114302,339,12484608,3,840.00,412,0.0,2038,49,0.0,0.0
2030766,346,40387571385,574,948670,5,631.80,415,0.0,1312,83,0.0,0.0
655985,125,30515165970,230,1089093,2,505.00,323,0.0,1231,34,0.0,0.0
1152895,374,32187143334,355,13212959,1,499.99,361,-50.0,1740,51,0.0,0.0
547657,125,30031850855,201,1089093,1,455.00,323,0.0,1059,29,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
121812,1048,28023838277,87,1093578,0,0.00,318,0.0,1845,13,0.0,0.0
176943,1193,28243385012,105,1102959,0,0.00,334,0.0,200,16,0.0,0.0
281098,1245,28674965013,133,870515,0,0.00,296,0.0,306,20,0.0,0.0
1904947,1098,36029791120,543,5978648,0,0.00,343,0.0,5,78,0.0,0.0


# Оценивание
За выполнени каждого задания 1 балл

4 балла -> отл

3 балла -> хор

И тд

### Задание 0. Товар 999999
На вебинаре мы использовали товар 999999 - это товар, который купили пользователи, если они купиши товар из top5000. Используя этот товар мы смещяем качество рекомендаций. В какую сторону? Уберите этот товар и сравните с качеством на семинаре.

In [None]:
Мне кажеться мы смещаем качество в сторону увиличения, так как сужаем дианазон варинатов. 

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [74]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(4)

Unnamed: 0,user_id,actual
0,1,"[8090541, 15971874, 1132814, 1132771, 9296986,..."
1,3,"[958154, 1053690, 1083328, 920626, 1096727, 13..."
2,6,"[909479, 6553035, 1329768, 1006718, 1075214, 1..."
3,7,"[1004122, 1038184, 1018769, 14111539, 15716073..."


In [12]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
    popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
    popularity = popularity.query("sales_value < 50")
    popularity['weight'] =np.log(popularity['sales_value'])/np.log(popularity['sales_value'].sum())
    popularity['weight'] = np.clip(popularity['weight'], 0, 89051)
    popularity['weight'] =popularity['weight']/popularity['weight'].sum()
    recs = np.random.choice(popularity['item_id'], size=n, replace=False, p= popularity['weight'])

 
    
    return recs.tolist()

Сделайте предсказания

In [39]:
%%time


result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(data_train, n=5))
result.head(2)
# your_code

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  popularity['weight'] =np.log(popularity['sales_value'])/np.log(popularity['sales_value'].sum())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  popularity['weight'] = np.clip(popularity['weight'], 0, 89051)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  popularity['weight'] =popularity['weight']/pop

Wall time: 3min 25s


Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5656623, 5586068, 6443175, 2005115, 5565456]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1060526, 121715, 8160336, 920749, 851614]"


In [42]:
result.apply(lambda row: precision_at_k(row['weighted_random_recommendation'], row['actual']), axis=1).mean()

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False]
[False False False False False False False False False False Fals

[False False False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False Fal

 False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False]
[False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 Fals

0.00019588638589618023

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [76]:
result_test = pd.read_csv('predictions_basic.csv')
result_test.head()

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[1088431, 16734400, 1048955, 2988597, 894298]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 1098066]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[1945975, 863283, 972295, 1409814, 825220]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1098066, 995242]","[1082185, 1098066, 981760, 999999, 826249]","[1082185, 981760, 1098066, 826249, 999999]","[999999, 1082185, 1098066, 6534178, 1127831]"
2,6,[ 920308 926804 946489 1006718 1017061 ...,"[8165506, 886064, 1058046, 12812329, 843376]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 999999, 981760, 1127831, 1098066]","[1082185, 981760, 1127831, 999999, 878996]","[999999, 1082185, 1029743, 6534178, 1127831]"
3,7,[ 840386 889774 898068 909714 929067 ...,"[885517, 9446741, 1105769, 1126420, 359526]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 995242]","[1082185, 981760, 999999, 1127831, 961554]","[1082185, 981760, 1127831, 999999, 961554]","[999999, 1082185, 1029743, 1127831, 995785]"
4,8,[ 835098 872137 910439 924610 992977 ...,"[12757007, 1123321, 858743, 12171210, 1418249]","[6534178, 6533889, 1029743, 6534166, 1082185]","[999999, 1082185, 981760, 1127831, 1098066]","[1082185, 981760, 999999, 1098066, 826249]","[1082185, 981760, 999999, 1098066, 826249]","[999999, 1082185, 1029743, 1098066, 6534178]"


In [16]:
# your_code

def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    print(flags)
    
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision


In [131]:
result_test.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

IndexError: too many indices for array

### Задание 3. Улучшение бейзлайнов и ItemItem¶
- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей  𝐾 .
- *Попробуйте стратегии ансамблирования изученных алгоритмов
- Обязательно нужно сделать первые 2 пункта!

In [77]:
result_top = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_top.columns=['user_id', 'actual']
result_top.head(10)

Unnamed: 0,user_id,actual
0,1,"[8090541, 15971874, 1132814, 1132771, 9296986,..."
1,3,"[958154, 1053690, 1083328, 920626, 1096727, 13..."
2,6,"[909479, 6553035, 1329768, 1006718, 1075214, 1..."
3,7,"[1004122, 1038184, 1018769, 14111539, 15716073..."
4,8,"[12172071, 854405, 825749, 1077410, 13115915, ..."
5,9,"[864335, 10457112, 1029743, 9297474, 990865, 8..."
6,13,"[6534178, 12695858, 14025593, 7409999, 6773223..."
7,14,"[1025611, 965693, 952408, 14025398, 925205, 69..."
8,15,"[828935, 1092303, 1082269, 1082310, 13073175, ..."
9,16,"[13007710, 1062973, 1082185]"


In [19]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [48]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()
top_5000

[6534178,
 6533889,
 6534166,
 6544236,
 1404121,
 397896,
 1426702,
 5703832,
 480014,
 5668996,
 6410462,
 420647,
 5747420,
 731106,
 5716076,
 707683,
 202291,
 1153346,
 1082185,
 5747233,
 6410464,
 545926,
 995242,
 2848087,
 1388206,
 1029743,
 5712216,
 5850988,
 1133018,
 1106523,
 1007195,
 981760,
 5845857,
 883404,
 1127831,
 2690723,
 866227,
 995785,
 860776,
 951590,
 5569230,
 908531,
 916122,
 826249,
 1098066,
 862349,
 1058997,
 1044078,
 904360,
 840361,
 923746,
 1126899,
 849843,
 961554,
 1005186,
 820165,
 1053690,
 844179,
 844165,
 1070820,
 1065593,
 834484,
 994928,
 833715,
 859075,
 1022003,
 1013321,
 938700,
 854852,
 986947,
 5569471,
 927191,
 1071939,
 1096036,
 1004906,
 986912,
 1080414,
 914190,
 908846,
 962229,
 8090521,
 1085604,
 911878,
 1092026,
 866211,
 1068719,
 878996,
 8090537,
 1081177,
 847270,
 929668,
 1024306,
 909894,
 907014,
 903325,
 910032,
 1095275,
 833025,
 862139,
 962568,
 953476,
 847982,
 999971,
 968215,
 1028816,
 976

In [78]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()

In [107]:
result_top['random_recommendation'] = result_top['user_id'].apply(lambda x: random_recommendation(top_5000, n=5))


In [80]:
result_top.apply(lambda row: precision_at_k(row['random_recommendation'], row['actual']), axis=1).mean()

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False]
[False False False False False False False False False False Fals

 False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False]
[False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False F

[False False False False False False False False False False]
[False False]
[False False False False False False False False False False False False
 False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False Fa

 False False False False False]
[False False False False False False False False False False False False
 False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False]
[False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False]
[False False]
[False False False False False False False False False False False False
 False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 Fa

 False False False False False False False False False False]
[False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 Fal

0.006953966699314389

In [102]:
def popularity_recommendation_TOP(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular= popular.sort_values('sales_value', ascending=False).head(5000)
   
    
    recs = popular.head(n).item_id
    
    return recs.tolist()

In [105]:
popular_recs = popularity_recommendation_TOP(data_train, n=5)

result_top['popularity_recommendation_TOP'] = result_top['user_id'].apply(lambda x: popular_recs)


Unnamed: 0,user_id,actual,random_recommendation,popularity_recommendation_TOP
0,1,"[8090541, 15971874, 1132814, 1132771, 9296986,...","[852572, 912949, 6391497, 821025, 1003836]","[6534178, 6533889, 1029743, 6534166, 1082185]"
1,3,"[958154, 1053690, 1083328, 920626, 1096727, 13...","[889511, 1040935, 843464, 1001521, 1112825]","[6534178, 6533889, 1029743, 6534166, 1082185]"


In [106]:
result_top.apply(lambda row:  precision_at_k(row['popularity_recommendation_TOP'], row['actual']), axis=1).mean()

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
  True False False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False]
[False False False False False False False False False False Fals

 False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False

 False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
  True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False

 False False False False False False False False False]
[ True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False False False False]
[False  True False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False False False False False False
 False False False False False False Fals

 False False False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False]
[False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fals

 False False False False False False False False False]
[ True False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False]
[ True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False  True
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False]
[False False False False Fal

0.15523996082272082

Какие смысл имееют $ \mu $ и $ \bar{r_u}$ ?

Реализуйте алгоритм, прогнозирующий рейтинги на основе данной формулы, на numpy (векторизованно!)

В качестве схожести возьмите CosineSimilarity.

Примените к user_item_matrix. В качестве рейтингов возьмите количество или стоимость купленного товара. 
Данный алгоритм предсказывает рейтинги. Как на основании предсказанных рейтингов предсказать факт покупки?

Предложите вариант.
Посчитайте accuracy@5 и сравните с алгоритмами, разобранными на вебинаре.

In [108]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
user_item_matrix.sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) * 100

5.33770796861036

In [111]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [144]:
### K=12 оптимальное число, дальше качество падает .

%%time

model = ItemItemRecommender(K=12, num_threads=4) # K - кол-во билжайших соседей

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

recs = model.recommend(userid=userid_to_id[2],  # userid - id от 0 до N
                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                        N=5, # кол-во рекомендаций 
                        filter_already_liked_items=False, 
                        filter_items=None, 
                        recalculate_user=True)

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))


Wall time: 987 ms


In [145]:
%%time

result['itemitem'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)])

Wall time: 89.8 ms


In [146]:
result.head(10)

Unnamed: 0,user_id,actual,itemitem
0,1,"[8090541, 15971874, 1132814, 1132771, 9296986,...","[999999, 1082185, 981760, 995242, 840361]"
1,3,"[958154, 1053690, 1083328, 920626, 1096727, 13...","[999999, 1082185, 981760, 1098066, 826249]"
2,6,"[909479, 6553035, 1329768, 1006718, 1075214, 1...","[999999, 1082185, 981760, 840361, 995242]"
3,7,"[1004122, 1038184, 1018769, 14111539, 15716073...","[999999, 1082185, 981760, 995242, 840361]"
4,8,"[12172071, 854405, 825749, 1077410, 13115915, ...","[999999, 1082185, 981760, 995242, 840361]"
5,9,"[864335, 10457112, 1029743, 9297474, 990865, 8...","[999999, 1082185, 981760, 1098066, 995242]"
6,13,"[6534178, 12695858, 14025593, 7409999, 6773223...","[999999, 1082185, 981760, 840361, 1098066]"
7,14,"[1025611, 965693, 952408, 14025398, 925205, 69...","[999999, 1082185, 981760, 1098066, 826249]"
8,15,"[828935, 1092303, 1082269, 1082310, 13073175, ...","[999999, 1082185, 981760, 995242, 840361]"
9,16,"[13007710, 1062973, 1082185]","[999999, 1082185, 981760, 840361, 995242]"


In [147]:
result.apply(lambda row: precision_at_k(row['itemitem'], row['actual']), axis=1).mean()

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False False False False False False
  True False False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False]
[False False False False False False False False False False Fals

 False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False

 False False False False False False False False False False False False]
[False False False False False False False False False False False]
[False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False]
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False Fal

0.1533790401567072

### Задание 4. Улучшение детерминированных алгоритмов
На семинаре мы рассматривали 



Далее $U \equiv N_i(u) $

$$r_{u,i} =  \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)r_{v, i}$$
$$ S = \sum\limits_{v \in U} \operatorname{sim}(u,v)$$

Предлагается улучшить эту формулу и учесть средние предпочтения всех пользователей

$$r_{u,i} = \mu + \bar{r_u} + \frac{1}{S}\sum\limits_{v \in U}\operatorname{sim}(u,v)(r_{v, i}-\bar{r_{v}} - \mu)$$