In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

In [5]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [6]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [7]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    # your_code
      
    items = np.array(items_weights['item_id'])
    weights = np.array(items_weights['weight'])
    recs = np.random.choice(items, size=n, replace=False, p=weights)
        
    return recs.tolist()

In [210]:
%%time

# your_code
# пробовал логарифм применять к сумме продаж, но видимо из-за нулевых продаж есть бесконечности и сделать потом веса нормальные не получается

items_weights = data_train.groupby('item_id')['sales_value'].sum().reset_index()
items_weights['weight'] = items_weights['sales_value'].apply(lambda x: x/items_weights['sales_value'].sum())
items_weights.sort_values('sales_value', ascending=False).head(5)

CPU times: user 29.3 s, sys: 71.8 ms, total: 29.3 s
Wall time: 29.3 s


Unnamed: 0,item_id,sales_value,weight
55470,6534178,447799.94,0.063498
55430,6533889,40483.34,0.005741
28895,1029743,35764.66,0.005071
55465,6534166,30170.77,0.004278
34707,1082185,26029.96,0.003691


In [211]:
items_weights['weight'].sum()

0.9999999999999999

In [212]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

result['weighted_random_recommendation'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))
result.head(2)

Unnamed: 0,user_id,actual,weighted_random_recommendation
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[896085, 1124732, 7168890, 9527160, 891961]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 952163, 5569471, 6979717, 948254]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [176]:
result = pd.read_csv('predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [177]:
# your_code

result['actual'] = result['actual'].apply(lambda x: x.strip('[]').split())
result['random_recommendation'] = result['random_recommendation'].apply(lambda x: x.strip('[]').split(sep=', '))
result['popular_recommendation'] = result['popular_recommendation'].apply(lambda x: x.strip('[]').split(sep=', '))
result['itemitem'] = result['itemitem'].apply(lambda x: x.strip('[]').split(sep=', '))
result['cosine'] = result['cosine'].apply(lambda x: x.strip('[]').split(sep=', '))
result['tfidf'] = result['tfidf'].apply(lambda x: x.strip('[]').split(sep=', '))
result['own_purchases'] = result['own_purchases'].apply(lambda x: x.strip('[]').split(sep=', '))
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [73]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision

In [187]:
result['precision_at_k_random_recommendation'] = [precision_at_k(result['random_recommendation'][x], result['actual'][x], k=5) for x in result.index]
result.sort_values('precision_at_k_random_recommendation', ascending=False).head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,precision_at_k_random_recommendation
406,511,"[844165, 883404, 905539, 999779, 1095700, 8421...","[1071939, 1031545, 13217915, 1015607, 6463472]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1127831, 904360, 995785, 12949590, 13190294]","[1127831, 995785, 1092026, 1108092, 1058554]","[995785, 1127831, 904360, 991435, 1068745]","[999999, 1082185, 1029743, 1098066, 6534178]",0.2
380,477,"[823758, 873654, 881883, 926597, 941307, 97564...","[8159332, 17329749, 994430, 88223, 1075282]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 826249, 1106523, 995785]","[981760, 1127831, 1004906, 826249, 848561]","[981760, 1127831, 826249, 923746, 848561]","[999999, 1082185, 1029743, 1098066, 6534178]",0.2


In [188]:
result['precision_at_k_popular_recommendation'] = [precision_at_k(result['popular_recommendation'][x], result['actual'][x], k=5) for x in result.index]
result.sort_values('precision_at_k_popular_recommendation', ascending=False).head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,precision_at_k_random_recommendation,precision_at_k_popular_recommendation
1761,2149,"[1033207, 1106523, 5978656, 942565, 15717219, ...","[915539, 1095926, 476254, 2062838, 97991]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1127831, 995242, 1098066, 826249, 840361]","[1127831, 1098066, 961554, 840361, 995785]","[1127831, 961554, 860776, 995242, 995785]","[999999, 1082185, 6534178, 1081177, 1068719]",0.0,0.8
559,707,"[847344, 995816, 1077048, 1096036, 9926165, 13...","[901302, 1080448, 953333, 2844067, 265230]","[6534178, 6533889, 1029743, 6534166, 1082185]","[883404, 1133018, 908531, 1004906, 1033142]","[883404, 1004906, 1137688, 1071939, 1033142]","[883404, 1004906, 1071939, 1133018, 1033142]","[999999, 1082185, 1029743, 1098066, 6534178]",0.0,0.8


In [189]:
result['precision_at_k_itemitem'] = [precision_at_k(result['itemitem'][x], result['actual'][x], k=5) for x in result.index]
result.sort_values('precision_at_k_itemitem', ascending=False).head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,precision_at_k_random_recommendation,precision_at_k_popular_recommendation,precision_at_k_itemitem
1903,2331,"[1001680, 1085559, 1130858, 10204393, 12172351...","[2662416, 1131246, 9297248, 5567715, 13213115]","[6534178, 6533889, 1029743, 6534166, 1082185]","[883404, 1106523, 916122, 1068719, 15926886]","[883404, 1068719, 916122, 1096036, 1080414]","[883404, 1068719, 916122, 1096036, 1080414]","[999999, 1082185, 1029743, 1098066, 6534178]",0.0,0.4,0.6
1653,2019,"[844165, 12781763, 13382073, 823704, 965444, 9...","[6855151, 830920, 15863928, 1840556, 933569]","[6534178, 6533889, 1029743, 6534166, 1082185]","[961554, 916122, 866211, 13987135, 16809471]","[961554, 1132770, 866211, 5590695, 856772]","[961554, 1132770, 916122, 838602, 1017041]","[999999, 1082185, 1029743, 1098066, 6534178]",0.0,0.4,0.6


In [190]:
result['precision_at_k_cosine'] = [precision_at_k(result['cosine'][x], result['actual'][x], k=5) for x in result.index]
result.sort_values('precision_at_k_cosine', ascending=False).head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,precision_at_k_random_recommendation,precision_at_k_popular_recommendation,precision_at_k_itemitem,precision_at_k_cosine
916,1131,"[851287, 913278, 920308, 945914, 993638, 10438...","[6391277, 1067204, 33931, 120578, 1158433]","[6534178, 6533889, 1029743, 6534166, 1082185]","[840361, 995785, 961554, 1106523, 1005186]","[961554, 1004906, 995785, 840361, 860776]","[840361, 961554, 1004906, 995785, 860776]","[999999, 1082185, 1029743, 1098066, 1127831]",0.0,0.4,0.4,0.6
1401,1724,"[824005, 837413, 839656, 852864, 860776, 87944...","[13095625, 1099157, 1045686, 952532, 1371262]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1098066, 1029743, 826249]","[981760, 1098066, 995785, 962568, 860776]","[981760, 1098066, 962568, 995785, 995242]","[999999, 1082185, 6534178, 1127831, 1133018]",0.0,0.2,0.2,0.6


In [191]:
result['precision_at_k_tfidf'] = [precision_at_k(result['tfidf'][x], result['actual'][x], k=5) for x in result.index]
result.sort_values('precision_at_k_tfidf', ascending=False).head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,precision_at_k_random_recommendation,precision_at_k_popular_recommendation,precision_at_k_itemitem,precision_at_k_cosine,precision_at_k_tfidf
1505,1848,"[831063, 831587, 834491, 834993, 835098, 84036...","[986006, 903895, 5571859, 1028243, 1601122]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1127831, 840361, 1029743, 961554, 13842088]","[1127831, 961554, 840361, 860776, 844179]","[1127831, 961554, 840361, 860776, 844179]","[999999, 1082185, 1098066, 6534178, 995785]",0.0,0.4,0.6,0.6,0.6
916,1131,"[851287, 913278, 920308, 945914, 993638, 10438...","[6391277, 1067204, 33931, 120578, 1158433]","[6534178, 6533889, 1029743, 6534166, 1082185]","[840361, 995785, 961554, 1106523, 1005186]","[961554, 1004906, 995785, 840361, 860776]","[840361, 961554, 1004906, 995785, 860776]","[999999, 1082185, 1029743, 1098066, 1127831]",0.0,0.4,0.4,0.6,0.6


In [195]:
result['precision_at_k_own_purchases'] = [precision_at_k(result['own_purchases'][x], result['actual'][x], k=5) for x in result.index]
result.sort_values('precision_at_k_own_purchases', ascending=False).head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,precision_at_k_random_recommendation,precision_at_k_popular_recommendation,precision_at_k_itemitem,precision_at_k_cosine,precision_at_k_tfidf,precision_at_k_own_purchases
1731,2115,"[6534178, 885086, 1070820, 819927, 844179, 850...","[12301870, 7144136, 8144459, 1039855, 5564922]","[6534178, 6533889, 1029743, 6534166, 1082185]","[923746, 1000753, 854852, 15926844, 15926886]","[854852, 5584027, 824555, 5568378, 1041259]","[854852, 824555, 923746, 5584027, 979707]","[999999, 1082185, 1029743, 1098066, 6534178]",0.0,0.6,0.0,0.0,0.0,0.8
1202,1489,"[869220, 929433, 1018588, 1025581, 1098066, 65...","[10356021, 9420212, 1174463, 9837131, 6545978]","[6534178, 6533889, 1029743, 6534166, 1082185]","[15596488, 15596515, 15926712, 15926844, 15831...","[860776, 821344, 833025, 1079292, 1124729]","[860776, 821344, 1079292, 1043590, 1136257]","[999999, 1082185, 1029743, 1098066, 6534178]",0.0,0.6,0.2,0.2,0.2,0.8


In [196]:
#  Последний алгоритм own_purchases показывает лучшее качество!

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [None]:
# your_code