In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k



In [None]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [None]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    recs = np.random.choice(items_weights['item_id'], size=n, 
                            replace=False, p=items_weights['weight'])
    
    return recs.tolist()

In [None]:
# weighted random recommendation by quantity
qp = pd.DataFrame(data_train.groupby('item_id')['quantity'].sum()).reset_index()
qp['weight'] = qp['quantity'].apply(lambda x: x / qp['quantity'].sum())
qp = qp[['item_id', 'weight']]

In [None]:
%%time
weighted_random_recommendation(qp, n=5)

CPU times: user 2.71 ms, sys: 42 µs, total: 2.75 ms
Wall time: 2.76 ms


[6534178, 6534166, 823356, 480014, 6533889]

In [None]:
# weighted random recommendation by sales_sum
sp = pd.DataFrame(data_train.groupby('item_id')['sales_value'].sum()).reset_index()
sp['weight'] = sp.sales_value.apply(lambda x: np.log(1+x))
sp['weight'] = sp.weight.apply(lambda x: x / sp['weight'].sum())
sp = sp[['item_id', 'weight']]

In [None]:
%%time
weighted_random_recommendation(sp, n=5)

CPU times: user 2.41 ms, sys: 15 µs, total: 2.42 ms
Wall time: 2.69 ms


[8249092, 827100, 926646, 9655033, 13382051]

### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [None]:
result = pd.read_csv('predictions_basic.csv')
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [None]:
# Изменение типа 'actual' с 'str' на 'list'
result['actual'] = result['actual'].apply(lambda x: 
                       list(map(lambda i: int(i), 
                                x.replace('\n', '').replace('[', '').replace(']', '').split())))

In [None]:
# Изменение типов колонок с рекомендациями с 'str' на 'list'
recommend = ['random_recommendation', 'popular_recommendation', 
             'itemitem', 'cosine', 'tfidf', 'own_purchases']
for col in recommend:
    result[col] = result[col].apply(lambda x: eval(x))
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"


In [None]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    flags = np.isin(bought_list, recommended_list[:k])
    precision = flags.sum() / k
    
    return precision

def recall_at_k(recommended_list, bought_list, k=5):
    
    flags = np.isin(bought_list, recommended_list[:k])
    recall = flags.sum() / len(bought_list)
    
    return recall

In [None]:
metrics = pd.DataFrame({'recommender': recommend})

metrics['precision@5'] = metrics['recommender'].apply(lambda col: result.apply(
                                lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean())

metrics['recall@5'] = metrics['recommender'].apply(lambda col: result.apply(
                                lambda row: recall_at_k(row[col], row['actual'], k=5), axis=1).mean())
metrics

Unnamed: 0,recommender,precision@5,recall@5
0,random_recommendation,0.000588,4.2e-05
1,popular_recommendation,0.15524,0.024996
2,itemitem,0.033595,0.005383
3,cosine,0.03526,0.00518
4,tfidf,0.036141,0.005173
5,own_purchases,0.179922,0.026675


Лучшие результаты дает ItemItemRecommender с K=1 и popular_recommender.

### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

### Топ-1000 товаров

In [None]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [None]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [None]:
top_1000 = popularity.sort_values('n_sold', ascending=False).head(1000).item_id.tolist()

In [None]:
def random_recommendation(items, n=5):
    """Случайные рекоммендации"""
    
    items = np.array(items)
    recs = np.random.choice(items, size=n, replace=False)
    
    return recs.tolist()


def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""

    popular = data.groupby('item_id')['sales_value'].sum().reset_index()
    popular.sort_values('sales_value', ascending=False, inplace=True)

    recs = popular.head(n).item_id

    return recs.tolist()

In [None]:
pop = data_train.loc[data_train['item_id'].isin(top_1000), ['item_id', 'sales_value', 'quantity']]

In [None]:
# random recommendation, popular recommendation
result['random_recommendation'] = result['user_id'].apply(lambda x: random_recommendation(top_1000, n=5))

popular_recs = popularity_recommendation(pop, n=5)
result['popular_recommendation'] = result['user_id'].apply(lambda x: popular_recs)

In [None]:
pop = pop.groupby('item_id').sum().reset_index()

In [None]:
# weighted random recommendation by quantity
pop['weight'] = pop['quantity'].apply(lambda x: x / pop['quantity'].sum())
result['weighted_random_qp'] = result['user_id'].apply(lambda x: weighted_random_recommendation(pop, n=5))

In [None]:
# weighted random recommendation by sales_sum
pop['weight'] = pop.sales_value.apply(lambda x: np.log(1+x))
pop['weight'] = pop.weight.apply(lambda x: x / pop['weight'].sum())

result['weighted_random_sp'] = result['user_id'].apply(lambda x: weighted_random_recommendation(pop, n=5))

;

In [None]:
data_train.loc[~data_train['item_id'].isin(top_1000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0)

user_item_matrix[user_item_matrix > 0] = 1
user_item_matrix = user_item_matrix.astype(float)

sparse_user_item = csr_matrix(user_item_matrix).tocsr()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [None]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [None]:
model = ItemItemRecommender(K=5, num_threads=4)
model.fit(sparse_user_item, show_progress=True)

result['itemitem'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                        user_items=sparse_user_item, 
                        N=5, 
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=True)[0]])

  0%|          | 0/1001 [00:00<?, ?it/s]

In [None]:
model = CosineRecommender(K=5, num_threads=4)
model.fit(csr_matrix(user_item_matrix).tocsr(), show_progress=True)

result['cosine'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item, 
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]])



  0%|          | 0/1001 [00:00<?, ?it/s]

In [None]:
model = TFIDFRecommender(K=5, num_threads=4)
model.fit(csr_matrix(user_item_matrix).tocsr(), show_progress=True)

result['tfidf'] = result['user_id'].\
    apply(lambda x: [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[x], 
                                    user_items=sparse_user_item, 
                                    N=5, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=False)[0]])



  0%|          | 0/1001 [00:00<?, ?it/s]

In [None]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,weighted_random_qp,weighted_random_sp,itemitem,cosine,tfidf
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1000753, 964327, 989101, 6533889, 1002558]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 6533889, 6544236, 6534166, 1404121]","[934427, 933835, 10121965, 861279, 1128665]","[1082185, 981760, 995242, 1127831, 1098066]","[1082185, 981760, 1127831, 961554, 1098066]","[1082185, 981760, 1127831, 961554, 995242]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1045220, 1029504, 1055853, 838186, 952317]","[6534178, 6533889, 1029743, 6534166, 1082185]","[6534178, 6533889, 6534166, 6544236, 861445]","[1092026, 1077490, 933835, 9526886, 1049788]","[1082185, 981760, 995242, 1127831, 1098066]","[1082185, 981760, 1127831, 961554, 1098066]","[1082185, 981760, 1127831, 961554, 995242]"


In [None]:
metrics = pd.DataFrame({'recommender': ['random_recommendation', 'popular_recommendation', 
             'itemitem', 'cosine', 'tfidf', 'weighted_random_qp', 'weighted_random_sp']})

metrics['precision@5'] = metrics['recommender'].apply(lambda col: result.apply(
                                lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean())

metrics

Unnamed: 0,recommender,precision@5
0,random_recommendation,0.015279
1,popular_recommendation,0.15524
2,itemitem,0.145739
3,cosine,0.135455
4,tfidf,0.155044
5,weighted_random_qp,0.047796
6,weighted_random_sp,0.017238


При использовании топ-1000 товаров: Лучшие - Popular recommender и tfidf. Из-за меньшего количества items (только самых популярных) улучшился precision у random recommendation и tfidf.

### ItemItemRecommender

In [None]:
data = pd.read_csv('retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [None]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [None]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head()

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [None]:
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()
top_5000[:5]

[6534178, 6533889, 6534166, 6544236, 1404121]

In [None]:
# Заведем фиктивный item_id (если юзер покупал товары из топ-1000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [None]:
def K_ItemItemRecommendation(x:int, model):

    return model.recommend(userid=userid_to_id[x],  # userid - id от 0 до N
                                user_items=sparse_user_item,   # на вход user-item matrix
                                N=5, # кол-во рекомендаций 
                                filter_already_liked_items=False, 
                                filter_items=[itemid_to_id[999999]], 
                                recalculate_user=False)[0]

In [None]:
for k in range(1, 11):
    model = ItemItemRecommender(K=k, num_threads=4)
    model.fit(sparse_user_item, show_progress=True)

    result[f'itemitem_K{k}'] = result['user_id'].\
        apply(lambda x: [id_to_itemid[rec] for rec in K_ItemItemRecommendation(x, model)])

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [None]:
result.head(2)

Unnamed: 0,user_id,actual,itemitem_K1,itemitem_K2,itemitem_K3,itemitem_K4,itemitem_K5,itemitem_K6,itemitem_K7,itemitem_K8,itemitem_K9,itemitem_K10
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1082185, 1029743, 995785, 1004906, 1081177]","[1082185, 995242, 1029743, 840361, 904360]","[1082185, 981760, 995242, 1029743, 840361]","[1082185, 981760, 995242, 1127831, 840361]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 995242, 840361]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1082185, 1029743, 995785, 1004906, 1081177]","[1082185, 995242, 1029743, 840361, 904360]","[1082185, 981760, 995242, 1029743, 840361]","[1082185, 981760, 995242, 1127831, 840361]","[1082185, 981760, 1127831, 995242, 1098066]","[1082185, 981760, 1127831, 995242, 840361]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]","[1082185, 981760, 995242, 840361, 1127831]"


In [None]:
itemitems = [f'itemitem_K{i}' for i in range(1, 11)]

metrics = pd.DataFrame({'recommender': itemitems})

metrics['precision@5'] = metrics['recommender'].apply(lambda col: result.apply(
                                lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean())

metrics

Unnamed: 0,recommender,precision@5
0,itemitem_K1,0.162292
1,itemitem_K2,0.162977
2,itemitem_K3,0.19285
3,itemitem_K4,0.157003
4,itemitem_K5,0.145739
5,itemitem_K6,0.157003
6,itemitem_K7,0.157003
7,itemitem_K8,0.157003
8,itemitem_K9,0.157003
9,itemitem_K10,0.157003


Лучший precision показал ItemItemRecommender с K=3.