In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import surprise
from surprise import Dataset, Reader, accuracy, KNNWithMeans
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from surprise.model_selection import KFold
from tqdm.notebook import tqdm
from sklearn.metrics import ndcg_score
from surprise import dump
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List
from surprise import SVD
import random
import pickle
from copy import deepcopy

import typing as tp
from sklearn.preprocessing import normalize
from lightfm import LightFM
from scipy.sparse import coo_matrix
from lightfm.data import Dataset as LFMDataset
np.random.seed(42)



In [2]:
n_recommendations = 10

In [3]:
df = pd.read_csv('../data/recsys-in-practice/train_joke_df.csv')

df['UID'] = df['UID'] - 1
df['JID'] = df['JID'] - 1
# сделаем сортировку и перепишем index
df = df.sort_values(by=['UID', 'JID'])
df = df.reset_index(drop=True)

df

Unnamed: 0,UID,JID,Rating
0,0,0,-7.82
1,0,1,8.79
2,0,2,-9.66
3,0,3,-8.16
4,0,4,-7.52
...,...,...,...
1448359,24982,66,6.21
1448360,24982,67,7.48
1448361,24982,68,5.15
1448362,24982,70,6.26


In [4]:
df_train, df_test = train_test_split(df, test_size=0.5, random_state=42)

In [5]:
# создадим на основе набора данных
# поднабор, который требуется для библиотеки Surprise

# указываем минимальный и максимальный рейтинги
reader = Reader(rating_scale=(-10, 10))

# передаём набор, указывая последовательность колонок: user (raw) ids, item (raw) ids, ratings
# для Surprise - это обязательно
#trainset = Dataset.load_from_df(df_train[['UID', 'JID', 'Rating']], reader).build_full_trainset()
#testset = Dataset.load_from_df(df_test[['UID', 'JID', 'Rating']], reader).build_full_trainset().build_testset()

data = Dataset.load_from_df(df[['UID', 'JID', 'Rating']], reader)
trainset, testset = surprise.model_selection.train_test_split(data, test_size=0.5, random_state=42)

In [8]:
np.unique(df_train['UID']).size, np.unique(df_test['UID']).size, np.unique(df_train['JID']).size, np.unique(df_test['JID']).size, 

(24983, 24983, 100, 100)

In [9]:
user_watched_items = {i:set() for i in range(np.unique(df_train['UID']).size)}

for row in tqdm(df_train.values):
    user = int(row[0])
    item = int(row[1])
    user_watched_items[user].add(item)

  0%|          | 0/724182 [00:00<?, ?it/s]

In [10]:
userid= 50
print(len(df.query('UID == @userid and JID in @user_watched_items[@userid]')) ,
      len(df.query('UID == @userid and JID not in @user_watched_items[@userid]')))
len(df_train[df_train['UID'] == userid]), len(df_test[df_test['UID'] == userid])

38 41


(38, 41)

In [11]:
#file_name = '05_baseline_knn_train'
#dump.dump(file_name, algo_knn=algo)
#_, algo_knn = dump.load(file_name)

In [6]:
n_users = np.unique(df['UID']).size
n_items = np.unique(df['JID']).size

In [7]:
def get_n_recommendations_for_user(df, user_id, n, sort_by):
    recommended_items = df[df['UID'] == user_id]
    recommended_items = recommended_items.sort_values(sort_by, ascending=False)  
    return recommended_items.iloc[:n]

In [8]:
test_set_all = []
for u in tqdm(range(n_users)):
    for j in range(n_items):
        test_set_all.append((u, j, 0))
        

  0%|          | 0/24983 [00:00<?, ?it/s]

In [9]:

frames = []
for user in tqdm(range(n_users)):
    frames.append(get_n_recommendations_for_user(df_test, user, n_recommendations, 'Rating'))
    
df_true = pd.concat(frames).reset_index()
df_true = df_true.drop(columns=['index'])
display(df_true)




  0%|          | 0/24983 [00:00<?, ?it/s]

Unnamed: 0,UID,JID,Rating
0,0,1,8.79
1,0,13,8.45
2,0,26,7.82
3,0,7,4.17
4,0,61,3.59
...,...,...,...
249691,24982,20,7.28
249692,24982,24,6.94
249693,24982,48,6.84
249694,24982,54,6.36


In [None]:
params = {'reg_pu': 0.0010398462453200708, 'reg_qi': 0.975481537099458, 'reg_bu': 6.405279534313364e-05,
          'reg_bi': 0.008446937909973943, 'lr_pu': 0.005037130407846973, 'lr_qi': 0.0001532653071418146,
          'lr_bu': 0.0016783726294656905, 'lr_bi': 0.00014842221255242954,
          'n_factors': 1235, 'n_epochs': 44}

algo_svd_kaggle = SVD(reg_pu=params['reg_pu'], reg_qi=params['reg_qi'], reg_bu=params['reg_bu'],
           reg_bi=params['reg_bi'], lr_pu=params['lr_pu'], lr_qi=params['lr_qi'], 
           lr_bu=params['lr_bu'], lr_bi=params['lr_bi'],
           n_factors=params['n_factors'], n_epochs=params['n_epochs'])




# получим предикт и посмотрим метрику
predictions = algo_svd_kaggle.test(testset)
print(accuracy.rmse(predictions))

In [17]:



file_name = '05_baseline_svd_kaggle_train'
dump.dump(file_name, algo=algo_svd_kaggle)
#_, algo_svd_kaggle = dump.load(file_name)


predictions_all = algo_svd_kaggle.test(test_set_all)


df_pred_all = pd.DataFrame([(x[0], x[1], x[3]) for x in predictions_all], columns = ['UID', 'JID', 'Ratings_pred'])
display(df_pred_all)
display(df_train)
mrg = df_pred_all.merge(df_train, how="left", indicator=True)
display(mrg)


mrg = mrg[mrg['_merge'] == 'left_only']
display(mrg)




frames = []
for user in tqdm(range(n_users)):
    frames.append(get_n_recommendations_for_user(mrg, user, n_recommendations, 'Ratings_pred'))
    
df_rec_svd_kaggle = pd.concat(frames).reset_index()
df_rec_svd_kaggle = df_rec_svd_kaggle.drop(columns=['index'])
display(df_rec_svd_kaggle)

display(df_rec_svd_kaggle.merge(df_train, on=['UID','JID']))
display(df_rec_svd_kaggle.merge(df_test, on=['UID','JID']))





predictions_df_surprise_svd_kaggle = { 'svd_kaggle':df_rec_svd_kaggle}
with open('predictions_df_surprise_svd_kaggle.pkl', 'wb') as f:
    pickle.dump(predictions_df_surprise_svd_kaggle, f)

AttributeError: 'SVD' object has no attribute 'trainset'

In [None]:
assert False

In [18]:
# обучим с лучшими параметрами
algo_svd = SVD(random_state=0, n_epochs=100, n_factors= 512)
algo_svd.fit(trainset)

# получим предикт и посмотрим метрику
predictions = algo_svd.test(testset)
print(accuracy.rmse(predictions))


file_name = '05_baseline_svd_train'
dump.dump(file_name, algo=algo_svd)
#_, algo_svd = dump.load(file_name)


predictions_all = algo_svd.test(test_set_all)


df_pred_all = pd.DataFrame([(x[0], x[1], x[3]) for x in predictions_all], columns = ['UID', 'JID', 'Ratings_pred'])
display(df_pred_all)
display(df_train)
mrg = df_pred_all.merge(df_train, how="left", indicator=True)
display(mrg)


mrg = mrg[mrg['_merge'] == 'left_only']
display(mrg)




frames = []
for user in tqdm(range(n_users)):
    frames.append(get_n_recommendations_for_user(mrg, user, n_recommendations, 'Ratings_pred'))
    
df_rec_svd = pd.concat(frames).reset_index()
df_rec_svd = df_rec_svd.drop(columns=['index'])
display(df_rec_svd)

display(df_rec_svd.merge(df_train, on=['UID','JID']))
display(df_rec_svd.merge(df_test, on=['UID','JID']))






RMSE: 4.1779
4.177896596084303


Unnamed: 0,UID,JID,Ratings_pred
0,0,0,-7.689100
1,0,1,8.454516
2,0,2,-9.454005
3,0,3,-8.234773
4,0,4,-2.047017
...,...,...,...
2498295,24982,95,3.731198
2498296,24982,96,4.302664
2498297,24982,97,2.968226
2498298,24982,98,1.967600


Unnamed: 0,UID,JID,Rating
189032,3325,30,0.97
450098,7797,78,-2.62
3173,53,61,-9.27
1201490,20722,17,1.36
1422712,24549,95,-0.15
...,...,...,...
259178,4520,25,6.41
1414414,24400,1,0.97
131932,2333,21,3.01
671155,11628,34,7.72


Unnamed: 0,UID,JID,Ratings_pred,Rating,_merge
0,0,0,-7.689100,,left_only
1,0,1,8.454516,,left_only
2,0,2,-9.454005,,left_only
3,0,3,-8.234773,,left_only
4,0,4,-2.047017,-7.52,both
...,...,...,...,...,...
2498295,24982,95,3.731198,,left_only
2498296,24982,96,4.302664,,left_only
2498297,24982,97,2.968226,,left_only
2498298,24982,98,1.967600,,left_only


Unnamed: 0,UID,JID,Ratings_pred,Rating,_merge
0,0,0,-7.689100,,left_only
1,0,1,8.454516,,left_only
2,0,2,-9.454005,,left_only
3,0,3,-8.234773,,left_only
5,0,5,-8.333634,,left_only
...,...,...,...,...,...
2498295,24982,95,3.731198,,left_only
2498296,24982,96,4.302664,,left_only
2498297,24982,97,2.968226,,left_only
2498298,24982,98,1.967600,,left_only


  0%|          | 0/24983 [00:00<?, ?it/s]

Unnamed: 0,UID,JID,Ratings_pred,Rating,_merge
0,0,1,8.454516,,left_only
1,0,13,7.838627,,left_only
2,0,26,7.638984,,left_only
3,0,7,3.537598,,left_only
4,0,61,3.326049,,left_only
...,...,...,...,...,...
249825,24982,20,6.993478,,left_only
249826,24982,24,6.835510,,left_only
249827,24982,48,6.637349,,left_only
249828,24982,54,6.219132,,left_only


Unnamed: 0,UID,JID,Ratings_pred,Rating_x,_merge,Rating_y


Unnamed: 0,UID,JID,Ratings_pred,Rating_x,_merge,Rating_y
0,0,1,8.454516,,left_only,8.79
1,0,13,7.838627,,left_only,8.45
2,0,26,7.638984,,left_only,7.82
3,0,7,3.537598,,left_only,4.17
4,0,61,3.326049,,left_only,3.59
...,...,...,...,...,...,...
207338,24982,20,6.993478,,left_only,7.28
207339,24982,24,6.835510,,left_only,6.94
207340,24982,48,6.637349,,left_only,6.84
207341,24982,54,6.219132,,left_only,6.36


In [10]:
_, algo_svd = dump.load('05_baseline_svd_train')


predictions_all = algo_svd.test(test_set_all)


df_pred_all = pd.DataFrame([(x[0], x[1], x[3]) for x in predictions_all], columns = ['UID', 'JID', 'Ratings_pred'])
display(df_pred_all)
display(df_train)
mrg = df_pred_all.merge(df_train, how="left", indicator=True)
display(mrg)


mrg = mrg[mrg['_merge'] == 'left_only']
display(mrg)




frames = []
for user in tqdm(range(n_users)):
    frames.append(get_n_recommendations_for_user(mrg, user, n_recommendations, 'Ratings_pred'))
    
df_rec_svd = pd.concat(frames).reset_index()
df_rec_svd = df_rec_svd.drop(columns=['index'])
display(df_rec_svd)

display(df_rec_svd.merge(df_train, on=['UID','JID']))
display(df_rec_svd.merge(df_test, on=['UID','JID']))

Unnamed: 0,UID,JID,Ratings_pred
0,0,0,-7.689100
1,0,1,8.454516
2,0,2,-9.454005
3,0,3,-8.234773
4,0,4,-2.047017
...,...,...,...
2498295,24982,95,3.731198
2498296,24982,96,4.302664
2498297,24982,97,2.968226
2498298,24982,98,1.967600


Unnamed: 0,UID,JID,Rating
189032,3325,30,0.97
450098,7797,78,-2.62
3173,53,61,-9.27
1201490,20722,17,1.36
1422712,24549,95,-0.15
...,...,...,...
259178,4520,25,6.41
1414414,24400,1,0.97
131932,2333,21,3.01
671155,11628,34,7.72


Unnamed: 0,UID,JID,Ratings_pred,Rating,_merge
0,0,0,-7.689100,,left_only
1,0,1,8.454516,,left_only
2,0,2,-9.454005,,left_only
3,0,3,-8.234773,,left_only
4,0,4,-2.047017,-7.52,both
...,...,...,...,...,...
2498295,24982,95,3.731198,,left_only
2498296,24982,96,4.302664,,left_only
2498297,24982,97,2.968226,,left_only
2498298,24982,98,1.967600,,left_only


Unnamed: 0,UID,JID,Ratings_pred,Rating,_merge
0,0,0,-7.689100,,left_only
1,0,1,8.454516,,left_only
2,0,2,-9.454005,,left_only
3,0,3,-8.234773,,left_only
5,0,5,-8.333634,,left_only
...,...,...,...,...,...
2498295,24982,95,3.731198,,left_only
2498296,24982,96,4.302664,,left_only
2498297,24982,97,2.968226,,left_only
2498298,24982,98,1.967600,,left_only


  0%|          | 0/24983 [00:00<?, ?it/s]

Unnamed: 0,UID,JID,Ratings_pred,Rating,_merge
0,0,1,8.454516,,left_only
1,0,13,7.838627,,left_only
2,0,26,7.638984,,left_only
3,0,7,3.537598,,left_only
4,0,61,3.326049,,left_only
...,...,...,...,...,...
249825,24982,20,6.993478,,left_only
249826,24982,24,6.835510,,left_only
249827,24982,48,6.637349,,left_only
249828,24982,54,6.219132,,left_only


Unnamed: 0,UID,JID,Ratings_pred,Rating_x,_merge,Rating_y


Unnamed: 0,UID,JID,Ratings_pred,Rating_x,_merge,Rating_y
0,0,1,8.454516,,left_only,8.79
1,0,13,7.838627,,left_only,8.45
2,0,26,7.638984,,left_only,7.82
3,0,7,3.537598,,left_only,4.17
4,0,61,3.326049,,left_only,3.59
...,...,...,...,...,...,...
207338,24982,20,6.993478,,left_only,7.28
207339,24982,24,6.835510,,left_only,6.94
207340,24982,48,6.637349,,left_only,6.84
207341,24982,54,6.219132,,left_only,6.36


In [11]:
df_rec_svd

Unnamed: 0,UID,JID,Ratings_pred,Rating,_merge
0,0,1,8.454516,,left_only
1,0,13,7.838627,,left_only
2,0,26,7.638984,,left_only
3,0,7,3.537598,,left_only
4,0,61,3.326049,,left_only
...,...,...,...,...,...
249825,24982,20,6.993478,,left_only
249826,24982,24,6.835510,,left_only
249827,24982,48,6.637349,,left_only
249828,24982,54,6.219132,,left_only


In [12]:
df_rec_svd[df_rec_svd['UID'] == 0]

Unnamed: 0,UID,JID,Ratings_pred,Rating,_merge
0,0,1,8.454516,,left_only
1,0,13,7.838627,,left_only
2,0,26,7.638984,,left_only
3,0,7,3.537598,,left_only
4,0,61,3.326049,,left_only
5,0,52,2.9815,,left_only
6,0,27,2.587917,,left_only
7,0,30,0.936276,,left_only
8,0,48,0.09848,,left_only
9,0,88,0.037282,,left_only


In [None]:

algo_knn = KNNWithMeans(sim_options={'name': 'cosine', 'min_support': 3, 'user_based': False})
algo_knn.fit(trainset)

# получим предикт и посмотрим метрику
predictions = algo_knn.test(testset)
accuracy.rmse(predictions)

file_name = '05_baseline_knn_train'
dump.dump(file_name, algo=algo_knn)
#_, algo_knn = dump.load(file_name)


predictions_all = algo_knn.test(test_set_all)


df_pred_all = pd.DataFrame([(x[0], x[1], x[3]) for x in predictions_all], columns = ['UID', 'JID', 'Ratings_pred'])
display(df_pred_all)
display(df_train)
mrg = df_pred_all.merge(df_train, how="left", indicator=True)
display(mrg)


mrg = mrg[mrg['_merge'] == 'left_only']
display(mrg)




frames = []
for user in tqdm(range(n_users)):
    frames.append(get_n_recommendations_for_user(mrg, user, n_recommendations, 'Ratings_pred'))
    
df_rec_knn = pd.concat(frames).reset_index()
df_rec_knn = df_rec_knn.drop(columns=['index'])
display(df_rec_knn)

display(df_rec_knn.merge(df_train, on=['UID','JID']))
display(df_rec_knn.merge(df_test, on=['UID','JID']))






In [None]:

predictions_df_surprise = { 'actual': df_true, 'svd':df_rec_svd, 'knn':df_rec_knn, 'df_rec_svd_kaggle':df_rec_svd_kaggle}
with open('predictions_df_surprise.pkl', 'wb') as f:
    pickle.dump(predictions_df_surprise, f)

In [None]:
assert False

In [None]:
actual = list(df_true.groupby('UID').agg({'JID':list})['JID'].values)

In [None]:
svd_predictions = list(df_rec.groupby('UID').agg({'JID':list})['JID'].values)

In [None]:

recommendations_dict = {}

In [None]:
df_rec

In [None]:
df_pred = pd.DataFrame([(x[0], x[1], x[2], x[3]) for x in predictions], columns = ['UID', 'JID', 'Ratings', 'Ratings_pred'])
df_pred

In [None]:
users = np.unique(df_pred['UID'])
users

In [None]:
ndsg = {}

for user in tqdm(users):
    df_user = df_pred[df_pred['UID'] == user]
    
    
    if len(df_user) < 2:
        continue
    #display(df_user)
    #display(np.asarray([df_user['Ratings']]))
    ndsg[user] = ndcg_score(np.asarray([df_user['Ratings']]), np.asarray([df_user['Ratings_pred']]))
  

In [None]:
np.mean(list(ndsg.values()))

In [None]:
df_pred = df_pred.merge(pd.DataFrame({'UID':ndsg.keys(), 'NDSG': list(ndsg.values())}), how='left')#
df_pred

In [None]:
df_pred[df_pred['NDSG'].isna()]

In [None]:
df_pred = df_pred.fillna(0)

In [None]:
df_pred[df_pred['NDSG'].isna()]

In [None]:
df_pred['NDSG'].mean()

In [None]:
pop = dict(df['UID'].value_counts())
pop

In [None]:
cf_recs = []

for user in tqdm(users):
    df_user = df_pred[df_pred['UID'] == user]
    

In [None]:
df_pred.index

In [None]:
test = df_true.copy().groupby('UID', as_index=False)['JID'].agg({'Ratings': (lambda x: list(set(x)))}).sort_values('Ratings')
test = test.set_index("UID")
test

In [None]:
def get_users_predictions(user_id, n, model):
    recommended_items = pd.DataFrame(model.loc[user_id])
    recommended_items.columns = ["Ratings_pred"]
    recommended_items = recommended_items.sort_values('Ratings_pred', ascending=False)    
    recommended_items = recommended_items.head(n)
    return recommended_items.index.tolist()

In [None]:
cf_model = df_pred.pivot_table(index='UID', columns='JID', values='Ratings_pred').fillna(0)
cf_model

In [None]:

cf_recs = []
for user in tqdm(test.index):
    cf_predictions = get_users_predictions(user, 10, cf_model)
    cf_recs.append(cf_predictions)
        
test['cf_predictions'] = cf_recs
test.head()

In [None]:
# рекомендации по популярным 
popularity_recs = df['JID'].value_counts().head(10).index.tolist()

pop_recs = []
for user in tqdm(test.index):
    pop_predictions = popularity_recs
    pop_recs.append(pop_predictions)
        
test['pop_predictions'] = pop_recs
test.head()

In [None]:
items = np.unique(df['JID'])
items

In [None]:
# рекомендации случайных
ran_recs = []
for user in tqdm(test.index):
    random_predictions = np.random.choice(items, 10)
    ran_recs.append(random_predictions)
        
test['random_predictions'] = ran_recs
test.head()

In [None]:
actual = test.Ratings.values.tolist()
cf_predictions = test.cf_predictions.values.tolist()
pop_predictions = test.pop_predictions.values.tolist()
random_predictions = test.random_predictions.values.tolist()

In [None]:
df_train

In [None]:
lfm_dataset = LFMDataset()
lfm_dataset.fit(
    users=df_train["UID"].values,
    items=df_train["JID"].values,
)

train_matrix, _ = lfm_dataset.build_interactions(zip(*df_train[["UID", "JID"]].values.T))

In [None]:
user2idx = lfm_dataset._user_id_mapping

In [None]:
item2idx = lfm_dataset._item_id_mapping

In [None]:
user2idx[3325], item2idx[30]

In [None]:
idx2item = {v:k for k,v in item2idx.items()}

In [None]:
idx2user = {v:k for k,v in user2idx.items()}

In [None]:
idx2user[0], idx2item[0]

In [None]:
lfm_model = LightFM(
    learning_rate=0.01, 
    loss='warp', 
    no_components=64,
    random_state=42
)
lfm_model.fit(
    interactions=train_matrix, 
    epochs=15,
    num_threads=20
);

In [None]:
with open('05_jokes_lfm_model.pkl', 'wb') as f:
    pickle.dump(lfm_model, f)

In [None]:
with open('05_jokes_lfm_dataset.pkl', 'wb') as f:
    pickle.dump(lfm_dataset, f)

In [None]:
lfm_model_cos = deepcopy(lfm_model)

lfm_model_cos.item_biases = np.zeros_like(lfm_model_cos.item_biases)
lfm_model_cos.user_biases = np.zeros_like(lfm_model_cos.user_biases)

lfm_model_cos.item_embeddings = normalize(lfm_model_cos.item_embeddings)
lfm_model_cos.user_embeddings = normalize(lfm_model_cos.user_embeddings)

In [None]:
def lfm_get_n_recommendations_for_user(
    user_id: str,
    model: LightFM,
    train_matrix: coo_matrix,
    user_to_id: tp.Dict[str, int],
    id_to_item: tp.Dict[int, str],
    n_recommendations: int
) -> pd.DataFrame:
    

    user_inner_id = user_to_id[user_id]
    scores = model.predict(
        user_ids=user_inner_id,
        item_ids=np.arange(train_matrix.shape[1]),
        num_threads=20
    )
    user_watched_items = train_matrix.col[train_matrix.row == user_inner_id]
    scores[user_watched_items] = -np.inf

    recommended_item_inner_ids = np.argpartition(scores, -np.arange(n_recommendations))[
        -n_recommendations:
    ][::-1]
    recommended_item_ids = [id_to_item[x] for x in recommended_item_inner_ids]
    return recommended_item_ids


In [None]:
models_dict = {"lfm": lfm_model, "lfm_cos": lfm_model_cos}

In [None]:
recommendations_dict = {}
for model_name, model in tqdm(models_dict.items()):
    recommendations = pd.DataFrame({"UID": df_test["UID"].unique()})
    recommendations["JID"] = recommendations["UID"].apply(
        lfm_get_n_recommendations_for_user,
        args=(
            model,
            train_matrix,
            user2idx,
            idx2item,
            n_recommendations
        ),
    )
    recommendations = recommendations.explode("JID")
    recommendations["rank"] = recommendations.groupby(["UID"]).cumcount() + 1
    recommendations_dict[model_name] = recommendations
    

In [None]:
recommendations_dict['lfm']

In [None]:
predictions_df_lfm = { 'lfm':recommendations_dict['lfm'],
                  'lfm_cos':recommendations_dict['lfm_cos']
                 }

with open('predictions_df_lfm.pkl', 'wb') as f:
    pickle.dump(predictions_df_lfm, f)

In [None]:
test

In [None]:
actual = test.Ratings.values.tolist()
cf_predictions = test.cf_predictions.values.tolist()
pop_predictions = test.pop_predictions.values.tolist()
random_predictions = test.random_predictions.values.tolist()

In [None]:
actual = df_test.groupby('UID', as_index=False)['JID'].agg({'JID': (lambda x: list(set(x)))}).sort_values('UID').JID.values.tolist()
actual

In [None]:
lfm = recommendations_dict['lfm'].groupby('UID', as_index=False)['JID'].agg({'JID': (lambda x: list(set(x)))}).sort_values('UID').JID.values.tolist()
lfm

In [None]:
lfm_cos = recommendations_dict['lfm_cos'].groupby('UID', as_index=False)['JID'].agg({'JID': (lambda x: list(set(x)))}).sort_values('UID').JID.values.tolist()
lfm_cos

In [None]:

def _precision(predicted, actual):
    prec = [value for value in predicted if value in actual]
    prec = float(len(prec)) / float(len(predicted))
    return prec

def _apk(actual: list, predicted: list, k=10) -> float:
    """
    average precision @ k.
    
    """
    #if not predicted or not actual:
    #    return 0.0
    
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    true_positives = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            max_ix = min(i + 1, len(predicted))
            score += _precision(predicted[:max_ix], actual)
            true_positives += 1
    
    if score == 0.0:
        return 0.0
    
    return score / true_positives


def mapk(actual: List[list], predicted: List[list], k: int=10) -> float:
    """
    mean average precision @ k.
    
    """
    if len(actual) != len(predicted):
        print(len(actual), len(predicted))
        raise AssertionError("Length mismatched")
    
    return np.mean([_apk(a,p,k) for a,p in zip(actual, predicted)])


def mapk_plot(mapk_scores, model_names, k_range):
    """
    Plots the mean average precision at k for a set of models to compare.
    ----------
    mapk_scores: list of lists
        list of list of map@k scores over k. This lis is in same order as model_names
        example: [[0.17, 0.25, 0.76],[0.2, 0.5, 0.74]]
    model_names: list
        list of model names in same order as coverage_scores
        example: ['Model A', 'Model B']
    k_range: list
        list or array indeitifying all k values in order
        example: [1,2,3,4,5,6,7,8,9,10]
    Returns:
    -------
        A map@k plot
    """
    #create palette
    #recommender_palette = ["#ED2BFF", "#14E2C0", "#FF9F1C", "#5E2BFF","#FC5FA3"]
    #sns.set_palette(recommender_palette)

    #lineplot
    mapk_df = pd.DataFrame(np.column_stack(mapk_scores), k_range, columns=model_names)
    ax = sns.lineplot(data=mapk_df)
    plt.xticks(k_range)
    plt.setp(ax.lines,linewidth=5)

    #set labels
    ax.set_title('Mean Average Precision at K (MAP@K) Comparison')
    ax.set_ylabel('MAP@K')
    ax.set_xlabel('K')
    plt.show()

In [None]:

pop_mapk = []
for K in np.arange(1, 20):
    pop_mapk.extend([mapk(actual, pop_predictions, k=K)])


random_mapk = []
for K in np.arange(1, 20):
    random_mapk.extend([mapk(actual, random_predictions, k=K)])


cf_mapk = []
for K in np.arange(1, 20):
    cf_mapk.extend([mapk(actual, cf_predictions, k=K)])

In [None]:

lfm_mapk = []
for K in np.arange(1, 20):
    lfm_mapk.extend([mapk(actual, lfm, k=K)])
    

lfm_cos_mapk = []
for K in np.arange(1, 20):
    lfm_cos_mapk.extend([mapk(actual, lfm_cos, k=K)])

In [None]:

mapk_scores = [random_mapk, pop_mapk, cf_mapk, lfm_mapk, lfm_cos_mapk]
index = range(1,19+1)
names = ['Random Recommender', 'Popularity Recommender', 'Collaborative Filter', 'LightFM', 'LightFM_cos']

fig = plt.figure(figsize=(15, 7))
mapk_plot(mapk_scores, model_names=names, k_range=index)

In [None]:
def prediction_coverage(predicted: List[list], catalog: list, unseen_warning: bool=False) -> float:
    """
    Покрытие рекомендаций (по пользователю)
    
    ----------    
    Базируется на статье:
    Ge, M., Delgado-Battenfeld, C., & Jannach, D. (2010, September).
    Beyond accuracy: evaluating recommender systems by coverage and serendipity.
    In Proceedings of the fourth ACM conference on Recommender systems (pp. 257-260). ACM.
    """
    
    unique_items_catalog = set(catalog)
    if len(catalog)!=len(unique_items_catalog):
        raise AssertionError("Дубликаты в каталоге")

    predicted_flattened = [p for sublist in predicted for p in sublist]
    unique_items_pred = set(predicted_flattened)
    
    if not unique_items_pred.issubset(unique_items_catalog):
        if unseen_warning:
            warnings.warn("В рекомендациях есть элементы не из каталога. "
                "Игнорируем")
            unique_items_pred = unique_items_pred.intersection(unique_items_catalog)
        else:
            print(unique_items_pred - unique_items_catalog)
            raise AssertionError("В рекомендациях есть элементы не из каталога.")
    
    num_unique_predictions = len(unique_items_pred)
    prediction_coverage = round(num_unique_predictions/(len(catalog)* 1.0)* 100, 2)
    return prediction_coverage

In [None]:
random_coverage = prediction_coverage(ran_recs, items)
pop_coverage = prediction_coverage(pop_recs, items)
cf_coverage = prediction_coverage(cf_recs, items)
lfm_coverage = prediction_coverage(lfm, items)
lfm_cos_coverage = prediction_coverage(lfm_cos, items)

In [None]:
cf_recs

In [None]:
random_coverage, pop_coverage, cf_coverage, lfm_coverage, lfm_cos_coverage

In [None]:

def catalog_coverage(predicted: List[list], catalog: list, k: int) -> float:
    """
    Покрытие рекомендаций (по каталогу)
    
    ----------
    Ge, M., Delgado-Battenfeld, C., & Jannach, D. (2010, September).
    Beyond accuracy: evaluating recommender systems by coverage and serendipity.
    In Proceedings of the fourth ACM conference on Recommender systems (pp. 257-260). ACM.
    """
    sampling = random.choices(predicted, k=k)
    predicted_flattened = [p for sublist in sampling for p in sublist]
    L_predictions = len(set(predicted_flattened))
    catalog_coverage = round(L_predictions/(len(catalog)*1.0)*100,2)
    return catalog_coverage



def coverage_plot(coverage_scores: list, model_names: list) -> None:
    """
    Plots the coverage for a set of models to compare.
    ----------
    coverage_scores: list
        list of coverage scores in same order as model_names
        example: [0.17, 0.25, 0.76]
    model_names: list
        list of model names in same order as coverage_scores
        example: ['Model A', 'Model B', 'Model C']
    Returns:
    -------
        A coverage plot
    """
    #create palette
    #recommender_palette = ["#ED2BFF", "#14E2C0", "#FF9F1C", "#5E2BFF","#FC5FA3"]
    #sns.set_palette(recommender_palette)

    #make barplot
    ax = sns.barplot(x=model_names, y=coverage_scores)

    #set labels
    ax.set_title('Catalog Coverage in %')
    ax.set_ylabel('coverage')

    plt.show()

In [None]:
# N=100 для определенного 
random_cat_coverage = catalog_coverage(ran_recs, items, 100)
pop_cat_coverage = catalog_coverage(pop_recs, items, 100)
cf_cat_coverage = catalog_coverage(cf_recs, items, 100)
lfm_cat_coverage = catalog_coverage(lfm, items, 100)
lfm_cat_cos_coverage = catalog_coverage(lfm_cos, items, 100)

In [None]:
# визуализация
coverage_scores = [random_coverage, pop_coverage, cf_coverage, lfm_cat_coverage, lfm_cat_cos_coverage]
model_names = ['Random Recommender', 'Popularity Recommender', 'Collaborative Filter', 'lfm', 'lfm_cos']

fig = plt.figure(figsize=(7, 5))
coverage_plot(coverage_scores, model_names)

In [None]:
def novelty(predicted: List[list], pop: dict, u: int, n: int) -> (float, list):
    """
    Новизна рекомендаций 
    ----------    
    Базируется на статье:
    Zhou, T., Kuscsik, Z., Liu, J. G., Medo, M., Wakeling, J. R., & Zhang, Y. C. (2010).
    Solving the apparent diversity-accuracy dilemma of recommender systems.
    Proceedings of the National Academy of Sciences, 107(10), 4511-4515.
    """
    mean_self_information = []
    k = 0
    for sublist in predicted:
        self_information = 0
        k += 1
        for i in sublist:
            self_information += np.sum(-np.log2(pop[i]/u))
        mean_self_information.append(self_information/n)
    novelty = sum(mean_self_information)/k
    return novelty, mean_self_information


In [None]:
nov = df.JID.value_counts()
pop = dict(nov)

In [None]:
random_novelty,random_mselfinfo_list = novelty(ran_recs, pop, len(users), 10)
pop_novelty,pop_mselfinfo_list = novelty(pop_recs, pop, len(users), 10)
cf_novelty,cf_mselfinfo_list = novelty(cf_recs, pop, len(users), 10)
lfm_novelty,lfm_mselfinfo_list = novelty(lfm, pop, len(users), 10)
lfm_cos_novelty,lfm_cos_mselfinfo_list = novelty(lfm_cos, pop, len(users), 10)

In [None]:
print(random_novelty, pop_novelty, cf_novelty, lfm_novelty, lfm_cos_novelty)

In [None]:
test_set_all

In [None]:
predictions_all

In [None]:
get_n_recommendations_for_user(df_pred_all, 4, 10)

In [None]:
df_rec

In [None]:
def get_value_popularity_ranks(values: pd.Series) -> pd.Series:
    value_counts = values.value_counts()
    counts_unique = value_counts.unique()
    count_rank_mapping = pd.Series(index=counts_unique, data=np.arange(len(counts_unique)) + 1)
    return value_counts.map(count_rank_mapping)


def calculate_serendipity_per_user(
    recommendations: pd.DataFrame,
    train: pd.DataFrame,
    test: pd.DataFrame,
) -> pd.Series:
    recommendations_ = pd.merge(recommendations, test[["UID", "JID"]], how="left", indicator=True)
    recommendations_["is_rel"] = np.where(recommendations_["_merge"] == "both", 1, 0)

    n_items = train["JID"].nunique()
    item_popularity_ranks = get_value_popularity_ranks(train["JID"])
    recommendations_["rank_pop"] = recommendations_["JID"].map(item_popularity_ranks)

    recommendations_["proba_user"] = (n_items + 1 - recommendations_["Ratings_pred"]) / n_items
    recommendations_["proba_any_user"] = (n_items + 1 - recommendations_["rank_pop"]) / n_items

    recommendations_["proba_diff"] = np.maximum(
        recommendations_["proba_user"] - recommendations_["proba_any_user"],
        0.0
    )
    recommendations_["item_serendipity"] = recommendations_["proba_diff"] * recommendations_["is_rel"]
    return recommendations_[["UID", "item_serendipity"]].groupby("UID").agg("mean")


In [None]:
recommendations_ = pd.merge(df_rec, df_test[["UID", "JID"]], how="left", indicator=True)
recommendations_["is_rel"] = np.where(recommendations_["_merge"] == "both", 1, 0)

n_items = df_train["JID"].nunique()
item_popularity_ranks = get_value_popularity_ranks(df_train["JID"])
recommendations_["rank_pop"] = recommendations_["JID"].map(item_popularity_ranks)

recommendations_["proba_user"] = (n_items + 1 - recommendations_["Ratings_pred"]) / n_items
recommendations_["proba_any_user"] = (n_items + 1 - recommendations_["rank_pop"]) / n_items

recommendations_["proba_diff"] = np.maximum(
    recommendations_["proba_user"] - recommendations_["proba_any_user"],
    0.0
)
recommendations_["item_serendipity"] = recommendations_["proba_diff"] * recommendations_["is_rel"]
recommendations_[["UID", "item_serendipity"]].groupby("UID").agg("mean")

In [None]:

recommendations_[recommendations_['is_rel'] > 0]

In [None]:
calculate_serendipity_per_user(df_rec, df_train, df_test).mean()

In [None]:
calculate_serendipity_per_user(df_pred, df_train, df_test).mean()