In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import random

from sklearn.metrics import pairwise_distances, mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt

import warnings

warnings.simplefilter('ignore')

In [4]:
df_ratings = pd.read_csv('./ml-latest-small/ratings.csv')
df_movies = pd.read_csv('./ml-latest-small/movies.csv')

print('df_ratings {} {}'.format(*df_ratings.shape))
print('df_movies {} {}'.format(*df_movies.shape))

df_ratings 100836 4
df_movies 9742 3


In [5]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
df_merge = df_ratings.merge(df_movies[['movieId','title']], how='left', on='movieId')
df_merge.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,1,3,4.0,964981247,Grumpier Old Men (1995)
2,1,6,4.0,964982224,Heat (1995)
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,964982931,"Usual Suspects, The (1995)"


In [8]:
train_data, test_data = train_test_split(df_merge, test_size=.25, random_state=57)

In [14]:
test_res = test_data.copy().groupby(by='userId', as_index=False)['title'].agg({'actual': (lambda x: list(set(x)))})
test_res = test_res.set_index('userId')
test_res.head(5)

Unnamed: 0_level_0,actual
userId,Unnamed: 1_level_1
1,"[Longest Day, The (1962), I Know What You Did ..."
2,"[Inglourious Basterds (2009), Warrior (2011), ..."
3,"[Death Race 2000 (1975), Dangerous Minds (1995..."
4,"[Almost Famous (2000), Swingers (1996), Night ..."
5,"[Four Weddings and a Funeral (1994), In the Na..."


In [64]:
test_res.shape

(609, 1)

In [16]:
train_data_ui = train_data.pivot_table(index=['userId'], columns=['title'], values='rating')
test_data_ui = test_data.pivot_table(index=['userId'], columns=['title'], values='rating')

In [17]:
train_data_ui.head(5)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...All the Marbles (1981),...,Zootopia (2016),Zulu (1964),Zulu (2013),[REC] (2007),[REC]³ 3 Génesis (2012),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [20]:
print('Num of movies in Test: {}'.format(len(set(test_data.movieId))))
print('Intersection of movies in Train and Test: {}'.format(len(set.intersection(set(test_data.movieId), set(train_data.movieId)))))

Num of movies in Test: 5644
Intersection of movies in Train and Test: 4693


In [22]:
mean_training = pd.DataFrame(train_data_ui.mean(axis=0), columns=['pred_rating']).reset_index()
mean_training.head(5)

Unnamed: 0,title,pred_rating
0,'71 (2014),4.0
1,'Hellboy': The Seeds of Creation (2004),4.0
2,'Round Midnight (1986),3.5
3,'Salem's Lot (2004),5.0
4,'Til There Was You (1997),4.0


In [23]:
test_data = test_data.merge(mean_training, how='left', on='title')
test_data.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,pred_rating
0,414,2433,2.0,961436616,"Civil Action, A (1998)",3.0625
1,380,91500,3.0,1494708626,The Hunger Games (2012),3.5
2,462,1261,4.0,1138343577,Evil Dead II (Dead by Dawn) (1987),3.903846
3,597,2505,2.0,940420065,8MM (1999),2.966667
4,232,53127,0.5,1209068728,Bug (2007),3.666667


In [24]:
def mse(actual, pred) -> float:
    mse = np.square(np.subtract(actual, pred)).mean()
    
    return mse

In [25]:
print('MSE {}'.format(mse(test_data.rating, test_data.pred_rating)))

MSE 0.9522521320877634


In [33]:
def rmse(actual,pred) -> float:
    
    rmse = sqrt(np.square(np.subtract(actual, pred)).mean())
    return rmse

In [44]:
rmse_var = rmse(test_data.rating, test_data.pred_rating)
print('RMSE {}'.format(round(rmse_var, 2)))

RMSE 0.98


In [65]:
cf_recs = []

In [66]:
%time
corr_matrix = train_data_ui.corr(method='pearson', min_periods=100)

CPU times: user 4 µs, sys: 13 µs, total: 17 µs
Wall time: 27.9 µs


In [48]:
corr_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...All the Marbles (1981),...,Zootopia (2016),Zulu (1964),Zulu (2013),[REC] (2007),[REC]³ 3 Génesis (2012),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),,,,,,,,,,,...,,,,,,,,,,
xXx (2002),,,,,,,,,,,...,,,,,,,,,,
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,
¡Three Amigos! (1986),,,,,,,,,,,...,,,,,,,,,,


In [67]:
%%time
for i in test_res.index:
    user_ratings = train_data_ui.loc[i].dropna()
    simCandidates = pd.Series()

    for j in range(0, len(user_ratings.index)): # Идем по списку всех фильмов оцененных пользвателем
        sims = corr_matrix[user_ratings.index[j]].dropna() # Извлекаем фильмы, похожие на оцененные данным юзером (1)
        sims = sims.map(lambda x: x * user_ratings[j]) # Умножаем корреляцию на оценку пользователя по фильму
        simCandidates = simCandidates.append(sims) # Добавляем индекс в список сравниваемых кандидатов

    simCandidates = simCandidates.groupby(simCandidates.index).sum()
    simCandidates.sort_values(inplace = True, ascending = False)

    # выбираем фильмы, которые пользователь еще не смотрел
    intersection_set = set.intersection(set(simCandidates.index), set(user_ratings.index))
    not_watched = list(set(simCandidates.index) - set(user_ratings.index))
    colab_predictions = simCandidates[not_watched].sort_values(ascending = False).head(10).index.to_list()
    cf_recs.append(colab_predictions)

CPU times: user 20.9 s, sys: 145 ms, total: 21 s
Wall time: 21.1 s


In [68]:
test_res['colab_predictions'] = cf_recs
test_res.head(5)

Unnamed: 0_level_0,actual,colab_predictions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[Longest Day, The (1962), I Know What You Did ...","[Shawshank Redemption, The (1994), Forrest Gum..."
2,"[Inglourious Basterds (2009), Warrior (2011), ...",[]
3,"[Death Race 2000 (1975), Dangerous Minds (1995...",[]
4,"[Almost Famous (2000), Swingers (1996), Night ...",[Star Wars: Episode VI - Return of the Jedi (1...
5,"[Four Weddings and a Funeral (1994), In the Na...","[Silence of the Lambs, The (1991), Forrest Gum..."


In [72]:
popularity_recs = train_data.title.value_counts().head(10).index.tolist()

pop_recs = []

for user in test_res.index:
    pop_predictios = popularity_recs
    pop_recs.append(pop_predictios)
    
test_res['pop_predictions'] = pop_recs
test_res.head()

Unnamed: 0_level_0,actual,colab_predictions,pop_predictions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[Longest Day, The (1962), I Know What You Did ...","[Shawshank Redemption, The (1994), Forrest Gum...","[Shawshank Redemption, The (1994), Forrest Gum..."
2,"[Inglourious Basterds (2009), Warrior (2011), ...",[],"[Shawshank Redemption, The (1994), Forrest Gum..."
3,"[Death Race 2000 (1975), Dangerous Minds (1995...",[],"[Shawshank Redemption, The (1994), Forrest Gum..."
4,"[Almost Famous (2000), Swingers (1996), Night ...",[Star Wars: Episode VI - Return of the Jedi (1...,"[Shawshank Redemption, The (1994), Forrest Gum..."
5,"[Four Weddings and a Funeral (1994), In the Na...","[Silence of the Lambs, The (1991), Forrest Gum...","[Shawshank Redemption, The (1994), Forrest Gum..."


In [74]:
ran_recs = []

for user in test_res.index:
    random_predictions = df_merge.title.sample(10).values.tolist()
    ran_recs.append(random_predictions)
    
test_res['rand_predicitons'] = ran_recs
test_res.head(5)

Unnamed: 0_level_0,actual,colab_predictions,pop_predictions,rand_predicitons
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"[Longest Day, The (1962), I Know What You Did ...","[Shawshank Redemption, The (1994), Forrest Gum...","[Shawshank Redemption, The (1994), Forrest Gum...","[Larger Than Life (1996), Kill Bill: Vol. 2 (2..."
2,"[Inglourious Basterds (2009), Warrior (2011), ...",[],"[Shawshank Redemption, The (1994), Forrest Gum...",[Pirates of the Caribbean: At World's End (200...
3,"[Death Race 2000 (1975), Dangerous Minds (1995...",[],"[Shawshank Redemption, The (1994), Forrest Gum...","[Sister Act (1992), Me, Myself & Irene (2000),..."
4,"[Almost Famous (2000), Swingers (1996), Night ...",[Star Wars: Episode VI - Return of the Jedi (1...,"[Shawshank Redemption, The (1994), Forrest Gum...","[Time to Kill, A (1996), Tora! Tora! Tora! (19..."
5,"[Four Weddings and a Funeral (1994), In the Na...","[Silence of the Lambs, The (1991), Forrest Gum...","[Shawshank Redemption, The (1994), Forrest Gum...","[Wizard of Oz, The (1939), Raiders of the Lost..."


In [100]:
def catalog_coverage(predicted, catalog, k: int) -> float:
    
    sampling = random.choices(predicted, k=k)
    # print(sampling)
    predicted_flattened = [p for sublist in sampling for p in sublist]
    L_predicted = len(set(predicted_flattened))
    catalog_coverage = round(L_predicted/(len(catalog)*1.0),3)
    
    return catalog_coverage

In [101]:
catalog = df_merge.title.unique().tolist()
cf_cat_coverage = catalog_coverage(cf_recs, catalog, 100)
pop_cat_coverage = catalog_coverage(pop_recs, catalog, 100)
random_cat_coverage = catalog_coverage(ran_recs, catalog, 100)

In [91]:
print("Catalog coverage (cf): {}".format(cf_cat_coverage))
print("Catalog coverage (popular): {}".format(pop_cat_coverage))
print("Catalog coverage (random): {}".format(random_cat_coverage))

Catalog coverage (cf): 0.001
Catalog coverage (popular): 0.001
Catalog coverage (random): 0.078


In [104]:
# cf_recs

In [94]:
def user_coverage(predicted) -> float:
    """
    Computes the share of test users to whom we were able to provide recommendation.
    Parameters
    ----------
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    Returns
    ----------
    user_coverage:
        Share of test users in predicted list to whom we provided recommendation
        rounded to 2 decimal places
    """
    
    L_predictions = sum([1 if len(i) > 0 else 0 for i in predicted])
    user_coverage = round(L_predictions/(len(predicted)*1.0),2)

    return user_coverage

In [106]:
print('user_coverage cf_recs {}'.format(user_coverage(cf_recs)))
print('user_coverage pop_recs {}'.format(user_coverage(pop_recs)))
print('user_coverage ran_recs {}'.format(user_coverage(ran_recs)))

user_coverage cf_recs 0.82
user_coverage pop_recs 1.0
user_coverage ran_recs 1.0


In [110]:
def novelty(predicted, pop, u: int, n: int) -> (float, list):
    """
    Computes the novelty for a list of recommendations
    Parameters
    ----------
    predicted : a list of lists
    pop: dictionary
        A dictionary of all items alongside of its occurrences counter in the training data
        example: {1198: 893, 1270: 876, 593: 876, 2762: 867}
    u: integer
        The number of users in the training data
    n: integer
        The length of recommended lists per user
    Returns
    ----------
    novelty:
        The novelty of the recommendations in system level
    mean_self_information:
        The novelty of the recommendations in recommended top-N list level
    ----------
    Metric definition: https://arxiv.org/pdf/0808.2670.pdf
    """
    mean_self_information = []
    k = 0
    for sublist in predicted:
        self_information = 0
        k += 1
        for i in sublist:
            self_information += np.sum(-np.log2(pop[i]/u))
        mean_self_information.append(self_information/n)
    novelty = sum(mean_self_information)/k
    return novelty, mean_self_information

In [108]:
nov = df_merge.title.value_counts()
pop = dict(nov)
users = df_merge["userId"].value_counts()

In [111]:
random_novelty,random_mselfinfo_list = novelty(ran_recs, pop, len(users), 10)
pop_novelty,pop_mselfinfo_list = novelty(pop_recs, pop, len(users), 10)
cf_novelty,cf_mselfinfo_list = novelty(cf_recs, pop, len(users), 10)

In [112]:
print("Novelty (cf): {}".format(cf_novelty))
print("Novelty (popular): {}".format(pop_novelty))
print("Novelty (random): {}".format(random_novelty))

Novelty (cf): 0.31385875843373223
Novelty (popular): 1.2007395088965802
Novelty (random): 4.356476749849581
