In [46]:
from collections import defaultdict
import pandas as pd
from surprise import Reader, Dataset
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import train_test_split, cross_validate

In [47]:
class MyAlgo02():
    def __init__(self, rating_data='', data_frame=''):
        if rating_data:
            reader = Reader(line_format='user item rating timestamp', sep=',')
            self.ratings = Dataset.load_from_file(rating_data, reader)
#             self.trainset, self.testset = train_test_split(self.ratings, test_size=0.25)
            self.trainset = self.ratings.build_full_trainset()
            self.sim_options = {'name': 'cosine','user_based': False}
        elif not data_frame.empty:
            reader = Reader(rating_scale=(0, 5))
            self.ratings = Dataset.load_from_df(data_frame[['userId', 'movieId', 'rating']], reader)
            self.trainset = self.ratings.build_full_trainset()
            self.sim_options = {'name': 'cosine','user_based': False}

        
    def set_k(self, k_value):
        algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
        self.algo = algo
        self.algo.fit(self.trainset)
        
        
    def find_best_k(self):
        for k_value in [2, 3, 5, 10, 20, 30, 40]:
            print('K = {}'.format(k_value))
            algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
            cross_validate(algo, self.ratings, measures=['RMSE', 'MAE'], cv=3, verbose=True)
            print('\n\n')
        
        
    def get_top_n(self, predictions, n=10):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n


    def predict_ratings(self):
        # Predict ratings for all pairs (u, i) that are NOT in the training set.
        testset = self.trainset.build_anti_testset()
        self.testset = testset
        predictions = self.algo.test(self.testset)
        self.predictions = predictions
        
        
    def recs_for_user(self, uid):
        user_filtered = list(filter(lambda x: x.uid == str(uid), self.predictions))
        print(len(user_filtered))
        top_n = self.get_top_n(predictions=user_filtered, n=10)
        
        return top_n

In [48]:
bla = MyAlgo02(rating_data='ml-latest-small/ratings.csv')
bla.sim_options

{'name': 'cosine', 'user_based': False}

In [49]:
bla.set_k(k_value=10)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [50]:
bla.predict_ratings()
len(bla.predictions)

5830804

In [6]:
# my_recs = bla.get_top_n(predictions=user_filtered, n=10)
my_recs = bla.recs_for_user(uid=2)
my_recs

9695


defaultdict(list,
            {'2': [('5746', 5),
              ('6835', 5),
              ('3851', 5),
              ('1631', 5),
              ('2075', 5),
              ('176601', 5),
              ('67618', 5),
              ('53', 5),
              ('1140', 5),
              ('3795', 5)]})

In [7]:
# Read itens info
movies = pd.read_csv('ml-latest-small/movies.csv', low_memory=False)
# movies.head()
my_movie = movies.loc[movies['movieId'] == 4]
my_movie.values[0][1]

'Waiting to Exhale (1995)'

In [8]:
# Print the recommended items for each user
for uid, user_ratings in my_recs.items():
#     print(uid, [iid for (iid, _) in user_ratings])
    print(uid)
    for (iid, _) in user_ratings:
        print("Title: {}, PREDICTED score: {}".format(movies.loc[movies['movieId'] == int(iid)].values[0][1], (iid, _)[1]))
#         print((iid, _)[1])
#         print(movies.loc[movies['movieId'] == int(iid)].values[0][1])

2
Title: Galaxy of Terror (Quest) (1981), PREDICTED score: 5
Title: Alien Contamination (1980), PREDICTED score: 5
Title: I'm the One That I Want (2000), PREDICTED score: 5
Title: Assignment, The (1997), PREDICTED score: 5
Title: Mephisto (1981), PREDICTED score: 5
Title: Black Mirror, PREDICTED score: 5
Title: Strictly Sexual (2008), PREDICTED score: 5
Title: Lamerica (1994), PREDICTED score: 5
Title: Entertaining Angels: The Dorothy Day Story (1996), PREDICTED score: 5
Title: Five Senses, The (1999), PREDICTED score: 5


In [51]:
metadata = pd.read_csv('ml-latest-small/ratings.csv', low_memory=False, names=['userId', 'movieId', 'rating','timestamp'])
metadata = metadata.drop(columns="timestamp")
metadata.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [10]:
user_ids = metadata['userId'].unique()
movie_ids = metadata['movieId'].unique()

In [11]:
dataframe = pd.pivot_table(metadata, values='rating', index=['userId'], columns=['movieId'], fill_value=0)
dataframe.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0


In [12]:
group_sparse_mtx = dataframe.loc[[77,596,452,243,420],[1, 110, 480,2762]]
group_sparse_mtx

movieId,1,110,480,2762
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77,0.0,0.0,0.0,0.0
596,4.0,0.0,0.0,4.0
452,0.0,5.0,4.0,5.0
243,0.0,0.0,0.0,0.0
420,4.0,0.0,3.5,4.0


In [13]:
# Getting the prediction of a specific userXmovie
pred_77_110 = list(filter(lambda x: x.uid=='77' and x.iid=='110', bla.predictions))
pred_77_110

[Prediction(uid='77', iid='110', r_ui=3.501556983616962, est=3.9139110223715323, details={'actual_k': 10, 'was_impossible': False})]

In [14]:
group_perf_mtx = group_sparse_mtx.copy()
group_perf_mtx

movieId,1,110,480,2762
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77,0.0,0.0,0.0,0.0
596,4.0,0.0,0.0,4.0
452,0.0,5.0,4.0,5.0
243,0.0,0.0,0.0,0.0
420,4.0,0.0,3.5,4.0


In [15]:
for index, row in group_perf_mtx.iterrows():
    for col in list(group_perf_mtx):
        if(group_perf_mtx.loc[index,col] == 0.0):
            aux = list(filter(lambda x: x.uid==str(index) and x.iid==str(col), bla.predictions))
            group_perf_mtx.loc[index,col] = aux[0].est

group_perf_mtx
        

movieId,1,110,480,2762
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77,4.113485,3.913911,4.066868,2.981952
596,4.0,4.130257,3.941667,4.0
452,4.853914,5.0,4.0,5.0
243,3.915623,4.161877,4.224056,4.223186
420,4.0,4.102919,3.5,4.0


In [16]:
group_perf = pd.DataFrame(index=[900], columns=[1, 110, 480, 2762])
group_perf

Unnamed: 0,1,110,480,2762
900,,,,


In [17]:
# my_col = group_perf_mtx.iloc[ : ,2]
# my_col = list(my_col)
# print(float(min(my_col)))

In [18]:
for i in range(0,len(list(group_perf_mtx))):
    my_col = group_perf_mtx.iloc[ : ,i]
    my_col = list(my_col)

    group_perf.iloc[0, i] = float(min(my_col))

In [19]:
group_perf

Unnamed: 0,1,110,480,2762
900,3.91562,3.91391,3.5,2.98195


In [20]:
new_data = []
movie_list = list(group_perf)
for i in range(0,len(movie_list)):
    aux = []
    aux.append(group_perf.index[0])
    aux.append(movie_list[i])
    aux.append(group_perf.loc[group_perf.index[0],movie_list[i]])
    new_data.append(aux)

new_data

[[900, 1, 3.9156225633874833],
 [900, 110, 3.9139110223715323],
 [900, 480, 3.5],
 [900, 2762, 2.981952144166049]]

In [21]:
metadata.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [22]:
new_df = pd.DataFrame(new_data, columns=['userId', 'movieId', 'rating'])
new_df

Unnamed: 0,userId,movieId,rating
0,900,1,3.915623
1,900,110,3.913911
2,900,480,3.5
3,900,2762,2.981952


In [23]:
new_metadata = metadata.append(new_df, ignore_index=True)
new_metadata.tail()

Unnamed: 0,userId,movieId,rating
100835,610,170875,3.0
100836,900,1,3.915623
100837,900,110,3.913911
100838,900,480,3.5
100839,900,2762,2.981952


In [24]:
ble = MyAlgo02(data_frame=new_metadata)

In [25]:
ble.set_k(k_value=10)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [26]:
ble.predict_ratings()
len(ble.predictions)

5840524

In [27]:
ble.predictions[-1]


Prediction(uid=900, iid=163981, r_ui=3.501560010766858, est=3.180421646335555, details={'actual_k': 4, 'was_impossible': False})

In [28]:
user_filtered = list(filter(lambda x: x.uid == 900, ble.predictions))
len(user_filtered)

9720

In [29]:
my_recs = ble.get_top_n(predictions=user_filtered, n=10)
my_recs

defaultdict(list,
            {900: [(131724, 5),
              (5746, 5),
              (6835, 5),
              (8804, 5),
              (26350, 5),
              (31522, 5),
              (1140, 5),
              (99636, 5),
              (2969, 5),
              (141718, 5)]})

In [30]:
# # my_recs = bla.get_top_n(predictions=user_filtered, n=10)
# my_recs = ble.recs_for_user(uid=900)
# my_recs

In [31]:
# Print the recommended items for each user
for uid, user_ratings in my_recs.items():
#     print(uid, [iid for (iid, _) in user_ratings])
    print(uid)
    for (iid, _) in user_ratings:
        print("Title: {}, PREDICTED score: {}".format(movies.loc[movies['movieId'] == int(iid)].values[0][1], (iid, _)[1]))
#         print((iid, _)[1])
#         print(movies.loc[movies['movieId'] == int(iid)].values[0][1])

900
Title: The Jinx: The Life and Deaths of Robert Durst (2015), PREDICTED score: 5
Title: Galaxy of Terror (Quest) (1981), PREDICTED score: 5
Title: Alien Contamination (1980), PREDICTED score: 5
Title: Story of Women (Affaire de femmes, Une) (1988), PREDICTED score: 5
Title: Passenger, The (Professione: reporter) (1975), PREDICTED score: 5
Title: Marriage of Maria Braun, The (Ehe der Maria Braun, Die) (1979), PREDICTED score: 5
Title: Entertaining Angels: The Dorothy Day Story (1996), PREDICTED score: 5
Title: English Vinglish (2012), PREDICTED score: 5
Title: Man and a Woman, A (Un homme et une femme) (1966), PREDICTED score: 5
Title: Deathgasm (2015), PREDICTED score: 5


In [100]:
# # Converting new_metadata to surprise's Dataset to make recs for the group
# new_reader = Reader(rating_scale=(0, 5))
# new_data = Dataset.load_from_df(new_metadata[['userId', 'movieId', 'rating']], new_reader)
# new_data

In [34]:
# # Links to look about Diversity

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
    
# https://github.com/Lab41/hermes/blob/master/src/algorithms/performance_metrics.py

In [32]:
metadata.tail()

Unnamed: 0,userId,movieId,rating
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0
100835,610,170875,3.0


In [52]:
my_users = [77,596,452,243,420]
metadata_filtered = metadata[metadata.userId.isin(my_users)]
len(metadata_filtered)

819

In [53]:
# my_user_ids = metadata_filtered['userId'].unique()
# my_movie_ids = metadata_filtered['movieId'].unique()

In [54]:
my_group_sparse = pd.pivot_table(metadata_filtered, values='rating', index=['userId'], columns=['movieId'], fill_value=0)
my_group_sparse.head()

movieId,1,10,32,34,36,39,44,47,48,50,...,176101,177763,178615,179401,179819,181719,182793,183635,184997,188301
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77,0,0,0.0,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0
243,0,5,0.0,0,4,0,4,0,4,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0
420,4,0,3.5,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0
452,0,4,0.0,0,0,0,4,5,0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0
596,4,0,3.5,4,0,4,0,0,0,3.5,...,2.5,3.5,3.5,4,3.5,3.5,3.5,3.5,4,4


In [None]:
for index, row in my_group_sparse.iterrows():
    for col in list(my_group_sparse):
        if(my_group_sparse.loc[index,col] == 0.0):
            aux = list(filter(lambda x: x.uid==str(index) and x.iid==str(col), bla.predictions))
            my_group_sparse.loc[index,col] = aux[0].est

my_group_sparse.head()