In [32]:
from collections import defaultdict
import pandas as pd
from surprise import Reader, Dataset
from surprise import KNNWithMeans, KNNBasic, SVD
from surprise import accuracy
from surprise.model_selection import train_test_split, cross_validate

import matplotlib.pyplot as plt
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
class RefinedMyAlgo():
    def __init__(self, rating_data='', data_frame='', movie_data=''):
        if rating_data:
            reader = Reader(line_format='user item rating timestamp', sep=',')
            self.ratings = Dataset.load_from_file(rating_data, reader)
#             self.trainset, self.testset = train_test_split(self.ratings, test_size=0.25)
            self.trainset = self.ratings.build_full_trainset()
            self.sim_options = {'name': 'cosine','user_based': False}
        elif not data_frame.empty:
            reader = Reader(rating_scale=(0, 5))
            self.ratings = Dataset.load_from_df(data_frame[['userId', 'movieId', 'rating']], reader)
            self.trainset = self.ratings.build_full_trainset()
            self.sim_options = {'name': 'cosine','user_based': False}
            
        if movie_data:
            self.movies = pd.read_csv(movie_data, low_memory=False)
            self.movies['year'] = self.movies['title'].apply(lambda x: x[-5:-1])
            self.movies['title'] = self.movies['title'].apply(lambda x: x[:-7])
            self.movies['genres'] = self.movies['genres'].apply(lambda x: x.replace('|',', '))

        
    def set_k(self, k_value=''):
        if k_value:
            algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
            self.algo = algo
            self.algo.fit(self.trainset)
        else:
            algo = SVD()
            self.algo = algo
            self.algo.fit(self.trainset)
        
        
    def find_best_k(self, k_value=''):
        if k_value:
            print('K = {}'.format(k_value))
            algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
            return cross_validate(algo, self.ratings, measures=['RMSE', 'MAE'], cv=10, verbose=True)
        else:
            aux = []
            for k_value in [3, 5, 7, 10, 15, 20, 30, 40]:
                print('K = {}'.format(k_value))
                algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
                my_dict = cross_validate(algo, self.ratings, measures=['RMSE', 'MAE'], cv=10, verbose=False)
                my_dict['k_value'] = k_value
                aux.append(my_dict)
            return aux
    
    
    def set_testset(self, users):
        if users:
            user_ratings = self.trainset.ur
            movies_ids = list(self.movies['movieId'])
            global_mean=self.trainset.global_mean
            my_testset = []
            
            for user in users:
                iuid = self.trainset.to_inner_uid(str(user))
                for movie in movies_ids:
                    is_in = False
                    for rating in user_ratings[iuid]:
#                         print( 'MOVIE: {}, RATING: {}'.format(movie,bla.trainset.to_raw_iid(rating[0])) )
                        if int(movie) == int(self.trainset.to_raw_iid(int(rating[0]))):
                            is_in = True
                            break
                    if not is_in:
                        my_tuple = (str(user),str(movie),global_mean)
                        my_testset.append(my_tuple)
                        
            self.testset = my_testset
        else:
            testset = self.trainset.build_anti_testset()
            self.testset = testset
        return self.testset


    def predict_ratings(self,users=''):
        # # Predict ratings for all pairs (u, i) that are NOT in the training set.
#         testset = self.trainset.build_anti_testset()
#         self.testset = testset
        testset = self.set_testset(users)
        predictions = self.algo.test(testset)
        self.predictions = predictions
        
        
    def set_perfil_movies(self, users):
        metadata = pd.read_csv('ml-latest-small/ratings.csv', low_memory=False, names=['userId', 'movieId', 'rating','timestamp'])
        metadata = metadata.drop(columns="timestamp")

        metadata_filtered = metadata[metadata.userId.isin(users)]

        self.group_sparse_mtx = pd.pivot_table(metadata_filtered, values='rating', index=['userId'], columns=['movieId'], fill_value=0)
        
        self.perfil_movies = list(self.group_sparse_mtx)
        
    
    ### You must call self.set_perfil_movies() before
    def set_candidate_movies(self):
        candidate_movies = []
        for item in refinedMyAlgo.movies.iterrows():
        #     get the movieId of each movie in movies dataframe
            if item[1].values[0] not in self.perfil_movies:
                candidate_movies.append(item[1].values[0])
        self.candidate_movies = candidate_movies
        
        
    def calc_similarity_matrix(self):
        #Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
        tfidf = TfidfVectorizer(stop_words='english')
        
        #Replace NaN with an empty string
        self.movies['title'] = self.movies['title'].fillna('')
        self.movies['genres'] = self.movies['genres'].fillna('')
        
        #Construct the required TF-IDF matrix by fitting and transforming the data
        tfidf_matrix_title = tfidf.fit_transform(self.movies['title'])
        tfidf_matrix_genres = tfidf.fit_transform(self.movies['genres'])
        
        #Compute the cosine similarity matrix
        self.cosine_sim_movies_title = cosine_similarity(tfidf_matrix_title, tfidf_matrix_title)
        self.cosine_sim_movies_genres = cosine_similarity(tfidf_matrix_genres, tfidf_matrix_genres)
        
        
    def get_similar_movies(self, references, title_weight=0.5):
        recs = []
        for movie in references:
            # Get the pairwsie similarity scores of all movies with that movie
            movie_idx = int(self.movies[self.movies['movieId']==movie].index[0])
            sim_scores_title = list(enumerate(self.cosine_sim_movies_title[movie_idx]))
            sim_scores_genres = list(enumerate(self.cosine_sim_movies_genres[movie_idx]))
            
            # Calculate total similarity based on title and genres
            total_sim_score = []
            for i in range(len(sim_scores_title)):
                aux = (sim_scores_title[i][1]*title_weight) + (sim_scores_genres[i][1]*(1-title_weight))
                total_sim_score.append((i, aux))
                
            # Sort the movies based on the similarity scores
            total_sim_score = sorted(total_sim_score, key=lambda x: x[1], reverse=True)
            
            # Get the scores of the 10 most similar movies
            total_sim_score = total_sim_score[1:11]
            
            recs.append(total_sim_score)
            
        return recs

In [34]:
refinedMyAlgo = RefinedMyAlgo(rating_data='ml-latest-small/ratings.csv', movie_data='ml-latest-small/movies.csv')
refinedMyAlgo.set_k()

In [35]:
my_users = [77,596,452,243,420]

refinedMyAlgo.predict_ratings(users=my_users)
len(refinedMyAlgo.predictions)

47891

In [37]:
# metadata = pd.read_csv('ml-latest-small/ratings.csv', low_memory=False, names=['userId', 'movieId', 'rating','timestamp'])
# metadata = metadata.drop(columns="timestamp")

# metadata_filtered = metadata[metadata.userId.isin(my_users)]

# my_group_sparse = pd.pivot_table(metadata_filtered, values='rating', index=['userId'], columns=['movieId'], fill_value=0)
# my_group_sparse.head()

refinedMyAlgo.set_perfil_movies(users=my_users)
refinedMyAlgo.set_candidate_movies()

# print(refinedMyAlgo.perfil_movies)
print(refinedMyAlgo.candidate_movies)
refinedMyAlgo.group_sparse_mtx.head()

movieId,1,10,32,34,36,39,44,47,48,50,...,176101,177763,178615,179401,179819,181719,182793,183635,184997,188301
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77,0,0,0.0,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0
243,0,5,0.0,0,4,0,4,0,4,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0
420,4,0,3.5,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0
452,0,4,0.0,0,0,0,4,5,0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0
596,4,0,3.5,4,0,4,0,0,0,3.5,...,2.5,3.5,3.5,4,3.5,3.5,3.5,3.5,4,4


In [26]:
perfil_movies = list(my_group_sparse)
perfil_movies

# print(len(refinedMyAlgo.movies))
# print(refinedMyAlgo.movies.dtypes)
# print(refinedMyAlgo.movies)

candidate_movies = []
for item in refinedMyAlgo.movies.iterrows():
#     get the movieId of each movie in movies dataframe
#     print(item[1].values)
    if item[1].values[0] not in perfil_movies:
        candidate_movies.append(item[1].values[0])
        
print('TOTAL: {}, PERFIL MOVIES: {}'.format(len(perfil_movies), perfil_movies))

print('TOTAL: {}, CANDIDATE MOVIES: {}'.format(len(candidate_movies), candidate_movies))

print(len(refinedMyAlgo.movies))

TOTAL: 668, PERFIL MOVIES: [1, 10, 32, 34, 36, 39, 44, 47, 48, 50, 62, 69, 70, 73, 110, 112, 141, 145, 150, 153, 161, 163, 165, 170, 172, 173, 181, 193, 227, 248, 260, 296, 316, 318, 329, 337, 345, 353, 356, 364, 367, 376, 377, 380, 410, 420, 434, 442, 457, 466, 480, 485, 508, 520, 527, 533, 541, 552, 581, 586, 588, 589, 592, 593, 595, 608, 616, 648, 736, 750, 780, 802, 858, 880, 899, 904, 910, 919, 923, 924, 1028, 1035, 1036, 1046, 1079, 1080, 1089, 1092, 1097, 1101, 1127, 1129, 1131, 1132, 1136, 1161, 1175, 1192, 1193, 1196, 1197, 1198, 1200, 1203, 1206, 1208, 1210, 1214, 1219, 1220, 1221, 1222, 1240, 1258, 1259, 1265, 1270, 1275, 1281, 1291, 1302, 1307, 1343, 1356, 1371, 1372, 1374, 1375, 1376, 1377, 1387, 1391, 1394, 1407, 1416, 1438, 1466, 1479, 1499, 1513, 1517, 1527, 1562, 1569, 1573, 1580, 1584, 1608, 1610, 1614, 1616, 1617, 1620, 1641, 1644, 1645, 1653, 1663, 1676, 1687, 1688, 1690, 1704, 1717, 1721, 1722, 1732, 1748, 1792, 1801, 1805, 1831, 1833, 1835, 1876, 1882, 1907, 1909,

In [6]:
for index, row in my_group_sparse.iterrows():
    for col in list(my_group_sparse):
        if(my_group_sparse.loc[index,col] == 0.0):
            aux = list(filter(lambda x: x.uid==str(index) and x.iid==str(col), refinedMyAlgo.predictions))
            my_group_sparse.loc[index,col] = aux[0].est

my_group_sparse.head()

movieId,1,10,32,34,36,39,44,47,48,50,...,176101,177763,178615,179401,179819,181719,182793,183635,184997,188301
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77,4.113485,3.304979,2.76677,3.583659,2.882343,2.680963,2.712837,3.540116,3.386169,4.071391,...,3.19044,3.00446,3.495336,3.617657,3.432329,3.495336,3.495336,3.495336,3.995336,3.626423
243,3.915623,5.0,4.194984,4.080091,4.0,3.726564,4.0,4.34132,4.0,4.330774,...,4.289422,3.586268,3.254118,4.865789,4.043921,3.254118,3.254118,3.254118,3.754118,4.698571
420,4.0,3.614387,3.5,3.934594,3.870416,3.494822,2.742958,3.867857,3.119853,4.191797,...,3.383267,2.965422,3.482426,3.805099,3.435428,3.482426,3.482426,3.482426,3.982426,3.658131
452,4.853914,4.0,5.0,4.71202,4.982113,4.701036,4.0,5.0,4.177572,5.0,...,4.060444,3.938935,4.191775,4.613918,3.978463,4.191775,4.191775,4.191775,4.691775,4.653946
596,4.0,3.75399,3.5,4.0,4.047011,4.0,2.827683,4.075369,3.504559,3.5,...,2.5,3.5,3.5,4.0,3.5,3.5,3.5,3.5,4.0,4.0


In [7]:
########################################################################
# # Implementing least misery ending-up in a dataframe
########################################################################
values = []
labels = []
for i in range(0,len(list(my_group_sparse))):
    my_col = my_group_sparse.iloc[ : ,i]
    label = my_col.name
    my_col = list(my_col)
    
    labels.append(label)
    values.append( float(min(my_col)) )
    
# print('Array values: {}, Array labels: {}'.format(values, labels))
agg_group_perf = pd.DataFrame(index=[900], columns=labels)

for i in range(0,len(list(agg_group_perf))):
    agg_group_perf.iloc[0, i] = values[i]

agg_group_perf.head()

Unnamed: 0,1,10,32,34,36,39,44,47,48,50,...,176101,177763,178615,179401,179819,181719,182793,183635,184997,188301
900,3.91562,3.30498,2.76677,3.58366,2.88234,2.68096,2.71284,3.54012,3.11985,3.5,...,2.5,2.96542,3.25412,3.61766,3.43233,3.25412,3.25412,3.25412,3.75412,3.62642


In [8]:
group_pref_dict = []
for col in list(agg_group_perf):
    my_dict = {}
#     print('Valor: {}, Coluna: {}'.format(agg_group_perf.loc[900,col], col))
    my_dict['rating'] = agg_group_perf.loc[900,col]
    my_dict['movieID'] = col
    group_pref_dict.append(my_dict)
    
group_pref_dict = sorted(group_pref_dict, key = lambda i: i['rating'],reverse=True)
group_pref_dict

[{'rating': 4.75411772510059, 'movieID': 33649},
 {'rating': 4.504739461827735, 'movieID': 57669},
 {'rating': 4.5, 'movieID': 43376},
 {'rating': 4.3411542378725185, 'movieID': 6350},
 {'rating': 4.3257482471522195, 'movieID': 38061},
 {'rating': 4.255150510862534, 'movieID': 122882},
 {'rating': 4.25411772510059, 'movieID': 4342},
 {'rating': 4.25411772510059, 'movieID': 7615},
 {'rating': 4.25411772510059, 'movieID': 111913},
 {'rating': 4.25411772510059, 'movieID': 171917},
 {'rating': 4.230338384811943, 'movieID': 76093},
 {'rating': 4.213262436731571, 'movieID': 2160},
 {'rating': 4.197721827235527, 'movieID': 3703},
 {'rating': 4.190486499639643, 'movieID': 5444},
 {'rating': 4.188178818695839, 'movieID': 1356},
 {'rating': 4.17918959630176, 'movieID': 1387},
 {'rating': 4.178572729377778, 'movieID': 2028},
 {'rating': 4.171559281320705, 'movieID': 122886},
 {'rating': 4.166061331251394, 'movieID': 166528},
 {'rating': 4.159179460795177, 'movieID': 51255},
 {'rating': 4.13980369

In [9]:
refinedMyAlgo.calc_similarity_matrix()

In [10]:
references=[33649, 57669, 43376]
recs = refinedMyAlgo.get_similar_movies(references)

In [25]:
count = 0
for reference in references:
    print('Referência: {}\t gêneros: {}'.format(refinedMyAlgo.movies[refinedMyAlgo.movies['movieId']==reference].values[0][1],
                                              refinedMyAlgo.movies[refinedMyAlgo.movies['movieId']==reference].values[0][2]))
    for movie in recs[count]:
        
        movie_id = refinedMyAlgo.movies.loc[movie[0]].values[0]
        movie_title = refinedMyAlgo.movies.loc[movie[0]].values[1]
        movie_genres = refinedMyAlgo.movies.loc[movie[0]].values[2]
        print('\tScore: {},\tmovieId: {},\ttitle: {},\tgenres: {}'.format(movie[1], movie_id, movie_title, movie_genres))
        
    count=count+1

Referência: Saving Face	 gêneros: Comedy, Drama, Romance
	Score: 0.6280458576942632,	movieId: 4149,	title: Saving Silverman (Evil Woman),	genres: Comedy, Romance
	Score: 0.6143249334852072,	movieId: 156,	title: Blue in the Face,	genres: Comedy, Drama
	Score: 0.5578633128156909,	movieId: 107141,	title: Saving Mr. Banks,	genres: Comedy, Drama
	Score: 0.5283440968834665,	movieId: 491,	title: Man Without a Face, The,	genres: Drama
	Score: 0.5059725740736212,	movieId: 3831,	title: Saving Grace,	genres: Comedy
	Score: 0.5,	movieId: 4,	title: Waiting to Exhale,	genres: Comedy, Drama, Romance
	Score: 0.5,	movieId: 11,	title: American President, The,	genres: Comedy, Drama, Romance
	Score: 0.5,	movieId: 52,	title: Mighty Aphrodite,	genres: Comedy, Drama, Romance
	Score: 0.5,	movieId: 58,	title: Postman, The (Postino, Il),	genres: Comedy, Drama, Romance
	Score: 0.5,	movieId: 94,	title: Beautiful Girls,	genres: Comedy, Drama, Romance
Referência: In Bruges	 gêneros: Comedy, Crime, Drama, Thriller
	

In [25]:
recs[0]

[(3091, 0.6280458576942632),
 (129, 0.6143249334852072),
 (8317, 0.5578633128156909),
 (428, 0.5283440968834665),
 (2863, 0.5059725740736212),
 (3, 0.5),
 (10, 0.5),
 (47, 0.5),
 (52, 0.5),
 (83, 0.5)]