In [1]:
from collections import defaultdict
import pandas as pd
from surprise import Reader, Dataset
from surprise import KNNWithMeans, KNNBasic, SVD
from surprise import accuracy
from surprise.model_selection import train_test_split, cross_validate

import matplotlib.pyplot as plt
import numpy as np
import json
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.metrics import precision_score

In [2]:
class RefinedMyAlgo():
    def __init__(self, rating_data='', data_frame='', movie_data=''):
        if rating_data:
            reader = Reader(line_format='user item rating timestamp', sep=',')
            self.ratings = Dataset.load_from_file(rating_data, reader)
#             self.trainset, self.testset = train_test_split(self.ratings, test_size=0.25)
            self.trainset = self.ratings.build_full_trainset()
            self.sim_options = {'name': 'cosine','user_based': False}
            self.df_ratings = pd.read_csv(rating_data, low_memory=False, names=['userId', 'movieId', 'rating','timestamp'])
        elif not data_frame.empty:
            reader = Reader(rating_scale=(0, 5))
            self.ratings = Dataset.load_from_df(data_frame[['userId', 'movieId', 'rating']], reader)
            self.trainset = self.ratings.build_full_trainset()
            self.sim_options = {'name': 'cosine','user_based': False}
            
        if movie_data:
            self.movies = pd.read_csv(movie_data, low_memory=False)
            self.movies['year'] = self.movies['title'].apply(lambda x: x[-5:-1])
            self.movies['title'] = self.movies['title'].apply(lambda x: x[:-7])
            self.movies['genres'] = self.movies['genres'].apply(lambda x: x.replace('|',', '))
            
    
    def random_group(self, n):
        self.users_list = list(self.df_ratings['userId'])
        random_group = random.sample(self.users_list,n)
        return random_group
        

        
    def set_k(self, k_value=''):
        if k_value:
            algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
            self.algo = algo
            self.algo.fit(self.trainset)
        else:
            algo = SVD()
            self.algo = algo
            self.algo.fit(self.trainset)
        
        
    def find_best_k(self, k_value=''):
        if k_value:
            print('K = {}'.format(k_value))
            algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
            return cross_validate(algo, self.ratings, measures=['RMSE', 'MAE'], cv=10, verbose=True)
        else:
            aux = []
            for k_value in [3, 5, 7, 10, 15, 20, 30, 40]:
                print('K = {}'.format(k_value))
                algo = KNNWithMeans(k=k_value, sim_options=self.sim_options)
                my_dict = cross_validate(algo, self.ratings, measures=['RMSE', 'MAE'], cv=10, verbose=False)
                my_dict['k_value'] = k_value
                aux.append(my_dict)
            return aux
    
    
    def set_testset(self, users):
        if users:
            user_ratings = self.trainset.ur
            movies_ids = list(self.movies['movieId'])
            global_mean=self.trainset.global_mean
            my_testset = []
            
            for user in users:
                iuid = self.trainset.to_inner_uid(str(user))
                for movie in movies_ids:
                    is_in = False
                    for rating in user_ratings[iuid]:
#                         print( 'MOVIE: {}, RATING: {}'.format(movie,bla.trainset.to_raw_iid(rating[0])) )
                        if int(movie) == int(self.trainset.to_raw_iid(int(rating[0]))):
                            is_in = True
                            break
                    if not is_in:
                        my_tuple = (str(user),str(movie),global_mean)
                        my_testset.append(my_tuple)
                        
            self.testset = my_testset
        else:
            testset = self.trainset.build_anti_testset()
            self.testset = testset
        return self.testset


    def predict_ratings(self,users=''):
        # # Predict ratings for all pairs (u, i) that are NOT in the training set.
#         testset = self.trainset.build_anti_testset()
#         self.testset = testset
        testset = self.set_testset(users)
        predictions = self.algo.test(testset)
        self.predictions = predictions
        
        
    def set_perfil_movies(self, users):
        metadata = pd.read_csv('../datasets/ml-latest-small/ratings.csv', low_memory=False, names=['userId', 'movieId', 'rating','timestamp'])
        metadata = metadata.drop(columns="timestamp")

        metadata_filtered = metadata[metadata.userId.isin(users)]

        self.group_sparse_mtx = pd.pivot_table(metadata_filtered, values='rating', index=['userId'], columns=['movieId'], fill_value=0)
        
        self.perfil_movies = list(self.group_sparse_mtx)
        
    
    ### You must call self.set_perfil_movies() before
    def set_candidate_movies(self):
        candidate_movies = []
        for item in refinedMyAlgo.movies.iterrows():
        #     get the movieId of each movie in movies dataframe
            if item[1].values[0] not in self.perfil_movies:
                candidate_movies.append(item[1].values[0])
        self.candidate_movies = candidate_movies
        
        
    def calc_similarity_matrix(self):
        #Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
        tfidf = TfidfVectorizer(stop_words='english')
        
        #Replace NaN with an empty string
        self.movies['title'] = self.movies['title'].fillna('')
        self.movies['genres'] = self.movies['genres'].fillna('')
        
        #Construct the required TF-IDF matrix by fitting and transforming the data
        tfidf_matrix_title = tfidf.fit_transform(self.movies['title'])
        tfidf_matrix_genres = tfidf.fit_transform(self.movies['genres'])
        
        #Compute the cosine similarity matrix
        self.cosine_sim_movies_title = cosine_similarity(tfidf_matrix_title, tfidf_matrix_title)
        self.cosine_sim_movies_genres = cosine_similarity(tfidf_matrix_genres, tfidf_matrix_genres)
        
        
    def get_similar_movies(self, references, title_weight=0.8):
        recs = []
        for movie in references:
            # Get the pairwsie similarity scores of all movies with that movie
            movie_idx = int(self.movies[self.movies['movieId']==movie['movieID']].index[0])
            sim_scores_title = list(enumerate(self.cosine_sim_movies_title[movie_idx]))
            sim_scores_genres = list(enumerate(self.cosine_sim_movies_genres[movie_idx]))
            
            # Calculate total similarity based on title and genres
            total_sim_score = []
            for i in range(len(sim_scores_title)):
#                 print("sim_score_title= {}\t sim_score_genres= {}".format(sim_scores_title[i][1], sim_scores_genres[i][1]))
                aux = (sim_scores_title[i][1]*title_weight) + (sim_scores_genres[i][1]*(1-title_weight))
                total_sim_score.append((i, aux))
#                 print("sim_score_total= {}".format(total_sim_score))
                
            # Sort the movies based on the similarity scores
            total_sim_score = sorted(total_sim_score, key=lambda x: x[1], reverse=True)
            self.total_sim_score = total_sim_score
            
            candidates_sim_score = []
            for item in total_sim_score:
                if self.movies.loc[item[0]].values[0] not in self.perfil_movies:
                    candidates_sim_score.append(item)
            
            # Get the scores of the 10 most similar movies
            candidates_sim_score = candidates_sim_score[1:11]
            
            recs.append(candidates_sim_score)
            
        return recs
    
    
    def get_relevance_score(self, recs, references):
        count = 0
        recs_dict = []
        for reference in references:
        #     print('Referência: {}\t gêneros: {}'.format(refinedMyAlgo.movies[refinedMyAlgo.movies['movieId']==reference['movieID']].values[0][1], refinedMyAlgo.movies[refinedMyAlgo.movies['movieId']==reference['movieID']].values[0][2]))

            for movie in recs[count]:
                aux = {}

                movie_id = self.movies.loc[movie[0]].values[0]
                movie_title = self.movies.loc[movie[0]].values[1]
                movie_genres = self.movies.loc[movie[0]].values[2]
                movie_similarity = movie[1]
                movie_relevance = round(((reference['rating']/5.0)+movie_similarity)/2, 3)

                aux['movie_id'] = movie_id
                aux['movie_title'] = movie_title
                aux['movie_genres'] = movie_genres
                aux['movie_similarity'] = movie_similarity
                aux['movie_relevance'] = movie_relevance

                recs_dict.append(aux)

        #         print('\tSim: {},\trelevance: {},\tmovieId: {},\ttitle: {}'.format(aux['movie_similarity'], aux['movie_relevance'], aux['movie_id'], aux['movie_title']))

            count=count+1

        recs_dict = sorted(recs_dict, key = lambda i: i['movie_relevance'],reverse=True)

        return recs_dict
    
    
    def calc_distance_item_in_list(self, item, this_list, title_weight=0.8):

        idx_i = int(self.movies[self.movies['movieId']==item['movie_id']].index[0])

        total_dist = 0
        for movie in this_list:
            
            idx_j = int(self.movies[self.movies['movieId']==int(movie['movie_id'])].index[0])

            sim_i_j = (self.cosine_sim_movies_title[idx_i][idx_j]*title_weight) + (self.cosine_sim_movies_genres[idx_i][idx_j]*(1-title_weight))
            dist_i_j = 1 - sim_i_j
            total_dist = total_dist + dist_i_j

        result = total_dist/len(this_list)
        return result
    
    
    def calc_diversity_score(self, actual_list, candidates_list, alfa=0.5):
        '''
        This function implemented here was based on MARIUS KAMINSKAS and DEREK BRIDGE paper: Diversity, Serendipity, Novelty, and Coverage: A Survey and Empirical Analysis of Beyond-Accuracy Objectives in Recommender Systems
        func(i,R) = (relevance[i]*alfa) + (dist_i_R(i,R)*(1-alfa))
        '''
        diversity_score = []
        count = 0

        for item in candidates_list:

            aux = {}
            dist_item_R = self.calc_distance_item_in_list(item=item, this_list=actual_list)
            aux['div_score'] = (item['movie_relevance']*alfa) + (dist_item_R*(1-alfa))
            aux['idx'] = count
            diversity_score.append(aux)
            count = count + 1

        return diversity_score
    
    
    def diversify_recs_list(self, recs, n=10):
        '''
        This function implemented here was based on MARIUS KAMINSKAS and DEREK BRIDGE paper: Diversity, Serendipity, Novelty, and Coverage: A Survey and Empirical Analysis of Beyond-Accuracy Objectives in Recommender Systems
        The Greedy Reranking Algorithm.
        '''
        diversified_list = []
        
        while len(diversified_list) < n:
            if len(diversified_list) == 0:
                diversified_list.append(recs[0])
                recs.pop(0)
            else:
                diversity_score = self.calc_diversity_score(actual_list=diversified_list, candidates_list=recs)
                diversity_score = sorted(diversity_score, key = lambda i: i['div_score'],reverse=True)
#               #  Add the item that maximize diversity in the list 
                item = diversity_score[0]
                diversified_list.append(recs[item['idx']])
#               #  Remove this item from the candidates list
                recs.pop(item['idx'])
    
        return diversified_list
    
    
    def divesify_recs_list_bounded_random(self, recs, n=10):
        '''
        This function implemented here was based on KEITH BRADLEY and BARRY SMYTH paper: Improving Recommendation Diversity
        The Bounded Random Selection Algorithm.
        '''
        diversified_list = random.sample(recs,n)

        return diversified_list
                
        
#     def get_ILS(self, final_recs, title_weight=0.8):
#         big_list = []
#         for i in final_recs:
#             movie_idx_i = int(self.movies[self.movies['movieId']==i['movie_id']].index[0])
#             small_list = []
#             for j in final_recs:
#                 movie_idx_j = int(self.movies[self.movies['movieId']==j['movie_id']].index[0])
#                 sim_genre = self.cosine_sim_movies_genres[movie_idx_i][movie_idx_j]
#                 sim_title = self.cosine_sim_movies_title[movie_idx_i][movie_idx_j]
#                 total_sim = (sim_title*title_weight) + (sim_genre*(1-title_weight))
#                 small_list.append(total_sim)
#             big_list.append(small_list)

#         return big_list

    def calc_dist_i_j(self, idx_i, idx_j, title_weight=0.8):
        sim_genre = self.cosine_sim_movies_genres[idx_i][idx_j]
        sim_title = self.cosine_sim_movies_title[idx_i][idx_j]
        total_sim = (sim_title*title_weight) + (sim_genre*(1-title_weight))
        dist_score = 1 - total_sim

        return dist_score
    
    
    def get_distance_matrix(self, final_recs, title_weight=0.8):
        dist_matrix = []
        for i in final_recs:
            aux = []
            movie_idx_i = int(self.movies[self.movies['movieId']==i['movie_id']].index[0])
            for j in final_recs:
                movie_idx_j = int(self.movies[self.movies['movieId']==j['movie_id']].index[0])
                dist_i_j = self.calc_dist_i_j(movie_idx_i, movie_idx_j, title_weight=0.8)
                aux.append(dist_i_j)
            dist_matrix.append(aux)
            
        return dist_matrix
    
    def get_ILD_score(self, final_recs, title_weight=0.8):
        dist_matrix = self.get_distance_matrix(final_recs, title_weight=0.8)
        np_dist_mtx = np.array(dist_matrix)
        upper_right = np.triu_indices(np_dist_mtx.shape[0], k=1)

        ild_score = np.mean(np_dist_mtx[upper_right])
        
        return ild_score
    
    
    
    # # # # # # # # # # PRECISION Module # # # # # # # # # #
    
    def get_mean(self, movie):
        converted_values = []
        for item in movie['ratings']:
            for bla in item:
                aux = float(bla)
                converted_values.append(aux)

        my_mean = sum(converted_values) / len(converted_values)
        my_mean = round(my_mean, 3)
        return my_mean
    
    
    def get_movies_means(self, movies_list, at):
        my_copy = self.df_ratings.copy()

        df_movies_ratings = my_copy.groupby('movieId')['rating'].apply(list).reset_index(name='ratings')

        movies_means = []

        for item in movies_list[:at]:
            movie = df_movies_ratings[df_movies_ratings['movieId']==item['movie_id']]
            movies_means.append(self.get_mean(movie))

        return movies_means
    
    
    def binary_mean(self, movies_mean, cutoff):
        binary_mean = []
        returned_movies = []
        for item in movies_mean:
            if item >= cutoff:
                binary_mean.append(1)
            else:
                binary_mean.append(0)

            returned_movies.append(1)

        return precision_score(binary_mean, returned_movies)
    
    
    def precision_at_offline(self, movies_list, at):
    
        global_mean = self.trainset.global_mean
        movies_list_mean = self.get_movies_means(movies_list, at)

        print("Global mean: {}, movies_list_mean: {}".format(global_mean, movies_list_mean))

        precision = self.binary_mean(movies_list_mean, global_mean)
        return precision

In [3]:
refinedMyAlgo = RefinedMyAlgo(rating_data='../datasets/ml-latest-small/ratings.csv', movie_data='../datasets/ml-latest-small/movies.csv')
refinedMyAlgo.set_k()

In [4]:
refinedMyAlgo.calc_similarity_matrix()

In [5]:
# # # FIXED GROUP
# my_group = [77,596,452,243,420]

# # # RANDOM GROUP
my_group = refinedMyAlgo.random_group(5)
print(my_group)

refinedMyAlgo.predict_ratings(users=my_group)
print(len(refinedMyAlgo.predictions))

[211, 368, 297, 469, 135]
47336


In [6]:
# metadata = pd.read_csv('ml-latest-small/ratings.csv', low_memory=False, names=['userId', 'movieId', 'rating','timestamp'])
# metadata = metadata.drop(columns="timestamp")

# metadata_filtered = metadata[metadata.userId.isin(my_group)]

# my_group_sparse = pd.pivot_table(metadata_filtered, values='rating', index=['userId'], columns=['movieId'], fill_value=0)
# my_group_sparse.head()

refinedMyAlgo.set_perfil_movies(users=my_group)
refinedMyAlgo.set_candidate_movies()

# print(refinedMyAlgo.perfil_movies)
# print(refinedMyAlgo.candidate_movies)
refinedMyAlgo.group_sparse_mtx.head()

movieId,1,2,3,6,10,11,16,21,22,29,...,82461,85414,87232,89745,89864,90405,91500,91529,91542,95510
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
135,4,3,0,0,0,0,0,3,0,0,...,0,0,0,0,0.0,0,0,0,0,0.0
211,0,0,0,0,0,0,0,0,0,0,...,2,2,4,5,3.5,4,4,5,4,4.5
297,0,0,0,5,0,0,0,0,2,0,...,0,0,0,0,0.0,0,0,0,0,0.0
368,0,0,3,4,3,0,4,3,3,0,...,0,0,0,0,0.0,0,0,0,0,0.0
469,4,0,0,3,2,3,0,0,0,4,...,0,0,0,0,0.0,0,0,0,0,0.0


In [7]:
group_filled_mtx = refinedMyAlgo.group_sparse_mtx.copy()

for index, row in group_filled_mtx.iterrows():
    for col in list(group_filled_mtx):
        if(group_filled_mtx.loc[index,col] == 0.0):
            aux = list(filter(lambda x: x.uid==str(index) and x.iid==str(col), refinedMyAlgo.predictions))
            group_filled_mtx.loc[index,col] = aux[0].est

group_filled_mtx = group_filled_mtx.round(decimals=3)
group_filled_mtx.head()

movieId,1,2,3,6,10,11,16,21,22,29,...,82461,85414,87232,89745,89864,90405,91500,91529,91542,95510
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
135,4.0,3.0,3.574,3.851,3.382,3.73,4.211,3.0,3.448,4.004,...,3.548,3.303,4.254,4.125,3.907,3.004,3.489,4.261,3.789,3.47
211,4.188,3.721,3.54,3.772,3.57,3.643,3.968,3.503,3.409,4.14,...,2.0,2.0,4.0,5.0,3.5,4.0,4.0,5.0,4.0,4.5
297,3.201,2.657,2.516,5.0,2.46,2.577,3.195,2.771,2.0,3.145,...,2.291,2.725,3.046,3.047,2.899,2.671,2.59,3.116,3.073,2.458
368,3.138,2.889,3.0,4.0,3.0,2.8,4.0,3.0,3.0,3.168,...,2.806,2.916,3.22,3.432,2.893,2.788,2.944,3.399,3.11,2.7
469,4.0,3.603,3.211,3.0,2.0,3.0,4.185,3.342,3.244,4.0,...,3.265,3.457,3.913,4.252,3.685,3.432,3.799,4.131,3.913,3.641


In [8]:
def apply_aggregation_strategy(group_filled_mtx, technique = 'AWM'):
    values = []
    labels = []
    for i in range(0,len(list(group_filled_mtx))):
        my_col = group_filled_mtx.iloc[ : ,i]
        label = my_col.name
        my_col = list(my_col)

        labels.append(label)
        values.append(0.0)
        
        
        if technique is 'LM':
            values.append( float(min(my_col)) )
        elif technique is 'MP':
            values.append( float(max(my_col)) )
        else:
            if float(min(my_col)) <= 2 :
                values.append( float(min(my_col)) )
            else:
                values.append( float( sum(my_col) / len(my_col) ) )
                

    print('\n-- -- --  -- > Aggregation Technique: {}\n'.format(technique))
    
    # print('Array values: {}, Array labels: {}'.format(values, labels))
    agg_group_perf = pd.DataFrame(index=[900], columns=labels)

    for i in range(0,len(list(agg_group_perf))):
        agg_group_perf.iloc[0, i] = values[i]

    agg_group_perf = agg_group_perf.round(decimals=3)
    
    return agg_group_perf

In [9]:
g_profile = apply_aggregation_strategy(group_filled_mtx, technique = 'AWM')
g_profile.head()


-- -- --  -- > Aggregation Technique: AWM



Unnamed: 0,1,2,3,6,10,11,16,21,22,29,...,82461,85414,87232,89745,89864,90405,91500,91529,91542,95510
900,0,3.7054,0,3.174,0,3.1682,0,3.9246,0,2,...,2.8516,0,2,0,3.2112,0,3.1984,0,3.2458,0


In [10]:
########################################################################
# # Implementing LEAST MISERY ending-up in a dataframe
########################################################################
values = []
labels = []
for i in range(0,len(list(group_filled_mtx))):
    my_col = group_filled_mtx.iloc[ : ,i]
    label = my_col.name
    my_col = list(my_col)
    
    labels.append(label)
    values.append( float(min(my_col)) )
    
# print('Array values: {}, Array labels: {}'.format(values, labels))
agg_group_perf = pd.DataFrame(index=[900], columns=labels)

for i in range(0,len(list(agg_group_perf))):
    agg_group_perf.iloc[0, i] = values[i]

    
agg_group_perf = agg_group_perf.round(decimals=3)
agg_group_perf.head()

Unnamed: 0,1,2,3,6,10,11,16,21,22,29,...,82461,85414,87232,89745,89864,90405,91500,91529,91542,95510
900,3.138,2.657,2.516,3,2,2.577,3.195,2.771,2,3.145,...,2,2,3.046,3.047,2.893,2.671,2.59,3.116,3.073,2.458


In [11]:
########################################################################
# # Implementing MOST PLEASURE ending-up in a dataframe
########################################################################
values = []
labels = []
for i in range(0,len(list(group_filled_mtx))):
    my_col = group_filled_mtx.iloc[ : ,i]
    label = my_col.name
    my_col = list(my_col)
    
    labels.append(label)
    values.append( float(max(my_col)) )
    
# print('Array values: {}, Array labels: {}'.format(values, labels))
agg_group_perf = pd.DataFrame(index=[900], columns=labels)

for i in range(0,len(list(agg_group_perf))):
    agg_group_perf.iloc[0, i] = values[i]

    
agg_group_perf = agg_group_perf.round(decimals=3)
agg_group_perf.head()

Unnamed: 0,1,2,3,6,10,11,16,21,22,29,...,82461,85414,87232,89745,89864,90405,91500,91529,91542,95510
900,4.188,3.721,3.574,5,3.57,3.73,4.211,3.503,3.448,4.14,...,3.548,3.457,4.254,5,3.907,4,4,5,4,4.5


In [12]:
########################################################################
# # Implementing AVERAGE WITHOUT MISERY: treshold=2  ending-up in a dataframe
########################################################################
values = []
labels = []
for i in range(0,len(list(group_filled_mtx))):
    my_col = group_filled_mtx.iloc[ : ,i]
    label = my_col.name
    my_col = list(my_col)
    
    labels.append(label)
    if float(min(my_col)) <= 2 :
        values.append( float(min(my_col)) )
    else:
        values.append( float( sum(my_col) / len(my_col) ) )
    
# print('Array values: {}, Array labels: {}'.format(values, labels))
agg_group_perf = pd.DataFrame(index=[900], columns=labels)

for i in range(0,len(list(agg_group_perf))):
    agg_group_perf.iloc[0, i] = values[i]

    
agg_group_perf = agg_group_perf.round(decimals=3)
agg_group_perf.head()

Unnamed: 0,1,2,3,6,10,11,16,21,22,29,...,82461,85414,87232,89745,89864,90405,91500,91529,91542,95510
900,3.7054,3.174,3.1682,3.9246,2,3.15,3.9118,3.1232,2,3.6914,...,2,2,3.6866,3.9712,3.3768,3.179,3.3644,3.9814,3.577,3.3538


In [13]:
group_pref_dict = []
for col in list(agg_group_perf):
    my_dict = {}
#     print('Valor: {}, Coluna: {}'.format(agg_group_perf.loc[900,col], col))
    my_dict['rating'] = agg_group_perf.loc[900,col]
    my_dict['movieID'] = col
    group_pref_dict.append(my_dict)
    
group_pref_dict = sorted(group_pref_dict, key = lambda i: i['rating'],reverse=True)
group_pref_dict

[{'rating': 4.6202, 'movieID': 1089},
 {'rating': 4.5248, 'movieID': 318},
 {'rating': 4.5214, 'movieID': 1208},
 {'rating': 4.4962, 'movieID': 1197},
 {'rating': 4.4542, 'movieID': 1193},
 {'rating': 4.4361999999999995, 'movieID': 1252},
 {'rating': 4.430400000000001, 'movieID': 1387},
 {'rating': 4.4194, 'movieID': 296},
 {'rating': 4.4044, 'movieID': 1617},
 {'rating': 4.3976, 'movieID': 858},
 {'rating': 4.3774, 'movieID': 750},
 {'rating': 4.3706000000000005, 'movieID': 2959},
 {'rating': 4.369, 'movieID': 527},
 {'rating': 4.26, 'movieID': 593},
 {'rating': 4.2524, 'movieID': 2571},
 {'rating': 4.239199999999999, 'movieID': 50},
 {'rating': 4.216199999999999, 'movieID': 1201},
 {'rating': 4.2076, 'movieID': 1136},
 {'rating': 4.2058, 'movieID': 4226},
 {'rating': 4.2054, 'movieID': 47},
 {'rating': 4.2022, 'movieID': 1259},
 {'rating': 4.1988, 'movieID': 1199},
 {'rating': 4.1968000000000005, 'movieID': 1358},
 {'rating': 4.1903999999999995, 'movieID': 1200},
 {'rating': 4.1818, 

In [14]:
references = group_pref_dict[0:10]
# references = group_pref_dict

for item in references:
    print(item)
# print(references)

{'rating': 4.6202, 'movieID': 1089}
{'rating': 4.5248, 'movieID': 318}
{'rating': 4.5214, 'movieID': 1208}
{'rating': 4.4962, 'movieID': 1197}
{'rating': 4.4542, 'movieID': 1193}
{'rating': 4.4361999999999995, 'movieID': 1252}
{'rating': 4.430400000000001, 'movieID': 1387}
{'rating': 4.4194, 'movieID': 296}
{'rating': 4.4044, 'movieID': 1617}
{'rating': 4.3976, 'movieID': 858}


In [15]:
# references=[904, 318, 38061, 31658, 57669]
recs = refinedMyAlgo.get_similar_movies(references)

In [16]:
# print(refinedMyAlgo.total_sim_score)

In [17]:
candidates_list = refinedMyAlgo.get_relevance_score(recs=recs, references=references)
print(len(candidates_list))

for item in candidates_list[0:20]:
    print('relevance: {}, title:{}'.format(item['movie_relevance'], item['movie_title']))

100
relevance: 0.71, title:Resident Evil: Apocalypse
relevance: 0.705, title:Bride Wars
relevance: 0.699, title:Undisputed III: Redemption
relevance: 0.697, title:Crimson Rivers 2: Angels of the Apocalypse (Rivières pourpres II - Les anges de l'apocalypse, Les)
relevance: 0.697, title:Art School Confidential
relevance: 0.69, title:Runaway Bride
relevance: 0.687, title:Redemption (Hummingbird)
relevance: 0.68, title:Father of the Bride
relevance: 0.68, title:Father of the Bride
relevance: 0.677, title:Princess Diaries, The
relevance: 0.675, title:Decoy Bride, The
relevance: 0.671, title:Scouts Guide to the Zombie Apocalypse
relevance: 0.671, title:Corpse Bride
relevance: 0.668, title:Little Princess, A
relevance: 0.668, title:Little Princess, The
relevance: 0.668, title:Bride & Prejudice
relevance: 0.651, title:Straw Dogs
relevance: 0.647, title:Hearts of Darkness: A Filmmakers Apocalypse
relevance: 0.646, title:War Dogs
relevance: 0.637, title:All Dogs Go to Heaven 2


In [18]:
my_candidates = candidates_list.copy()
final_recs_greedy = refinedMyAlgo.diversify_recs_list(recs=my_candidates)
for item in final_recs_greedy:
    print('relevance: {}, title:{}'.format(item['movie_relevance'], item['movie_title']))

relevance: 0.71, title:Resident Evil: Apocalypse
relevance: 0.705, title:Bride Wars
relevance: 0.699, title:Undisputed III: Redemption
relevance: 0.668, title:Little Princess, A
relevance: 0.697, title:Art School Confidential
relevance: 0.637, title:All Dogs Go to Heaven 2
relevance: 0.697, title:Crimson Rivers 2: Angels of the Apocalypse (Rivières pourpres II - Les anges de l'apocalypse, Les)
relevance: 0.69, title:Runaway Bride
relevance: 0.647, title:Hearts of Darkness: A Filmmakers Apocalypse
relevance: 0.687, title:Redemption (Hummingbird)


In [19]:
my_candidates = candidates_list.copy()
final_recs_random = refinedMyAlgo.divesify_recs_list_bounded_random(recs=my_candidates)
for item in final_recs_random:
    print('relevance: {}, title:{}'.format(item['movie_relevance'], item['movie_title']))

relevance: 0.552, title:Black Hawk Down
relevance: 0.668, title:Little Princess, The
relevance: 0.537, title:Foreign Correspondent
relevance: 0.545, title:Basketball Diaries, The
relevance: 0.543, title:Underworld: Blood Wars
relevance: 0.545, title:Georgia
relevance: 0.612, title:Isle of Dogs
relevance: 0.68, title:Father of the Bride
relevance: 0.552, title:Above the Rim
relevance: 0.535, title:Brick


In [20]:
# dist_i_R = refinedMyAlgo.calc_distance_item_in_list(item=candidates_list[1],this_list=candidates_list[0:5])
# dist_i_R

In [21]:
# print(refinedMyAlgo.calc_diversity_score(actual_list=candidates_list[0:5], candidates_list=candidates_list[10:20]))

In [22]:
# my_candidates = candidates_list.copy()
# final_recs = refinedMyAlgo.diversify_recs_list(recs=my_candidates)
# for item in final_recs:
#     print('relevance: {}, title:{}'.format(item['movie_relevance'], item['movie_title']))

In [23]:
standard_recs = candidates_list[0:10]
# refinedMyAlgo.get_ILD_score(standard_recs, title_weight=0.8)

In [24]:
standard_recs = candidates_list[0:10]
print('ILD - standard recs: {}'.format(refinedMyAlgo.get_ILD_score(standard_recs, title_weight=0.8)))
print('ILD - div greedy algo: {}'.format(refinedMyAlgo.get_ILD_score(final_recs_greedy, title_weight=0.8)))
print('ILD - div random algo: {}'.format(refinedMyAlgo.get_ILD_score(final_recs_random, title_weight=0.8)))

ILD - standard recs: 0.8684364395203267
ILD - div greedy algo: 0.9369153258100315
ILD - div random algo: 0.9675219742174661


In [25]:
print('P@10 - standard recs: {}'.format(refinedMyAlgo.precision_at_offline(standard_recs, 10)))
print('P@10 - div greedy algo: {}'.format(refinedMyAlgo.precision_at_offline(final_recs_greedy, 10)))
print('P@10 - div random algo: {}'.format(refinedMyAlgo.precision_at_offline(final_recs_random, 10)))

Global mean: 3.504348811095132, movies_list_mean: [2.929, 3.5, 4.0, 2.5, 3.333, 2.833, 2.0, 3.25, 3.5, 3.037]
P@10 - standard recs: 0.1
Global mean: 3.504348811095132, movies_list_mean: [2.929, 3.5, 4.0, 3.905, 3.333, 3.182, 2.5, 2.833, 3.8, 2.0]
P@10 - div greedy algo: 0.3
Global mean: 3.504348811095132, movies_list_mean: [3.81, 3.5, 3.5, 3.467, 3.0, 4.0, 3.5, 3.5, 2.0, 3.875]
P@10 - div random algo: 0.3


In [26]:
print(refinedMyAlgo.precision_at_offline(final_recs, 10))

NameError: name 'final_recs' is not defined

In [27]:
'''Runing evaluation metrics on BASELINE outputs'''

def parse_to_evaluate(movie_id_list):
    output = []
    for i in movie_id_list:
        cube = {}
        cube['movie_id'] = i
        output.append(cube)
        
    return output

In [28]:
# for item in output_1:
# #     movie = df_movies_ratings[df_movies_ratings['movieId']==item['movie_id']]
#     print(item['movie_id'])

In [29]:
output_1 = parse_to_evaluate([4437,2118,5244,50740,1251,27773,1261,6385,57528,3400])
output_2 = parse_to_evaluate([66934,1223,2498,8605,50685,70183,1381,6155,111743,8493])
output_3 = parse_to_evaluate([93721,55118,54962,58025,111844,2405,33646,5026,53129,51084])
output_4 = parse_to_evaluate([2393,2657,1373,352,1643,1527,4002,3094,1275,3225])
output_5 = parse_to_evaluate([65261,94018,7132,2648,6870,86290,3623,59725,808,3635])

# print(output_1)
output_lm_1 = parse_to_evaluate([4437,5570,50806,1273,27773,57640,2110,1227,6331,1237])
output_lm_2 = parse_to_evaluate([2846,932,6155,69844,64957,8633,111781,46530,1193,53464])
output_lm_3 = parse_to_evaluate([93721,55118,58025,111844,2405,122882,5225,122924,60487,41863])
output_lm_4 = parse_to_evaluate([2393,2657,352,1372,7260,1643,1527,5009,165483,2338])
output_lm_5 = parse_to_evaluate([65261,94018,7132,2648,6870,61240,3435,79592,5151,104863])

In [36]:
## ## ## ## EVALUATING ILD SCORE - ONILNE EXPERIMENT ## ## ## ##

g31_prop = parse_to_evaluate([130520,2006,828,3675,1253,33493,69529,31698,77798,26467])
g32_prop = parse_to_evaluate([4526,27075,4322,8591,1690,104944,84942,26038,6835,104925])
g33_prop = parse_to_evaluate([33493,6810,122092,27441,172591,4641,1466,2628,3634,120783])
g51_prop = parse_to_evaluate([45186,89386,405,2709,2628,189333,3634,107723,94122,44243])
g52_prop = parse_to_evaluate([4921,136297,122092,51927,3675,115170,26467,8261,479,147142])
g53_prop = parse_to_evaluate([25746,1377,3909,3821,2145,6810,153,72605,90603,110771])


g31_bl = parse_to_evaluate([1278,180777,313,422,325,46530,3825,11,515,531])
g32_bl = parse_to_evaluate([61,494,1199,48783,1061,531,1225,168,4967,332])
g33_bl = parse_to_evaluate([181,458,158872,3953,5601,5597,26547,27008,40578,45648])
g51_bl = parse_to_evaluate([513,1021,455,49272,11,3625,5384,5198,26413,26854])
g52_bl = parse_to_evaluate([471,118,4148,3861,710,610,1201,3802,132660,272])
g53_bl = parse_to_evaluate([140110,4,747,152081,31658,475,513,5893,999,112])

In [39]:
print("Proposal:")
score_g31_prop = refinedMyAlgo.get_ILD_score(g31_prop, title_weight=0.8)
score_g32_prop = refinedMyAlgo.get_ILD_score(g32_prop, title_weight=0.8)
score_g33_prop = refinedMyAlgo.get_ILD_score(g33_prop, title_weight=0.8)
score_g51_prop = refinedMyAlgo.get_ILD_score(g51_prop, title_weight=0.8)
score_g52_prop = refinedMyAlgo.get_ILD_score(g52_prop, title_weight=0.8)
score_g53_prop = refinedMyAlgo.get_ILD_score(g53_prop, title_weight=0.8)

print(score_g31_prop)
print(score_g32_prop)
print(score_g33_prop)
print(score_g51_prop)
print(score_g52_prop)
print(score_g53_prop)

print("Proposal mean: {}".format( (score_g31_prop+score_g32_prop+score_g33_prop+score_g51_prop+score_g52_prop+score_g53_prop)/6 ))


print("\n\nBaseline:")
score_g31_bl = refinedMyAlgo.get_ILD_score(g31_bl, title_weight=0.8)
score_g32_bl = refinedMyAlgo.get_ILD_score(g32_bl, title_weight=0.8)
score_g33_bl = refinedMyAlgo.get_ILD_score(g33_bl, title_weight=0.8)
score_g51_bl = refinedMyAlgo.get_ILD_score(g51_bl, title_weight=0.8)
score_g52_bl = refinedMyAlgo.get_ILD_score(g52_bl, title_weight=0.8)
score_g53_bl = refinedMyAlgo.get_ILD_score(g53_bl, title_weight=0.8)

print(score_g31_bl)
print(score_g32_bl)
print(score_g33_bl)
print(score_g51_bl)
print(score_g52_bl)
print(score_g53_bl)

print("Baseline mean: {}".format( (score_g31_bl+score_g32_bl+score_g33_bl+score_g51_bl+score_g52_bl+score_g53_bl)/6 ))

Proposal:
0.9277894816548834
0.9299749807661832
0.9660435976315773
0.9282406790453305
0.9665268917393663
0.917158732584889
Proposal mean: 0.9392890605703718


Baseline:
0.9733954205309023
0.9652587847434008
0.9675618349572882
0.9722793984282531
0.9467043528568628
0.9601956035642658
Baseline mean: 0.9642325658468288


In [30]:
refinedMyAlgo.get_ILD_score(output_1, title_weight=0.8)

0.9694159990485626

In [31]:
refinedMyAlgo.get_ILD_score(output_2, title_weight=0.8)

0.9442230039447174

In [32]:
refinedMyAlgo.get_ILD_score(output_3, title_weight=0.8)

0.9593941978720311

In [33]:
refinedMyAlgo.get_ILD_score(output_4, title_weight=0.8)

0.9354006319845902

In [34]:
refinedMyAlgo.get_ILD_score(output_5, title_weight=0.8)

0.97624502398981

In [None]:
refinedMyAlgo.precision_at_offline(output_1,10)

In [None]:
refinedMyAlgo.precision_at_offline(output_2,10)

In [None]:
refinedMyAlgo.precision_at_offline(output_3,10)

In [None]:
refinedMyAlgo.precision_at_offline(output_4,10)

In [None]:
refinedMyAlgo.precision_at_offline(output_5,10)

In [None]:
baseline_p3 = []
p3_g1 = refinedMyAlgo.precision_at_offline(output_1,3)
p3_g2 = refinedMyAlgo.precision_at_offline(output_2,3)
p3_g3 = refinedMyAlgo.precision_at_offline(output_3,3)
p3_g4 = refinedMyAlgo.precision_at_offline(output_4,3)
p3_g5 = refinedMyAlgo.precision_at_offline(output_5,3)

baseline_p3.append(p3_g1)
baseline_p3.append(p3_g2)
baseline_p3.append(p3_g3)
baseline_p3.append(p3_g4)
baseline_p3.append(p3_g5)

print('G1 = {}\nG2 = {}\nG3 = {}\nG4 = {}\nG5 = {}'.format(p3_g1, p3_g2, p3_g3, p3_g4, p3_g5))

In [None]:
# with open('baseline_p3.json', 'w') as json_file:  
#     json.dump(baseline_p3, json_file)

In [None]:
baseline_p5 = []
p5_g1 = refinedMyAlgo.precision_at_offline(output_1,5)
p5_g2 = refinedMyAlgo.precision_at_offline(output_2,5)
p5_g3 = refinedMyAlgo.precision_at_offline(output_3,5)
p5_g4 = refinedMyAlgo.precision_at_offline(output_4,5)
p5_g5 = refinedMyAlgo.precision_at_offline(output_5,5)

baseline_p5.append(p5_g1)
baseline_p5.append(p5_g2)
baseline_p5.append(p5_g3)
baseline_p5.append(p5_g4)
baseline_p5.append(p5_g5)

print('G1={}\nG2={}\nG3={}\nG4={}\nG5={}'.format(p5_g1, p5_g2, p5_g3, p5_g4, p5_g5))

In [None]:
# with open('baseline_p5.json', 'w') as json_file:  
#     json.dump(baseline_p5, json_file)

In [None]:
baseline_p10 = []
p10_g1 = refinedMyAlgo.precision_at_offline(output_1,10)
p10_g2 = refinedMyAlgo.precision_at_offline(output_2,10)
p10_g3 = refinedMyAlgo.precision_at_offline(output_3,10)
p10_g4 = refinedMyAlgo.precision_at_offline(output_4,10)
p10_g5 = refinedMyAlgo.precision_at_offline(output_5,10)

baseline_p10.append(p10_g1)
baseline_p10.append(p10_g2)
baseline_p10.append(p10_g3)
baseline_p10.append(p10_g4)
baseline_p10.append(p10_g5)

print('G1={}\nG2={}\nG3={}\nG4={}\nG5={}'.format(p10_g1, p10_g2, p10_g3, p10_g4, p10_g5))

In [None]:
# with open('baseline_p10.json', 'w') as json_file:  
#     json.dump(baseline_p10, json_file)

In [None]:
baseline_lm_ild = []

output_lm_1_ild = refinedMyAlgo.get_ILD_score(output_lm_1, title_weight=0.8)
output_lm_2_ild = refinedMyAlgo.get_ILD_score(output_lm_2, title_weight=0.8)
output_lm_3_ild = refinedMyAlgo.get_ILD_score(output_lm_3, title_weight=0.8)
output_lm_4_ild = refinedMyAlgo.get_ILD_score(output_lm_4, title_weight=0.8)
output_lm_5_ild = refinedMyAlgo.get_ILD_score(output_lm_5, title_weight=0.8)

baseline_lm_ild.append(output_lm_1_ild)
baseline_lm_ild.append(output_lm_2_ild)
baseline_lm_ild.append(output_lm_3_ild)
baseline_lm_ild.append(output_lm_4_ild)
baseline_lm_ild.append(output_lm_5_ild)

print(baseline_lm_ild)

# with open('baseline_lm_ild.json', 'w') as json_file:  
#     json.dump(baseline_lm_ild, json_file)

In [None]:
baseline_lm_p3 = []
p3_g1 = refinedMyAlgo.precision_at_offline(output_lm_1,3)
p3_g2 = refinedMyAlgo.precision_at_offline(output_lm_2,3)
p3_g3 = refinedMyAlgo.precision_at_offline(output_lm_3,3)
p3_g4 = refinedMyAlgo.precision_at_offline(output_lm_4,3)
p3_g5 = refinedMyAlgo.precision_at_offline(output_lm_5,3)

baseline_lm_p3.append(p3_g1)
baseline_lm_p3.append(p3_g2)
baseline_lm_p3.append(p3_g3)
baseline_lm_p3.append(p3_g4)
baseline_lm_p3.append(p3_g5)

print('G1={}\nG2={}\nG3={}\nG4={}\nG5={}'.format(p3_g1, p3_g2, p3_g3, p3_g4, p3_g5))

# with open('baseline_lm_p3.json', 'w') as json_file:  
#     json.dump(baseline_lm_p3, json_file)

In [None]:
baseline_lm_p5 = []
p5_g1 = refinedMyAlgo.precision_at_offline(output_lm_1,5)
p5_g2 = refinedMyAlgo.precision_at_offline(output_lm_2,5)
p5_g3 = refinedMyAlgo.precision_at_offline(output_lm_3,5)
p5_g4 = refinedMyAlgo.precision_at_offline(output_lm_4,5)
p5_g5 = refinedMyAlgo.precision_at_offline(output_lm_5,5)

baseline_lm_p5.append(p3_g1)
baseline_lm_p5.append(p3_g2)
baseline_lm_p5.append(p3_g3)
baseline_lm_p5.append(p3_g4)
baseline_lm_p5.append(p3_g5)

print('G1={}\nG2={}\nG3={}\nG4={}\nG5={}'.format(p5_g1, p5_g2, p5_g3, p5_g4, p5_g5))

# with open('baseline_lm_p5.json', 'w') as json_file:  
#     json.dump(baseline_lm_p5, json_file)

In [None]:
baseline_lm_p10 = []
p10_g1 = refinedMyAlgo.precision_at_offline(output_lm_1,10)
p10_g2 = refinedMyAlgo.precision_at_offline(output_lm_2,10)
p10_g3 = refinedMyAlgo.precision_at_offline(output_lm_3,10)
p10_g4 = refinedMyAlgo.precision_at_offline(output_lm_4,10)
p10_g5 = refinedMyAlgo.precision_at_offline(output_lm_5,10)

baseline_lm_p10.append(p10_g1)
baseline_lm_p10.append(p10_g2)
baseline_lm_p10.append(p10_g3)
baseline_lm_p10.append(p10_g4)
baseline_lm_p10.append(p10_g5)

print('G1={}\nG2={}\nG3={}\nG4={}\nG5={}'.format(p10_g1, p10_g2, p10_g3, p10_g4, p10_g5))

# with open('baseline_lm_p10.json', 'w') as json_file:  
#     json.dump(baseline_lm_p10, json_file)