In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [30]:
class DivClass():
    def __init__(self, movie_data=''):
        if movie_data:
            self.metadata = pd.read_csv(movie_data, low_memory=False)
            
            self.metadata['year'] = self.metadata['title'].apply(lambda x: x[-5:-1])
            self.metadata['title'] = self.metadata['title'].apply(lambda x: x[:-7])
            self.metadata['genres'] = self.metadata['genres'].apply(lambda x: x.replace('|',', '))
            
            movie_ids = self.metadata['movieId'].unique()
            self.metadata = self.metadata.set_index(movie_ids)
            
            my_column = []
            for i in range(0,len(self.metadata['movieId'])):
                my_column.append(i)
            self.metadata = self.metadata.assign(index = my_column)
            
            
    def calculate_cosine_similarity(self):
        #Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
        tfidf = TfidfVectorizer(stop_words='english')
        
        #Replace NaN with an empty string
        self.metadata['title'] = self.metadata['title'].fillna('')
        self.metadata['genres'] = self.metadata['genres'].fillna('')
        
        #Construct the required TF-IDF matrix by fitting and transforming the data
        tfidf_matrix_title = tfidf.fit_transform(self.metadata['title'])
        tfidf_matrix_genres = tfidf.fit_transform(self.metadata['genres'])
        
        # # Compute the cosine similarity matrix
        # cosine_sim_l = linear_kernel(tfidf_matrix, tfidf_matrix)
        self.cosine_sim_title = cosine_similarity(tfidf_matrix_title, tfidf_matrix_title)
        self.cosine_sim_genres = cosine_similarity(tfidf_matrix_genres, tfidf_matrix_genres)
        
        
    def get_pairwsie_similarity(self, movieIndex):
        # Get the pairwsie similarity scores of all movies with that movie
        sim_scores_title = list(enumerate(self.cosine_sim_title[movieIndex]))
        sim_scores_genres = list(enumerate(self.cosine_sim_genres[movieIndex]))
        
        total_sim_score = []

        for i in range(len(sim_scores_title)):
            aux = (sim_scores_title[i][1]*0.5) + (sim_scores_genres[i][1]*0.5)
            total_sim_score.append((i, aux))
            
        distance_score = []

        for i in range(len(total_sim_score)):
            aux = 1 - total_sim_score[i][1]
            distance_score.append((i, aux))
            
        return distance_score
    
    
    def get_distance_x_y(self, movieIdX, movieIdY):
        distances = self.get_pairwsie_similarity(movieIndex=self.metadata.loc[movieIdX]['index'])
        distance_x_y = distances[self.metadata.loc[movieIdY]['index']]
        
        return distance_x_y[1]
    
    
    def get_list_diversity(self, my_list):
        '''Function based on the article: Diversity, Serendipity, Novelty, and Coverage: A Survey and Empirical
        Analysis of Beyond-Accuracy Objectives in Recommender Systems by MARIUS KAMINSKAS and DEREK BRIDGE'''
        distance_sum = 0
        for i in my_list:
            for j in my_list:
                if i != j:
                    distance_sum = distance_sum + self.get_distance_x_y(movieIdX=i[0], movieIdY=j[0])
                    
        divisor = len(my_list) * (len(my_list) - 1)
        
        diversity = distance_sum/divisor
        
        return diversity

In [31]:
divClass = DivClass(movie_data='ml-latest-small/movies.csv')
divClass.metadata.tail()

Unnamed: 0,movieId,title,genres,year,index
193581,193581,Black Butler: Book of the Atlantic,"Action, Animation, Comedy, Fantasy",2017,9737
193583,193583,No Game No Life: Zero,"Animation, Comedy, Fantasy",2017,9738
193585,193585,Flint,Drama,2017,9739
193587,193587,Bungo Stray Dogs: Dead Apple,"Action, Animation",2018,9740
193609,193609,Andrew Dice Clay: Dice Rules,Comedy,1991,9741


In [32]:
divClass.calculate_cosine_similarity()

In [33]:
distances = divClass.get_pairwsie_similarity(movieIndex=0)
distances[0:10]

[(0, 0.0),
 (1, 0.5932111296099281),
 (2, 0.9236153788938614),
 (3, 0.9324323488573905),
 (4, 0.8662067615542999),
 (5, 1.0),
 (6, 0.9236153788938614),
 (7, 0.6726510797561449),
 (8, 1.0),
 (9, 0.8687933010966684)]

In [34]:
dist_1_3 = divClass.get_distance_x_y(movieIdX=1, movieIdY=3)
dist_1_3

0.9236153788938614

In [35]:
my_list = [(131724, 5), (5746, 5), (6835, 5), (8804, 5), (26350, 5), (31522, 5), (1140, 5), (99636, 5), (2969, 5), (141718, 5)]
my_list

[(131724, 5),
 (5746, 5),
 (6835, 5),
 (8804, 5),
 (26350, 5),
 (31522, 5),
 (1140, 5),
 (99636, 5),
 (2969, 5),
 (141718, 5)]

In [40]:
# distance_sum = 0

# for i in my_list:
#     for j in my_list:
#         if i != j:
#             distance_sum = distance_sum + divClass.get_distance_x_y(movieIdX=i[0], movieIdY=j[0])

# distance_sum

In [41]:
# divisor = len(my_list) * (len(my_list) - 1)
# divisor

In [42]:
# total = distance_sum/divisor
# total

In [39]:
print(divClass.get_list_diversity(my_list=my_list))

0.8487959406942316
