In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
metadata = pd.read_csv('ml-latest-small/movies.csv', low_memory=False)

metadata['year'] = metadata['title'].apply(lambda x: x[-5:-1])
metadata['title'] = metadata['title'].apply(lambda x: x[:-7])
metadata['genres'] = metadata['genres'].apply(lambda x: x.replace('|',', '))

metadata.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",1995
1,2,Jumanji,"Adventure, Children, Fantasy",1995
2,3,Grumpier Old Men,"Comedy, Romance",1995
3,4,Waiting to Exhale,"Comedy, Drama, Romance",1995
4,5,Father of the Bride Part II,Comedy,1995


In [3]:
movie_ids = metadata['movieId'].unique()

In [4]:
metadata = metadata.set_index(movie_ids)
metadata.tail()

Unnamed: 0,movieId,title,genres,year
193581,193581,Black Butler: Book of the Atlantic,"Action, Animation, Comedy, Fantasy",2017
193583,193583,No Game No Life: Zero,"Animation, Comedy, Fantasy",2017
193585,193585,Flint,Drama,2017
193587,193587,Bungo Stray Dogs: Dead Apple,"Action, Animation",2018
193609,193609,Andrew Dice Clay: Dice Rules,Comedy,1991


In [8]:
my_column = []
for i in range(0,len(metadata['movieId'])):
    my_column.append(i)
    
len(my_column)

9742

In [11]:
metadata = metadata.assign(index = my_column)
metadata.tail()

Unnamed: 0,movieId,title,genres,year,index
193581,193581,Black Butler: Book of the Atlantic,"Action, Animation, Comedy, Fantasy",2017,9737
193583,193583,No Game No Life: Zero,"Animation, Comedy, Fantasy",2017,9738
193585,193585,Flint,Drama,2017,9739
193587,193587,Bungo Stray Dogs: Dead Apple,"Action, Animation",2018,9740
193609,193609,Andrew Dice Clay: Dice Rules,Comedy,1991,9741


In [30]:
# Acesso direto ao filme de id desejado
metadata.loc[100]

movieId                100
title            City Hall
genres     Drama, Thriller
year                  1996
index                   88
Name: 100, dtype: object

In [15]:
# Acesso ao index do filme no dataframe de filmes. Por exemplo, o 100tesimo filme da tabela eh o filme de id 113, "Before and After"
metadata.iloc[100]

movieId                 113
title      Before and After
genres       Drama, Mystery
year                   1996
index                   100
Name: 113, dtype: object

In [16]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['title'] = metadata['title'].fillna('')
metadata['genres'] = metadata['genres'].fillna('')

In [17]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix_title = tfidf.fit_transform(metadata['title'])
tfidf_matrix_genres = tfidf.fit_transform(metadata['genres'])

In [18]:
# # Compute the cosine similarity matrix
# cosine_sim_l = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim_title = cosine_similarity(tfidf_matrix_title, tfidf_matrix_title)
cosine_sim_genres = cosine_similarity(tfidf_matrix_genres, tfidf_matrix_genres)

In [19]:
# Get the pairwsie similarity scores of all movies with that movie
sim_scores_title = list(enumerate(cosine_sim_title[7]))
sim_scores_genres = list(enumerate(cosine_sim_genres[7]))

In [20]:
total_sim_score = []

for i in range(len(sim_scores_title)):
    aux = (sim_scores_title[i][1]*0.5) + (sim_scores_genres[i][1]*0.5)
    total_sim_score.append((i, aux))
    
# total_sim_score

In [22]:
distance_score = []

for i in range(len(total_sim_score)):
    aux = 1 - total_sim_score[i][1]
    distance_score.append((i, aux))
    
distance_score[0:10]

[(0, 0.6726510797561449),
 (1, 0.597642728118448),
 (2, 1.0),
 (3, 1.0),
 (4, 1.0),
 (5, 1.0),
 (6, 1.0),
 (7, 0.0),
 (8, 1.0),
 (9, 0.7995919784834015)]

In [32]:
distance_score[2][1]

1.0

In [24]:
my_list = [(131724, 5), (5746, 5), (6835, 5), (8804, 5), (26350, 5), (31522, 5), (1140, 5), (99636, 5), (2969, 5), (141718, 5)]
my_list

[(131724, 5),
 (5746, 5),
 (6835, 5),
 (8804, 5),
 (26350, 5),
 (31522, 5),
 (1140, 5),
 (99636, 5),
 (2969, 5),
 (141718, 5)]