In [3]:
import numpy as np
import pandas as pd
import difflib          # Libreria utilizada para comparar la coincidencia cercana entre palabras
from sklearn.feature_extraction.text import TfidfVectorizer     # Para transformar los datos textuales en vectores de caracteristicas numericas
from sklearn.metrics.pairwise import cosine_similarity          # Se usa para hallar el valor de similaridad

In [4]:
# Cargando los datos del archivo csv a un dataframe de pandas
peliculas = pd.read_csv('PeliculasCreditos.csv')
peliculas.head(5)

Unnamed: 0,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,...,spoken_languages,status,tagline,title,vote_average,vote_count,Director,actores,release_year,return
0,30000000,"Animation, Comedy, Family",862.0,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios,US,1995-10-30,373554033,...,English,Released,,Toy Story,7.7,5415.0,John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",1995,12.451801
1,65000000,"Adventure, Fantasy, Family",8844.0,en,When siblings Judy and Peter discover an encha...,17.015539,"TriStar Pictures, Teitler Film, Interscope Com...",US,1995-12-15,262797249,...,"English, Français",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,Joe Johnston,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",1995,4.043035
2,0,"Romance, Comedy",15602.0,en,A family wedding reignites the ancient feud be...,11.7129,"Warner Bros., Lancaster Gate",US,1995-12-22,0,...,English,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,Howard Deutch,"Walter Matthau, Jack Lemmon, Ann-Margret, Soph...",1995,0.0
3,16000000,"Comedy, Drama, Romance",31357.0,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,Twentieth Century Fox Film Corporation,US,1995-12-22,81452156,...,English,Released,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,Forest Whitaker,"Whitney Houston, Angela Bassett, Loretta Devin...",1995,5.09076
4,0,Comedy,11862.0,en,Just when George Banks has recovered from his ...,8.387519,"Sandollar Productions, Touchstone Pictures",US,1995-02-10,76578911,...,English,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,Charles Shyer,"Steve Martin, Diane Keaton, Martin Short, Kimb...",1995,0.0


In [5]:
# Agregar una nueva columna de ID
peliculas['ID'] = range(1, len(peliculas) + 1)

In [6]:
peliculas.shape

(45345, 22)

In [7]:
# Reemplazar las comas en todas las columnas
peliculas = peliculas.replace(',', '', regex=True)

In [8]:
# Seleccionando las caracteristicas importantes para la recomendacion
selected_features = ['genres','tagline','actores','Director']
print(selected_features)

['genres', 'tagline', 'actores', 'Director']


In [9]:
# Remplazando los valores faltantes con tipo de datos 'string' vacios
for feature in selected_features:
  peliculas[feature] = peliculas[feature].fillna('')

In [10]:
# Combinamos las 5 caracteristicas
#   combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']
peliculas_combinadas = peliculas[selected_features].apply(lambda row: ' '.join(row.astype(str)), axis=1)
print(peliculas_combinadas)

0        Animation Comedy Family  Tom Hanks Tim Allen D...
1        Adventure Fantasy Family Roll the dice and unl...
2        Romance Comedy Still Yelling. Still Fighting. ...
3        Comedy Drama Romance Friends are the people wh...
4        Comedy Just When His World Is Back To Normal.....
                               ...                        
45340    Drama Action Romance  Patrick Bergin Uma Thurm...
45341    Drama  Angel Aquino Perry Dizon Hazel Orencio ...
45342    Action Drama Thriller A deadly game of wits. E...
45343      Iwan Mosschuchin Nathalie Lissenko Pavel Pav...
45344                                        Daisy Asquith
Length: 45345, dtype: object


In [11]:
# Convirtiendo el texto a vectores
vectorizer = TfidfVectorizer()

In [13]:
feature_vectors = vectorizer.fit_transform(peliculas_combinadas)
print(feature_vectors)

  (0, 62496)	0.2537027938996513
  (0, 52924)	0.2506761692894138
  (0, 84577)	0.18424543304859278
  (0, 37410)	0.16454794480966775
  (0, 96112)	0.13553395124779805
  (0, 33256)	0.23076457050782964
  (0, 63211)	0.10510113448406101
  (0, 72913)	0.22738109205615015
  (0, 62715)	0.18076730421534185
  (0, 27790)	0.26535033251628315
  (0, 116537)	0.14805566523902036
  (0, 33192)	0.1669369667138081
  (0, 75589)	0.15536738300205347
  (0, 87296)	0.22378560994984056
  (0, 4443)	0.17326932584513785
  (0, 90305)	0.2261319228664019
  (0, 53212)	0.23163885537906814
  (0, 99610)	0.16370091445798296
  (0, 117202)	0.15861457888451524
  (0, 114616)	0.24205525528301947
  (0, 52934)	0.12697785503854503
  (0, 91884)	0.23808669826003928
  (0, 29385)	0.12325844193456223
  (0, 2961)	0.1359280090705521
  (0, 110073)	0.129129066891467
  :	:
  (45342, 52660)	0.11372564761853413
  (45342, 109731)	0.06261094932891685
  (45342, 993)	0.06575229469965801
  (45342, 80801)	0.07807047957417428
  (45342, 30020)	0.04063378

In [13]:
print(feature_vectors.shape)

(45345, 123678)


Coseno de Similaridad

In [None]:
# Solo usamos los 25000 primeros datos para entrenar el modelo, ya que no se cuenta con memoria RAM suficiente

similarity = cosine_similarity(feature_vectors[:25000,:])
# similarity = cosine_similarity(feature_vectors)

In [None]:
print(similarity.shape)

(25000, 25000)


In [None]:
# obteniendo el nombre del usuario con la funcion input
movie_name = input('Ingrese el nombre de la pelicula : ')

In [None]:
# Convirtiendo todos los nombres de pelicula a una lista
lista_de_todas_las_peliculas = peliculas['title'].tolist()
print((lista_de_todas_las_peliculas))
print(len(lista_de_todas_las_peliculas))

45345


In [None]:
# Encontrando la coincidencia cercana para el nombre de la pelicula dada por el usuario
find_close_match = difflib.get_close_matches(movie_name,lista_de_todas_las_peliculas)
print(find_close_match)

['Iron Man', 'Iron Man', 'Iron Man']


In [None]:
# Seleccionando la coincidencia mas cercana
close_match = find_close_match[0]

In [None]:
# Buscando el índice de la película con el título
index_of_the_movie = peliculas[peliculas['title'] == close_match]['ID'].values[0]
print(index_of_the_movie)

12580


In [None]:
# Obteniendo una lista de peliculas similares
similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.007750330499553545), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.030573210185926614), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (42, 0.0), (43, 0.0), (44, 0.0), (45, 0.0), (46, 0.0), (47, 0.0), (48, 0.0), (49, 0.0), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.0), (55, 0.0), (56, 0.0), (57, 0.0), (58, 0.0), (59, 0.0), (60, 0.0), (61, 0.0), (62, 0.0), (63, 0.0), (64, 0.0), (65, 0.0), (66, 0.0), (67, 0.0), (68, 0.0), (69, 0.004148096968364987), (70, 0.0), (71, 0.0), (72, 0.0), (73, 0.0), (74, 0.0), (75, 0.0075216815153791115), (76, 0.0), (77, 0.0), (78, 0.0), (79, 0.0), (80, 0.0), (81, 0.0), (82, 0.0), (83, 0.0), (84, 0.0), (85, 

In [None]:
# Ordenando las peliculas basado en su puntaje
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(12580, 1.0000000000000002), (6156, 0.2432123954501209), (11178, 0.22634875735407944), (10309, 0.2247246781462082), (12790, 0.21902483114299387), (9387, 0.18398198211411368), (5283, 0.17598003162260123), (12391, 0.1726614243521341), (7309, 0.1666573653911406), (14243, 0.1613861407686677), (9809, 0.1609641594120565), (4203, 0.15932735213770635), (6048, 0.13615656720561398), (3199, 0.13580738899360878), (15655, 0.13089696181320484), (24804, 0.12767058442713627), (9247, 0.1240260676807476), (11556, 0.12366574334734964), (16344, 0.12204008733974585), (20235, 0.1197144228046247), (17926, 0.11736545300496012), (11518, 0.11622861449550416), (11566, 0.1152659532498451), (9970, 0.11488210703361945), (9127, 0.1116358223731315), (802, 0.10996127971803836), (19832, 0.10930088223131033), (13087, 0.10558192129751538), (18243, 0.1054020980724039), (10402, 0.10245377487098786), (12791, 0.10194571270868288), (7599, 0.10164781735680166), (20444, 0.098497070257935), (9917, 0.09817249304366739), (15542, 

In [None]:
# Creamos un bucle para generar las 10 primeras peliculas parecidas a partir de ID
i = 1
for movie_name in sorted_similar_movies:
    index = movie_name[0]
    matching_rows = peliculas[peliculas.ID == index]
    if len(matching_rows) > 0:
        title_from_index = matching_rows['title'].values[0]
        print(i, ':', title_from_index)
        if i >= 10:
            break
        i += 1

1 : Iron Man
2 : Winged Migration
3 : G-Men
4 : The Key
5 : Breath
6 : Bukowski - Born into This
7 : The Dangerous Lives of Altar Boys
8 : Deep Water
9 : The Three Faces of Eve
10 : The Killing of John Lennon


# Código compacto

In [None]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
peliculas = pd.read_csv('PeliculasCreditos.csv')

In [None]:
peliculas['ID'] = range(1, len(peliculas) + 1)

peliculas = peliculas.replace(',', '', regex=True)

selected_features = ['genres','tagline','actores','Director']

for feature in selected_features:
  peliculas[feature] = peliculas[feature].fillna('')

peliculas_combinadas = peliculas[selected_features].apply(lambda row: ' '.join(row.astype(str)), axis=1)

vectorizer = TfidfVectorizer()

feature_vectors = vectorizer.fit_transform(peliculas_combinadas)

similarity = cosine_similarity(feature_vectors[:25000,:])
# similarity = cosine_similarity(feature_vectors)

In [1]:
movie_name = input('Ingrese el nombre de la pelicula : ')
lista_de_todas_las_peliculas = peliculas['title'].tolist()
find_close_match = difflib.get_close_matches(movie_name,lista_de_todas_las_peliculas)
close_match = find_close_match[0]
index_of_the_movie = peliculas[peliculas['title'] == close_match]['ID'].values[0]
similarity_score = list(enumerate(similarity[index_of_the_movie]))
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
i = 1
for movie_name in sorted_similar_movies:
    index = movie_name[0]
    matching_rows = peliculas[peliculas.ID == index]
    if len(matching_rows) > 0:
        title_from_index = matching_rows['title'].values[0]
        print(i, ':', title_from_index)
        if i >= 5:
            break
        i += 1

NameError: name 'peliculas' is not defined