In [1]:
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

# Carga el dataset
credits=pd.read_csv('tmdb_5000_credits.csv')
movies=pd.read_csv('tmdb_5000_movies.csv')

credits.columns = ['id','titulo','cast','crew']
movies= movies.merge(credits,on='id')

movies.head(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,titulo,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [2]:
from ast import literal_eval

In [3]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movies[feature] = movies[feature].apply(literal_eval)  

In [4]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i['name'].replace(" ", "")) for i in x]  # Asume que los datos relevantes están en una clave 'name'
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name'].replace(" ", "").lower()  # Extraer el nombre del director está bajo la llave 'name'
    return ''

# Obteniendo el nombre del director y almacenándolo en una nueva columna
movies['director'] = movies['crew'].apply(get_director)

In [5]:
# Obteniendo el nombre del director y almacenándolo en una nueva columna
movies['director'] = movies['crew'].apply(get_director)

features_clean = ['cast', 'keywords', 'director', 'genres', 'overview']

# Limpia todas las características relevantes
for feature in features_clean:
    movies[feature] = movies[feature].apply(clean_data)


In [6]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres']) + ' ' + x['overview']

# Crear el 'soup' para cada película
movies['soup'] = movies.apply(create_soup, axis=1)

In [7]:
# Utilizar TfidfVectorizer para crear la matriz de características
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['soup'])

# Calcular la matriz de similitud de coseno
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
# Resetear índices para una correspondencia más fácil y creación del mapeo inverso
movies = movies.reset_index()
indices = pd.Series(movies.index, index=movies['title'])

In [11]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Obtiene los scores de las 10 películas más similares
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

In [12]:
get_recommendations('The Dark Knight Rises', cosine_sim)

96                    Inception
65              The Dark Knight
119               Batman Begins
2060         Out of the Furnace
95                 Interstellar
298     The Wolf of Wall Street
428              Batman Returns
210              Batman & Robin
1196               The Prestige
1424                 Concussion
Name: title, dtype: object

<h1>Filtro colaborativo</h1>

In [15]:
ratings = pd.read_csv('ratings_small.csv')

ratings.head(3)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


In [19]:

# Utilizando Surprise para el filtrado colaborativo
from surprise import KNNBasic, Dataset, Reader
from surprise.model_selection import cross_validate

def knn_recommendations(user_id, movie_indices):
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    
    knn = KNNBasic(k=20, sim_options={'name': 'cosine', 'user_based': True})
    trainset = data.build_full_trainset()
    knn.fit(trainset)
    
    predictions = []
    for idx in movie_indices:
        movie_id = movies.iloc[idx]['id']
        pred = knn.predict(uid=user_id, iid=str(movie_id))
        predictions.append((idx, pred.est))
    
    predictions = sorted(predictions, key=lambda x: x[1], reverse=True)
    return [x[0] for x in predictions]

In [20]:
def hybrid_recommendations(title, user_id):
    content_recommendations = get_recommendations(title)
    knn_recommendations_indices = knn_recommendations(user_id, content_recommendations)
    
    recommended_movies = movies['title'].iloc[knn_recommendations_indices]
    return recommended_movies

In [21]:
# Obtener recomendaciones híbridas
print(hybrid_recommendations('The Exorcist', 1))

Computing the cosine similarity matrix...
Done computing similarity matrix.


TypeError: Cannot index by location index with a non-integer key