# Explore here

In [1]:
import pandas as pd
import sqlite3
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Carga de datos
movies_path = "../data/raw/tmdb_5000_movies.csv"
credits_path = "../data/raw/tmdb_5000_credits.csv"

df_m = pd.read_csv(movies_path)
df_c = pd.read_csv(credits_path)
df_m = df_m.rename(columns={'id': 'movie_id'})

# Unir datasets
df_full = pd.merge(df_m, df_c, on="movie_id")
df_full = df_full[['movie_id', 'original_title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Transformación de datos
def extract_names(json_col, limit=None, filter_job=None):
    try:
        data = json.loads(json_col)
        if filter_job:
            return [entry['name'].replace(" ", "") for entry in data if entry.get('job') == filter_job]
        return [entry['name'].replace(" ", "") for entry in data[:limit]]
    except:
        return []

df_full['genres'] = df_full['genres'].apply(lambda x: extract_names(x))
df_full['keywords'] = df_full['keywords'].apply(lambda x: extract_names(x))
df_full['cast'] = df_full['cast'].apply(lambda x: extract_names(x, limit=3))
df_full['crew'] = df_full['crew'].apply(lambda x: extract_names(x, filter_job="Director"))
df_full['overview'] = df_full['overview'].fillna('').apply(lambda x: x.split())

# Crear columna "tags"
df_full['tags'] = (df_full['overview'] + df_full['genres'] + df_full['keywords'] + 
                   df_full['cast'] + df_full['crew']).apply(lambda x: ' '.join(x))

# Construcción del modelo KNN
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df_full['tags'])
similarity = cosine_similarity(tfidf_matrix)

def recommend(movie_title):
    if movie_title not in df_full['original_title'].values:
        print(f"La película '{movie_title}' no se encuentra en la base de datos.")
        return
    movie_index = df_full[df_full['original_title'] == movie_title].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    print(f"Basado en '{movie_title}', te recomendamos:")
    for movie in movie_list:
        print(f"- {df_full.iloc[movie[0]].original_title}")

# Ejemplo 
recommend("Man on Fire")


Basado en 'Man on Fire', te recomendamos:
- The November Man
- Double Take
- Wild Card
- We Have Your Husband
- The Bodyguard
