# Sistema de Recomendación de Películas

Este notebook crea un sistema de recomendación que sugiere películas del Top 250 de IMDB basado en:
- Ratings personales en Letterboxd
- Año de la película
- Géneros
- Actores involucrados
- Directores

## 1. Configuración Inicial
Primero, instalamos e importamos las librerías necesarias.

In [None]:
# Instalación de paquetes (ejecutar solo una vez)
!pip install imdbpy scikit-learn pandas numpy matplotlib seaborn tqdm

In [1]:
# Importaciones
import pandas as pd
import numpy as np
from imdb import IMDb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from concurrent.futures import ThreadPoolExecutor, as_completed
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings

# Configuración
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")
ia = IMDb()

## 2. Carga de Datos

Cargamos tus ratings de Letterboxd y el Top 250 de IMDB.

**Nota**: Se asume que se tienen estos archivos CSV.

In [2]:
def cargar_datos():
    """Carga y prepara los datasets"""
    # Cargar tus ratings de Letterboxd (ajusta los nombres de columnas)
    letterboxd = pd.read_csv('letterboxd_ratings.csv')
    letterboxd = letterboxd.rename(columns={
        'Name': 'title',
        'Year': 'year',
        'Rating': 'rating'
    })

    # Cargar Top 250 de IMDB (puedes generarlo con imdbpy si no tienes el CSV)
    imdb_top = pd.read_csv('imdb_top250.csv')
    imdb_top = imdb_top.rename(columns={
        'name': 'title',
        'year': 'year',
        'rating': 'imdb_rating'
    })

    return letterboxd, imdb_top

In [3]:
letterboxd_ratings, imdb_top250 = cargar_datos()

## 3. Enriquecimiento de Datos

Vamos a obtener información adicional de IMDB para las películas calificadas.

In [4]:
def serializar(df):
  df = df[['title', 'year', 'imdb_rating', 'genre', 'directors', 'casts', 'run_time']]
  df['genre'] = df['genre'].apply(lambda x: [item.strip() for item in x.split(',')])
  df['directors'] = df['directors'].apply(lambda x: [item.strip() for item in x.split(',')])
  df['casts'] = df['casts'].apply(lambda x: [item.strip() for item in x.split(',')])

  # Convertir tiempo de ejecución a minutos
  def convert_runtime(runtime):
    try:
        parts = runtime.split("h ")
        hours = int(parts[0]) * 60 if parts[0].isdigit() else 0
        minutes = int(parts[1].replace("m", "")) if len(parts) > 1 and parts[1].replace("m", "").isdigit() else 0
        return int(hours + minutes)
    except:
        return None

  df["run_time"] = df["run_time"].apply(convert_runtime)

  df.rename(columns={
    "genre": "genres",
    "casts": "actors",
    "run_time": "runtime"
  }, inplace=True)

  return df
  

In [5]:
df_top_global = None

In [6]:
def obtener_info_local(title, year, df_top):
    """Busca los datos en el CSV local"""
    filtered_df = df_top[df_top['title'] == title]
    if not filtered_df.empty:
        movie = filtered_df.iloc[0]
        return {
            'title': movie['title'],
            'year': movie.get('year', year),
            'imdb_rating': movie.get('imdb_rating', np.nan),
            'genres': movie.get('genres', []),
            'directors': movie.get('directors', []),
            'actors': movie.get('actors', []),
            'runtime': movie.get('runtime', 0)
        }
    return None

In [7]:
def obtener_info_imdb(title, year):
    """Busca los datos usando la API de IMDb"""
    try:
        resultados = ia.search_movie(f"{title} {year}")
        if not resultados:
            return None

        pelicula = ia.get_movie(resultados[0].movieID)
        
        return {
            'title': pelicula.get('title', title),
            'year': pelicula.get('year', year),
            'imdb_rating': pelicula.get('rating', np.nan),
            'genres': pelicula.get('genres', []),
            'directors': [d['name'] for d in pelicula.get('directors', [])],
            'actors': [a['name'] for a in pelicula.get('actors', [])],
            'runtime': pelicula.get('runtime', [0])[0] if pelicula.get('runtime') else 0
        }
    except Exception:
        return None

In [8]:
def enriquecer_fila(fila):
    """Enriquece una fila con datos de IMDb, primero local, luego API"""
    title = str(fila['title'])
    year = fila['year']
    
    # Buscar primero en el CSV local
    info = obtener_info_local(title, year, df_top_global)
    
    # Si no está, buscar con la API
    if not info:
        info = obtener_info_imdb(title, year)
    
    if info:
        info['user_rating'] = fila['rating']
    return info


In [9]:
def enriquecer_datos(df, df_top, max_workers=20):
    """Añade datos de IMDB a un DataFrame usando concurrencia"""
    global df_top_global
    df_top_global = df_top  # guardar el dataframe para uso dentro de los threads

    datos_enriquecidos = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futuros = {executor.submit(enriquecer_fila, fila): fila for _, fila in df.iterrows()}
        for futuro in tqdm(as_completed(futuros), total=len(futuros)):
            resultado = futuro.result()
            if resultado:
                datos_enriquecidos.append(resultado)
                
    df_top_global = None  # limpiar después
    return pd.DataFrame(datos_enriquecidos)

In [10]:
# Enriquecer datos
imdb_enriched = serializar(imdb_top250)

print("\nEnriqueciendo tus ratings de Letterboxd...")
letterboxd_enriched = enriquecer_datos(letterboxd_ratings, imdb_enriched)

# Mostrar resultados
print(f"\nPelículas enriquecidas: {len(letterboxd_enriched)}/{len(letterboxd_ratings)}")
display(letterboxd_enriched.head(2))
display(imdb_enriched.head(2))


Enriqueciendo tus ratings de Letterboxd...


100%|██████████| 184/184 [00:49<00:00,  3.72it/s]


Películas enriquecidas: 184/184





Unnamed: 0,title,year,imdb_rating,genres,directors,actors,runtime,user_rating
0,The Incredibles,2004,8.0,"[Animation, Action, Adventure]",[Brad Bird],"[Craig T. Nelson, Samuel L. Jackson, Holly Hun...",115,3.0
1,The Truman Show,1998,8.2,"[Comedy, Drama]",[Peter Weir],"[Jim Carrey, Ed Harris, Laura Linney, Noah Emm...",103,4.0


Unnamed: 0,title,year,imdb_rating,genres,directors,actors,runtime
0,The Shawshank Redemption,1994,9.3,[Drama],[Frank Darabont],"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",142
1,The Godfather,1972,9.2,"[Crime, Drama]",[Francis Ford Coppola],"[Marlon Brando, Al Pacino, James Caan, Diane K...",175


In [11]:
# Filtrar las películas del top 250 que ya estén en el historial de Letterboxd
watched_movies = imdb_enriched[imdb_enriched['title'].isin(letterboxd_enriched['title'])]

unwatched_movies = imdb_enriched[~imdb_enriched['title'].isin(letterboxd_enriched['title'])]

## 4. Preprocesamiento de Datos

In [12]:
mlb_genres = MultiLabelBinarizer()
mlb_directors = MultiLabelBinarizer()
mlb_actors = MultiLabelBinarizer()

letterboxd_enriched['genres'] = letterboxd_enriched['genres'].apply(lambda x: x if isinstance(x, list) else eval(x))
letterboxd_enriched['directors'] = letterboxd_enriched['directors'].apply(lambda x: x if isinstance(x, list) else eval(x))
letterboxd_enriched['actors'] = letterboxd_enriched['actors'].apply(lambda x: x if isinstance(x, list) else eval(x))

imdb_enriched['genres'] = imdb_enriched['genres'].apply(lambda x: x if isinstance(x, list) else eval(x))
imdb_enriched['directors'] = imdb_enriched['directors'].apply(lambda x: x if isinstance(x, list) else eval(x))
imdb_enriched['actors'] = imdb_enriched['actors'].apply(lambda x: x if isinstance(x, list) else eval(x))


In [13]:
# Preparar las características (features) y la variable objetivo (target) para las películas no vistas
X_train = np.hstack([
    mlb_genres.fit_transform(letterboxd_enriched['genres']),
    mlb_directors.fit_transform(letterboxd_enriched['directors']),
    mlb_actors.fit_transform(letterboxd_enriched['actors']),
    letterboxd_enriched[['year', 'imdb_rating']].values
])
y_train = letterboxd_enriched['user_rating'].values

## 5. Modelado y Recomendación

In [14]:
# Entrenar el modelo
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [15]:
# Predecir en películas no vistas del Top 250 IMDb
X_test = np.hstack([
    mlb_genres.transform(unwatched_movies['genres']),
    mlb_directors.transform(unwatched_movies['directors']),
    mlb_actors.transform(unwatched_movies['actors']),
    unwatched_movies[['year', 'imdb_rating']].values
])

In [16]:
y_pred = model.predict(X_test)
unwatched_movies['predicted_rating'] = y_pred
recommendations = unwatched_movies.sort_values(by='predicted_rating', ascending=False).head(100)

# Chequeo de recomendaciones
display(recommendations.head(5))

Unnamed: 0,title,year,imdb_rating,genres,directors,actors,runtime,predicted_rating
6,The Lord of the Rings: The Return of the King,2003,9.0,"[Action, Adventure, Drama]",[Peter Jackson],"[Elijah Wood, Viggo Mortensen, Ian McKellen, O...",201,4.6
4,12 Angry Men,1957,9.0,"[Crime, Drama]",[Sidney Lumet],"[Henry Fonda, Lee J. Cobb, Martin Balsam, John...",96,4.53
113,A Separation,2011,8.3,[Drama],[Asghar Farhadi],"[Payman Maadi, Leila Hatami, Sareh Bayat, Shah...",123,4.225
108,Incendies,2010,8.3,"[Drama, Mystery]",[Denis Villeneuve],"[Lubna Azabal, Mélissa Désormeaux-Poulin, Maxi...",131,4.22
93,The Hunt,2012,8.3,[Drama],[Thomas Vinterberg],"[Mads Mikkelsen, Thomas Bo Larsen, Annika Wedd...",115,4.21


## 6. Exportación de datos

In [17]:
# Acorta los datos para que no aparezcan tantos actores en el .xlsx
export = recommendations.copy()
export['actors'] = export['actors'].apply(lambda x: x[:3])

export.to_excel('letterboxd_recomendaciones.xlsx', index=False)
#print("Recomendaciones guardadas en recomendaciones_random_forest_no_vistas.xlsx")