# Proyecto PLN  - Chatbot_IMDB

In [1]:
# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)


## Diccionario de traducción de genero IMDB

In [2]:
GENRES_IMDB = {
    28: "Action",
    12: "Adventure",
    16: "Animation",
    35: "Comedy",
    80: "Crime",
    99: "Documentary",
    18: "Drama",
    10751: "Family",
    14: "Fantasy",
    36: "History",
    27: "Horror",
    10402: "Music",
    9648: "Mystery",
    10749: "Romance",
    878: "Sci-Fi",
    10770: "TV Movie",
    53: "Thriller",
    10752: "War",
    37: "Western"
}

GENRES_IMDB_INVERTED = {
    "Action": 28,
    "Adventure": 12,
    "Animation": 16,
    "Comedy": 35,
    "Crime": 80,
    "Documentary": 99,
    "Drama": 18,
    "Family": 10751,
    "Fantasy": 14,
    "History": 36,
    "Horror": 27,
    "Music": 10402,
    "Mystery": 9648,
    "Romance": 10749,
    "Sci-Fi": 878,
    "TV Movie": 10770,
    "Thriller": 53,
    "War": 10752,
    "Western": 37
}



# Scrapping de información con API

In [3]:
import requests
import json
import time
import pandas as pd
import os

# Configuración de la API
API_KEY = "c436a0598ba40f517d94fa3c9cc217d6"  # Reemplaza con tu API Key de TMDB
BASE_URL = "https://api.themoviedb.org/3/movie/popular"
NUM_PELICULAS = 10000  # Número total de películas a descargar
PELICULAS_POR_PAGINA = 20  # TMDB devuelve 20 películas por página
paginas_a_descargar = (NUM_PELICULAS // PELICULAS_POR_PAGINA) + 1

PATH_SAVE = "./peliculasPopulares10k.csv"

# Ensure the directory exists
os.makedirs(os.path.dirname(PATH_SAVE), exist_ok=True)

def obtener_peliculas(n=NUM_PELICULAS):
    peliculas = []
    for pagina in range(1, paginas_a_descargar + 1):
        url = f"{BASE_URL}?api_key={API_KEY}&language=en-EN&page={pagina}"
        respuesta = requests.get(url)
        if respuesta.status_code == 200:
            datos = respuesta.json()
            peliculas.extend(datos["results"])
        else:
            print(f"⚠ Error en la petición: {respuesta.status_code}")
            break
        # Pausa para no exceder los límites de la API (40 peticiones/10s)
        time.sleep(0.1)
        # Detener si alcanzamos el límite deseado
        if len(peliculas) >= n:
            break

    return peliculas[:n]

# Obtener las películas más populares
peliculas = obtener_peliculas(NUM_PELICULAS)

# Crear un DataFrame y guardar en un archivo CSV
df_peliculas = pd.DataFrame(peliculas)

df_peliculas.to_csv(PATH_SAVE, index=False, encoding="utf-8")

print(f"✅ Se han guardado {len(peliculas)} películas en {PATH_SAVE}")

✅ Se han guardado 10000 películas en ./peliculasPopulares10k.csv


# Limpieza DataSet

In [4]:

PATH_LOAD = "./peliculasPopulares10k.csv"

# Leer el archivo CSV en chunks
chunks = pd.read_csv(PATH_LOAD, encoding="utf-8", sep=",", chunksize=1000)



# Concatenate chunks into a single DataFrame
df_peliculas = pd.concat(chunks, ignore_index=True)

columnas = [ "title", "release_date", "popularity","original_language", "overview", "genre_ids", "adult"]

df_peliculas = df_peliculas[columnas]

# Remove duplicate rows based on all columns
df_peliculas.drop_duplicates(inplace=True)

# Remove rows with missing values (NaN) in any column
df_peliculas.dropna(inplace=True)

# Reset the index after removing rows
df_peliculas.reset_index(drop=True, inplace=True)

df_peliculas.head()


PATH_SAVE_PROCESADO = "./peliculasPopulares10k_Procesado.csv"
# Guardar el DataFrame limpio en un archivo CSV
df_peliculas.to_csv(PATH_SAVE_PROCESADO, index=False, encoding="utf-8")
print(f"✅ Archivo CSV guardado en {PATH_SAVE_PROCESADO}")




✅ Archivo CSV guardado en ./peliculasPopulares10k_Procesado.csv


## Funciones de filtro


In [5]:
def getDataFrameGenero(genero):
  if isinstance(genero, str):
    genero=GENRES_IMDB_INVERTED[genero]
  df_peliculas_genero = df_peliculas[df_peliculas['genre_ids'].apply(lambda x: genero in x)]
  return df_peliculas_genero

def getDataFrameLanguage(language):
  df_peliculas_language = df_peliculas[df_peliculas['original_language'] == language]
  return df_peliculas_language

def getDataFrameYearRange(start_year, end_year):
  """
  Filters the DataFrame to include movies released within a specified year range.

  Args:
    start_year: The starting year of the range (inclusive).
    end_year: The ending year of the range (inclusive).

  Returns:
    A filtered DataFrame containing movies released within the specified range.
  """
  df_peliculas_year_range = df_peliculas[
      df_peliculas['release_date'].str.slice(0, 4).between(str(start_year), str(end_year))
  ]
  return df_peliculas_year_range

# Example usage
df_2020_to_2023_movies = getDataFrameYearRange(2020, 2023)
# You can now work with the df_2020_to_2023_movies DataFrame

df_2020_to_2023_movies.head()


Unnamed: 0,title,release_date,popularity,original_language,overview,genre_ids,adult
42,My Fault,2023-06-08,442.55,es,"Noah must leave her city, boyfriend, and frien...","[10749, 18]",False
55,Sonic the Hedgehog 2,2022-03-30,305.582,en,"After settling in Green Hills, Sonic is eager ...","[28, 12, 10751, 35]",False
57,Sex Game 6969,2022-01-27,338.296,ko,Three married women had always been dissatisfi...,"[35, 18, 10749]",False
78,Fast X,2023-05-17,250.448,en,Over many missions and against impossible odds...,"[28, 80, 53]",False
83,365 Days: This Day,2022-04-26,231.718,pl,Laura and Massimo are back and hotter than eve...,"[10749, 18]",False


## Construcción de Dense Retriever Class

In [6]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import os

class DenseRetriever:
    def __init__(self, df, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        """
        Inicializa el modelo de embeddings y almacena los embeddings en memoria.
        :param df: DataFrame con las columnas ["title", "overview"].
        :param model_name: Nombre del modelo de Hugging Face.
        """
        self.df = df
        self.model = SentenceTransformer(model_name)
        self.embeddings = None

        # Concatenar "title + overview" para generar embeddings
        self.df["text"] = self.df["title"] + " - " + self.df["overview"]

        # Generar embeddings para las películas
        self._generate_embeddings("./movie_embeddings.npy") # Se puede añadir un path

    def _generate_embeddings(self, pathEmbeddings=None):
        """Genera embeddings y los almacena en memoria."""
        if pathEmbeddings and os.path.exists(pathEmbeddings):
            self.embeddings = np.load(pathEmbeddings)
            print(f"Embeddings cargados desde el archivo: {pathEmbeddings}")
        else:
          print("🔹 Generando embeddings...")
          self.embeddings = self.model.encode(self.df["text"].tolist(), convert_to_numpy=True)
          np.save("movie_embeddings.npy", self.embeddings)  # Guardar embeddings en un archivo .npy
          print("✅ Embeddings generados.")

    def save_embeddings(self, path):
        """Guarda los embeddings en un archivo .npy."""
        np.save(path, self.embeddings)
        print(f"Embeddings guardados en: {path}")


    def search(self, query, top_k=5):
        """
        Realiza una búsqueda utilizando similitud del coseno.
        :param query: Texto de búsqueda.
        :param top_k: Número de resultados a devolver.
        :return: DataFrame con los resultados ordenados por similitud.
        """
        print(f"🔍 Buscando: {query}")

        # Convertir la query en embedding
        query_embedding = self.model.encode([query], convert_to_numpy=True)

        # Calcular similitud del coseno entre la query y los embeddings de las películas
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]

        # Obtener los índices de los mejores resultados
        best_indices = np.argsort(similarities)[::-1][:top_k]

        # Recuperar las películas coincidentes
        results = self.df.iloc[best_indices].copy()
        results["similarity"] = similarities[best_indices]

        return results.sort_values(by="similarity", ascending=False)



## Preparación de embeddings

### Carga del dataset procesado

In [7]:
PATH_SAVE_PROCESADO = "./peliculasPopulares10k_Procesado.csv"
df = pd.read_csv(PATH_SAVE_PROCESADO)

### Carga de la clase Dense Retriever y generación de embeddings para el dataframe 

In [8]:
denseR=DenseRetriever(df)

🔹 Generando embeddings...
✅ Embeddings generados.


## Prueba de busqueda densa basada en query

In [9]:
query = "a kid who learns kung fu with a old man on China"
results = denseR.search(query, 5)
results

🔍 Buscando: a kid who learns kung fu with a old man on China


Unnamed: 0,title,release_date,popularity,original_language,overview,genre_ids,adult,text,similarity
2094,Karate Kid: Legends,2025-05-28,40.495,en,"After a family tragedy, kung fu prodigy Li Fon...","[28, 18, 10751]",False,"Karate Kid: Legends - After a family tragedy, ...",0.691103
1327,Karate Kid: Legends,2025-05-28,40.495,en,"After a family tragedy, kung fu prodigy Li Fon...","[28, 18, 10751]",False,"Karate Kid: Legends - After a family tragedy, ...",0.691103
6475,Man of Tai Chi,2013-07-04,15.281,en,"In Beijing, a young martial artist's skill pla...","[28, 18]",False,"Man of Tai Chi - In Beijing, a young martial a...",0.597481
4962,Kung Fu Jungle,2014-10-31,18.453,zh,A martial arts instructor working at a police ...,"[28, 53, 80, 12]",False,Kung Fu Jungle - A martial arts instructor wor...,0.589305
90,Kung Fu Panda 4,2024-03-02,215.393,en,Po is gearing up to become the spiritual leade...,"[16, 10751, 14, 28]",False,Kung Fu Panda 4 - Po is gearing up to become t...,0.573977


In [10]:
query = "A weapons businessman is kidnapped and becomes an armored superhero"
results = denseR.search(query, 10)
results

🔍 Buscando: A weapons businessman is kidnapped and becomes an armored superhero


Unnamed: 0,title,release_date,popularity,original_language,overview,genre_ids,adult,text,similarity
3769,Super,2010-11-26,20.506,en,After his wife falls under the influence of a ...,"[35, 28, 18]",False,Super - After his wife falls under the influen...,0.587949
299,Iron Man,2008-04-30,100.273,en,"After being held captive in an Afghan cave, bi...","[28, 878, 12]",False,Iron Man - After being held captive in an Afgh...,0.563994
6262,The Great Arms Robbery,2022-04-09,14.7,zh,Agent Wen goes undercover to locate weapons fo...,"[28, 80, 18]",False,The Great Arms Robbery - Agent Wen goes underc...,0.54686
2097,Commando,1985-10-03,28.813,en,"John Matrix, the former leader of a special co...","[28, 12, 53]",False,"Commando - John Matrix, the former leader of a...",0.492411
72,Armor,2024-10-30,252.688,en,Armored truck security guard James Brody is wo...,"[28, 80, 53, 18]",False,Armor - Armored truck security guard James Bro...,0.480697
397,Iron Man 2,2010-04-28,83.981,en,With the world now aware of his dual life as t...,"[12, 28, 878]",False,Iron Man 2 - With the world now aware of his d...,0.472264
3007,Ransom,1996-11-08,24.251,en,"When a rich man's son is kidnapped, he coopera...","[28, 53]",False,"Ransom - When a rich man's son is kidnapped, h...",0.469037
5350,6 Bullets,2012-09-11,16.15,en,An ex-mercenary known for finding missing chil...,"[53, 28, 80]",False,6 Bullets - An ex-mercenary known for finding ...,0.468657
4640,Iron Man & Captain America: Heroes United,2014-07-29,17.686,en,Iron Man and Captain America battle to keep th...,"[12, 16, 28]",False,Iron Man & Captain America: Heroes United - Ir...,0.467726
2179,Watchmen: Chapter I,2024-08-12,29.722,en,"In 1985, the murder of a government-sponsored ...","[16, 18, 878]",False,"Watchmen: Chapter I - In 1985, the murder of a...",0.46721
