## Preparación de embeddings

### Carga del dataset procesado

In [9]:
import pandas as pd
PATH_SAVE_PROCESADO = "./peliculasPopulares10k_Procesado.csv"
df = pd.read_csv(PATH_SAVE_PROCESADO)

### Carga de la clase Dense Retriever y generación de embeddings para el dataframe 

In [None]:
import DenseRetriever
denseR=DenseRetriever.DenseRetriever(df)


## Prueba de busqueda densa basada en query

In [None]:
query = "a kid who learns kung fu from a old sensei"
results = denseR.search(query, 5)
results

🔍 Buscando: a kid who learns kung fu from a old sensei


Unnamed: 0,title,release_date,popularity,original_language,overview,genre_ids,adult,text,similarity
616,The Karate Kid,2010-06-10,52.662,en,Twelve-year-old Dre Parker could have been the...,"[28, 12, 18, 10751]",False,The Karate Kid - Twelve-year-old Dre Parker co...,0.64602
2050,Karate Kid: Legends,2025-05-28,35.522,en,"After a family tragedy, kung fu prodigy Li Fon...","[28, 18, 10751]",False,"Karate Kid: Legends - After a family tragedy, ...",0.617543
2019,The Karate Kid,1984-06-22,34.382,en,Daniel LaRusso moves to Los Angeles with his m...,"[28, 18, 10751]",False,The Karate Kid - Daniel LaRusso moves to Los A...,0.55807
4216,Bulletproof Monk,2003-03-28,18.89,en,A mysterious and immortal Tibetan kung fu mast...,"[28, 35, 14]",False,Bulletproof Monk - A mysterious and immortal T...,0.539531
7672,Kung Fu Jungle,2014-10-31,20.819,zh,A martial arts instructor working at a police ...,"[28, 53, 80, 12]",False,Kung Fu Jungle - A martial arts instructor wor...,0.528238


In [None]:
query = "A young guy who is bitten by a radioactive spider"
results = denseR.search(query, 10)
results

🔍 Buscando: A young guy who is bitten by a radioactive spider


Unnamed: 0,title,release_date,popularity,original_language,overview,genre_ids,adult,text,similarity
293,Spider-Man,2002-05-01,98.615,en,After being bitten by a genetically altered sp...,"[28, 878]",False,Spider-Man - After being bitten by a genetical...,0.548529
230,Spider-Man: Into the Spider-Verse,2018-12-06,96.228,en,Struggling to find his place in the world whil...,"[16, 28, 12, 878]",False,Spider-Man: Into the Spider-Verse - Struggling...,0.523907
2518,Hulk,2003-06-19,33.935,en,"Bruce Banner, a genetics researcher with a tra...","[878, 12, 28]",False,"Hulk - Bruce Banner, a genetics researcher wit...",0.482946
1407,Sting,2024-04-12,57.227,en,After raising an unnervingly talented spider i...,"[27, 878, 53]",False,Sting - After raising an unnervingly talented ...,0.478088
353,The Amazing Spider-Man,2012-06-23,79.934,en,Peter Parker is an outcast high schooler aband...,"[28, 12, 878]",False,The Amazing Spider-Man - Peter Parker is an ou...,0.460183
3396,Eight Legged Freaks,2002-07-17,22.878,en,The residents of a rural mining town discover ...,"[28, 35, 27, 53]",False,Eight Legged Freaks - The residents of a rural...,0.424509
192,Venom,2018-09-28,120.395,en,Investigative journalist Eddie Brock attempts ...,"[878, 28]",False,Venom - Investigative journalist Eddie Brock a...,0.41895
6617,The Girl in the Spider's Web,2018-10-25,22.795,en,After being enlisted to recover a dangerous co...,"[28, 80, 53]",False,The Girl in the Spider's Web - After being enl...,0.418223
6133,Spider-Man,1977-09-14,14.916,en,When an extortionist threatens to force a mult...,"[878, 28, 80, 10770]",False,Spider-Man - When an extortionist threatens to...,0.416715
2820,Infested,2023-12-27,27.163,fr,Residents of a rundown French apartment buildi...,"[27, 53]",False,Infested - Residents of a rundown French apart...,0.414117


## Preparación del modelo generativo - TinyLlama/TinyLlama-1.1B-Chat-v1.0

In [None]:
# Install transformers from source - only needed for versions <= v4.34
# pip install git+https://github.com/huggingface/transformers.git
# pip install accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch
import os

device= "cpu" #if not torch.cuda.is_available() else "cuda"


datosTrain = df["overview"].tolist()
dataset = Dataset.from_list([{"text": text} for text in datosTrain])

print("🔹 Entrenando el modelo de lenguaje...")

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
modelo = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizador = AutoTokenizer.from_pretrained(model_name)


tokenizador.pad_token = tokenizador.eos_token
modelo.config.pad_token_id = tokenizador.eos_token_id

def procesar_datos(ejemplo):
    tokenizado = tokenizador(
        ejemplo["text"], max_length=128, truncation=True, padding="max_length", return_tensors="pt"
    )
    return {key: tensor.squeeze() for key, tensor in tokenizado.items()}


dataset_procesado = dataset.map(procesar_datos)

# Usar DataCollatorForLanguageModeling (para gestionar el padding correctamente)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizador, mlm=False  # mlm=False porque es modelado causal, no enmascarado
)


# Ajustar las dimensiones del modelo al tokenizador
modelo.resize_token_embeddings(len(tokenizador))

# argumentos = TrainingArguments(
#     output_dir="./resultados",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=2,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     save_total_limit=1,
#     logging_steps=10,
#     report_to="none"  # Desactiva W&B
# )

# # Crear el Trainer
# trainer = Trainer(
#     model=modelo,
#     args=argumentos,
#     data_collator=data_collator,
#     train_dataset=dataset_procesado,
#     tokenizer=tokenizador,
#     eval_dataset=dataset_procesado
# )

def generar_texto(prompt, modelo, tokenizador, max_length=100, contexto: str = None):
    """
    Función para generar texto con el modelo actual.
    """
    inputs = tokenizador(prompt, return_tensors="pt").to(device)
    modelo.to(device)

    with torch.no_grad():
        if contexto:
            contexto_ids = tokenizador(contexto, return_tensors="pt").input_ids.to(device)
            inputs.input_ids = torch.cat([contexto_ids, inputs.input_ids], dim=-1)
            inputs.attention_mask = torch.cat([torch.ones(contexto_ids.shape, device=device), inputs.attention_mask], dim=-1)
        output = modelo.generate(**inputs, max_length=max_length, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)
        
     

    return tokenizador.decode(output[0], skip_special_tokens=True)





🔹 Entrenando el modelo de lenguaje...


Map:   0%|          | 0/9682 [00:00<?, ? examples/s]

In [8]:
# Prueba antes del ajuste fino
query = "A young guy who is bitten by a radioactive spider"
results = denseR.search(query, 10)



peliculaMasParecida = results.iloc[0]
titulo = peliculaMasParecida["title"]
contexto = peliculaMasParecida["overview"]



print(f"Prompt: {titulo}\nContexto: {contexto}")

textoGenerado = generar_texto(titulo, modelo, tokenizador, contexto=contexto, max_length=100)

print(f"Texto generado\n: {textoGenerado}")

🔍 Buscando: A young guy who is bitten by a radioactive spider
Prompt: Spider-Man
Contexto: After being bitten by a genetically altered spider at Oscorp, nerdy but endearing high school student Peter Parker is endowed with amazing powers to become the superhero known as Spider-Man.
Texto generado
: Descripción creativa: Spider-Man: Into the Spider-Verse (2018)

4. The Incredibles (2004)
5. The Lion King (1994)
6. Toy Story (1995)
7. The Princess Bride (1987)
8. The Nightmare Before Christmas (1993)
9. The Lion King 2: Simba's Pride (1
