# Procesamiento de datos - Fase 1
## DSRP - Especialización de Machine Learning -  Curso MLOps e Ingeniería de Software para ML


In [1]:
import json
import polars as pl
import numpy as np
import flytekit

from sentence_transformers import CrossEncoder, SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## Preparación Dataset Base

In [3]:

@flytekit.task
def load_imdb_data() -> pl.DataFrame:

    with open("data/omdb_raw.jsonl", 'r') as json_file:
        json_list = [json.loads(j) for j in json_file]

    complementary_imdb_data = pl.DataFrame(
        [
          [
              i["imdb_id"], 
              i["raw"].get("Runtime"),
              i["raw"].get("Director"),
              i["raw"].get("Actors"),
              i["raw"].get("Plot"),
              i["raw"].get("Country"),
              i["raw"].get("Language"),
          ] for i in json_list
        ],
         schema= {
             "imdb_id": str, 
             "Runtime": str,
             "Director": str,
             "Actors": str,
             "Plot": str,
             "Country": str,
             "Language": str
         },
         orient="row"
    )
    
    movies_base = pl.read_parquet("data/movies_base.parquet")

    return movies_base.join(
        complementary_imdb_data,
        on="imdb_id"
    )

In [4]:
complete_database = load_imdb_data()
complete_database.write_parquet("data/complete_imdb_database.parquet")

In [5]:
complete_database

imdb_id,title,year,genres,imdb_rating,imdb_votes,Runtime,Director,Actors,Plot,Country,Language
str,str,i32,str,f64,i64,str,str,str,str,str,str
"""tt0002423""","""Passion""",1919,"""Biography,Drama,Romance""",6.7,1105,"""113 min""","""Ernst Lubitsch""","""Pola Negri, Emil Jannings, Har…","""The story of Madame DuBarry, t…","""Germany""","""None, German"""
"""tt0004181""","""Judith of Bethulia""",1914,"""Drama""",6.2,1525,"""61 min""","""D.W. Griffith""","""Blanche Sweet, Henry B. Waltha…","""A fascinating work of high art…","""United States""","""None, English"""
"""tt0004465""","""The Perils of Pauline""",1914,"""Action,Adventure,Drama""",6.3,1116,"""199 min""","""Louis J. Gasnier, Donald MacKe…","""Pearl White, Crane Wilbur, Pau…","""Young Pauline is left a lot of…","""United States""","""None, English"""
"""tt0003643""","""The Avenging Conscience: or 'T…",1914,"""Crime,Drama,Horror""",6.4,1557,"""78 min""","""D.W. Griffith""","""Henry B. Walthall, Spottiswood…","""Thwarted by his despotic uncle…","""United States""","""English"""
"""tt0002844""","""Fantômas: In the Shadow of the…",1913,"""Crime,Drama""",6.9,2686,"""54 min""","""Louis Feuillade""","""René Navarre, Edmund Breon, Ge…","""Princess Sonia Danidoff is sta…","""France""","""None, French"""
…,…,…,…,…,…,…,…,…,…,…,…
"""tt9907782""","""The Cursed""",2021,"""Fantasy,Horror,Mystery""",6.2,22054,"""111 min""","""Sean Ellis""","""Boyd Holbrook, Kelly Reilly, A…","""In the late nineteenth century…","""United Kingdom, France, United…","""English, Romanian"""
"""tt9908390""","""Le lion""",2020,"""Comedy""",5.5,1600,"""95 min""","""Ludovic Colbeau-Justin""","""Dany Boon, Philippe Katerine, …","""A psychiatric hospital patient…","""France, Belgium""","""French"""
"""tt9911196""","""The Marriage Escape""",2020,"""Comedy,Drama""",7.4,3520,"""103 min""","""Johan Nijenhuis""","""Herman Finkers, Johanna ter St…","""Jan and Gedda have been marrie…","""Netherlands""","""Low German, Dutch"""
"""tt9916362""","""Coven""",2020,"""Drama,History,Horror""",6.4,6167,"""92 min""","""Pablo Agüero""","""Amaia Aberasturi, Alex Brendem…","""1609, Basque Country (north to…","""Spain, France, Argentina""","""Spanish, Basque"""


# 2. Feature Engineering
## 2.1 Generación embeddings

In [None]:
complete_database = pl.read_parquet("data/complete_imdb_database.parquet")

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = (complete_database["title"] + ". " + complete_database["Plot"]).to_list()

embs = model.encode(texts, batch_size=64)
embs = np.asarray(embs, dtype="float32")

np.save("data/movie_embs.npy", embs)

## 2.2 Creación nuevas Features

In [6]:
complete_database = complete_database.with_columns([
    pl.col("imdb_votes").log1p().alias("imdb_votes_log"),
    (
        (pl.col("year") - pl.col("year").mean()) / pl.col("year").std()
    ).alias("year_norm"),
    (2025 - pl.col("year")).alias("movie_age"),
    pl.col("Plot").str.len_chars().alias("plot_length"),
    #### One hot
    pl.col("genres").str.contains("Action").cast(pl.Int8()).alias("genre_action"),
])

complete_database

imdb_id,title,year,genres,imdb_rating,imdb_votes,Runtime,Director,Actors,Plot,Country,Language,imdb_votes_log,year_norm,movie_age,plot_length,genre_action
str,str,i32,str,f64,i64,str,str,str,str,str,str,f64,f64,i32,u32,i8
"""tt0002423""","""Passion""",1919,"""Biography,Drama,Romance""",6.7,1105,"""113 min""","""Ernst Lubitsch""","""Pola Negri, Emil Jannings, Har…","""The story of Madame DuBarry, t…","""Germany""","""None, German""",7.008505,-3.574081,106,116,0
"""tt0004181""","""Judith of Bethulia""",1914,"""Drama""",6.2,1525,"""61 min""","""D.W. Griffith""","""Blanche Sweet, Henry B. Waltha…","""A fascinating work of high art…","""United States""","""None, English""",7.330405,-3.79282,111,4666,0
"""tt0004465""","""The Perils of Pauline""",1914,"""Action,Adventure,Drama""",6.3,1116,"""199 min""","""Louis J. Gasnier, Donald MacKe…","""Pearl White, Crane Wilbur, Pau…","""Young Pauline is left a lot of…","""United States""","""None, English""",7.018402,-3.79282,111,368,1
"""tt0003643""","""The Avenging Conscience: or 'T…",1914,"""Crime,Drama,Horror""",6.4,1557,"""78 min""","""D.W. Griffith""","""Henry B. Walthall, Spottiswood…","""Thwarted by his despotic uncle…","""United States""","""English""",7.351158,-3.79282,111,332,0
"""tt0002844""","""Fantômas: In the Shadow of the…",1913,"""Crime,Drama""",6.9,2686,"""54 min""","""Louis Feuillade""","""René Navarre, Edmund Breon, Ge…","""Princess Sonia Danidoff is sta…","""France""","""None, French""",7.896181,-3.836568,112,2579,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""tt9907782""","""The Cursed""",2021,"""Fantasy,Horror,Mystery""",6.2,22054,"""111 min""","""Sean Ellis""","""Boyd Holbrook, Kelly Reilly, A…","""In the late nineteenth century…","""United Kingdom, France, United…","""English, Romanian""",10.001295,0.88819,4,393,0
"""tt9908390""","""Le lion""",2020,"""Comedy""",5.5,1600,"""95 min""","""Ludovic Colbeau-Justin""","""Dany Boon, Philippe Katerine, …","""A psychiatric hospital patient…","""France, Belgium""","""French""",7.378384,0.844442,5,157,0
"""tt9911196""","""The Marriage Escape""",2020,"""Comedy,Drama""",7.4,3520,"""103 min""","""Johan Nijenhuis""","""Herman Finkers, Johanna ter St…","""Jan and Gedda have been marrie…","""Netherlands""","""Low German, Dutch""",8.1665,0.844442,5,364,0
"""tt9916362""","""Coven""",2020,"""Drama,History,Horror""",6.4,6167,"""92 min""","""Pablo Agüero""","""Amaia Aberasturi, Alex Brendem…","""1609, Basque Country (north to…","""Spain, France, Argentina""","""Spanish, Basque""",8.72713,0.844442,5,1760,0


# 2.3 Creación Queries Sinteticas

In [7]:
genre_df = (
    complete_database
    .select(pl.col("genres").str.split_exact(",", 10).alias("genres_split"))  # optional, según formato
)

genre_df = (
    complete_database
    .select(pl.col("genres").str.split(",").alias("genres_list"))
    .explode("genres_list")
    .with_columns(
        pl.col("genres_list").str.strip_chars().alias("genre")
    )
    .filter(pl.col("genre").is_not_null() & (pl.col("genre") != ""))
)
top_genres = (
    genre_df
    .group_by("genre")
    .len()
    .sort("len", descending=True)
    .head(15)  # top 10 géneros
    ["genre"]
    .to_list()
)

top_genres

['Drama',
 'Comedy',
 'Action',
 'Romance',
 'Crime',
 'Thriller',
 'Horror',
 'Adventure',
 'Mystery',
 'Fantasy',
 'Biography',
 'Documentary',
 'Sci-Fi',
 'Family',
 'History']

In [8]:
years = complete_database["year"].drop_nulls()
min_year = years.min()
max_year = years.max()

# bucket por década
decades = sorted({int(y) // 10 * 10 for y in years})
decades

[1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]

In [10]:
from itertools import product
# ---------------------------------------------------------
# 3. Generar queries sintéticas con 'emphasis'
#    - emphasis = "rating"      → queries con BEST / TOP
#    - emphasis = "popularity"  → queries con POPULAR
#    - emphasis = "neutral"     → queries neutras
# ---------------------------------------------------------

def generate_synthetic_queries(top_genres: list[str], decades: list[int]) -> list[dict]:
    queries = []
    qid = 0

    # 3.1 Queries solo por género
    # (texto, intent_type, emphasis)
    templates_genre = [
        ("best {genre} movies", "genre_only", "rating"),
        ("top rated {genre} movies", "genre_only", "rating"),
        ("classic {genre} movies", "genre_only", "neutral"),
        ("popular {genre} movies", "genre_only", "popularity"),
    ]

    for g in top_genres:
        for tpl, intent_type, emphasis in templates_genre:
            qid += 1
            queries.append(
                {
                    "query_id": qid,
                    "query_text": tpl.format(genre=g.lower()),
                    "intent_type": intent_type,
                    "genre": g,
                    "decade": None,
                    "emphasis": emphasis,
                }
            )

    # 3.2 Queries género + década
    templates_genre_decade = [
        ("best {genre} movies from the {decade}s", "genre_decade", "rating"),
        ("popular {genre} movies from the {decade}s", "genre_decade", "popularity"),
    ]

    for g, d in product(top_genres, decades):
        for tpl, intent_type, emphasis in templates_genre_decade:
            qid += 1
            queries.append(
                {
                    "query_id": qid,
                    "query_text": tpl.format(genre=g.lower(), decade=d),
                    "intent_type": intent_type,
                    "genre": g,
                    "decade": d,
                    "emphasis": emphasis,
                }
            )

    # 3.3 Queries por “mood” + género (siguen siendo útiles para LTR)
    moods = [
        ("feel good", "mood_feel_good", "neutral"),
        ("dark", "mood_dark", "neutral"),
        ("family friendly", "mood_family", "neutral"),
    ]

    for g in top_genres:
        for mood_text, mood_tag, emphasis in moods:
            qid += 1
            queries.append(
                {
                    "query_id": qid,
                    "query_text": f"{mood_text} {g.lower()} movies",
                    "intent_type": mood_tag,
                    "genre": g,
                    "decade": None,
                    "emphasis": emphasis,
                }
            )

    return queries

synthetic_queries = generate_synthetic_queries(top_genres, decades)
print("Total synthetic queries:", len(synthetic_queries))

# ---------------------------------------------------------
# 4. Pasar a Polars DataFrame → queries_df
# ---------------------------------------------------------

queries_df = pl.DataFrame(synthetic_queries)
queries_df.head(10)

Total synthetic queries: 495


query_id,query_text,intent_type,genre,decade,emphasis
i64,str,str,str,i64,str
1,"""best drama movies""","""genre_only""","""Drama""",,"""rating"""
2,"""top rated drama movies""","""genre_only""","""Drama""",,"""rating"""
3,"""classic drama movies""","""genre_only""","""Drama""",,"""neutral"""
4,"""popular drama movies""","""genre_only""","""Drama""",,"""popularity"""
5,"""best comedy movies""","""genre_only""","""Comedy""",,"""rating"""
6,"""top rated comedy movies""","""genre_only""","""Comedy""",,"""rating"""
7,"""classic comedy movies""","""genre_only""","""Comedy""",,"""neutral"""
8,"""popular comedy movies""","""genre_only""","""Comedy""",,"""popularity"""
9,"""best action movies""","""genre_only""","""Action""",,"""rating"""
10,"""top rated action movies""","""genre_only""","""Action""",,"""rating"""


In [19]:
queries_df

query_id,query_text,intent_type,genre,decade,emphasis
i64,str,str,str,i64,str
1,"""best drama movies""","""genre_only""","""Drama""",,"""rating"""
2,"""top rated drama movies""","""genre_only""","""Drama""",,"""rating"""
3,"""classic drama movies""","""genre_only""","""Drama""",,"""neutral"""
4,"""popular drama movies""","""genre_only""","""Drama""",,"""popularity"""
5,"""best comedy movies""","""genre_only""","""Comedy""",,"""rating"""
…,…,…,…,…,…
491,"""dark family movies""","""mood_dark""","""Family""",,"""neutral"""
492,"""family friendly family movies""","""mood_family""","""Family""",,"""neutral"""
493,"""feel good history movies""","""mood_feel_good""","""History""",,"""neutral"""
494,"""dark history movies""","""mood_dark""","""History""",,"""neutral"""


In [16]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

embs = np.load("data/movie_embs.npy")
embs

array([[-0.02956097,  0.00746341,  0.02317246, ...,  0.09302959,
         0.04051323, -0.04066506],
       [ 0.0043345 ,  0.0106766 , -0.00462149, ...,  0.06287877,
         0.01587654, -0.05517145],
       [-0.05460579,  0.06455793, -0.01173891, ..., -0.02748005,
         0.03246444, -0.01696188],
       ...,
       [-0.03798081,  0.09147337,  0.04804617, ..., -0.04112181,
         0.06490831,  0.02602935],
       [ 0.02010926,  0.02452525, -0.10270256, ..., -0.00185721,
         0.0069349 , -0.04031993],
       [-0.06547838,  0.05664021, -0.05948014, ...,  0.03930681,
        -0.00102902, -0.04971379]], shape=(47203, 384), dtype=float32)

In [17]:
movie_norms = np.linalg.norm(embs, axis=1, keepdims=True)
movie_embs_norm = embs / (movie_norms + 1e-9)

def get_candidates_for_query(
    q_row: dict,
    movies_df: pl.DataFrame,
    movie_embs_norm: np.ndarray,
    k: int = 100,
) -> pl.DataFrame:
    """
    q_row: dict con campos de queries_df (query_id, query_text, etc.)
    """
    qid = q_row["query_id"]
    qtext = q_row["query_text"]

    # 1) embedding de la query
    q_emb = model.encode([qtext]).astype("float32")[0]
    q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-9)

    # 2) cosine similarity contra TODAS las pelis
    # (N,) = (N, d) @ (d,)
    scores = movie_embs_norm @ q_emb

    # 3) top-K índices
    k = min(k, scores.shape[0])
    idxs = np.argpartition(-scores, k)[:k]
    idxs = idxs[np.argsort(-scores[idxs])]  # ordenados de mayor a menor

    # 4) construir DataFrame de candidatos
    cand = movies_df[idxs].with_columns(
        pl.Series("sim_embedding", scores[idxs]),
        pl.lit(qid).alias("query_id"),
        pl.lit(qtext).alias("query_text"),
    )

    # 5) columnas que devolvemos (ajusta a lo que tengas)
    cols = ["query_id", "query_text", "imdb_id", "title", "sim_embedding"]
    for extra in ["imdb_rating", "imdb_votes_log", "year", "genres"]:
        if extra in cand.columns:
            cols.append(extra)

    return cand.select(cols)


all_candidates = []

for q_row in queries_df.iter_rows(named=True):
    cand = get_candidates_for_query(
        q_row=q_row,
        movies_df=complete_database,
        movie_embs_norm=movie_embs_norm,
        k=100,   # top 100 por query
    )
    all_candidates.append(cand)

candidates_df = pl.concat(all_candidates)

candidates_df.head()

query_id,query_text,imdb_id,title,sim_embedding,imdb_rating,imdb_votes_log,year,genres
i32,str,str,str,f32,f64,f64,i32,str
1,"""best drama movies""","""tt1592292""","""Zero 2""",0.53556,7.5,8.669743,2010,"""Comedy,Thriller"""
1,"""best drama movies""","""tt26445483""","""Fukrey 3""",0.518885,5.5,10.158672,2023,"""Comedy,Crime,Drama"""
1,"""best drama movies""","""tt8165192""","""Raging Fire""",0.518501,6.5,8.817446,2021,"""Action,Adventure,Crime"""
1,"""best drama movies""","""tt13049760""","""The Last Matinee""",0.505715,5.8,7.948032,2020,"""Horror,Thriller"""
1,"""best drama movies""","""tt18163024""","""Chaaruseela""",0.501941,8.5,7.604396,2022,


## 2.4 Calculo de Relevancia

In [18]:
candidates_df

query_id,query_text,imdb_id,title,sim_embedding,imdb_rating,imdb_votes_log,year,genres
i32,str,str,str,f32,f64,f64,i32,str
1,"""best drama movies""","""tt1592292""","""Zero 2""",0.53556,7.5,8.669743,2010,"""Comedy,Thriller"""
1,"""best drama movies""","""tt26445483""","""Fukrey 3""",0.518885,5.5,10.158672,2023,"""Comedy,Crime,Drama"""
1,"""best drama movies""","""tt8165192""","""Raging Fire""",0.518501,6.5,8.817446,2021,"""Action,Adventure,Crime"""
1,"""best drama movies""","""tt13049760""","""The Last Matinee""",0.505715,5.8,7.948032,2020,"""Horror,Thriller"""
1,"""best drama movies""","""tt18163024""","""Chaaruseela""",0.501941,8.5,7.604396,2022,
…,…,…,…,…,…,…,…,…
495,"""family friendly history movies""","""tt28491891""","""His Three Daughters""",0.447149,7.1,9.924809,2023,"""Drama"""
495,"""family friendly history movies""","""tt11847972""","""Breaking Up in Rome""",0.446048,6.5,7.079184,2021,"""Comedy,Romance"""
495,"""family friendly history movies""","""tt0053134""","""Good Morning""",0.445699,7.8,9.401374,1959,"""Comedy,Drama,Family"""
495,"""family friendly history movies""","""tt14826022""","""You People""",0.445516,5.5,11.104009,2023,"""Comedy,Romance"""


In [24]:
def add_rel_score_for_query(
    cand: pl.DataFrame,
    q_row: dict,
    max_log_votes: float,
) -> pl.DataFrame:
    emphasis = q_row["emphasis"]

    # pesos base
    w_sim, w_rating, w_votes = 0.2, 0.6, 0.2

    # si la query enfatiza rating (best/top)
    if emphasis == "rating":
        w_sim, w_rating, w_votes = 0.4, 0.4, 0.2
    # si enfatiza popularidad
    elif emphasis == "popularity":
        w_sim, w_rating, w_votes = 0.3, 0.2, 0.5
    # neutral → dejamos los pesos base

    return cand.with_columns(
        (
            w_sim * pl.col("sim_embedding") +
            w_rating * (pl.col("imdb_rating") / 10.0) +
            w_votes * (pl.col("imdb_votes_log") / 15)
        ).alias("rel_score")
    )


def add_label_from_rel_score(cand: pl.DataFrame, n_bins: int = 4) -> pl.DataFrame:
    # ordenar por rel_score para rankear
    cand = cand.sort("rel_score", descending=True).with_row_count("rank")
    n = cand.height
    if n == 0:
        return cand

    bin_size = max(1, n // n_bins)

    # bucket preliminar = rank // bin_size
    bucket_expr = pl.col("rank") // bin_size

    # limitar bucket a [0, n_bins-1] manualmente
    bucket_expr = pl.when(bucket_expr > (n_bins - 1)) \
                    .then(n_bins - 1) \
                    .otherwise(bucket_expr)

    cand = cand.with_columns(
        bucket_expr.alias("bucket")
    )

    # label = invertimos para que top tenga label más alto
    cand = cand.with_columns(
        (n_bins - 1 - pl.col("bucket")).cast(pl.Int32).alias("label")
    ).drop(["rank", "bucket"])

    return cand


ltr_chunks = []

# indexar queries_df por query_id para acceso rápido
queries_by_id = {row["query_id"]: row for row in queries_df.iter_rows(named=True)}

for qid, q_row in queries_by_id.items():
    cand = candidates_df.filter(pl.col("query_id") == qid)
    if cand.is_empty():
        continue

    # 1) añadir rel_score usando sim_embedding + rating + votos según emphasis
    cand = add_rel_score_for_query(cand, q_row, 15)

    # 2) añadir label discreto 0..3
    cand = add_label_from_rel_score(cand, n_bins=5)

    ltr_chunks.append(cand)

ltr_df = pl.concat(ltr_chunks)

ltr_df.head(30)

  cand = cand.sort("rel_score", descending=True).with_row_count("rank")


query_id,query_text,imdb_id,title,sim_embedding,imdb_rating,imdb_votes_log,year,genres,rel_score,label
i32,str,str,str,f32,f64,f64,i32,str,f64,i32
1,"""best drama movies""","""tt9179430""","""Vikram""",0.463414,8.3,11.37896,2022,"""Action,Crime,Thriller""",0.669085,4
1,"""best drama movies""","""tt18163024""","""Chaaruseela""",0.501941,8.5,7.604396,2022,,0.642168,4
1,"""best drama movies""","""tt0100998""","""Dreams""",0.483811,7.7,10.370048,1990,"""Drama,Fantasy""",0.639792,4
1,"""best drama movies""","""tt0187231""","""Barking at the Stars""",0.425586,8.6,9.091332,1998,"""Comedy,Romance""",0.635452,4
1,"""best drama movies""","""tt9006564""","""2018""",0.427564,8.3,9.691222,2023,"""Action,Drama,Thriller""",0.632242,4
…,…,…,…,…,…,…,…,…,…,…
1,"""best drama movies""","""tt0281724""","""Cinemania""",0.446192,7.1,7.800163,2002,"""Documentary""",0.566479,3
1,"""best drama movies""","""tt0424992""","""Emma's Bliss""",0.437226,7.2,7.671361,2006,"""Drama,Romance""",0.565175,3
1,"""best drama movies""","""tt19886242""","""Rorschach""",0.424209,6.9,8.903543,2022,"""Action,Mystery,Thriller""",0.564397,3
1,"""best drama movies""","""tt26445483""","""Fukrey 3""",0.518885,5.5,10.158672,2023,"""Comedy,Crime,Drama""",0.563003,3


In [135]:
ltr_df.write_parquet("data/ltr_imdb_dataset.parquet")

# 3.Cliente Qdrant para busqueda semantica

In [31]:
import numpy as np
import polars as pl
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

# embs: (N, dim) float32
movie_embs = np.asarray(embs, dtype="float32")
num_items, dim = movie_embs.shape

# Aseguramos que movies_df está alineado
assert complete_database.height == num_items, "movies_df y embs no tienen el mismo número de filas"

# Cliente Qdrant embebido en una carpeta local
client = QdrantClient(path="qdrant_data")  # se crea si no existe

collection_name = "movies"

# Crear colección (si no existe)
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=dim,
        distance=Distance.COSINE,
    ),
)


  client.recreate_collection(


True

In [33]:
points = []

for i in range(num_items):
    vec = movie_embs[i].tolist()
    payload = {
        "row_index": i,
        "imdb_id": complete_database[i, "imdb_id"],
        "title": complete_database[i, "title"],
    }
    points.append(
        PointStruct(
            id=i,
            vector=vec,
            payload=payload,
        )
    )

client.upsert(
    collection_name=collection_name,
    points=points,
)


  client.upsert(


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [26]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

from qdrant_client.models import Filter
from qdrant_client import QdrantClient

def search_movies_qdrant(
    query_text: str,
    model: SentenceTransformer,
    client: QdrantClient,
    collection_name: str,
    movies_df: pl.DataFrame,
    k: int = 10,
) -> pl.DataFrame:
    # 1) Embedding de la query
    q_emb = model.encode([query_text]).astype("float32")[0]

    # 2) Buscar en Qdrant
    hits = client.query_points(
        collection_name=collection_name,
        query=q_emb.tolist(),
        limit=k,
        with_payload=True,
    )
    

    # 3) Extraer índices y scores
    row_indices = []
    scores = []


    for hit in hits.points:
        # El score de Qdrant ya es similarity (cosine por defecto)
        scores.append(hit.score)
        # Recuperamos el índice de fila desde el payload
        row_idx = hit.payload.get("row_index")
        row_indices.append(row_idx)

    row_indices = np.array(row_indices, dtype=int)
    scores = np.array(scores, dtype="float32")

    # 4) Mapear a movies_df y añadir sim_embedding
    cand = movies_df[row_indices].with_columns(
        pl.Series("sim_embedding", scores)
    ).sort("sim_embedding", descending=True)

    # 5) Seleccionar columnas útiles
    cols = ["imdb_id", "title", "sim_embedding"]
    for extra in ["imdb_rating", "imdb_votes", "year", "plot"]:
        if extra in cand.columns:
            cols.append(extra)

    return cand.select(cols)



In [34]:
client = QdrantClient(path="qdrant_data")
collection_name = "movies"

results = search_movies_qdrant(
    "top rated comedy movies",
    model=model,
    client=client,
    collection_name=collection_name,
    movies_df=complete_database,
    k=10,
)

results

  client = QdrantClient(path="qdrant_data")
