# Modelo LTR - Learning To Rank

In [4]:
import dagshub
import mlflow

import polars as pl
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import train_test_split



dagshub.init(repo_owner='abdala9512', repo_name='dsrp-machine-learning-engineering-4', mlflow=True)

In [5]:
ltr_df = pl.read_parquet("data/ltr_imdb_dataset.parquet")

# columnas feature (elige las que quieras incluir)
feature_cols = [c for c in ltr_df.columns if c.startswith("emb_")] + [
    "sim_embedding",
    "imdb_rating",
    "imdb_votes_log",
]

# matriz X
X = ltr_df.select(feature_cols).to_numpy()

# vector y (labels)
y = ltr_df["label"].to_numpy()

# grupos = n√∫mero de docs por query
groups = (
    ltr_df
    .group_by("query_id")
    .count()
    ["count"]
    .to_numpy()
)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("num groups:", len(groups))


X shape: (21000, 3)
y shape: (21000,)
num groups: 210


  .count()


In [None]:
mlflow.create_experiment("LTR - Claser 15 Diciembre")

In [62]:
mlflow.set_experiment("LTR - Claser 15 Diciembre")

mlflow.autolog()

with mlflow.start_run(run_name="LGBM") as run:
    
    ranker = lgb.LGBMRanker(
        objective="lambdarank",
        boosting_type="gbdt", # LAMBDAMART
        n_estimators=50,      # peque√±o para PoC
        learning_rate=0.1,
        num_leaves=31,
    )

    
    ranker.fit(
        X,
        y,
        group=groups,
    )

    mlflow.lightgbm.log_model(
        lgb_model=ranker,
        name="ltr-model",
        input_example=X,
        registered_model_name=MODEL_NAME,
    )
    
    print("Modelo entrenado!")

2025/12/15 21:44:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/12/15 21:44:00 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 586
[LightGBM] [Info] Number of data points in the train set: 21000, number of used features: 3


Registered model 'lts-dsrpflix-prd' already exists. Creating a new version of this model...
2025/12/15 21:44:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lts-dsrpflix-prd, version 5
Created version '5' of model 'lts-dsrpflix-prd'.


Modelo entrenado!
üèÉ View run LGBM at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-4.mlflow/#/experiments/0/runs/b53ab2a2b25e4f95be57802ee49c5299
üß™ View experiment at: https://dagshub.com/abdala9512/dsrp-machine-learning-engineering-4.mlflow/#/experiments/0


In [57]:
best_run = mlflow.search_runs().sort_values(by="params.num_leaves", ascending=False)["run_id"][0]

In [47]:
model_uri = f'{mlflow.get_run(run_id=best_run).to_dictionary().get("info")["artifact_uri"]}/model.pkl'
model_uri

'mlflow-artifacts:/1f75ef0243bc4acda854471c6c253611/6acef7b054dc40459d872304dc021420/artifacts/model.pkl'

In [48]:
MODEL_NAME = "lts-dsrpflix-prd"

In [54]:
mlflow.register_model(name=MODEL_NAME, model_uri=f"mlflow-artifacts:/1f75ef0243bc4acda854471c6c253611/6acef7b054dc40459d872304dc021420/artifacts", )

Registered model 'lts-dsrpflix-prd' already exists. Creating a new version of this model...
2025/12/15 21:39:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lts-dsrpflix-prd, version 3
Created version '3' of model 'lts-dsrpflix-prd'.


<ModelVersion: aliases=[], creation_timestamp=1765852782443, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1765852782443, metrics=None, model_id=None, name='lts-dsrpflix-prd', params=None, run_id='', run_link='', source='mlflow-artifacts:/1f75ef0243bc4acda854471c6c253611/6acef7b054dc40459d872304dc021420/artifacts', status='READY', status_message=None, tags={}, user_id='', version='3'>

In [63]:
client = mlflow.MlflowClient()
client.set_registered_model_alias(
 name=MODEL_NAME,
 alias="champion",
 version=5
)

In [64]:
prod_model = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}@champion")

Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:07<00:00,  1.02s/it]


In [60]:
prod_model

mlflow.pyfunc.loaded_model:
  artifact_path: mlflow-artifacts:/1f75ef0243bc4acda854471c6c253611/models/m-5aa51eacd5f34593a1c31baf41119a57/artifacts
  flavor: mlflow.lightgbm
  run_id: 6044248016254b838f01d127b713dfed

In [65]:
def test_single_query(qid, model):
    df = ltr_df.filter(pl.col("query_id") == qid)
    Xq = df.select(feature_cols).to_numpy()

    preds = model.predict(Xq)

    df = df.with_columns(pl.Series("ranker_score", preds))
    return df.sort("ranker_score", descending=True).head(10)

test_single_query(3, model=prod_model)




query_id,query_text,imdb_id,title,sim_embedding,imdb_rating,imdb_votes_log,year,genres,rel_score,label,ranker_score
i32,str,str,str,f32,f64,f64,i32,str,f64,i32,f64
3,"""classic drama movies""","""tt0088930""","""Clue""",0.451998,7.3,11.696346,1985,"""Comedy,Crime,Mystery""",0.601407,3,5.524147
3,"""classic drama movies""","""tt9179430""","""Vikram""",0.437424,8.3,11.37896,2022,"""Action,Crime,Thriller""",0.619876,3,5.519233
3,"""classic drama movies""","""tt0105151""","""The Player""",0.454028,7.5,11.17052,1992,"""Comedy,Crime,Drama""",0.60139,3,5.462067
3,"""classic drama movies""","""tt1462764""","""Indiana Jones and the Dial of ‚Ä¶",0.442078,6.5,12.333731,2023,"""Action,Adventure,Sci-Fi""",0.58097,3,5.445153
3,"""classic drama movies""","""tt0048281""","""The Ladykillers""",0.434312,7.6,10.42106,1955,"""Comedy,Crime""",0.58451,3,5.092517
3,"""classic drama movies""","""tt0100998""","""Dreams""",0.431395,7.7,10.370048,1990,"""Drama,Fantasy""",0.58537,3,4.752521
3,"""classic drama movies""","""tt0097108""","""The Cook, the Thief, His Wife ‚Ä¶",0.431122,7.5,10.699913,1989,"""Crime,Drama""",0.583644,3,4.685198
3,"""classic drama movies""","""tt0808279""","""Funny Games""",0.460056,6.5,11.60111,2007,"""Crime,Drama,Thriller""",0.580163,3,4.667681
3,"""classic drama movies""","""tt0485510""","""Killing the Shadows""",0.465337,7.5,9.557682,2006,"""Comedy,Drama,History""",0.585477,3,4.577074
3,"""classic drama movies""","""tt0187231""","""Barking at the Stars""",0.434227,8.6,9.091332,1998,"""Comedy,Romance""",0.596687,3,4.361507


In [5]:
import numpy as np
import polars as pl


# -----------------------------------------------------------
# Utilidades para m√©tricas de ranking
# -----------------------------------------------------------

def dcg(relevances):
    """Discounted Cumulative Gain."""
    relevances = np.array(relevances)
    discounts = 1 / np.log2(np.arange(2, len(relevances) + 2))
    return np.sum(relevances * discounts)


def ndcg_at_k(true_rels, pred_scores, k=10):
    """Compute NDCG@k for una query individual."""
    idx = np.argsort(-pred_scores)[:k]
    sorted_true = np.array(true_rels)[idx]

    dcg_k = dcg(sorted_true)

    # Ideal DCG (ordenamos relev√¢ncias por orden descendente)
    ideal_sorted_true = np.sort(true_rels)[::-1][:k]
    idcg_k = dcg(ideal_sorted_true)

    return dcg_k / idcg_k if idcg_k > 0 else 0.0


def precision_at_k(true_rels, pred_scores, k=10, threshold=1):
    """
    relevance >= threshold se considera relevante.
    Por defecto, threshold=1 (0: irrelevante, >=1: relevante)
    """
    idx = np.argsort(-pred_scores)[:k]
    rels_k = np.array(true_rels)[idx]
    return np.mean(rels_k >= threshold)


def average_precision(true_rels, pred_scores):
    """
    AP por query.
    """
    idx = np.argsort(-pred_scores)
    sorted_rels = np.array(true_rels)[idx]

    precisions = []
    num_relevant = 0

    for i, rel in enumerate(sorted_rels, start=1):
        if rel > 0:
            num_relevant += 1
            precisions.append(num_relevant / i)

    return np.mean(precisions) if precisions else 0.0


# -----------------------------------------------------------
# Funci√≥n principal para evaluar TODO el dataset
# -----------------------------------------------------------

def evaluate_ranker(
    ltr_df: pl.DataFrame,
    ranker,
    feature_cols: list[str],
    k=10,
):
    """
    Calcula NDCG@k, Precision@k y MAP para todas las queries.
    
    ltr_df: dataset con query_id, label, y features
    ranker: modelo LGBMRanker entrenado
    feature_cols: columnas de features en el orden correcto
    k: cutoff para m√©tricas top-k
    """
    results = []

    # Agrupamos por query
    for qid, group in ltr_df.group_by("query_id"):
        dfq = group

        # features
        Xq = dfq.select(feature_cols).to_numpy()
        # etiqueta verdadera
        y_true = dfq["label"].to_numpy()
        # predicci√≥n modelo
        y_pred = ranker.predict(Xq)

        # m√©tricas por query
        ndcg = ndcg_at_k(y_true, y_pred, k=k)
        prec = precision_at_k(y_true, y_pred, k=k)
        ap = average_precision(y_true, y_pred)

        results.append((ndcg, prec, ap))

    # Convertimos para agregados
    results = np.array(results)

    return {
        "NDCG@{}".format(k): float(np.mean(results[:, 0])),
        "Precision@{}".format(k): float(np.mean(results[:, 1])),
        "MAP": float(np.mean(results[:, 2])),
        "Num Queries": len(results),
    }


In [6]:
metrics = evaluate_ranker(
    ltr_df=ltr_df,
    ranker=ranker,
    feature_cols=feature_cols,
    k=10
)

metrics




{'NDCG@10': 1.0,
 'Precision@10': 1.0,
 'MAP': 0.9886046084368605,
 'Num Queries': 210}