In [1]:
# --- imports & config ---
import json, numpy as np, pandas as pd
from pathlib import Path
DATA_PATH = "dataset_Recomendacion_villa_de_leyva_eleccion (2).csv"   # ajústalo
CAT_PATH  = "catalogo_vdl_lugares_unico.csv"               # ajústalo
SEP, ENC  = ";", "utf-8-sig"
RANDOM_SEED = 42
K_LIST = [3, 5, 10]
TEST_USER_FRAC = 0.20     # % de usuarios para hold-out honesto

# --- normalizador de encabezados con caracteres raros ---
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    ren = {
        "compa¤¡a_viaje": "compania_viaje",
        "‚poca_visita": "epoca_visita",
    }
    ren = {k:v for k,v in ren.items() if k in df.columns}
    return df.rename(columns=ren)

# --- métricas Top-K ---
def _dcg_at_k(rels): return float(np.sum([r/np.log2(i+2) for i,r in enumerate(rels)]))

def recall_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False)
    topk = g.head(k)
    tot = g[rel_col].sum()
    return float("nan") if tot==0 else float(topk[rel_col].sum()/tot)

def ndcg_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False)
    dcg  = _dcg_at_k(g.head(k)[rel_col].tolist())
    idcg = _dcg_at_k(sorted(g[rel_col].tolist(), reverse=True)[:k])
    return float("nan") if idcg==0 else float(dcg/idcg)

def coverage_at_k(df, k, score_col, item_col="nombre_sitio"):
    topk = (df.sort_values(["id_usuario", score_col], ascending=[True, False])
              .groupby("id_usuario").head(k))
    return float(topk[item_col].nunique() / df[item_col].nunique())


In [2]:
# --- Config de lectura ---
ENC = "utf-8-sig"
SEP = ";"
DATA_PATH = "dataset_Recomendacion_villa_de_leyva_eleccion (2).csv"  # <-- usa el tuyo

import pandas as pd
import numpy as np

# (opcional) inspección rápida
# with open(DATA_PATH, "r", encoding=ENC) as f:
#     for _ in range(3): print(f.readline().rstrip("\n"))

# --- carga ---
df = pd.read_csv(DATA_PATH, sep=SEP, encoding=ENC)

# Normaliza nombres (tu función)
df = normalize_columns(df)        # asegura minúsculas, sin acentos/espacios
df.columns = df.columns.str.strip()

# ---- Parche de esquema: renombrar sinónimos / grafías esperadas ----
CANDIDATES = {
    "compania_viaje":    ["compania_viaje", "compañia_viaje", "compan_a_viaje", "companaviaje"],
    "costo_entrada":     ["costo_entrada", "costoentrada", "precio_entrada", "costo"],
    "afluencia_promedio":["afluencia_promedio", "afluencia", "afluencia_prom"],
    "duracion_esperada": ["duracion_esperada", "duracion_estimada", "duracion", "tiempo_esperado"],
    "presupuesto_estimado": ["presupuesto_estimado","presupuesto","budget"],
    "tipo_turista_preferido": ["tipo_turista_preferido","preferencia","perfil_preferido"],
}

for tgt, opts in CANDIDATES.items():
    if tgt not in df.columns:
        for c in opts:
            if c in df.columns:
                df = df.rename(columns={c: tgt})
                break

# Validación mínima antes de features:
REQ = ["costo_entrada","presupuesto_estimado","tipo_sitio",
       "tipo_turista_preferido","epoca_visita"]
missing = [c for c in REQ if c not in df.columns]
if missing:
    raise KeyError(f"Faltan columnas obligatorias: {missing}\nDisponibles: {list(df.columns)}")

# Asegura tipos numéricos (por si llegaron como texto)
for c in ["costo_entrada","presupuesto_estimado","edad","frecuencia_viaje",
          "sitios_visitados","calificacion_sitios_previos",
          "tiempo_estancia_promedio","afluencia_promedio",
          "duracion_esperada","admite_mascotas","rating_usuario"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# quita posibles fugas si llegara a existir
if "sitio_recomendado" in df.columns and df["sitio_recomendado"].dtype == object:
    # si por error viene con nombres de sitio, repara usando rating>=4
    df["sitio_recomendado"] = (df["rating_usuario"] >= 4.0).astype(int)

# --- columnas por tipo (ya con nombres normalizados) ---
CAT_COLS = [
    "nacionalidad","origen","tipo_turista_preferido","compania_viaje",
    "restricciones_movilidad","nombre_sitio","tipo_sitio","accesibilidad_general",
    "idioma_info","ubicacion_geografica","clima_predominante","epoca_visita"
]
NUM_COLS = [
    "edad","frecuencia_viaje","presupuesto_estimado","sitios_visitados",
    "calificacion_sitios_previos","tiempo_estancia_promedio","costo_entrada",
    "afluencia_promedio","duracion_esperada","admite_mascotas"
]

# --- AFINIDAD (actualizado a tus tipos reales) ---
AFINIDAD = {
    "cultural": {"museo":0.9,"centro_historico":0.9,"arquitectura":0.85,"arqueologico":0.85,"plaza":0.7,"religioso":0.7},
    "naturaleza": {"naturaleza":0.95,"senderismo":0.9,"mirador":0.8},
    "aventura": {"senderismo":0.9,"parque_tematico":0.75,"mirador":0.75,"naturaleza":0.7},
    "gastronomico": {"gastronomico":0.95},
    "relax": {"mirador":0.9,"plaza":0.8,"naturaleza":0.75,"arquitectura":0.8},
}

def make_features(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # Costo relativo (evita división por cero)
    denom = (X["presupuesto_estimado"]*0.15).replace(0, np.nan)
    X["ratio_costo_presu"] = (X["costo_entrada"] / denom).clip(0, 3).fillna(0)
    # Afinidad perfil–tipo
    X["afinidad_tipo"] = X.apply(
        lambda r: AFINIDAD.get(str(r["tipo_turista_preferido"]), {}).get(str(r["tipo_sitio"]), 0.5), axis=1
    )
    # Cruces categóricos
    X["x_tipoTur__tipoSit"] = X["tipo_turista_preferido"].astype(str) + "×" + X["tipo_sitio"].astype(str)
    X["x_epoca__tipoSit"]   = X["epoca_visita"].astype(str) + "×" + X["tipo_sitio"].astype(str)
    return X

df = make_features(df)

# Target binario para la tarea de clasificación
df["y_like"] = (df["rating_usuario"] >= 4.0).astype(int)

# Columnas extendidas para el modelado
CAT_COLS_X = CAT_COLS + ["x_tipoTur__tipoSit","x_epoca__tipoSit"]
NUM_COLS_X = NUM_COLS + ["ratio_costo_presu","afinidad_tipo"]

print("✅ Esquema OK. Ejemplo columnas:", df.columns[:15].tolist())
print("Shape:", df.shape)


✅ Esquema OK. Ejemplo columnas: ['id_usuario', 'edad', 'nacionalidad', 'origen', 'tipo_turista_preferido', 'compania_viaje', 'frecuencia_viaje', 'restricciones_movilidad', 'presupuesto_estimado', 'sitios_visitados', 'calificacion_sitios_previos', 'tiempo_estancia_promedio', 'nombre_sitio', 'tipo_sitio', 'costo_entrada']
Shape: (100000, 30)


In [None]:
# --- carga ---
df = pd.read_csv(DATA_PATH, sep=SEP, encoding=ENC)
df = normalize_columns(df)

# quita posibles fugas
if "sitio_recomendado" in df.columns:
    df = df.drop(columns=["sitio_recomendado"])

# columnas por tipo (ajústalas si cambian en tu dataset)
CAT_COLS = [
    "nacionalidad","origen","tipo_turista_preferido","compañia_viaje",
    "restricciones_movilidad","nombre_sitio","tipo_sitio","accesibilidad_general",
    "idioma_info","ubicacion_geografica","clima_predominante","epoca_visita"
]
NUM_COLS = [
    "edad","frecuencia_viaje","presupuesto_estimado","sitios_visitados",
    "calificacion_sitios_previos","tiempo_estancia_promedio","costo_entrada",
    "afluencia_promedio","duracion_esperada","admite_mascotas"
]

# --- features de interacción usuario×sitio ---
AFINIDAD = {
    "cultural": {"museo":0.9,"histórico":0.9,"religioso":0.7,"arquitectura":0.85,"museo_religioso":0.8,"arqueologico":0.85,"plaza":0.7},
    "naturaleza": {"natural":0.95,"senderismo":0.9,"mirador":0.8,"parque_urbano":0.6},
    "aventura": {"aventura":0.95,"senderismo":0.85,"parque_tematico":0.7},
    "gastronómico": {"gastronomico":0.95,"enoturismo":0.9,"artesanal":0.6,"plaza":0.6},
    "relax_fotografía": {"mirador":0.9,"plaza":0.8,"arquitectura":0.8,"natural":0.75},
}

def make_features(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # costo relativo al 15% del presupuesto diario
    X["ratio_costo_presu"] = (X["costo_entrada"] / (X["presupuesto_estimado"]*0.15)).clip(0, 3)
    # afinidad perfil-tipo_sitio
    X["afinidad_tipo"] = X.apply(lambda r: AFINIDAD.get(r["tipo_turista_preferido"],{}).get(r["tipo_sitio"],0.5), axis=1)
    # cruces categóricos (ayuda a modelos lineales y deja explícita la interacción)
    X["x_tipoTur__tipoSit"] = X["tipo_turista_preferido"] + "×" + X["tipo_sitio"]
    X["x_epoca__tipoSit"]    = X["epoca_visita"] + "×" + X["tipo_sitio"]
    return X

df = make_features(df)

# añade variable binaria "like" para clasificación
df["y_like"] = (df["rating_usuario"] >= 4.0).astype(int)

# columnas extendidas
CAT_COLS_X = CAT_COLS + ["x_tipoTur__tipoSit","x_epoca__tipoSit"]
NUM_COLS_X = NUM_COLS + ["ratio_costo_presu","afinidad_tipo"]

In [None]:
rng = np.random.default_rng(RANDOM_SEED)
users = df["id_usuario"].drop_duplicates().to_numpy()
test_users = set(rng.choice(users, size=int(len(users)*TEST_USER_FRAC), replace=False))

train_df = df[~df["id_usuario"].isin(test_users)].reset_index(drop=True)
test_df  = df[ df["id_usuario"].isin(test_users)].reset_index(drop=True)

print("Usuarios train/test:", train_df["id_usuario"].nunique(), test_df["id_usuario"].nunique())
print("Filas train/test:", train_df.shape, test_df.shape)

Usuarios train/test: 16000 4000
Filas train/test: (80004, 29) (19996, 29)


In [5]:
from pycaret.classification import (
    setup, compare_models, tune_model, blend_models,
    finalize_model, predict_model, pull, save_model
)
import pandas as pd
import numpy as np

# 1) Copias profundas y tipos explícitos (evita vistas read-only)
train_df = train_df.copy(deep=True)

# Asegura que id_usuario esté como string (pero NO como feature)
train_df["id_usuario"] = train_df["id_usuario"].astype("string")

for c in CAT_COLS_X:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype("string").copy()

for c in NUM_COLS_X:
    if c in train_df.columns:
        train_df[c] = pd.to_numeric(train_df[c], errors="coerce").astype("float64").copy()

# 2) Setup — sin SMOTE y sin paralelismo en la primera corrida
setup_cls = setup(
    data = train_df[["id_usuario"] + CAT_COLS_X + NUM_COLS_X + ["y_like"]].copy(),
    target = "y_like",
    fold = 5,
    fold_strategy = "groupkfold",
    fold_groups = "id_usuario",          # usa el id para agrupar
    categorical_features = CAT_COLS_X,
    ignore_features = ["id_usuario"],     # << clave: no lo pases como feature
    remove_multicollinearity = True,
    multicollinearity_threshold = 0.95,
    imputation_type = "simple",
    fix_imbalance = True,                # << desactivar por ahora
    n_jobs = 1,                           # << sin paralelismo (evita bug loky/writeable)
    verbose = False
)

best3 = compare_models(n_select=3, sort="AUC")
tuned = [tune_model(m, optimize="AUC") for m in best3]
blend = blend_models(tuned)
final_cls = finalize_model(blend)
save_model(final_cls, "modelo_cls_like_v2")

# Si esto corre OK, ya puedes volver a activar fix_imbalance=True y/o subir n_jobs




Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8499,0.9227,0.762,0.7814,0.7715,0.6598,0.6599,27.512
ridge,Ridge Classifier,0.8317,0.9221,0.8584,0.7022,0.7725,0.641,0.6492,3.596
lda,Linear Discriminant Analysis,0.8318,0.922,0.8585,0.7022,0.7725,0.6411,0.6493,4.354
lightgbm,Light Gradient Boosting Machine,0.8493,0.9216,0.7514,0.786,0.7683,0.6566,0.657,7.046
ada,Ada Boost Classifier,0.8387,0.9193,0.8181,0.7299,0.7714,0.6474,0.65,10.054
catboost,CatBoost Classifier,0.847,0.9193,0.7468,0.7831,0.7645,0.6512,0.6516,120.774
rf,Random Forest Classifier,0.8472,0.9069,0.7583,0.777,0.7675,0.6537,0.6539,16.224
et,Extra Trees Classifier,0.8313,0.9005,0.6625,0.7961,0.7232,0.6033,0.6085,22.644
lr,Logistic Regression,0.7373,0.8287,0.7736,0.5789,0.6622,0.4544,0.4672,11.216
dt,Decision Tree Classifier,0.7857,0.7608,0.6864,0.675,0.6806,0.5193,0.5194,4.774


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

In [9]:
# ================= PARCHE REGRESIÓN =================
from pycaret.regression import (
    setup as setup_reg, compare_models as compare_reg, tune_model as tune_reg,
    blend_models as blend_reg, finalize_model as finalize_reg, predict_model as predict_reg,
    pull as pull_reg, save_model as save_reg
)
import pandas as pd, numpy as np

# 0) Preparar train con copias profundas y tipos explícitos
train_reg = train_df[["id_usuario"] + CAT_COLS_X + NUM_COLS_X + ["rating_usuario"]].copy(deep=True)

# Asegurar tipos
train_reg["id_usuario"] = train_reg["id_usuario"].astype("string")
for c in CAT_COLS_X:
    if c in train_reg.columns:
        train_reg[c] = train_reg[c].astype("string").copy()
for c in NUM_COLS_X:
    if c in train_reg.columns:
        train_reg[c] = pd.to_numeric(train_reg[c], errors="coerce").astype("float64").copy()

# Target numérico y sin NaNs / inf
train_reg["rating_usuario"] = pd.to_numeric(train_reg["rating_usuario"], errors="coerce").astype("float64")
train_reg = train_reg.replace([np.inf, -np.inf], np.nan)

# 1) Setup — sin paralelismo, ignorando id_usuario (y opcionalmente nombre_sitio)
IGNORE_FEATS = ["id_usuario"]  # agrega "nombre_sitio" si quieres evitar OHE de alta cardinalidad
setup_reg(
    data = train_reg,
    target = "rating_usuario",
    session_id = RANDOM_SEED,
    fold = 5,
    fold_strategy = "groupkfold",
    fold_groups = "id_usuario",
    categorical_features = CAT_COLS_X,
    ignore_features = IGNORE_FEATS,
    remove_multicollinearity = True,
    multicollinearity_threshold = 0.95,
    imputation_type = "simple",
    n_jobs = 1,            # evita errores de loky/writeable en 1ra pasada
    verbose = False
)

# 2) Benchmark, tuning y ensamble
best3r = compare_reg(n_select=3, sort="RMSE")
lb_reg = pull_reg(); lb_reg.to_csv("leaderboard_regresion.csv", index=False, encoding="utf-8-sig")

tunedr = [tune_reg(m, optimize="RMSE") for m in best3r]
blendr = blend_reg(tunedr)
final_reg = finalize_reg(blendr)
save_reg(final_reg, "modelo_reg_rating_v2")

# 3) Evaluación en TEST (alineamiento seguro para métricas Top-K)
# --- evaluación en TEST (sin columnas duplicadas) ---
Xtest = test_df[["id_usuario"] + CAT_COLS_X + NUM_COLS_X].copy(deep=True)
Xtest["id_usuario"] = Xtest["id_usuario"].astype("string")
for c in CAT_COLS_X:
    if c in Xtest.columns:
        Xtest[c] = Xtest[c].astype("string")
for c in NUM_COLS_X:
    if c in Xtest.columns:
        Xtest[c] = pd.to_numeric(Xtest[c], errors="coerce").astype("float64")

# Guardamos ytest SOLO con la verdad (rating_usuario) para evitar duplicados
ytest = test_df[["rating_usuario"]].reset_index(drop=True)

# Predicción (predict_model devuelve X original + columna 'prediction_label')
pred = predict_reg(final_reg, data=Xtest).reset_index(drop=True)
pred = pred.rename(columns={"prediction_label": "rating_prev"})

# Nos quedamos con UNA SOLA columna de id y de item:
# - Si 'nombre_sitio' está en pred (porque lo usamos como feature), lo conservamos de ahí.
# - Si NO está (porque lo ignoraste), lo traemos desde test_df.
cols_keep = ["id_usuario", "rating_prev"]
if "nombre_sitio" in pred.columns:
    cols_keep.append("nombre_sitio")
test_pred = pred[cols_keep].copy()

if "nombre_sitio" not in test_pred.columns:
    test_pred = pd.concat([test_pred, test_df[["nombre_sitio"]].reset_index(drop=True)], axis=1)

# Añadimos la verdad del rating sin duplicar id/ítem
test_pred["rating_usuario"] = ytest["rating_usuario"].values
test_pred["y_true_rel"] = (test_pred["rating_usuario"] >= 4.0).astype(int)

# (opcional) sanity check: que sólo haya UNA columna 'id_usuario' y 'nombre_sitio'
# print([c for c in test_pred.columns if c == "id_usuario"])
# print([c for c in test_pred.columns if c == "nombre_sitio"])

# --- Métricas Top-K ---
metrics_reg = {}
for K in K_LIST:
    recalls = [recall_at_k(g, K, "rating_prev", "y_true_rel") for _, g in test_pred.groupby("id_usuario")]
    ndcgs   = [ndcg_at_k(g,   K, "rating_prev", "y_true_rel") for _, g in test_pred.groupby("id_usuario")]
    cov     = coverage_at_k(test_pred, K, "rating_prev", "nombre_sitio")
    metrics_reg[K] = {"recall": float(np.nanmean(recalls)),
                      "ndcg":   float(np.nanmean(ndcgs)),
                      "coverage": cov}

import json
print("Top-K REGRESIÓN (global):", json.dumps(metrics_reg, indent=2, ensure_ascii=False))
with open("metrics_regresion_topk.json","w",encoding="utf-8") as f:
    json.dump(metrics_reg, f, ensure_ascii=False, indent=2)

# ================= FIN PARCHE =================



Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.3808,0.2256,0.475,0.5995,0.1161,0.1252,9.48
lightgbm,Light Gradient Boosting Machine,0.3808,0.2257,0.4751,0.5992,0.1161,0.1251,2.642
catboost,CatBoost Regressor,0.3824,0.2279,0.4774,0.5953,0.1166,0.1255,17.666
br,Bayesian Ridge,0.3857,0.2326,0.4822,0.5871,0.1182,0.1272,2.492
lr,Linear Regression,0.3859,0.2328,0.4824,0.5867,0.1182,0.1273,2.376
ridge,Ridge Regression,0.3859,0.2327,0.4824,0.5868,0.1182,0.1273,2.198
rf,Random Forest Regressor,0.3872,0.2339,0.4836,0.5847,0.1179,0.1269,48.604
omp,Orthogonal Matching Pursuit,0.3908,0.2389,0.4888,0.5758,0.1199,0.1291,2.19
ada,AdaBoost Regressor,0.3972,0.2436,0.4936,0.5675,0.1202,0.13,6.812
et,Extra Trees Regressor,0.3994,0.2496,0.4996,0.5569,0.1215,0.1307,48.084


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3783,0.2232,0.4724,0.6028,0.1151,0.1239
1,0.3808,0.2266,0.4761,0.6023,0.1163,0.125
2,0.3837,0.2277,0.4771,0.5972,0.1165,0.1262
3,0.379,0.2217,0.4708,0.6025,0.1148,0.124
4,0.3855,0.2315,0.4811,0.5878,0.1181,0.1278
Mean,0.3815,0.2261,0.4755,0.5985,0.1162,0.1254
Std,0.0028,0.0034,0.0036,0.0057,0.0012,0.0015


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3782,0.2233,0.4726,0.6026,0.1151,0.1238
1,0.3818,0.2276,0.477,0.6007,0.1165,0.1253
2,0.3836,0.2283,0.4779,0.596,0.1168,0.1263
3,0.3788,0.2219,0.4711,0.602,0.1148,0.1239
4,0.3851,0.2316,0.4813,0.5875,0.1181,0.1277
Mean,0.3815,0.2266,0.476,0.5978,0.1163,0.1254
Std,0.0027,0.0035,0.0037,0.0056,0.0012,0.0015


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1413
[LightGBM] [Info] Number of data points in the train set: 35814, number of used features: 95
[LightGBM] [Info] Start training from score 3.317253
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1411
[LightGBM] [Info] Number of data points in the train set: 35814, number of used features: 95
[LightGBM] [Info] Start training from score 3.317398
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007305 seconds.
You can set `force_row_

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3771,0.222,0.4712,0.6049,0.1149,0.1235
1,0.3807,0.2259,0.4753,0.6037,0.1161,0.125
2,0.3832,0.2273,0.4768,0.5979,0.1166,0.1262
3,0.3776,0.221,0.4702,0.6036,0.1147,0.1237
4,0.3838,0.2305,0.4802,0.5894,0.1181,0.1276
Mean,0.3805,0.2254,0.4747,0.5999,0.1161,0.1252
Std,0.0028,0.0035,0.0037,0.0058,0.0012,0.0015


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.377,0.2219,0.4711,0.6051,0.1148,0.1235
1,0.38,0.2252,0.4745,0.6049,0.1159,0.1247
2,0.3827,0.227,0.4764,0.5985,0.1165,0.126
3,0.3776,0.2207,0.4698,0.6042,0.1146,0.1236
4,0.3832,0.2294,0.479,0.5914,0.1176,0.1271
Mean,0.3801,0.2248,0.4742,0.6008,0.1159,0.125
Std,0.0025,0.0032,0.0034,0.0053,0.0011,0.0014


Transformation Pipeline and Model Successfully Saved


Top-K REGRESIÓN (global): {
  "3": {
    "recall": 0.888768695456969,
    "ndcg": 0.8813271402023607,
    "coverage": 1.0
  },
  "5": {
    "recall": 0.9686028565816427,
    "ndcg": 0.9078232291455232,
    "coverage": 1.0
  },
  "10": {
    "recall": 0.9984819148637651,
    "ndcg": 0.9199722878177368,
    "coverage": 1.0
  }
}


In [33]:
import numpy as np
import pandas as pd
from pycaret.classification import load_model as load_cls, predict_model as predict_cls
from pycaret.regression import     load_model as load_reg,  predict_model as predict_reg

CLS = load_cls("modelo_cls_like_v2")
REG = load_reg("modelo_reg_rating_v2")
CAT = pd.read_csv(CAT_PATH, sep=SEP, encoding=ENC)

IGNORED_AT_TRAIN = ["id_usuario"]  # se ignoró en setup -> NO pasarla al pipeline

def _build_candidates(user: dict) -> pd.DataFrame:
    cand = CAT.copy()
    for k, v in user.items():
        cand[k] = v
    cand = normalize_columns(cand)
    cand = make_features(cand)                 # mismas features que en train
    return cand

def _prepare_for_pipeline(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # quitar columnas ignoradas en setup
    X.drop(columns=[c for c in IGNORED_AT_TRAIN if c in X.columns], inplace=True, errors="ignore")
    # (opcional) asegurar tipos como en train
    for c in CAT_COLS_X:
        if c in X.columns: X[c] = X[c].astype("string")
    for c in NUM_COLS_X:
        if c in X.columns: X[c] = pd.to_numeric(X[c], errors="coerce").astype("float64")
    return X

def _prob_like_from_hard_voting(pipeline, Xdf: pd.DataFrame) -> pd.Series:
    # 1) preprocesa con el pipeline sin el último paso (modelo)
    pre = pipeline[:-1]
    Xenc = pre.transform(Xdf)

    clf = pipeline.named_steps.get('trained_model', pipeline.steps[-1][1])
    proba_list = []

    for est in getattr(clf, 'estimators_', []):
        if hasattr(est, "predict_proba"):
            p = est.predict_proba(Xenc)
            classes = getattr(est, "classes_", None)
            if classes is not None:
                cls_list = list(classes)
                if 1 in cls_list: pos = cls_list.index(1)
                elif "1" in cls_list: pos = cls_list.index("1")
                else: pos = p.shape[1]-1
            else:
                pos = p.shape[1]-1
            proba_list.append(p[:, pos])
        elif hasattr(est, "decision_function"):
            df = est.decision_function(Xenc)
            if df.ndim == 1:
                proba_list.append(1.0/(1.0 + np.exp(-df)))            # logística
            else:
                col = 1 if df.shape[1] > 1 else 0
                proba_list.append(1.0/(1.0 + np.exp(-df[:, col])))

    if proba_list:
        return pd.Series(np.mean(np.column_stack(proba_list), axis=1), index=Xdf.index)

    # último recurso: usar predict_model por si expone alguna columna de score
    scored = predict_cls(CLS, data=Xdf, raw_score=True)
    if "Score_1" in scored.columns: return scored["Score_1"]
    if "Score" in scored.columns and "prediction_label" in scored.columns:
        return pd.Series(np.where(scored["prediction_label"]==1, scored["Score"], 1.0-scored["Score"]), index=Xdf.index)
    if "prediction_score" in scored.columns and "prediction_label" in scored.columns:
        return pd.Series(np.where(scored["prediction_label"]==1, scored["prediction_score"], 1.0-scored["prediction_score"]), index=Xdf.index)

    raise AttributeError("No hay forma de obtener probas del VotingClassifier (hard). Considera reentrenar con voting='soft'.")

def recomendar_top3_cls(user: dict) -> pd.DataFrame:
    X0 = _build_candidates(user)
    X  = _prepare_for_pipeline(X0)                 # << quita id_usuario y ajusta tipos

    # Intento directo (por si NO es 'hard'); si falla, uso agregación de base estimators
    try:
        proba = CLS.predict_proba(X)
        classes = getattr(CLS, "classes_", None)
        if classes is not None and (1 in list(classes) or "1" in list(classes)):
            pos = list(classes).index(1) if 1 in list(classes) else list(classes).index("1")
        else:
            pos = proba.shape[1]-1
        prob_like = pd.Series(proba[:, pos], index=X.index)
    except Exception:
        prob_like = _prob_like_from_hard_voting(CLS, X)

    scored = X0.assign(prob_like=prob_like.values)  # unimos a los campos legibles (tipo_sitio, etc.)

    cols = ["nombre_sitio","tipo_sitio","costo_entrada","accesibilidad_general",
            "afinidad_tipo","ratio_costo_presu","prob_like"]
    cols = [c for c in cols if c in scored.columns]
    return scored[cols].nlargest(3, "prob_like").reset_index(drop=True)

def recomendar_top3_reg(user: dict) -> pd.DataFrame:
    X0 = _build_candidates(user)
    X  = _prepare_for_pipeline(X0)                 # opcional (predict_model ya lo ignora)
    scored = predict_reg(REG, data=X)
    pred_col = "prediction_label" if "prediction_label" in scored.columns else ("Label" if "Label" in scored.columns else None)
    if pred_col is None:
        raise KeyError(f"No encuentro columna de predicción en regresión. Tengo: {list(scored.columns)[:20]}")
    scored = scored.rename(columns={pred_col: "rating_prev"})

    cols = ["nombre_sitio","tipo_sitio","costo_entrada","accesibilidad_general",
            "afinidad_tipo","ratio_costo_presu","rating_prev"]
    cols = [c for c in cols if c in X0.columns] + ["rating_prev"]
    out = X0.join(scored[["rating_prev"]]).nlargest(3, "rating_prev")[cols].reset_index(drop=True)
    return out



usuario_demo = {
    "id_usuario":"U_demo","edad":30,"nacionalidad":"Colombia","origen":"Bogotá",
    "tipo_turista_preferido":"gastronomía","compañia_viaje":"solo",
    "frecuencia_viaje":2,"restricciones_movilidad":"ninguna",
    "presupuesto_estimado":10000,"sitios_visitados":6,
    "calificacion_sitios_previos":4.3,"tiempo_estancia_promedio":90,
    "epoca_visita":"fin_de_semana_puente"
}

print("Top-3 (clasificación):")
print(recomendar_top3_cls(usuario_demo).to_string(index=False))

print("\nTop-3 (regresión):")
print(recomendar_top3_reg(usuario_demo).to_string(index=False))



Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Top-3 (clasificación):
                                nombre_sitio      tipo_sitio  costo_entrada accesibilidad_general  afinidad_tipo  ratio_costo_presu  prob_like
Claustro de San Agustín (Instituto Humboldt)       religioso           5000                  alta            0.5                3.0   0.885336
         Eco Parque Conscientia Universallis parque_tematico          27066                  alta            0.5                3.0   0.883514
             Iglesia Colonial de Santa Sofía       religioso           5000                  alta            0.5                3.0   0.883024

Top-3 (regresión):
               nombre_sitio   tipo_sitio  costo_entrada accesibilidad_general  afinidad_tipo  ratio_costo_presu  rating_prev
Jardín Paleobotánico (UNAL)    histórico              0                 media            0.5                0.0     3.638123
Antigua Estación de Policía

In [21]:
# Clasificación
clf = final_cls.named_steps["trained_model"]
print(clf)               # tipo (VotingClassifier)
print([type(e).__name__ for e in clf.estimators_])  # modelos base

# Regresión
reg = final_reg.named_steps["trained_model"]
print(reg)               # tipo (VotingRegressor)
print([type(e).__name__ for e in reg.estimators_])  # modelos base


KeyError: 'trained_model'

In [None]:
from sklearn.ensemble import VotingClassifier, VotingRegressor, StackingClassifier, StackingRegressor

def inspect_pycaret_pipeline(pipe):
    # 1) nombres de pasos
    step_names = [name for name, _ in pipe.steps]
    print("Pasos del Pipeline:", step_names)

    # 2) último paso = estimador final
    last_name, last_step = pipe.steps[-1]
    print(f"Paso final: {last_name}  →  {type(last_step).__name__}")

    # 3) según el tipo, muestra su composición
    if isinstance(last_step, (VotingClassifier, VotingRegressor)):
        # estimadores base (usamos estimators_ si está disponible; si no, estimators)
        base = getattr(last_step, "estimators_", None)
        if base is None:
            base = [est for _, est in getattr(last_step, "estimators", [])]
        print("Tipo Voting:", getattr(last_step, "voting", "—"))
        print("Estimadores base:", [type(e).__name__ for e in base])

    elif isinstance(last_step, (StackingClassifier, StackingRegressor)):
        base = getattr(last_step, "estimators_", None)
        if base is None:
            base = [est for _, est in getattr(last_step, "estimators", [])]
        meta = getattr(last_step, "final_estimator_", getattr(last_step, "final_estimator", None))
        print("Estimadores base (stacking):", [type(e).__name__ for e in base])
        print("Meta-modelo:", type(meta).__name__ if meta is not None else "—")

    else:
        print("Modelo único (no es ensamble).")

# Inspecciona tus modelos guardados:
inspect_pycaret_pipeline(CLS)   # clasificación
inspect_pycaret_pipeline(REG)   # regresión


Pasos del Pipeline: ['numerical_imputer', 'categorical_imputer', 'ordinal_encoding', 'onehot_encoding', 'rest_encoding', 'remove_multicollinearity', 'clean_column_names', 'actual_estimator']
Paso final: actual_estimator  →  VotingClassifier
Tipo Voting: hard
Estimadores base: ['GradientBoostingClassifier', 'RidgeClassifier', 'LinearDiscriminantAnalysis']
Pasos del Pipeline: ['numerical_imputer', 'categorical_imputer', 'ordinal_encoding', 'onehot_encoding', 'rest_encoding', 'remove_multicollinearity', 'clean_column_names', 'actual_estimator']
Paso final: actual_estimator  →  VotingRegressor
Tipo Voting: —
Estimadores base: ['GradientBoostingRegressor', 'LGBMRegressor', 'CatBoostRegressor']


In [23]:
_, est_final = CLS.steps[-1]
print(list(est_final.get_params().keys())[:20])  # algunas claves


['estimators', 'flatten_transform', 'n_jobs', 'verbose', 'voting', 'weights', 'Gradient Boosting Classifier', 'Ridge Classifier', 'Linear Discriminant Analysis', 'Gradient Boosting Classifier__ccp_alpha', 'Gradient Boosting Classifier__criterion', 'Gradient Boosting Classifier__init', 'Gradient Boosting Classifier__learning_rate', 'Gradient Boosting Classifier__loss', 'Gradient Boosting Classifier__max_depth', 'Gradient Boosting Classifier__max_features', 'Gradient Boosting Classifier__max_leaf_nodes', 'Gradient Boosting Classifier__min_impurity_decrease', 'Gradient Boosting Classifier__min_samples_leaf', 'Gradient Boosting Classifier__min_samples_split']


In [24]:
from pycaret.classification import (
    setup, compare_models, tune_model, blend_models,
    calibrate_model, finalize_model, save_model, pull
)

# Usa el mismo train_df y el MISMO setup que ya te funcionó (groupkfold + ignore id_usuario)
setup(
    data=train_df[["id_usuario"] + CAT_COLS_X + NUM_COLS_X + ["y_like"]],
    target="y_like",
    session_id=RANDOM_SEED,
    fold=5, fold_strategy="groupkfold", fold_groups="id_usuario",
    categorical_features=CAT_COLS_X,
    ignore_features=["id_usuario"],
    remove_multicollinearity=True, multicollinearity_threshold=0.95,
    imputation_type="simple",
    fix_imbalance=False,   # activa luego si lo necesitas
    verbose=False
)

# candidatos y tuning
cands = compare_models(n_select=5, sort="AUC")
tuned = [tune_model(m, optimize="AUC") for m in cands]
# quédate con modelos que soportan predict_proba (requisito para 'soft')
tuned = [m for m in tuned if hasattr(m, "predict_proba")]

# ensamble 'soft' (promedia probabilidades)
soft = blend_models(estimator_list=tuned, method="soft", choose_better=True)

# (opcional pero recomendado) calibra probabilidades
soft_cal = calibrate_model(soft, method="isotonic")

# cierra y guarda
final_soft = finalize_model(soft_cal)
save_model(final_soft, "modelo_cls_like_soft_v1")


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9077,0.9343,0.7363,0.7985,0.7661,0.7087,0.7096,3.168
lightgbm,Light Gradient Boosting Machine,0.9071,0.9332,0.7367,0.7957,0.765,0.7072,0.7081,1.178
ridge,Ridge Classifier,0.906,0.9331,0.7064,0.8115,0.7552,0.6974,0.7,0.91
lda,Linear Discriminant Analysis,0.9063,0.933,0.7285,0.7978,0.7615,0.7033,0.7045,1.37
ada,Ada Boost Classifier,0.9052,0.9325,0.752,0.7789,0.7651,0.7057,0.706,1.842
catboost,CatBoost Classifier,0.9048,0.9324,0.7306,0.7901,0.7591,0.6999,0.7008,8.844
rf,Random Forest Classifier,0.9042,0.9247,0.7242,0.7918,0.7564,0.697,0.6981,2.272
et,Extra Trees Classifier,0.9004,0.9207,0.7185,0.7795,0.7477,0.6858,0.6867,2.916
lr,Logistic Regression,0.9008,0.9001,0.6744,0.8106,0.7361,0.6757,0.6801,12.15
dt,Decision Tree Classifier,0.8573,0.788,0.6696,0.6476,0.6583,0.5681,0.5684,1.094


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9142,0.9384,0.7521,0.8094,0.7797,0.7265,0.7273
1,0.9032,0.927,0.7249,0.774,0.7486,0.6887,0.6893
2,0.907,0.9348,0.7495,0.7956,0.7718,0.7135,0.714
3,0.9083,0.9334,0.7348,0.804,0.7679,0.7109,0.712
4,0.9047,0.9352,0.7524,0.7846,0.7681,0.7082,0.7085
Mean,0.9075,0.9338,0.7427,0.7935,0.7672,0.7096,0.7102
Std,0.0038,0.0038,0.011,0.0129,0.0102,0.0122,0.0122


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9124,0.9392,0.7377,0.8113,0.7728,0.7187,0.7199
1,0.9043,0.9276,0.7176,0.7831,0.7489,0.6899,0.6909
2,0.9064,0.9338,0.7314,0.805,0.7664,0.7081,0.7094
3,0.9084,0.9337,0.7311,0.807,0.7672,0.7103,0.7117
4,0.9045,0.9341,0.7487,0.7859,0.7668,0.7068,0.7072
Mean,0.9072,0.9337,0.7333,0.7985,0.7644,0.7068,0.7078
Std,0.003,0.0037,0.0101,0.0116,0.0081,0.0094,0.0095


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9111,0.937,0.7133,0.8226,0.7641,0.7097,0.7124
1,0.9012,0.9273,0.6721,0.7991,0.7301,0.6702,0.6739
2,0.9057,0.9349,0.7096,0.8174,0.7597,0.7014,0.7041
3,0.9058,0.9336,0.7002,0.8174,0.7543,0.6965,0.6997
4,0.9053,0.9339,0.7215,0.8065,0.7617,0.7028,0.7045
Mean,0.9058,0.9333,0.7033,0.8126,0.754,0.6961,0.6989
Std,0.0032,0.0032,0.0171,0.0086,0.0124,0.0136,0.0132


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9123,0.9367,0.7261,0.819,0.7697,0.7158,0.7178
1,0.9027,0.9272,0.699,0.788,0.7409,0.6812,0.6831
2,0.9073,0.935,0.7362,0.8056,0.7693,0.7115,0.7126
3,0.9072,0.9336,0.7165,0.8118,0.7611,0.7038,0.7059
4,0.9029,0.9336,0.7343,0.7885,0.7604,0.6996,0.7003
Mean,0.9065,0.9332,0.7224,0.8026,0.7603,0.7024,0.704
Std,0.0035,0.0032,0.0136,0.0125,0.0105,0.012,0.012


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9134,0.9373,0.7277,0.8229,0.7724,0.7192,0.7213
1,0.9022,0.9256,0.712,0.7774,0.7433,0.683,0.684
2,0.9064,0.9314,0.7213,0.812,0.7639,0.7058,0.7078
3,0.9077,0.9315,0.7181,0.8131,0.7626,0.7057,0.7078
4,0.9066,0.9343,0.7348,0.8033,0.7675,0.7092,0.7103
Mean,0.9073,0.932,0.7228,0.8057,0.762,0.7046,0.7062
Std,0.0036,0.0039,0.0079,0.0155,0.0099,0.0119,0.0122


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9137,0.9394,0.736,0.818,0.7748,0.7216,0.7232
1,0.9029,0.928,0.7069,0.7839,0.7434,0.6838,0.6852
2,0.9074,0.934,0.7399,0.8036,0.7704,0.7126,0.7135
3,0.9082,0.9343,0.7284,0.8079,0.7661,0.7092,0.7106
4,0.9037,0.9349,0.7417,0.787,0.7637,0.7033,0.7038
Mean,0.9072,0.9341,0.7306,0.8001,0.7637,0.7061,0.7073
Std,0.0038,0.0037,0.0127,0.0128,0.0108,0.0126,0.0127


Original model was better than the blended model, hence it will be returned. NOTE: The display metrics are for the blended model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.913,0.9393,0.7322,0.8177,0.7726,0.719,0.7206
1,0.9042,0.9282,0.6979,0.7953,0.7434,0.6848,0.687
2,0.9069,0.934,0.725,0.8113,0.7657,0.7078,0.7096
3,0.9073,0.9343,0.7403,0.7963,0.7672,0.7095,0.7102
4,0.9063,0.9354,0.7455,0.795,0.7694,0.7107,0.7113
Mean,0.9075,0.9342,0.7282,0.8031,0.7637,0.7064,0.7077
Std,0.0029,0.0036,0.0167,0.0095,0.0104,0.0114,0.0111


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['edad', 'frecuencia_viaje',
                                              'presupuesto_estimado',
                                              'sitios_visitados',
                                              'calificacion_sitios_previos',
                                              'tiempo_estancia_promedio',
                                              'costo_entrada',
                                              'afluencia_promedio',
                                              'duracion_esperada',
                                              'admite_mascotas',
                                              'ratio_costo_presu',
                                              'afinidad_tipo'],
                                     transf...
                                                                    

In [25]:
from pycaret.classification import load_model
CLS_SOFT = load_model("modelo_cls_like_soft_v1")

def recomendar_top3_cls_soft(user: dict) -> pd.DataFrame:
    X0 = _build_candidates(user)
    X  = _prepare_for_pipeline(X0)          # quita id_usuario y asegura tipos
    proba = CLS_SOFT.predict_proba(X)       # ahora sí disponible
    pos = list(CLS_SOFT.classes_).index(1) if 1 in CLS_SOFT.classes_ else proba.shape[1]-1
    X0["prob_like"] = proba[:, pos]
    cols = ["nombre_sitio","tipo_sitio","costo_entrada","accesibilidad_general",
            "afinidad_tipo","ratio_costo_presu","prob_like"]
    return X0[cols].nlargest(3, "prob_like").reset_index(drop=True)

# probar:
print(recomendar_top3_cls_soft(usuario_demo))


Transformation Pipeline and Model Successfully Loaded
                nombre_sitio tipo_sitio  costo_entrada accesibilidad_general  \
0  Antigua Estación del Tren  histórico           5168                 media   
1    Claustro de San Agustín  histórico           5000                 media   
2            Museo del Fósil      museo          10000                  alta   

   afinidad_tipo  ratio_costo_presu  prob_like  
0            0.9           0.149797   0.957099  
1            0.9           0.144928   0.957099  
2            0.9           0.289855   0.920571  
