In [20]:
# --- imports & config ---
import json, numpy as np, pandas as pd
from pathlib import Path
DATA_PATH = "dataset_Recomendacion_villa_de_leyva_eleccion (2).csv"   # ajústalo
CAT_PATH  = "catalogo_vdl_lugares_unico.csv"               # ajústalo
SEP, ENC  = ";", "utf-8-sig"
RANDOM_SEED = 42
K_LIST = [3, 5, 10]
TEST_USER_FRAC = 0.20     # % de usuarios para hold-out honesto

# --- normalizador de encabezados con caracteres raros ---
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    ren = {
        "compa¤¡a_viaje": "compania_viaje",
        "‚poca_visita": "epoca_visita",
    }
    ren = {k:v for k,v in ren.items() if k in df.columns}
    return df.rename(columns=ren)

# --- métricas Top-K ---
def _dcg_at_k(rels): return float(np.sum([r/np.log2(i+2) for i,r in enumerate(rels)]))

def recall_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False)
    topk = g.head(k)
    tot = g[rel_col].sum()
    return float("nan") if tot==0 else float(topk[rel_col].sum()/tot)

def ndcg_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False)
    dcg  = _dcg_at_k(g.head(k)[rel_col].tolist())
    idcg = _dcg_at_k(sorted(g[rel_col].tolist(), reverse=True)[:k])
    return float("nan") if idcg==0 else float(dcg/idcg)

def coverage_at_k(df, k, score_col, item_col="nombre_sitio"):
    topk = (df.sort_values(["id_usuario", score_col], ascending=[True, False])
              .groupby("id_usuario").head(k))
    return float(topk[item_col].nunique() / df[item_col].nunique())

In [21]:
# =======================
# Config y carga inicial
# =======================
import pandas as pd
import numpy as np
import unicodedata

ENC = "utf-8-sig"
SEP = ";"
DATA_PATH = "dataset_Recomendacion_villa_de_leyva_eleccion (2).csv"  # <-- ajusta si aplica

df = pd.read_csv(DATA_PATH, sep=SEP, encoding=ENC)

# 1) Normaliza encabezados y espacios
# (Se asume que ya tienes normalize_columns; si no, te la paso)
df = normalize_columns(df)
df.columns = df.columns.str.strip()

# ================================
# 2) Parche de esquema (sinónimos)
# ================================
CANDIDATES = {
    "compania_viaje":        ["compania_viaje","compañia_viaje","compan_a_viaje","companaviaje"],
    "costo_entrada":         ["costo_entrada","costoentrada","precio_entrada","costo"],
    "afluencia_promedio":    ["afluencia_promedio","afluencia","afluencia_prom"],
    "duracion_esperada":     ["duracion_esperada","duracion_estimada","duracion","tiempo_esperado"],
    "presupuesto_estimado":  ["presupuesto_estimado","presupuesto","budget"],
    "tipo_turista_preferido":["tipo_turista_preferido","preferencia","perfil_preferido"],
}
for tgt, opts in CANDIDATES.items():
    if tgt not in df.columns:
        for c in opts:
            if c in df.columns:
                df = df.rename(columns={c: tgt})
                break

# ==============================================================
# 3) Canonicalización de valores categóricos (acentos, espacios)
# ==============================================================
CANON_MAP = {
    "gastronomia": "gastronomico",
    "gastronomía": "gastronomico",
    "relax_fotografia": "relax_fotografia",
    "relax_fotografía": "relax_fotografia",
    "parque tematico": "parque_tematico",
    "enoturismo": "gastronomico",
}
def canon(s: str) -> str:
    s = str(s)
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    s = s.strip().lower().replace(" ", "_")
    return CANON_MAP.get(s, s)

def normalize_values(df_in: pd.DataFrame) -> pd.DataFrame:
    df2 = df_in.copy()
    cols = ["tipo_turista_preferido","tipo_sitio","epoca_visita",
            "accesibilidad_general","ubicacion_geografica","idioma_info"]
    for c in cols:
        if c in df2.columns:
            df2[c] = df2[c].astype(str).map(canon)
    return df2

df = normalize_values(df)

# ====================================
# 4) Validación mínima de requeridos
# ====================================
REQ = ["costo_entrada","presupuesto_estimado","tipo_sitio",
       "tipo_turista_preferido","epoca_visita","rating_usuario"]  # <-- añade rating_usuario
missing = [c for c in REQ if c not in df.columns]
if missing:
    raise KeyError(f"Faltan columnas obligatorias: {missing}\nDisponibles: {list(df.columns)}")

# ==========================================================
# 5) Limpieza específica (admite_mascotas texto -> numérico)
# ==========================================================
if "admite_mascotas" in df.columns:
    # Mapea variantes comunes a 0/1, preserva si ya es numérica
    _map = {"si":1,"sí":1,"SI":1,"Sí":1,"Si":1,"no":0,"NO":0,"No":0,1:1,0:0,"1":1,"0":0,True:1,False:0}
    try:
        df["admite_mascotas"] = df["admite_mascotas"].map(_map).astype("float")
    except Exception:
        pass  # si ya es numérica, sigue

# ============================================
# 6) Coerción numérica segura (si existen)
# ============================================
for c in ["costo_entrada","presupuesto_estimado","edad","frecuencia_viaje",
          "sitios_visitados","calificacion_sitios_previos","tiempo_estancia_promedio",
          "afluencia_promedio","duracion_esperada","admite_mascotas","rating_usuario"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# ============================================
# 7) Anti-leakage (si llega objetivo filtrado)
# ============================================
if "sitio_recomendado" in df.columns and df["sitio_recomendado"].dtype == object:
    # Si por error incluye nombres de sitios como "objetivo", rehace como binario
    df["sitio_recomendado"] = (df["rating_usuario"] >= 4.0).astype(int)

# ===================================================
# 8) Definición de columnas por tipo (para modelado)
# ===================================================
CAT_COLS = [
    "nacionalidad","origen","tipo_turista_preferido","compania_viaje",
    "restricciones_movilidad","nombre_sitio","tipo_sitio","accesibilidad_general",
    "idioma_info","ubicacion_geografica","clima_predominante","epoca_visita"
]
NUM_COLS = [
    "edad","frecuencia_viaje","presupuesto_estimado","sitios_visitados",
    "calificacion_sitios_previos","tiempo_estancia_promedio","costo_entrada",
    "afluencia_promedio","duracion_esperada","admite_mascotas"
]

# ================================================
# 9) Afinidad (perfil × tipo de sitio) canónica
# ================================================
AFINIDAD = {
    "cultural":   {"museo":0.9,"centro_historico":0.9,"arquitectura":0.85,"arqueologico":0.85,"plaza":0.7,"religioso":0.7},
    "naturaleza": {"naturaleza":0.95,"senderismo":0.9,"mirador":0.8},
    "aventura":   {"senderismo":0.9,"parque_tematico":0.75,"mirador":0.75,"naturaleza":0.7},
    "gastronomico":{"gastronomico":0.95},
    "relax":      {"mirador":0.9,"plaza":0.8,"naturaleza":0.75,"arquitectura":0.8},
}

# =======================================
# 10) Feature engineering determinístico
# =======================================
def make_features(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # a) Ratio costo / (15% del presupuesto) con protecciones
    denom = (X["presupuesto_estimado"] * 0.15).replace(0, np.nan)
    X["ratio_costo_presu"] = (X["costo_entrada"] / denom).clip(0, 3).fillna(0)
    # b) Afinidad (fallback=0.5 si no hay match)
    X["afinidad_tipo"] = X.apply(
        lambda r: AFINIDAD.get(str(r["tipo_turista_preferido"]), {}).get(str(r["tipo_sitio"]), 0.5), axis=1
    )
    # c) Interacciones categóricas
    X["x_tipoTur__tipoSit"] = X["tipo_turista_preferido"].astype(str) + "×" + X["tipo_sitio"].astype(str)
    X["x_epoca__tipoSit"]   = X["epoca_visita"].astype(str) + "×" + X["tipo_sitio"].astype(str)
    return X

df = make_features(df)

# =======================================
# 11) Etiqueta binaria para clasificación
# =======================================
df["y_like"] = (df["rating_usuario"] >= 4.0).astype(int)

# ==========================================================
# 12) Conjunto final de columnas de entrada (X) extendidas
# ==========================================================
CAT_COLS_X = CAT_COLS + ["x_tipoTur__tipoSit","x_epoca__tipoSit"]
NUM_COLS_X = NUM_COLS + ["ratio_costo_presu","afinidad_tipo"]

print("✅ Esquema OK. Ejemplo columnas:", df.columns[:15].tolist())
print("Shape:", df.shape)


✅ Esquema OK. Ejemplo columnas: ['id_usuario', 'edad', 'nacionalidad', 'origen', 'tipo_turista_preferido', 'compania_viaje', 'frecuencia_viaje', 'restricciones_movilidad', 'presupuesto_estimado', 'sitios_visitados', 'calificacion_sitios_previos', 'tiempo_estancia_promedio', 'nombre_sitio', 'tipo_sitio', 'costo_entrada']
Shape: (100000, 30)


In [22]:
from sklearn.model_selection import GroupShuffleSplit

# --- Versión con tus líneas + checks ---
rng = np.random.default_rng(RANDOM_SEED)

users = df["id_usuario"].drop_duplicates().to_numpy()
n_test = max(1, int(round(len(users) * TEST_USER_FRAC)))  # evita 0
test_users = set(rng.choice(users, size=n_test, replace=False))

train_df = df[~df["id_usuario"].isin(test_users)].reset_index(drop=True)
test_df  = df[ df["id_usuario"].isin(test_users)].reset_index(drop=True)

print("Usuarios train/test:", train_df["id_usuario"].nunique(), test_df["id_usuario"].nunique())
print("Filas train/test:", train_df.shape, test_df.shape)

# --- Sanity checks útiles ---
# 1) No hay usuarios compartidos
overlap = set(train_df["id_usuario"]).intersection(set(test_df["id_usuario"]))
assert len(overlap) == 0, f"Fuga de usuarios entre splits: {len(overlap)}"

# 2) Proporción de usuarios en test (aprox TEST_USER_FRAC)
print("Frac usuarios test real:", round(test_df["id_usuario"].nunique() / len(users), 4))

# 3) Distribución de interacciones por usuario en cada split
def _stats_per_user(df_):
    g = df_.groupby("id_usuario").size()
    return {"min": int(g.min()), "p25": int(g.quantile(0.25)), "median": int(g.median()),
            "p75": int(g.quantile(0.75)), "max": int(g.max())}

print("Interacciones por usuario (train):", _stats_per_user(train_df))
print("Interacciones por usuario (test): ", _stats_per_user(test_df))

# 4) (Opcional) Balance de la etiqueta en cada split si existe y_like
if "y_like" in df.columns:
    def _label_rate(d):
        return float(d["y_like"].mean()) if "y_like" in d.columns else float("nan")
    print("Tasa y_like train/test:", round(_label_rate(train_df),4), "/", round(_label_rate(test_df),4))

# --- Alternativa con GroupShuffleSplit (recomendable y equivalente) ---
# gss = GroupShuffleSplit(n_splits=1, test_size=TEST_USER_FRAC, random_state=RANDOM_SEED)
# idx_train, idx_test = next(gss.split(df, groups=df["id_usuario"]))
# train_df = df.iloc[idx_train].reset_index(drop=True)
# test_df  = df.iloc[idx_test].reset_index(drop=True)


Usuarios train/test: 16000 4000
Filas train/test: (80004, 30) (19996, 30)
Frac usuarios test real: 0.2
Interacciones por usuario (train): {'min': 1, 'p25': 3, 'median': 5, 'p75': 6, 'max': 16}
Interacciones por usuario (test):  {'min': 1, 'p25': 3, 'median': 5, 'p75': 6, 'max': 14}
Tasa y_like train/test: 0.3326 / 0.3234


In [16]:
rng = np.random.default_rng(RANDOM_SEED)
users = df["id_usuario"].drop_duplicates().to_numpy()
test_users = set(rng.choice(users, size=int(len(users)*TEST_USER_FRAC), replace=False))

train_df = df[~df["id_usuario"].isin(test_users)].reset_index(drop=True)
test_df  = df[ df["id_usuario"].isin(test_users)].reset_index(drop=True)

print("Usuarios train/test:", train_df["id_usuario"].nunique(), test_df["id_usuario"].nunique())
print("Filas train/test:", train_df.shape, test_df.shape)

Usuarios train/test: 16000 4000
Filas train/test: (80004, 29) (19996, 29)


In [24]:
from pycaret.classification import (
    setup, compare_models, tune_model, blend_models,
    finalize_model, save_model
)
import pandas as pd
import numpy as np
import inspect

# --------- Tipos explícitos (tu parte) ----------
train_df = train_df.copy(deep=True)
train_df["id_usuario"] = train_df["id_usuario"].astype("string")

for c in CAT_COLS_X:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype("string").copy()

for c in NUM_COLS_X:
    if c in train_df.columns:
        train_df[c] = pd.to_numeric(train_df[c], errors="coerce").astype("float64").copy()

data_in = train_df[["id_usuario"] + CAT_COLS_X + NUM_COLS_X + ["y_like"]].copy()

# --------- Kwargs “deseables” (algunos pueden no existir en tu versión) ----------
desired_kwargs = dict(
    data = data_in,
    target = "y_like",
    session_id = RANDOM_SEED,
    fold = 5,
    fold_strategy = "groupkfold",
    fold_groups = "id_usuario",
    categorical_features = CAT_COLS_X,
    ignore_features = ["id_usuario"],
    remove_multicollinearity = True,
    multicollinearity_threshold = 0.95,

    # Imputación explícita (version-friendly)
    numeric_imputation = "mean",
    categorical_imputation = "most_frequent",

    # Alta cardinalidad
    high_cardinality_features = ["nombre_sitio","x_tipoTur__tipoSit","x_epoca__tipoSit"],
    high_cardinality_method = "frequency",

    # Otros
    fix_imbalance = False,
    n_jobs = 1,
    verbose = False,
    use_gpu = False,
)

# --------- Filtra kwargs según la firma real de setup ----------
sig = inspect.signature(setup)
allowed = set(sig.parameters.keys())
safe_kwargs = {k: v for k, v in desired_kwargs.items() if k in allowed}

# (Por si tu versión tiene “html” o “imputation_type” válidos, puedes añadirlos a mano:)
extra_maybe = {}
for k, v in [("html", False), ("imputation_type", "simple")]:
    if k in allowed:
        extra_maybe[k] = v
safe_kwargs.update(extra_maybe)

# --------- Llamada segura a setup ----------
setup_cls = setup(**safe_kwargs)

# --------- Selección, tuning, blending, final ----------
best3 = compare_models(n_select=3, sort="AUC")
tuned = [tune_model(m, optimize="AUC") for m in best3]
blend = blend_models(tuned)
final_cls = finalize_model(blend)
save_model(final_cls, "modelo_cls_like_v2")
print("✅ Modelo guardado: modelo_cls_like_v2")



                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.8484  0.9215  0.7377  0.7923   
ridge                    Ridge Classifier    0.8493  0.9213  0.7304  0.7995   
lda          Linear Discriminant Analysis    0.8489  0.9212  0.7404  0.7919   
ada                  Ada Boost Classifier    0.8485  0.9208  0.7541  0.7827   
lightgbm  Light Gradient Boosting Machine    0.8483  0.9203  0.7382  0.7917   
catboost              CatBoost Classifier    0.8465  0.9191  0.7358  0.7887   
xgboost         Extreme Gradient Boosting    0.8418  0.9140  0.7308  0.7798   
rf               Random Forest Classifier    0.8464  0.9067  0.7338  0.7898   
et                 Extra Trees Classifier    0.8194  0.8887  0.5882  0.8178   
lr                    Logistic Regression    0.7586  0.8129  0.5739  0.6573   
dt               Decision Tree Classifier    0.7818  0.7551  0.6753  0.6710   
nb                            Naive Bayes    0.6674 

Processing:  14%|█▍        | 1/7 [00:00<00:01,  4.16it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


                                                          

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.8463  0.9190  0.7499  0.7816  0.7654  0.6511  0.6514
1       0.8506  0.9232  0.7521  0.7777  0.7647  0.6553  0.6555
2       0.8466  0.9186  0.7399  0.7864  0.7624  0.6493  0.6500
3       0.8453  0.9182  0.7255  0.8010  0.7614  0.6473  0.6491
4       0.8522  0.9246  0.7492  0.7954  0.7716  0.6626  0.6632
Mean    0.8482  0.9207  0.7433  0.7884  0.7651  0.6531  0.6538
Std     0.0027  0.0027  0.0098  0.0086  0.0036  0.0054  0.0052


Processing:  14%|█▍        | 1/7 [00:00<00:01,  4.02it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


                                                         

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.8481  0.9205  0.7338  0.7961  0.7637  0.6521  0.6533
1       0.8518  0.9242  0.7410  0.7872  0.7634  0.6557  0.6563
2       0.8484  0.9196  0.7228  0.8020  0.7603  0.6499  0.6518
3       0.8454  0.9190  0.7145  0.8090  0.7588  0.6457  0.6484
4       0.8520  0.9253  0.7355  0.8036  0.7680  0.6597  0.6610
Mean    0.8491  0.9217  0.7295  0.7996  0.7629  0.6526  0.6542
Std     0.0025  0.0025  0.0096  0.0074  0.0032  0.0048  0.0043


Processing:  14%|█▍        | 1/7 [00:00<00:01,  3.90it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


                                                         

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.8469  0.9199  0.7413  0.7882  0.7641  0.6509  0.6516
1       0.8522  0.9237  0.7526  0.7814  0.7667  0.6587  0.6589
2       0.8482  0.9193  0.7335  0.7945  0.7628  0.6514  0.6526
3       0.8451  0.9186  0.7261  0.8002  0.7613  0.6470  0.6487
4       0.8523  0.9248  0.7487  0.7960  0.7716  0.6626  0.6633
Mean    0.8490  0.9213  0.7404  0.7921  0.7653  0.6541  0.6550
Std     0.0029  0.0025  0.0097  0.0066  0.0036  0.0057  0.0053


                                                         

      Accuracy  AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                       
0       0.8473  0.0  0.7387  0.7910  0.7639  0.6513  0.6522
1       0.8523  0.0  0.7493  0.7836  0.7661  0.6583  0.6586
2       0.8489  0.0  0.7300  0.7986  0.7628  0.6523  0.6537
3       0.8451  0.0  0.7221  0.8028  0.7603  0.6464  0.6483
4       0.8525  0.0  0.7428  0.8002  0.7704  0.6620  0.6630
Mean    0.8492  0.0  0.7366  0.7952  0.7647  0.6540  0.6552
Std     0.0029  0.0  0.0096  0.0070  0.0034  0.0055  0.0051
Transformation Pipeline and Model Successfully Saved
✅ Modelo guardado: modelo_cls_like_v2


In [30]:
# ============================================
# Evaluación del modelo guardado en test_df
# ============================================
import pandas as pd
import numpy as np
from pycaret.classification import load_model, predict_model

# --- 0) Asegúrate de tener en memoria (o vuelve a definir) ---
#     normalize_columns, canon, normalize_values, make_features,
#     CAT_COLS_X, NUM_COLS_X, K_LIST, SEP/ENC si las usas, etc.
#     (Si vienes del mismo notebook ya están).

# --- 1) Copia y preprocesa test_df igual que train ---
test_proc = test_df.copy(deep=True)

# Normaliza encabezados y espacios (si aplican en tu flujo)
test_proc = normalize_columns(test_proc)
test_proc.columns = test_proc.columns.str.strip()

# Canonicaliza valores categóricos (acentos/espacios a forma canónica)
test_proc = normalize_values(test_proc)

# Feature engineering (mismas transformaciones que en train)
test_proc = make_features(test_proc)

# Tipos: categóricas → string; numéricas → float
test_proc["id_usuario"] = test_proc["id_usuario"].astype("string")
for c in CAT_COLS_X:
    if c in test_proc.columns:
        test_proc[c] = test_proc[c].astype("string")
for c in NUM_COLS_X:
    if c in test_proc.columns:
        test_proc[c] = pd.to_numeric(test_proc[c], errors="coerce").astype("float64")

# Objetivo binario (por si aún no existe en test)
if "y_like" not in test_proc.columns and "rating_usuario" in test_proc.columns:
    test_proc["rating_usuario"] = pd.to_numeric(test_proc["rating_usuario"], errors="coerce")
    test_proc["y_like"] = (test_proc["rating_usuario"] >= 4.0).astype(int)

# --- 2) Carga el modelo PyCaret guardado ---
model = load_model("modelo_cls_like_v2")

# --- 3) Obtener scores SIN pasar id_usuario como feature ---

# 3.0) Guarda el id para reconstruir df_scores luego
user_ids = test_proc["id_usuario"].astype("string").values

# IMPORTANTE: el modelo se entrenó SIN 'id_usuario'
cols_infer = CAT_COLS_X + NUM_COLS_X
data_infer = test_proc[cols_infer].copy()

score_vec = None

# Intento 1: probabilidades directamente del pipeline completo
try:
    proba = model.predict_proba(data_infer)
    # localizar índice de la clase positiva (1); si no existe, usa la última columna
    pos_idx = None
    if hasattr(model, "classes_"):
        import numpy as np
        classes = np.array(model.classes_)
        where_1 = np.where(classes == 1)[0]
        if len(where_1) > 0:
            pos_idx = int(where_1[0])
    if pos_idx is None:
        pos_idx = -1
    score_vec = proba[:, pos_idx].astype(float)
except Exception:
    pass

# Intento 2: decision_function → sigmoide
if score_vec is None:
    try:
        import numpy as np
        dfun = model.decision_function(data_infer)
        dfun = np.asarray(dfun, dtype=float)
        if dfun.ndim == 2 and dfun.shape[1] > 1:
            pos_idx = -1
            if hasattr(model, "classes_"):
                classes = np.array(model.classes_)
                where_1 = np.where(classes == 1)[0]
                if len(where_1) > 0:
                    pos_idx = int(where_1[0])
            dfun = dfun[:, pos_idx]
        score_vec = 1.0 / (1.0 + np.exp(-dfun))  # sigmoide para ranking
    except Exception:
        pass

# Intento 3: etiquetas (fallback; menos ideal para ranking)
if score_vec is None:
    labels = model.predict(data_infer)
    score_vec = (pd.Series(labels).astype(str).isin(["1","True","true"])).astype(float).values

# Construye df_scores (recuerda volver a poner el id guardado)
df_scores = pd.DataFrame({
    "id_usuario": user_ids,
    "nombre_sitio": test_proc["nombre_sitio"].values,
    "score": score_vec,
    "relevancia": test_proc["y_like"].astype(float).values
})




# --- 4) Métricas Top-K ---
def _dcg_at_k(rels):
    return float(np.sum([r/np.log2(i+2) for i, r in enumerate(rels)]))

def recall_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False)
    topk = g.head(k)
    tot = g[rel_col].sum()
    return float("nan") if tot==0 else float(topk[rel_col].sum()/tot)

def ndcg_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False)
    dcg  = _dcg_at_k(g.head(k)[rel_col].tolist())
    idcg = _dcg_at_k(sorted(g[rel_col].tolist(), reverse=True)[:k])
    return float("nan") if idcg==0 else float(dcg/idcg)

def coverage_at_k(df, k, score_col, item_col="nombre_sitio"):
    topk = (df.sort_values(["id_usuario", score_col], ascending=[True, False])
              .groupby("id_usuario").head(k))
    return float(topk[item_col].nunique() / df[item_col].nunique())

# Calcula Recall@K, NDCG@K por usuario y cobertura global
results = {}
for k in K_LIST:
    rec_k = (df_scores.groupby("id_usuario")
                     .apply(lambda g: recall_at_k(g, k, "score", "relevancia"))
                     .mean(skipna=True))
    ndcg_k = (df_scores.groupby("id_usuario")
                      .apply(lambda g: ndcg_at_k(g, k, "score", "relevancia"))
                      .mean(skipna=True))
    cov_k = coverage_at_k(df_scores, k, "score", item_col="nombre_sitio")
    results[k] = {"Recall@K": rec_k, "NDCG@K": ndcg_k, "Coverage@K": cov_k}

eval_df = pd.DataFrame(results).T
print("\n=== Métricas Top-K en TEST ===")
print(eval_df.round(4))

# --- 5) (Opcional) AUC global para referencia de clasificación ---
try:
    from sklearn.metrics import roc_auc_score
    auc_global = roc_auc_score(df_scores["relevancia"], df_scores["score"])
    print("\nAUC global (binario):", round(float(auc_global), 4))
except Exception as e:
    print("\nAUC no disponible:", e)

# --- 6) (Opcional) Quick sanity checks ---
print("\nUsuarios test eval:", df_scores["id_usuario"].nunique())
print("Items únicos en test:", df_scores["nombre_sitio"].nunique())
print("Tasa de positivos (y_like=1):", round(float(df_scores["relevancia"].mean()), 4))


Transformation Pipeline and Model Successfully Loaded

=== Métricas Top-K en TEST ===
    Recall@K  NDCG@K  Coverage@K
3     0.6430  0.6875         1.0
5     0.8791  0.7640         1.0
10    0.9980  0.8055         1.0

AUC global (binario): 0.8095

Usuarios test eval: 4000
Items únicos en test: 87
Tasa de positivos (y_like=1): 0.3234


In [26]:
# ============================================
# Simulación: recomendar Top-N a un usuario nuevo
# ============================================
import pandas as pd
import numpy as np
from pycaret.classification import load_model, predict_model

# --- Config comunes (ajusta si aplica) ---
ENC, SEP = "utf-8-sig", ";"
CAT_PATH = "catalogo_vdl_lugares_unico.csv"   # <-- ruta a tu catálogo
MODEL_NAME = "modelo_cls_like_v2"             # <-- nombre del modelo guardado
TOP_N = 5                                     # cuántas recomendaciones
DIVERSITY_LAMBDA = 0.25                       # penalización por repetir tipo_sitio (0 = nada, 0.2-0.4 razonable)

# --- Reutiliza tus utilidades (asegúrate de tenerlas definidas antes en el notebook) ---
#   - normalize_columns(df)
#   - canon(s)
#   - normalize_values(df)  -> aplica canon a columnas categóricas clave
#   - make_features(df)     -> crea ratio_costo_presu, afinidad_tipo, cruces
#   - CAT_COLS_X, NUM_COLS_X

# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
def _ensure_catalog_schema(cat: pd.DataFrame) -> pd.DataFrame:
    """
    Garantiza columnas mínimas del catálogo y tipos numéricos.
    Completa faltantes con defaults razonables.
    """
    cat = cat.copy()
    needed = [
        "nombre_sitio","tipo_sitio","ubicacion_geografica","clima_predominante",
        "costo_entrada","afluencia_promedio","accesibilidad_general",
        "duracion_esperada","admite_mascotas","idioma_info"
    ]
    for col in needed:
        if col not in cat.columns:
            # Defaults mínimos
            if col in ["costo_entrada","afluencia_promedio","duracion_esperada","admite_mascotas"]:
                cat[col] = 0
            else:
                cat[col] = ""
    # tipos numéricos
    for c in ["costo_entrada","afluencia_promedio","duracion_esperada","admite_mascotas"]:
        cat[c] = pd.to_numeric(cat[c], errors="coerce")
    # valores categóricos a forma canónica
    cat = normalize_values(cat)
    return cat

def _broadcast_user_over_catalog(user_profile: dict, catalog: pd.DataFrame) -> pd.DataFrame:
    """
    Crea el producto cartesiano: duplica el perfil del usuario sobre todos los sitios del catálogo.
    """
    user_df = pd.DataFrame([user_profile])
    # canoniza valores de usuario
    user_df = normalize_values(user_df)
    # repetimos el usuario tantas filas como sitios
    user_expanded = pd.concat([user_df]*len(catalog), ignore_index=True)
    # combinamos columnas de usuario + columnas del catálogo
    X = pd.concat([user_expanded.reset_index(drop=True), catalog.reset_index(drop=True)], axis=1)
    return X

def _ensure_model_dtypes(X: pd.DataFrame) -> pd.DataFrame:
    """
    Asegura dtypes compatibles con el pipeline de entrenamiento.
    """
    X = X.copy()
    # categóricas a string
    for c in CAT_COLS_X:
        if c in X.columns:
            X[c] = X[c].astype("string")
    # numéricas a float
    for c in NUM_COLS_X:
        if c in X.columns:
            X[c] = pd.to_numeric(X[c], errors="coerce").astype("float64")
    return X

def _mmr_diversify(df_scored: pd.DataFrame, score_col: str, tipo_col: str = "tipo_sitio",
                   top_n: int = 5, lam: float = 0.25) -> pd.DataFrame:
    """
    Reranking muy simple estilo MMR por 'tipo_sitio':
    - Escoge greedy el mayor score
    - Penaliza candidatos del mismo tipo: score' = score - lam * (#seleccionados de ese tipo)
    """
    work = df_scored.copy()
    work["_sel"] = 0
    chosen_idx = []
    type_counts = {}
    for _ in range(min(top_n, len(work))):
        # penalización dinámica
        penal = work[tipo_col].map(lambda t: lam * type_counts.get(t, 0))
        work["_adj"] = work[score_col] - penal
        pick = work["_adj"].idxmax()
        chosen_idx.append(pick)
        t = work.at[pick, tipo_col]
        type_counts[t] = type_counts.get(t, 0) + 1
        work = work.drop(index=pick)
    return df_scored.loc[chosen_idx]

def recommend_for_user(user_profile: dict, model_name: str = MODEL_NAME,
                       catalog_path: str = CAT_PATH, top_n: int = TOP_N,
                       diversity_lambda: float = DIVERSITY_LAMBDA) -> pd.DataFrame:
    """
    Carga catálogo + modelo, arma el set usuario×sitio, predice y devuelve Top-N (diversificado).
    """
    # 1) Lee y normaliza catálogo
    cat = pd.read_csv(catalog_path, sep=SEP, encoding=ENC)
    cat = normalize_columns(cat)
    cat.columns = cat.columns.str.strip()
    cat = _ensure_catalog_schema(cat)

    # 2) Ensambla matriz usuario×sitio
    X = _broadcast_user_over_catalog(user_profile, cat)

    # 3) Feature engineering igual que en train
    X = make_features(X)

    # 4) Dtypes como en train
    X = _ensure_model_dtypes(X)

    # 5) Carga modelo y predice score de clase positiva
    model = load_model(model_name)
    preds = predict_model(model, data=X[CAT_COLS_X + NUM_COLS_X])

    # buscar columna de probabilidad
    candidates = ["prediction_score","Score","score","prediction_score_1","Score_1"]
    score_col = next((c for c in candidates if c in preds.columns), None)
    if score_col is None:
        # fallback genérico
        proba_1 = [c for c in preds.columns if str(c).lower().endswith(("_1","class_1","score_1"))]
        score_col = proba_1[0] if proba_1 else None
    if score_col is None:
        raise RuntimeError(f"No se encontró columna de score/probabilidad en {list(preds.columns)}")

    # 6) Arma dataframe de salida con metadatos del catálogo + score
    out = pd.DataFrame({
        "nombre_sitio": X["nombre_sitio"].values,
        "tipo_sitio":   X["tipo_sitio"].values,
        "ubicacion_geografica": X.get("ubicacion_geografica", pd.Series([""]*len(X))).values,
        "costo_entrada": pd.to_numeric(X.get("costo_entrada", pd.Series([0]*len(X))), errors="coerce"),
        "admite_mascotas": pd.to_numeric(X.get("admite_mascotas", pd.Series([0]*len(X))), errors="coerce"),
        "idioma_info": X.get("idioma_info", pd.Series([""]*len(X))).values,
        "score_like": preds[score_col].astype(float).values
    })

    # 7) Orden inicial por score y re-ranking diversificado
    out = out.sort_values("score_like", ascending=False).reset_index(drop=True)
    out_div = _mmr_diversify(out, score_col="score_like", tipo_col="tipo_sitio",
                             top_n=top_n, lam=diversity_lambda)

    # 8) Formatea
    out_div = out_div.copy()
    out_div["score_like"] = out_div["score_like"].round(4)
    return out_div.reset_index(drop=True)

# ------------------------------------------------------------
# Ejemplo de simulación con 2 perfiles
# ------------------------------------------------------------

# Perfil 1: turista cultural en pareja, presupuesto medio, visita en temporada seca
usuario_cultural = {
    "id_usuario": "SIMU-001",
    "nacionalidad": "Colombia",
    "origen": "Bogota",
    "tipo_turista_preferido": "cultural",
    "compania_viaje": "pareja",
    "restricciones_movilidad": "ninguna",
    "epoca_visita": "temporada_seca",
    "presupuesto_estimado": 250000,     # COP totales del viaje (ajusta a tu definición)
    "frecuencia_viaje": 3,
    "sitios_visitados": 5,
    "calificacion_sitios_previos": 4.2,
    "tiempo_estancia_promedio": 2,      # días
    "edad": 28,
    "admite_mascotas": 0
}

# Perfil 2: naturaleza/aventura en familia, bajo costo, visita en festivos
usuario_naturaleza = {
    "id_usuario": "SIMU-002",
    "nacionalidad": "Colombia",
    "origen": "Tunja",
    "tipo_turista_preferido": "naturaleza",
    "compania_viaje": "familia",
    "restricciones_movilidad": "ninguna",
    "epoca_visita": "puente_festivo",
    "presupuesto_estimado": 120000,
    "frecuencia_viaje": 1,
    "sitios_visitados": 2,
    "calificacion_sitios_previos": 3.8,
    "tiempo_estancia_promedio": 1,
    "edad": 36,
    "admite_mascotas": 1
}

print("\n=== Recomendaciones para PERFIL 1 (cultural, pareja) ===")
print(recommend_for_user(usuario_cultural, top_n=TOP_N, diversity_lambda=DIVERSITY_LAMBDA))

print("\n=== Recomendaciones para PERFIL 2 (naturaleza, familia) ===")
print(recommend_for_user(usuario_naturaleza, top_n=TOP_N, diversity_lambda=DIVERSITY_LAMBDA))



=== Recomendaciones para PERFIL 1 (cultural, pareja) ===


TypeError: arg must be a list, tuple, 1-d array, or Series

In [5]:
from pycaret.classification import (
    setup, compare_models, tune_model, blend_models,
    finalize_model, predict_model, pull, save_model
)
import pandas as pd
import numpy as np

# 1) Copias profundas y tipos explícitos (evita vistas read-only)
train_df = train_df.copy(deep=True)

# Asegura que id_usuario esté como string (pero NO como feature)
train_df["id_usuario"] = train_df["id_usuario"].astype("string")

for c in CAT_COLS_X:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype("string").copy()

for c in NUM_COLS_X:
    if c in train_df.columns:
        train_df[c] = pd.to_numeric(train_df[c], errors="coerce").astype("float64").copy()

# 2) Setup — sin SMOTE y sin paralelismo en la primera corrida
setup_cls = setup(
    data = train_df[["id_usuario"] + CAT_COLS_X + NUM_COLS_X + ["y_like"]].copy(),
    target = "y_like",
    fold = 5,
    fold_strategy = "groupkfold",
    fold_groups = "id_usuario",          # usa el id para agrupar
    categorical_features = CAT_COLS_X,
    ignore_features = ["id_usuario"],     # << clave: no lo pases como feature
    remove_multicollinearity = True,
    multicollinearity_threshold = 0.95,
    imputation_type = "simple",
    fix_imbalance = True,                # << desactivar por ahora
    n_jobs = 1,                           # << sin paralelismo (evita bug loky/writeable)
    verbose = False
)

best3 = compare_models(n_select=3, sort="AUC")
tuned = [tune_model(m, optimize="AUC") for m in best3]
blend = blend_models(tuned)
final_cls = finalize_model(blend)
save_model(final_cls, "modelo_cls_like_v2")

# Si esto corre OK, ya puedes volver a activar fix_imbalance=True y/o subir n_jobs

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8495,0.9223,0.7571,0.7831,0.7699,0.6581,0.6583,29.238
ridge,Ridge Classifier,0.8309,0.9217,0.8554,0.7016,0.7709,0.6389,0.6468,3.968
lda,Linear Discriminant Analysis,0.8307,0.9216,0.8552,0.7014,0.7707,0.6386,0.6465,4.704
lightgbm,Light Gradient Boosting Machine,0.8481,0.9215,0.7458,0.7866,0.7656,0.6534,0.6539,5.29
ada,Ada Boost Classifier,0.8394,0.9193,0.8177,0.7313,0.7721,0.6487,0.6511,10.932
catboost,CatBoost Classifier,0.8463,0.9192,0.743,0.7838,0.7628,0.6493,0.6498,101.59
xgboost,Extreme Gradient Boosting,0.8425,0.9158,0.7358,0.7786,0.7565,0.6403,0.6409,7.994
rf,Random Forest Classifier,0.8447,0.9082,0.7506,0.7753,0.7627,0.6473,0.6475,17.376
et,Extra Trees Classifier,0.8318,0.901,0.6568,0.8018,0.722,0.6031,0.6094,20.928
lr,Logistic Regression,0.7261,0.8166,0.7688,0.5649,0.6512,0.4343,0.4483,12.404


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8498,0.9239,0.7765,0.7744,0.7754,0.6626,0.6626
1,0.8435,0.9156,0.7563,0.7677,0.762,0.6454,0.6455
2,0.8498,0.9244,0.7478,0.7873,0.767,0.6563,0.6568
3,0.8494,0.9241,0.7755,0.7761,0.7758,0.6624,0.6624
4,0.8497,0.9191,0.7565,0.7828,0.7694,0.658,0.6582
Mean,0.8485,0.9214,0.7625,0.7777,0.7699,0.657,0.6571
Std,0.0025,0.0035,0.0115,0.0068,0.0052,0.0063,0.0063


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8299,0.9241,0.8676,0.6971,0.7731,0.6396,0.6494
1,0.8264,0.9168,0.8407,0.6975,0.7624,0.6276,0.6344
2,0.8348,0.9253,0.8609,0.7048,0.7751,0.6466,0.6547
3,0.8298,0.9245,0.8621,0.7006,0.773,0.6392,0.648
4,0.8314,0.9195,0.8502,0.7032,0.7698,0.6387,0.6458
Mean,0.8305,0.922,0.8563,0.7006,0.7707,0.6383,0.6465
Std,0.0027,0.0033,0.0096,0.0031,0.0045,0.0061,0.0067


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8307,0.9237,0.8668,0.6987,0.7737,0.641,0.6505
1,0.8262,0.9166,0.8394,0.6974,0.7618,0.6268,0.6335
2,0.8362,0.9251,0.8609,0.7072,0.7765,0.6491,0.657
3,0.8292,0.9239,0.8589,0.7005,0.7717,0.6375,0.6459
4,0.8315,0.9188,0.8513,0.703,0.7701,0.639,0.6463
Mean,0.8308,0.9216,0.8555,0.7014,0.7708,0.6387,0.6466
Std,0.0033,0.0033,0.0095,0.0035,0.005,0.0072,0.0077


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8325,0.0,0.862,0.7033,0.7746,0.6435,0.652
1,0.8279,0.0,0.8345,0.702,0.7626,0.6291,0.6349
2,0.8371,0.0,0.8553,0.7109,0.7764,0.65,0.6569
3,0.8317,0.0,0.856,0.7058,0.7737,0.6417,0.6493
4,0.8329,0.0,0.847,0.707,0.7707,0.641,0.6474
Mean,0.8324,0.0,0.851,0.7058,0.7716,0.6411,0.6481
Std,0.003,0.0,0.0095,0.0031,0.0049,0.0068,0.0073


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['edad', 'frecuencia_viaje',
                                              'presupuesto_estimado',
                                              'sitios_visitados',
                                              'calificacion_sitios_previos',
                                              'tiempo_estancia_promedio',
                                              'costo_entrada',
                                              'afluencia_promedio',
                                              'duracion_esperada',
                                              'admite_mascotas',
                                              'ratio_costo_presu',
                                              'afinidad_tipo'],
                                     transf...
                                                                fit_