# 📒 Notebook: Recomendador Villa de Leyva — entrenamiento, evaluación y simulación

## 0) Configuración inicial y utilidades (ejecuta primero)

In [1]:

# ===============================
# 0) Configuración y utilidades
# ===============================
import pandas as pd
import numpy as np
import unicodedata
from pathlib import Path

# --- Rutas y parámetros ---
ENC = "utf-8-sig"
SEP = ";"
DATA_PATH = "dataset_Recomendacion_villa_de_leyva_eleccion (2).csv"
CAT_PATH  = "catalogo_vdl_lugares_unico.csv"
RANDOM_SEED = 42
K_LIST = [3, 5, 10]
TEST_USER_FRAC = 0.20

# --- Normalizador de encabezados con caracteres raros ---
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    ren = {
        "compa¤¡a_viaje": "compania_viaje",
        "‚poca_visita": "epoca_visita",
        "compañia_viaje": "compania_viaje",
    }
    ren = {k: v for k, v in ren.items() if k in df.columns}
    return df.rename(columns=ren)

# --- Canonicalización de valores categóricos ---
CANON_MAP = {
    "gastronomia": "gastronomico",
    "gastronomía": "gastronomico",
    "relax_fotografia": "relax_fotografia",
    "relax_fotografía": "relax_fotografia",
    "parque tematico": "parque_tematico",
    "enoturismo": "gastronomico",
    "historico": "centro_historico",
    "histórico": "centro_historico",
    "natural": "naturaleza",
    # añadidos útiles:
    "cultura": "cultural",
    "historia": "cultural",
    "fotografia": "relax_fotografia",
}

def canon(s: str) -> str:
    s = str(s)
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    s = s.strip().lower().replace(" ", "_")
    return CANON_MAP.get(s, s)

def normalize_values(df_in: pd.DataFrame) -> pd.DataFrame:
    df2 = df_in.copy()
    # (ajustado a columnas existentes)
    cols = ["tipo_turista_preferido", "tipo_sitio", "epoca_visita",
            "accesibilidad_general", "ubicacion_geografica"]
    for c in cols:
        if c in df2.columns:
            df2[c] = df2[c].astype(str).map(canon)
    return df2

# --- Columnas base (ajustadas al CSV actual) ---
CAT_COLS = [
    "nacionalidad","origen","tipo_turista_preferido","compania_viaje",
    "nombre_sitio","tipo_sitio","accesibilidad_general",
    "ubicacion_geografica","epoca_visita","restricciones_movilidad","idioma_info","clima_predominante"
]
NUM_COLS = [
    "edad","presupuesto_estimado","costo_entrada","admite_mascotas","frecuencia_viaje","sitios_visitados","calificacion_sitios_previos","tiempo_estancia_promedio","afluencia_promedio","duracion_esperada"
]

# --- Afinidad perfil × tipo (canónica) ---
AFINIDAD = {
    "cultural":   {"museo":0.9,"centro_historico":0.9,"religioso":0.7,"arquitectura":0.85,"museo_religioso":0.8,"arqueologico":0.85,"plaza":0.7},
    "naturaleza": {"naturaleza":0.95,"senderismo":0.9,"mirador":0.8,"parque_urbano":0.6},
    "aventura":   {"aventura":0.95,"senderismo":0.85,"parque_tematico":0.7},
    "gastronomico":{"gastronomico":0.95,"enoturismo":0.9,"artesanal":0.6,"plaza":0.6},
    "relax_fotografia": {"mirador":0.9,"plaza":0.8,"arquitectura":0.8,"naturaleza":0.75},
}

# --- Feature engineering ---
def make_features(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # evita división por cero y NaN
    presu = pd.to_numeric(X.get("presupuesto_estimado", 0), errors="coerce")
    costo = pd.to_numeric(X.get("costo_entrada", 0), errors="coerce")
    denom = (presu * 0.15).replace(0, np.nan)
    X["ratio_costo_presu"] = (costo / denom).clip(0, 3).fillna(0)

    def _afinidad_row(r):
        return AFINIDAD.get(str(r.get("tipo_turista_preferido","")), {})\
                       .get(str(r.get("tipo_sitio","")), 0.5)

    X["afinidad_tipo"] = X.apply(_afinidad_row, axis=1)
    X["x_tipoTur__tipoSit"] = X.get("tipo_turista_preferido","").astype(str) + "×" + X.get("tipo_sitio","").astype(str)
    X["x_epoca__tipoSit"]   = X.get("epoca_visita","").astype(str) + "×" + X.get("tipo_sitio","").astype(str)
    return X

# --- Columnas extendidas para modelado ---
CAT_COLS_X = CAT_COLS + ["x_tipoTur__tipoSit","x_epoca__tipoSit"]
NUM_COLS_X = NUM_COLS + ["ratio_costo_presu","afinidad_tipo"]

# --- Helpers anti-fugas / duplicados (útiles más adelante) ---
LEAK_COLS = ["y_like","rating_usuario","sitio_recomendado"]

def drop_leaks(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(columns=[c for c in LEAK_COLS if c in df.columns], errors="ignore")

def dedupe_columns(df: pd.DataFrame, keep="last") -> pd.DataFrame:
    """Elimina columnas duplicadas por nombre (útil tras concat usuario×catálogo)."""
    return df.loc[:, ~df.columns.duplicated(keep=keep)].copy()

# --- Métricas Top-K ---
def _dcg_at_k(rels):
    return float(np.sum([r/np.log2(i+2) for i, r in enumerate(rels)]))

def recall_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False)
    topk = g.head(k)
    tot = g[rel_col].sum()
    return float("nan") if tot==0 else float(topk[rel_col].sum()/tot)

def ndcg_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False)
    dcg  = _dcg_at_k(g.head(k)[rel_col].tolist())
    idcg = _dcg_at_k(sorted(g[rel_col].tolist(), reverse=True)[:k])
    return float("nan") if idcg==0 else float(dcg/idcg)

def coverage_at_k(df, k, score_col, item_col="nombre_sitio"):
    topk = (df.sort_values(["id_usuario", score_col], ascending=[True, False])
              .groupby("id_usuario").head(k))
    return float(topk[item_col].nunique() / df[item_col].nunique())


## 1) Carga, limpieza, features y etiqueta

In [2]:

# ===============================
# 1) Carga, limpieza y features
# ===============================
df_raw = pd.read_csv(DATA_PATH, sep=SEP, encoding=ENC)
df_raw = normalize_columns(df_raw)
df_raw.columns = df_raw.columns.str.strip()

# Canonicaliza valores categóricos clave
df_raw = normalize_values(df_raw)

# Coerción numérica (solo si existen)
for c in ["costo_entrada","presupuesto_estimado","edad","frecuencia_viaje",
          "sitios_visitados","calificacion_sitios_previos","tiempo_estancia_promedio",
          "afluencia_promedio","duracion_esperada","admite_mascotas","rating_usuario"]:
    if c in df_raw.columns:
        df_raw[c] = pd.to_numeric(df_raw[c], errors="coerce")

# Anti-leakage obvio
df_raw = df_raw.drop(columns=["sitio_recomendado"], errors="ignore")

# Elimina posibles columnas duplicadas por nombre
df_raw = dedupe_columns(df_raw, keep="last")

# Validación mínima antes de features (lo que make_features y el modelo necesitan)
REQUIRED = ["tipo_turista_preferido","tipo_sitio","epoca_visita",
            "presupuesto_estimado","costo_entrada","nombre_sitio","id_usuario"]
missing = [c for c in REQUIRED if c not in df_raw.columns]
if missing:
    raise KeyError(f"Faltan columnas obligatorias: {missing}")

# Features
df_feat = make_features(df_raw)

# Etiqueta binaria (mantén rating_usuario como referencia si lo tienes)
if "rating_usuario" not in df_feat.columns:
    raise KeyError("Falta 'rating_usuario' para crear y_like.")
df_feat["y_like"] = (df_feat["rating_usuario"] >= 4.0).astype(int)

# Mantén id_usuario como string para el GroupKFold (no lo uses como feature)
df_feat["id_usuario"] = df_feat["id_usuario"].astype("string")

print("Shape:", df_feat.shape)
df_feat.head(3)


Shape: (100000, 29)


Unnamed: 0,id_usuario,edad,nacionalidad,origen,tipo_turista_preferido,compania_viaje,frecuencia_viaje,restricciones_movilidad,presupuesto_estimado,sitios_visitados,...,idioma_info,ubicacion_geografica,clima_predominante,epoca_visita,rating_usuario,ratio_costo_presu,afinidad_tipo,x_tipoTur__tipoSit,x_epoca__tipoSit,y_like
0,U03462,40,Colombia,Duitama,gastronomico,familia,2,ninguna,572831,8,...,es,raquira,seco,fin_de_semana,3.9,0.081467,0.95,gastronomico×gastronomico,fin_de_semana×gastronomico,0
1,U13281,31,Colombia,Cali,naturaleza,solo,1,ninguna,191317,5,...,es,gachantiva,templado_humedo,fin_de_semana,3.6,0.420593,0.5,naturaleza×parque_tematico,fin_de_semana×parque_tematico,0
2,U12443,16,Colombia,Cúcuta,gastronomico,solo,2,ninguna,172745,21,...,es/en,villa_de_leyva,templado_seco,temporada_baja,2.9,0.077185,0.5,gastronomico×centro_historico,temporada_baja×centro_historico,0


## 2) Split honesto por usuario (train/test)

In [3]:
# ================================================
# 2) Split por usuario (sin fuga de información)
# ================================================
base = df_feat.copy()

# Asegura id_usuario usable
base["id_usuario"] = base["id_usuario"].astype("string")
base = base[~base["id_usuario"].isna()]

# Conjunto de usuarios y muestreo reproducible
rng = np.random.default_rng(RANDOM_SEED)
users = base["id_usuario"].drop_duplicates().to_numpy()
n_test = max(1, int(round(len(users) * TEST_USER_FRAC)))
test_users = set(rng.choice(users, size=n_test, replace=False))

# Particiones por usuario (hold-out honesto)
train_df = base[~base["id_usuario"].isin(test_users)].reset_index(drop=True)
test_df  = base[ base["id_usuario"].isin(test_users)].reset_index(drop=True)

print("Usuarios train/test:", train_df["id_usuario"].nunique(), "/", test_df["id_usuario"].nunique())
print("Filas train/test:", train_df.shape, test_df.shape)

# No debe haber fuga de usuarios
overlap = set(train_df["id_usuario"]).intersection(set(test_df["id_usuario"]))
assert len(overlap) == 0, f"Fuga de usuarios entre splits: {len(overlap)}"

# (Opcional) sanity de la etiqueta por split
if "y_like" in train_df.columns:
    print("Positivos (train/test):",
          round(train_df["y_like"].mean(), 4),
          "/",
          round(test_df["y_like"].mean(), 4))



Usuarios train/test: 16000 / 4000
Filas train/test: (80004, 29) (19996, 29)
Positivos (train/test): 0.3438 / 0.3456


## 3) Entrenamiento con PyCaret (compare → tune → blend → save)

In [4]:

# ========================================
# 3) Entrenamiento con PyCaret (robusto)
# ========================================
from pycaret.classification import (
    setup, compare_models, tune_model, blend_models,
    finalize_model, save_model, pull, get_config
)
import inspect

# Tipos explícitos (categóricas como string, numéricas float)
train_df = train_df.copy(deep=True)
train_df["id_usuario"] = train_df["id_usuario"].astype("string")
for c in CAT_COLS_X:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype("string").copy()
for c in NUM_COLS_X:
    if c in train_df.columns:
        train_df[c] = pd.to_numeric(train_df[c], errors="coerce").astype("float64").copy()

# === 0) data_in: SOLO id_usuario + features + y_like (target al final) ===
data_in = train_df[["id_usuario"] + CAT_COLS_X + NUM_COLS_X + ["y_like"]].copy()

# === 1) setup con kwargs “seguros” según la firma instalada ===
desired_kwargs = dict(
    data = data_in,
    target = "y_like",
    session_id = RANDOM_SEED,
    fold = 2,
    fold_strategy = "groupkfold",
    fold_groups = "id_usuario",                 # agrupa por usuario en CV
    categorical_features = CAT_COLS_X,
    ignore_features = ["id_usuario"],           # nunca usar id como feature
    remove_multicollinearity = True,
    multicollinearity_threshold = 0.95,
    numeric_imputation = "mean",
    categorical_imputation = "most_frequent",
    high_cardinality_features = ["nombre_sitio","x_tipoTur__tipoSit","x_epoca__tipoSit"],
    high_cardinality_method = "frequency",
    fix_imbalance = True,                       # si ves sobreajuste, prueba False
    n_jobs = 1,                                 # para evitar issues de loky y dtypes
    verbose = False,
    use_gpu = False,
    html = True,                                # tablas bonitas
)
sig = inspect.signature(setup)
allowed = set(sig.parameters.keys())
safe_kwargs = {k: v for k, v in desired_kwargs.items() if k in allowed}

setup(**safe_kwargs)

# === 2) comparar → tunear → blend ===
best3 = compare_models(n_select=3, sort="AUC")
tuned = [tune_model(m, optimize="AUC") for m in best3]
blend = blend_models(tuned)

# === 3) finalize y guardar pipeline ===
final_cls = finalize_model(blend)
MODEL_NAME = "modelo_cls_like_v3"
save_model(final_cls, MODEL_NAME)
print(f"✅ Modelo guardado: {MODEL_NAME}")

# === 4) Post-checks para evitar problemas en inferencia ===
# 4.1) columnas de features que REALMENTE vio el prep de PyCaret
X_cols = list(get_config("X_train").columns)
print("Features usadas por el pipeline (prep):", len(X_cols))
pd.Series(X_cols).to_csv("features_usadas_por_modelo.csv", index=False, encoding="utf-8-sig")

# 4.2) sanity: confirma que el target NO quedó como feature en el prep
bad_targets = [c for c in ["y_like","rating_usuario","sitio_recomendado"] if c in X_cols]
if bad_targets:
    print("⚠️ Aviso: el prep contiene columnas de target como features:", bad_targets,
          "\n   (seguirá funcionando con el parche de inferencia en 2 etapas,",
          "   pero considera re-entrenar para que no aparezcan ahí).")

# 4.3) guarda un “manifiesto” mínimo para la simulación
manifest = {
    "cat_cols_x": CAT_COLS_X,
    "num_cols_x": NUM_COLS_X,
    "features_prep_seen": X_cols,
    "model_name": MODEL_NAME,
    "fold_strategy": "groupkfold",
    "fold_groups": "id_usuario",
}
pd.Series({k:str(v) for k,v in manifest.items()}).to_csv("manifest_modelo.csv", encoding="utf-8-sig")



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7916,0.8552,0.6168,0.7344,0.6705,0.5198,0.524,28.19
lightgbm,Light Gradient Boosting Machine,0.7919,0.8552,0.6043,0.7425,0.6663,0.5174,0.5232,5.335
catboost,CatBoost Classifier,0.7881,0.8521,0.6121,0.7283,0.6652,0.5119,0.516,78.86
xgboost,Extreme Gradient Boosting,0.7825,0.8443,0.6118,0.7145,0.6591,0.5008,0.504,7.35
rf,Random Forest Classifier,0.7877,0.8439,0.6018,0.733,0.6609,0.5086,0.5138,22.965
ridge,Ridge Classifier,0.7738,0.8417,0.6874,0.6657,0.6763,0.5025,0.5027,5.885
lda,Linear Discriminant Analysis,0.7738,0.8416,0.6874,0.6657,0.6764,0.5026,0.5028,3.575
ada,Ada Boost Classifier,0.7783,0.8384,0.5778,0.7221,0.6417,0.484,0.4905,14.8
et,Extra Trees Classifier,0.7805,0.8243,0.5901,0.7209,0.649,0.4916,0.4969,12.99
lr,Logistic Regression,0.7161,0.7853,0.7091,0.5701,0.632,0.4053,0.4117,12.02


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7908,0.8555,0.6062,0.7363,0.6649,0.5149,0.52
1,0.7915,0.8536,0.6115,0.7395,0.6694,0.5192,0.5242
Mean,0.7912,0.8545,0.6088,0.7379,0.6672,0.517,0.5221
Std,0.0004,0.001,0.0026,0.0016,0.0023,0.0021,0.0021


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7903,0.8551,0.6073,0.7342,0.6648,0.5141,0.519
1,0.7915,0.8555,0.6062,0.7425,0.6675,0.5178,0.5235
Mean,0.7909,0.8553,0.6068,0.7384,0.6661,0.516,0.5213
Std,0.0006,0.0002,0.0006,0.0042,0.0014,0.0018,0.0022


Fitting 2 folds for each of 10 candidates, totalling 20 fits
[LightGBM] [Info] Number of positive: 18334, number of negative: 18334
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020797 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21262
[LightGBM] [Info] Number of data points in the train set: 36668, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 18413, number of negative: 18413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21583
[LightGBM] [Info] Number of data points in the train set: 36826, number of used features: 94
[LightGBM]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7939,0.8578,0.6092,0.7428,0.6694,0.5218,0.5272
1,0.7937,0.8564,0.6073,0.7476,0.6702,0.5224,0.5284
Mean,0.7938,0.8571,0.6083,0.7452,0.6698,0.5221,0.5278
Std,0.0001,0.0007,0.0009,0.0024,0.0004,0.0003,0.0006


Fitting 2 folds for each of 10 candidates, totalling 20 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7935,0.8573,0.6127,0.7396,0.6702,0.5218,0.5267
1,0.7929,0.8562,0.6097,0.7441,0.6702,0.5214,0.5269
Mean,0.7932,0.8567,0.6112,0.7418,0.6702,0.5216,0.5268
Std,0.0003,0.0006,0.0015,0.0023,0.0,0.0002,0.0001


Transformation Pipeline and Model Successfully Saved
✅ Modelo guardado: modelo_cls_like_v3
Features usadas por el pipeline (prep): 26


In [5]:
def _transform_sin_sampler(model_pipeline, X_df):
    """
    Aplica todos los pasos previos al estimador final, saltando cualquier paso
    que requiera 'y' (samplers como FixImbalancer/SMOTE) o que tenga fit_resample.
    Devuelve la matriz transformada (numpy o DataFrame según el último paso).
    """
    Xt = X_df
    # recorre todas las etapas menos la última (el estimador)
    for name, trans in model_pipeline.steps[:-1]:
        try:
            # si el paso tiene fit_resample, es sampler -> saltar en inferencia
            if hasattr(trans, "fit_resample"):
                continue
            # intenta transform(X)
            Xt = trans.transform(Xt)
        except TypeError as e:
            # algunos transform piden y: intenta con y=None
            try:
                Xt = trans.transform(Xt, None)
            except Exception:
                # si aun así truena, lo saltamos
                continue
    return Xt


## 4) Evaluación del modelo guardado en TEST

In [6]:
# =========================================
# 4) Evaluación en TEST (prep → estimador)
# =========================================
import numpy as np
import pandas as pd
from pycaret.classification import load_model, get_config
from sklearn.metrics import roc_auc_score

# --- Helper: aplica el prep del pipeline saltando samplers (no van en inferencia) ---
def _transform_sin_sampler(model_pipeline, X_df):
    """
    Recorre todas las etapas del pipeline menos el estimador final y aplica transformaciones.
    Si la etapa es un sampler (tiene fit_resample) se omite.
    Si una etapa pide y en transform(), intenta con y=None; si no aplica, se omite.
    Devuelve la matriz transformada (numpy/DF según el último transformador).
    """
    Xt = X_df
    for name, trans in model_pipeline.steps[:-1]:
        # 1) Omitir samplers / re-balanceadores
        if hasattr(trans, "fit_resample"):
            continue
        # 2) Intentar transform
        try:
            Xt = trans.transform(Xt)
        except TypeError:
            # algunos wrappers piden y en transform; intentamos con y=None
            try:
                Xt = trans.transform(Xt, None)
            except Exception:
                # si tampoco aplica a inferencia, se omite
                continue
    return Xt

# --- 1) Preprocesa test igual que train ---
test_proc = test_df.copy(deep=True)
test_proc = normalize_columns(test_proc)
test_proc.columns = test_proc.columns.str.strip()
test_proc = normalize_values(test_proc)
test_proc = make_features(test_proc)

# Tipos coherentes
for c in CAT_COLS_X:
    if c in test_proc.columns:
        test_proc[c] = test_proc[c].astype(object)
for c in NUM_COLS_X:
    if c in test_proc.columns:
        test_proc[c] = pd.to_numeric(test_proc[c], errors="coerce").astype("float64")

# Target binario por si acaso
if "y_like" not in test_proc.columns and "rating_usuario" in test_proc.columns:
    test_proc["y_like"] = (pd.to_numeric(test_proc["rating_usuario"], errors="coerce") >= 4).astype(int)

# --- 2) Carga modelo y separa estimador final ---
model = load_model("modelo_cls_like_v3")
assert hasattr(model, "steps") and len(model.steps) >= 1, "Pipeline inválido (sin steps)."
final_est = model.steps[-1][1]

# --- 3) Alinear columnas a lo que vio el prep en train ---
try:
    feat_used = pd.read_csv("features_usadas_por_modelo.csv", squeeze=True).tolist()
except Exception:
    feat_used = list(get_config("X_train").columns)  # fallback si estás en la misma sesión de PyCaret

Xinfer = test_proc.reindex(columns=feat_used).copy()

# --- 4) Transformación (saltando samplers) + scoring ---
X_trans = _transform_sin_sampler(model, Xinfer)

if hasattr(final_est, "predict_proba"):
    proba = final_est.predict_proba(X_trans)
    pos_idx = -1
    if hasattr(final_est, "classes_"):
        cls = np.array(final_est.classes_)
        where = np.where(cls == 1)[0]
        if len(where):
            pos_idx = int(where[0])
    score = proba[:, pos_idx].astype(float)

elif hasattr(final_est, "decision_function"):
    dfun = final_est.decision_function(X_trans)
    dfun = np.asarray(dfun, dtype=float)
    if dfun.ndim == 2 and dfun.shape[1] > 1:
        pos_idx = -1
        if hasattr(final_est, "classes_"):
            cls = np.array(final_est.classes_)
            where = np.where(cls == 1)[0]
            if len(where):
                pos_idx = int(where[0])
        dfun = dfun[:, pos_idx]
    score = 1.0/(1.0+np.exp(-dfun))

else:
    labels = final_est.predict(X_trans)
    score = (pd.Series(labels).astype(str).isin(["1","True","true"])).astype(float).values

# --- 5) Métricas Top-K + AUC ---
df_scores = pd.DataFrame({
    "id_usuario": test_proc["id_usuario"].astype(str).values,
    "nombre_sitio": test_proc["nombre_sitio"].values,
    "score": score,
    "relevancia": test_proc["y_like"].astype(float).values
})

results = {}
for k in K_LIST:
    rec_k = (df_scores.groupby("id_usuario")
             .apply(lambda g: recall_at_k(g, k, "score", "relevancia"))
             .mean(skipna=True))
    ndcg_k = (df_scores.groupby("id_usuario")
              .apply(lambda g: ndcg_at_k(g, k, "score", "relevancia"))
              .mean(skipna=True))
    cov_k = coverage_at_k(df_scores, k, "score", "nombre_sitio")
    results[k] = {"Recall@K": rec_k, "NDCG@K": ndcg_k, "Coverage@K": cov_k}

eval_df = pd.DataFrame(results).T
print("\n=== Métricas Top-K en TEST ===")
print(eval_df.round(4))

try:
    auc_global = roc_auc_score(df_scores["relevancia"], df_scores["score"])
    print("\nAUC global (binario):", round(float(auc_global), 4))
except Exception as e:
    print("\nAUC no disponible:", e)

print("\nUsuarios test eval:", df_scores["id_usuario"].nunique())
print("Items únicos en test:", df_scores["nombre_sitio"].nunique())
print("Tasa de positivos (y_like=1):", round(float(df_scores["relevancia"].mean()), 4))


Transformation Pipeline and Model Successfully Loaded

=== Métricas Top-K en TEST ===
    Recall@K  NDCG@K  Coverage@K
3     0.7662  0.7954         1.0
5     0.9313  0.8438         1.0
10    0.9992  0.8688         1.0

AUC global (binario): 0.8643

Usuarios test eval: 4000
Items únicos en test: 87
Tasa de positivos (y_like=1): 0.3456


## 5) Análisis adicionales (distribución de positivos, métricas por segmento, MAP/MRR)

In [7]:

# %%
# 5.1) Distribución de positivos por usuario
g = df_scores.groupby("id_usuario")["relevancia"].sum().rename("positivos_user")
print({
    "min": int(g.min()), "p25": float(g.quantile(0.25)), "mediana": float(g.median()),
    "p75": float(g.quantile(0.75)), "max": int(g.max())
})


{'min': 0, 'p25': 0.0, 'mediana': 1.0, 'p75': 3.0, 'max': 11}


In [8]:

import numpy as np

# 5.2) Métricas por segmento (versión robusta a pandas)
SEG = "tipo_turista_preferido"  # prueba también con 'epoca_visita', 'ubicacion_geografica', etc.

tmp = test_proc[["id_usuario", SEG]].drop_duplicates()
scores_seg = df_scores.merge(tmp, on="id_usuario", how="left")

def eval_segment(seg_name):
    out = {}
    # agrupamos por (segmento, usuario)
    by = scores_seg.groupby([seg_name, "id_usuario"], sort=False)

    for k in [3, 5, 10]:
        rec_user = by.apply(lambda g: recall_at_k(g, k, "score", "relevancia"))
        ndc_user = by.apply(lambda g: ndcg_at_k(g, k, "score", "relevancia"))

        # promedio por segmento (nivel 0 del índice jerárquico)
        rec_mean = rec_user.groupby(level=0).agg(np.nanmean)
        ndc_mean = ndc_user.groupby(level=0).agg(np.nanmean)

        out[k] = pd.DataFrame({"Recall@K": rec_mean, "NDCG@K": ndc_mean}).sort_index()

    return out

seg_metrics = eval_segment(SEG)
for k, dfk in seg_metrics.items():
    print(f"\n== {SEG} @ {k} ==")
    print(dfk.round(3).sort_values("NDCG@K", ascending=False))




== tipo_turista_preferido @ 3 ==
                        Recall@K  NDCG@K
tipo_turista_preferido                  
gastronomico               0.848   0.941
cultural                   0.764   0.842
relax_fotografia           0.715   0.635
aventura                   0.687   0.610
naturaleza                 0.700   0.604

== tipo_turista_preferido @ 5 ==
                        Recall@K  NDCG@K
tipo_turista_preferido                  
gastronomico               0.966   0.957
cultural                   0.938   0.882
relax_fotografia           0.889   0.710
naturaleza                 0.901   0.696
aventura                   0.877   0.692

== tipo_turista_preferido @ 10 ==
                        Recall@K  NDCG@K
tipo_turista_preferido                  
gastronomico               1.000   0.968
cultural                   0.999   0.902
relax_fotografia           1.000   0.754
aventura                   1.000   0.747
naturaleza                 0.998   0.737


In [9]:

# %%
# 5.3) Métricas extra: HitRate, MAP@K, MRR@K
def hitrate_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False).head(k)
    return 1.0 if g[rel_col].sum() > 0 else 0.0

def apk(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False)
    rels = g[rel_col].values[:k]
    if rels.sum() == 0: return 0.0
    precs = [rels[:i+1].sum()/(i+1) for i in range(len(rels)) if rels[i]==1]
    return float(np.mean(precs))

def mrr_at_k(g, k, score_col, rel_col):
    g = g.sort_values(score_col, ascending=False).head(k)
    idx = np.where(g[rel_col].values==1)[0]
    return 1.0/(idx[0]+1) if len(idx)>0 else 0.0

more = {}
for k in [3,5,10]:
    hr = df_scores.groupby("id_usuario").apply(lambda g: hitrate_at_k(g,k,"score","relevancia")).mean()
    mapk = df_scores.groupby("id_usuario").apply(lambda g: apk(g,k,"score","relevancia")).mean()
    mrrk = df_scores.groupby("id_usuario").apply(lambda g: mrr_at_k(g,k,"score","relevancia")).mean()
    more[k] = {"HitRate@K":hr, "MAP@K":mapk, "MRR@K":mrrk}
pd.DataFrame(more).T.round(4)


Unnamed: 0,HitRate@K,MAP@K,MRR@K
3,0.6325,0.5527,0.5576
5,0.6665,0.5517,0.5655
10,0.677,0.5482,0.5671


## 6) Simulación: recomendaciones Top-N para un usuario nuevo (con diversidad)

In [10]:
print("Columnas X_trans:", X_trans.columns[:10])
print("Columnas que el modelo espera:", final_est.feature_names_in_[:10])
print("Cantidad de columnas (X_trans vs modelo):", X_trans.shape[1], len(final_est.feature_names_in_))


Columnas X_trans: Index(['nacionalidad_Reino Unido', 'nacionalidad_Colombia',
       'nacionalidad_España', 'nacionalidad_Canadá', 'nacionalidad_Chile',
       'nacionalidad_Brasil', 'nacionalidad_Perú', 'nacionalidad_México',
       'nacionalidad_Estados Unidos', 'nacionalidad_Italia'],
      dtype='object')
Columnas que el modelo espera: ['nacionalidad_Reino_Unido' 'nacionalidad_Colombia' 'nacionalidad_España'
 'nacionalidad_Canadá' 'nacionalidad_Chile' 'nacionalidad_Brasil'
 'nacionalidad_Perú' 'nacionalidad_México' 'nacionalidad_Estados_Unidos'
 'nacionalidad_Italia']
Cantidad de columnas (X_trans vs modelo): 94 94


In [12]:
# ============================================================
# 6) Simulación: recomendaciones Top-N para un usuario (MMR)
#    Versión con perfil mínimo + defaults e inferencia robusta
# ============================================================
TOP_N = 5
DIVERSITY_LAMBDA = 1.00

from pycaret.classification import load_model, get_config
import numpy as np
import pandas as pd

# --------------------------
# PERFIL BASE (defaults) + helper para fusionar perfiles cortos
# --------------------------
USER_DEFAULTS = {
    "id_usuario": "ANON",
    "nacionalidad": "Colombia",
    "origen": "Tunja",
    "tipo_turista_preferido": "cultura",
    "compania_viaje": "pareja",
    "restricciones_movilidad": "ninguna",
    "epoca_visita": "fin_de_semana",
    "presupuesto_estimado": 100000,
    "frecuencia_viaje": np.nan,
    "sitios_visitados": np.nan,
    "calificacion_sitios_previos": np.nan,
    "tiempo_estancia_promedio": np.nan,
    "edad": 30,
}

# lo mínimo que pedimos en el UI / entrada del usuario
MIN_KEYS = {"tipo_turista_preferido","compania_viaje","epoca_visita","presupuesto_estimado","edad"}

def build_user_profile(user_partial: dict) -> dict:
    """Fusiona lo que venga del usuario con defaults. No exige todas las columnas."""
    prof = USER_DEFAULTS.copy()
    if user_partial:
        prof.update({k: v for k, v in user_partial.items() if v is not None})
    # relleno básico si faltó algún mínimo
    for k in MIN_KEYS:
        if k not in prof or prof[k] in [None, ""]:
            prof[k] = USER_DEFAULTS[k]
    return prof

# --- (si no lo usas, puedes borrar este helper) ---
def _transform_sin_sampler(model_pipeline, X_df):
    """
    Aplica transformaciones previas al estimador final, saltando samplers.
    (No se usa en la ruta recomendada porque inferimos con toda la Pipeline)
    """
    Xt = X_df
    for name, trans in model_pipeline.steps[:-1]:
        if hasattr(trans, "fit_resample"):
            continue
        try:
            Xt = trans.transform(Xt)
        except TypeError:
            try:
                Xt = trans.transform(Xt, None)
            except Exception:
                continue
    return Xt

def _ensure_catalog_schema(cat: pd.DataFrame) -> pd.DataFrame:
    cat = cat.copy()
    needed = [
        "nombre_sitio","tipo_sitio","ubicacion_geografica","clima_predominante",
        "costo_entrada","afluencia_promedio","accesibilidad_general",
        "duracion_esperada","admite_mascotas","idioma_info"
    ]
    for col in needed:
        if col not in cat.columns:
            cat[col] = 0 if col in ["costo_entrada","afluencia_promedio","duracion_esperada","admite_mascotas"] else ""
    for c in ["costo_entrada","afluencia_promedio","duracion_esperada","admite_mascotas"]:
        cat[c] = pd.to_numeric(cat[c], errors="coerce")
    cat = normalize_values(cat)
    cat = cat.loc[:, ~cat.columns.duplicated(keep='last')].copy()
    return cat

def _broadcast_user_over_catalog(user_profile: dict, catalog: pd.DataFrame) -> pd.DataFrame:
    user_df = pd.DataFrame([user_profile])
    user_df = normalize_values(user_df)
    user_expanded = pd.concat([user_df]*len(catalog), ignore_index=True)
    X = pd.concat([user_expanded.reset_index(drop=True), catalog.reset_index(drop=True)], axis=1)
    X = X.loc[:, ~X.columns.duplicated(keep='last')].copy()
    dup_cols = pd.Index(X.columns)[pd.Index(X.columns).duplicated(keep=False)]
    if len(dup_cols) > 0:
        raise RuntimeError(f"Aún hay columnas duplicadas: {sorted(set(dup_cols))}")
    return X

def _mmr_diversify(df_scored: pd.DataFrame, score_col: str, tipo_col: str = "tipo_sitio",
                   top_n: int = 5, lam: float = 0.25) -> pd.DataFrame:
    work = df_scored.copy()
    chosen_idx, type_counts = [], {}
    for _ in range(min(top_n, len(work))):
        penal = work[tipo_col].map(lambda t: lam * type_counts.get(t, 0))
        work["_adj"] = work[score_col] - penal
        pick = work["_adj"].idxmax()
        chosen_idx.append(pick)
        t = work.at[pick, tipo_col]
        type_counts[t] = type_counts.get(t, 0) + 1
        work = work.drop(index=pick)
    return df_scored.loc[chosen_idx]

def recommend_for_user(user_profile: dict, model_name: str = "modelo_cls_like_v3",
                       catalog_path: str = CAT_PATH, top_n: int = TOP_N,
                       diversity_lambda: float = DIVERSITY_LAMBDA) -> pd.DataFrame:
    # 1) catálogo + normalización
    cat = pd.read_csv(catalog_path, sep=SEP, encoding=ENC)
    cat = normalize_columns(cat); cat.columns = cat.columns.str.strip()
    cat = _ensure_catalog_schema(cat)

    # 2) PERFIL: permitir parciales y completar con defaults
    user_full = build_user_profile(user_profile)

    # 3) producto usuario × catálogo + features
    X = _broadcast_user_over_catalog(user_full, cat)
    X = make_features(X)
    X.columns = X.columns.map(str)

    # quitar campos de usuario que no aportan (evitar errores de nombres)
    if "id_usuario" in X.columns:
        X = X.drop(columns=["id_usuario"])

    # 4) tipos coherentes (por si tu make_features no los fijó)
    for c in CAT_COLS_X:
        if c in X.columns:
            X[c] = X[c].astype(object)
    for c in NUM_COLS_X:
        if c in X.columns:
            X[c] = pd.to_numeric(X[c], errors="coerce").astype("float64")

    # 5) cargar modelo (pipeline completa)
    model = load_model(model_name)

    # 6) preparar entrada limpia para la Pipeline (alineación estricta)
    Xinp = X.copy()
    Xinp = Xinp.loc[:, ~Xinp.columns.duplicated()].copy()
    Xinp.columns = Xinp.columns.map(str)

    expected = None
    try:
        expected = list(get_config("X_train").columns)
    except Exception:
        for _, step in getattr(model, "steps", []):
            if hasattr(step, "feature_names_in_"):
                expected = list(step.feature_names_in_)
                break

    if expected:
        expected = [str(c) for c in expected]
        extras  = [c for c in Xinp.columns if c not in expected]
        missing = [c for c in expected if c not in Xinp.columns]
        if extras:
            Xinp = Xinp.drop(columns=extras, errors="ignore")
        for m in missing:
            Xinp[m] = np.nan
        Xinp = Xinp[expected]

    # 7) SCORE usando la pipeline completa
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(Xinp)
        pos_idx = -1
        if hasattr(model, "classes_"):
            cls = np.array(model.classes_); where = np.where(cls == 1)[0]
            if len(where): pos_idx = int(where[0])
        score = proba[:, pos_idx].astype(float)
    elif hasattr(model, "decision_function"):
        dfun = model.decision_function(Xinp)
        dfun = np.asarray(dfun, dtype=float)
        if dfun.ndim == 2 and dfun.shape[1] > 1:
            pos_idx = -1
            if hasattr(model, "classes_"):
                cls = np.array(model.classes_); where = np.where(cls == 1)[0]
                if len(where): pos_idx = int(where[0])
            dfun = dfun[:, pos_idx]
        score = 1.0 / (1.0 + np.exp(-dfun))
    else:
        labels = model.predict(Xinp)
        score = (pd.Series(labels).astype(str).isin(["1","True","true"])).astype(float).values

    # 8) salida + re-rank diversificado
    out = pd.DataFrame({
        "nombre_sitio": X["nombre_sitio"].values,
        "tipo_sitio":   X["tipo_sitio"].values,
        "ubicacion_geografica": X.get("ubicacion_geografica", pd.Series([""]*len(X))).values,
        "costo_entrada": pd.to_numeric(X.get("costo_entrada", pd.Series([0]*len(X))), errors="coerce"),
        "admite_mascotas": pd.to_numeric(X.get("admite_mascotas", pd.Series([0]*len(X))), errors="coerce"),
        "idioma_info": X.get("idioma_info", pd.Series([""]*len(X))).values,
        "score_like": score
    })
    out = out.sort_values("score_like", ascending=False).reset_index(drop=True)
    out_div = _mmr_diversify(out, score_col="score_like", tipo_col="tipo_sitio",
                             top_n=top_n, lam=diversity_lambda).reset_index(drop=True)
    out_div["score_like"] = out_div["score_like"].round(4)
    return out_div

# --------------------------
# Perfiles de prueba (MÍNIMOS)
# --------------------------
usuario_cultural_min = {
    "tipo_turista_preferido": "gastronomico",
    "compania_viaje": "pareja",
    "epoca_visita": "temporada_alta",
    "presupuesto_estimado": 1000000,
    "edad": 20
    # 'restricciones_movilidad' es opcional; si no viene, toma "ninguna"
}

usuario_naturaleza_min = {
    "tipo_turista_preferido": "aventura",
    "compania_viaje": "solo",
    "epoca_visita": "temporada_baja",
    "presupuesto_estimado": 120000,
    "edad": 36
}

print("\n=== Recomendaciones PERFIL 1 (mínimo) ===")
display(recommend_for_user(usuario_cultural_min))
print("\n=== Recomendaciones PERFIL 2 (mínimo) ===")
display(recommend_for_user(usuario_naturaleza_min))





=== Recomendaciones PERFIL 1 (mínimo) ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Calle de las Artesanías (Ráquira),gastronomico,raquira,7000.0,0.0,es,0.9708
1,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.6118
2,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.5177
3,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.4962
4,Paso del Ángel,centro_historico,santa_sofia,7217.0,1.0,es,0.4935



=== Recomendaciones PERFIL 2 (mínimo) ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.4589
1,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.4321
2,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.3557
3,Ecoparque Guatoc,parque_tematico,gachantiva,12070.0,1.0,es,0.3442
4,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.3429


In [13]:
# --------------------------
# Lote de perfiles de prueba
# --------------------------

PERFILES_PRUEBA = [
    # --- MÍNIMOS (solo 5-6 campos) ---
    {"tipo_turista_preferido": "cultura",      "compania_viaje": "solo",
     "epoca_visita": "fin_de_semana",          "presupuesto_estimado": 80000,   "edad": 22},

    {"tipo_turista_preferido": "aventura",     "compania_viaje": "pareja",
     "epoca_visita": "puente_festivo",         "presupuesto_estimado": 150000,  "edad": 29},

    {"tipo_turista_preferido": "gastronomico", "compania_viaje": "familia",
     "epoca_visita": "temporada_alta",         "presupuesto_estimado": 200000,  "edad": 35},

    {"tipo_turista_preferido": "naturaleza",   "compania_viaje": "grupo",
     "epoca_visita": "temporada_baja",         "presupuesto_estimado": 60000,   "edad": 27},

    {"tipo_turista_preferido": "historia",     "compania_viaje": "pareja",
     "epoca_visita": "fin_de_semana",          "presupuesto_estimado": 90000,   "edad": 41},

    # --- MÍNIMOS con extremos de presupuesto/edad (estresan imputación) ---
    {"tipo_turista_preferido": "aventura",     "compania_viaje": "solo",
     "epoca_visita": "temporada_alta",         "presupuesto_estimado": 30000,   "edad": 18},  # bajo presupuesto

    {"tipo_turista_preferido": "cultura",      "compania_viaje": "familia",
     "epoca_visita": "puente_festivo",         "presupuesto_estimado": 500000,  "edad": 68},  # alto presupuesto, mayor

    # --- MÍNIMOS con restricciones de movilidad (deja defaults para lo demás) ---
    {"tipo_turista_preferido": "gastronomico", "compania_viaje": "pareja",
     "epoca_visita": "fin_de_semana",          "presupuesto_estimado": 120000,  "edad": 33,
     "restricciones_movilidad": "leve"},

    {"tipo_turista_preferido": "naturaleza",   "compania_viaje": "grupo",
     "epoca_visita": "puente_festivo",         "presupuesto_estimado": 110000,  "edad": 24,
     "restricciones_movilidad": "ninguna"},

    {"tipo_turista_preferido": "historia",     "compania_viaje": "familia",
     "epoca_visita": "temporada_baja",         "presupuesto_estimado": 140000,  "edad": 52,
     "restricciones_movilidad": "alta"},

    # --- “AVANZADOS” (añaden campos opcionales para probar pipeline) ---
    {"tipo_turista_preferido": "cultura",      "compania_viaje": "pareja",
     "epoca_visita": "fin_de_semana",          "presupuesto_estimado": 130000,  "edad": 31,
     "frecuencia_viaje": 3, "sitios_visitados": 5, "calificacion_sitios_previos": 4.2,
     "tiempo_estancia_promedio": 2},

    {"tipo_turista_preferido": "naturaleza",   "compania_viaje": "solo",
     "epoca_visita": "temporada_alta",         "presupuesto_estimado": 95000,   "edad": 26,
     "frecuencia_viaje": 1, "sitios_visitados": 1, "calificacion_sitios_previos": 3.0,
     "tiempo_estancia_promedio": 1},

    {"tipo_turista_preferido": "gastronomico", "compania_viaje": "familia",
     "epoca_visita": "puente_festivo",         "presupuesto_estimado": 180000,  "edad": 45,
     "frecuencia_viaje": 2, "sitios_visitados": 8, "calificacion_sitios_previos": 4.8,
     "tiempo_estancia_promedio": 2},

    {"tipo_turista_preferido": "aventura",     "compania_viaje": "grupo",
     "epoca_visita": "temporada_baja",         "presupuesto_estimado": 70000,   "edad": 20,
     "frecuencia_viaje": 4, "sitios_visitados": 10, "calificacion_sitios_previos": 4.0,
     "tiempo_estancia_promedio": 3},

    {"tipo_turista_preferido": "historia",     "compania_viaje": "pareja",
     "epoca_visita": "fin_de_semana",          "presupuesto_estimado": 160000,  "edad": 38,
     "frecuencia_viaje": 2, "sitios_visitados": 3, "calificacion_sitios_previos": 2.9,
     "tiempo_estancia_promedio": 1},
]

# Helper para ejecutar todos y ver Top-N
for i, perfil in enumerate(PERFILES_PRUEBA, 1):
    print(f"\n=== PERFIL #{i} ===")
    display(recommend_for_user(perfil))



=== PERFIL #1 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Antigua Estación de Policía,centro_historico,villa_de_leyva,0.0,1.0,es,0.9183
1,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.882
2,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.8416
3,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.6421
4,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.5606



=== PERFIL #2 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.6334
1,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.5811
2,Caminos Reales (tramo Villa de Leyva),centro_historico,villa_de_leyva,2433.0,1.0,es/en,0.5137
3,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.5072
4,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.4755



=== PERFIL #3 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Plaza de Mercado de Villa de Leyva,gastronomico,villa_de_leyva,5000.0,1.0,es,0.9581
1,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.5986
2,Museo del Chocolate,museo,villa_de_leyva,10730.0,1.0,es/en,0.5102
3,Paso del Ángel,centro_historico,santa_sofia,7217.0,1.0,es,0.4702
4,Plazuela del Carmen,arquitectura,villa_de_leyva,2000.0,1.0,es/en,0.4454



=== PERFIL #4 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Humedal El Carrizal,naturaleza,chiquiza,5225.0,1.0,es,0.4557
1,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.4041
2,Calle de las Artesanías (Ráquira),gastronomico,raquira,7000.0,0.0,es,0.3874
3,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.3603
4,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.3455



=== PERFIL #5 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Antigua Estación de Policía,centro_historico,villa_de_leyva,0.0,1.0,es,0.9231
1,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.875
2,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.8604
3,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.738
4,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.6368



=== PERFIL #6 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.5008
1,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.4953
2,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.448
3,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.4244
4,Ecoparque Guatoc,parque_tematico,gachantiva,12070.0,1.0,es,0.4186



=== PERFIL #7 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Antigua Estación de Policía,centro_historico,villa_de_leyva,0.0,1.0,es,0.931
1,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.8845
2,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.8411
3,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.6233
4,Plaza de Mercado de Villa de Leyva,gastronomico,villa_de_leyva,5000.0,1.0,es,0.5581



=== PERFIL #8 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.981
1,Portales de la Plaza (Los Portales),plaza,villa_de_leyva,0.0,1.0,es,0.7234
2,Molino del Mesopotamia,arquitectura,villa_de_leyva,2000.0,0.0,es,0.6195
3,Museo Comunitario veredal (Monquirá),museo,villa_de_leyva,10517.0,0.0,es,0.5888
4,Sendero Dunas y Secos,senderismo,villa_de_leyva,7368.0,1.0,es,0.5478



=== PERFIL #9 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Humedal El Carrizal,naturaleza,chiquiza,5225.0,1.0,es,0.5556
1,Viñedo Umaña Dajud,gastronomico,sachica,3000.0,1.0,es,0.5151
2,Reserva Rogitama Biodiversidad,centro_historico,arcabuco,1735.0,0.0,es,0.4805
3,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.4656
4,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.4615



=== PERFIL #10 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Antigua Estación de Policía,centro_historico,villa_de_leyva,0.0,1.0,es,0.8979
1,Museo de Arte Religioso El Carmen,museo,villa_de_leyva,11913.0,0.0,es,0.8681
2,Molino del Mesopotamia,arquitectura,villa_de_leyva,2000.0,0.0,es,0.8072
3,Plaza Mayor de Villa de Leyva,plaza,villa_de_leyva,0.0,1.0,es,0.6126
4,Mirador Alto de los Migueles,mirador,villa_de_leyva,2000.0,1.0,es/en,0.4261



=== PERFIL #11 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Antigua Estación de Policía,centro_historico,villa_de_leyva,0.0,1.0,es,0.9116
1,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.8407
2,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.8288
3,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.6751
4,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.5317



=== PERFIL #12 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.0762
1,Humedal El Carrizal,naturaleza,chiquiza,5225.0,1.0,es,0.0755
2,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.0722
3,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.0581
4,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.0533



=== PERFIL #13 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.9628
1,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.5359
2,FIBAS Jardín de Desierto,centro_historico,villa_de_leyva,6771.0,1.0,es/en,0.476
3,Museo del Chocolate,museo,villa_de_leyva,10730.0,1.0,es/en,0.463
4,Plazuela del Carmen,arquitectura,villa_de_leyva,2000.0,1.0,es/en,0.4172



=== PERFIL #14 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Plaza Principal de Ráquira,plaza,raquira,2000.0,1.0,es,0.4444
1,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.3824
2,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.2817
3,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.2641
4,Ecoparque Guatoc,parque_tematico,gachantiva,12070.0,1.0,es,0.2565



=== PERFIL #15 ===
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,nombre_sitio,tipo_sitio,ubicacion_geografica,costo_entrada,admite_mascotas,idioma_info,score_like
0,Antigua Estación de Policía,centro_historico,villa_de_leyva,0.0,1.0,es,0.7303
1,Museo Paleontológico (Molino de la Osada),museo,villa_de_leyva,8306.0,0.0,es/en,0.6075
2,Casa Terracota,arquitectura,villa_de_leyva,0.0,1.0,es,0.5639
3,Pueblito de los Artesanos,gastronomico,villa_de_leyva,0.0,1.0,es/en,0.1796
4,Plaza Mayor de Villa de Leyva,plaza,villa_de_leyva,0.0,1.0,es,0.099
