In [2]:
import pandas as pd

# Leer archivos CSV
bookings = pd.read_csv("bookings_train.csv")
hotels = pd.read_csv("hotels.csv")

# Convertir hotel_id a string para evitar problemas de tipos
bookings["hotel_id"] = bookings["hotel_id"].astype(str)
hotels["hotel_id"] = hotels["hotel_id"].astype(str)

# Hacer el merge
df = bookings.merge(hotels, on="hotel_id", how="left")

# Ver las primeras filas y columnas disponibles
print(df.shape)
print(df.columns.tolist())
df


(50741, 22)
['board', 'country_x', 'market_segment', 'distribution_channel', 'room_type', 'required_car_parking_spaces', 'special_requests', 'stay_nights', 'rate', 'total_guests', 'hotel_id', 'arrival_date', 'booking_date', 'reservation_status_date', 'reservation_status', 'hotel_type', 'country_y', 'parking', 'total_rooms', 'restaurant', 'pool_and_spa', 'avg_review']


Unnamed: 0,board,country_x,market_segment,distribution_channel,room_type,required_car_parking_spaces,special_requests,stay_nights,rate,total_guests,...,booking_date,reservation_status_date,reservation_status,hotel_type,country_y,parking,total_rooms,restaurant,pool_and_spa,avg_review
0,BB,SPA,Corporate,Corporate,A,,1.0,1.0,65.00,3.0,...,2016-12-09,2016-12-13,Check-Out,City Hotel,SPA,False,126,True,False,4.75
1,,SPA,Corporate,Corporate,A,,1.0,2.0,130.00,3.0,...,2016-10-17,2016-10-19,Check-Out,City Hotel,SPA,False,126,True,False,4.75
2,BB,SPA,Corporate,Corporate,A,,1.0,1.0,65.00,2.0,...,2016-09-30,2016-10-04,Check-Out,City Hotel,SPA,True,98,True,True,4.04
3,BB,SPA,Corporate,Corporate,A,,1.0,1.0,65.00,3.0,...,2016-07-22,2016-07-26,Check-Out,City Hotel,SPA,False,126,True,False,4.75
4,BB,SPA,Corporate,Corporate,A,,0.0,2.0,130.00,,...,2016-07-09,2016-07-12,Check-Out,City Hotel,SPA,True,98,True,True,4.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50736,HB,POR,Direct,Direct,D,1.0,,5.0,1350.00,4.0,...,2016-07-30,2016-08-27,Check-Out,Resort Hotel,POR,True,64,True,True,4.66
50737,HB,SPA,Direct,Direct,E,1.0,0.0,7.0,1642.20,4.0,...,2016-07-10,2016-08-27,Check-Out,Resort Hotel,POR,True,64,True,True,4.66
50738,HB,SPA,Offline TA/TO,TA/TO,A,,1.0,7.0,869.40,4.0,...,2016-04-23,2016-08-27,Check-Out,Resort Hotel,SPA,False,45,True,False,4.54
50739,HB,SPA,Online TA,TA/TO,D,1.0,,6.0,1183.20,4.0,...,2016-07-03,2016-08-27,Check-Out,Resort Hotel,POR,True,64,True,True,4.66


In [5]:
import pandas as pd

# ------------------------------
# Función base_transform
# ------------------------------
def base_transform(df):
    df_hoteles = pd.read_csv("hotels.csv")
    df["hotel_id"] = df["hotel_id"].astype(str)
    df_hoteles["hotel_id"] = df_hoteles["hotel_id"].astype(str)

    df = df.merge(df_hoteles, on="hotel_id", how="left", suffixes=('', '_hotel'))

    # Fechas y variables nuevas
    df["arrival_date"] = pd.to_datetime(df["arrival_date"], errors="coerce")
    df["booking_date"] = pd.to_datetime(df["booking_date"], errors="coerce")
    df["days_in_advance"] = (df["arrival_date"] - df["booking_date"]).dt.days
    df["revenue"] = df["rate"] * df["stay_nights"]
    df["rel_days_stay"] = df["days_in_advance"] / (df["stay_nights"] + 1)

    df["country"] = df["country_x"]
    df["country_hotel"] = df["country_y"]
    df["country"] = df["country"].fillna("Unknown")
    df["country_hotel"] = df["country_hotel"].fillna("Unknown")
    df["is_foreign"] = (df["country"] != df["country_hotel"]).astype(int)

    df["extras"] = (
            df["pool_and_spa"].astype(int) +
            df["restaurant"].astype(int) +
            df["parking"].astype(int)
    )

    # Filtro de outliers y valores inválidos
    q05 = df["rate"].quantile(0.05)
    q95 = df["rate"].quantile(0.95)
    df = df[df["rate"].between(q05, q95)]
    df = df[df["reservation_status"] != "Booked"]
    df = df[df["total_guests"] != 0]

    df = df.reset_index(drop=True)
    return df

# ------------------------------
# Función get_target
# ------------------------------
def get_target(df):
    df = base_transform(df)

    df["reservation_status_date"] = pd.to_datetime(df["reservation_status_date"], errors="coerce")
    delta_dias = (df["arrival_date"] - df["reservation_status_date"]).dt.days

    y = ((df["reservation_status"] == "Canceled") & (delta_dias <= 30) & (delta_dias >= 0)).astype(int)
    return y


In [6]:
# Creamos copia del DataFrame limpio
X = df.copy()

# Transformaciones adicionales
X["rel_days_stay"] = X["days_in_advance"] / (X["stay_nights"] + 1)
X["is_foreign"] = (X["country_x"] != X["country_y"]).astype(int)
X["extras"] = (
        X["pool_and_spa"].astype(int) +
        X["restaurant"].astype(int) +
        X["parking"].astype(int)
)

# Eliminamos columnas que no son útiles como input
cols_to_drop = [
    "arrival_date", "booking_date", "reservation_status_date",
    "country_x", "country_y", "reservation_status", "hotel_id"
]
X = X.drop(columns=cols_to_drop)

# Confirmamos la forma
print(X.shape)
print(X.columns.tolist())


KeyError: 'days_in_advance'

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Separar por tipo de variable
cat_cols = ["board", "market_segment", "distribution_channel", "room_type", "hotel_type"]
num_cols = [
    "required_car_parking_spaces", "special_requests", "stay_nights", "rate",
    "total_guests", "total_rooms", "avg_review", "days_in_advance", "revenue",
    "rel_days_stay"
]
bin_cols = ["parking", "restaurant", "pool_and_spa", "is_foreign", "extras"]

# Pipelines para cada tipo
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

bin_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent"))  # por si acaso hay nulos
])

# ColumnTransformer con todo
preprocessor = ColumnTransformer([
    ("cat", cat_pipe, cat_cols),
    ("num", num_pipe, num_cols),
    ("bin", bin_pipe, bin_cols)
])


In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.linear_model import LogisticRegression
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline

def get_pipeline(clf):
    drop_columns = [
        "arrival_date", "booking_date", "country", "country_hotel",
        "hotel_id", "parking", "pool_and_spa", "restaurant"
    ]

    col_remover = ColumnTransformer(
        transformers=[("drop", "drop", drop_columns)],
        remainder="passthrough",
        verbose_feature_names_out=False
    )

    cat_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop="first", sparse_output=False, handle_unknown='ignore'))
    ])

    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    transformer = ColumnTransformer(
        transformers=[
            ('num', num_transformer, selector(dtype_exclude="object")),
            ('cat', cat_transformer, selector(dtype_include="object"))
        ],
        verbose_feature_names_out=True
    )

    pipe = ImbPipeline(steps=[
        ('col', col_remover),
        ('cast', FunctionTransformer(lambda x: x.infer_objects(), validate=False)),
        ('transformer', transformer),
        ('variance_threshold', VarianceThreshold()),
        ('resampler', SMOTETomek(random_state=42)),
        ('select_features', SelectFromModel(
            LogisticRegression(penalty="l1", solver="liblinear", max_iter=5000)
        )),
        ('clf', clf)
    ])

    return pipe



In [8]:
from sklearn.model_selection import cross_validate, StratifiedKFold, KFold
from sklearn.base import clone
import numpy as np

X = base_transform(df.copy())
y = get_target(df.copy())

pipe = get_pipeline()

# Cambiado a F1-score
scoring_dict = {
    "accuracy": "accuracy",
    "f1": "f1"
}

print("🚧 Probando el pipeline fuera de la validación cruzada...")
try:
    pipe.fit(X, y)
    print("✅ Pipeline entrenado correctamente fuera de CV.")
except Exception as e:
    print(f"❌ Error en el ajuste del pipeline:\n{e}")

# Validación cruzada con F1-score
for cv in (KFold(3, shuffle=True, random_state=42), StratifiedKFold(3, shuffle=True, random_state=42)):
    print(f"\n🔁 Cross-Validation: {cv.__class__.__name__}")

    try:
        cv_score = cross_validate(
            clone(pipe),
            X,
            y,
            scoring=scoring_dict,
            cv=cv,
            return_train_score=True,
            n_jobs=-1
        )

        print(f"✅ Accuracy Train: {np.mean(cv_score['train_accuracy']):.4f} // Test: {np.mean(cv_score['test_accuracy']):.4f}")
        print(f"📊 F1 Train: {np.mean(cv_score['train_f1']):.4f} // Test: {np.mean(cv_score['test_f1']):.4f}")

    except Exception as e:
        print(f"❌ Error en la validación cruzada:\n{e}")


TypeError: get_pipeline() missing 1 required positional argument: 'clf'