In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score,
                             mean_absolute_error, r2_score)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
                              GradientBoostingClassifier, GradientBoostingRegressor)
from sklearn.neural_network import MLPClassifier, MLPRegressor
from xgboost import XGBClassifier, XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

#### Train and Encode

In [3]:
train_df = pd.read_csv("../data/hotel_bookings.csv")
cat_cols = train_df.select_dtypes(include='object').columns
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    encoders[col] = le

Cancellation (Classification)

In [14]:
X_c = train_df.drop('is_canceled', axis=1)
X_c = X_c.fillna(-1)      # numeric NaNs to -1
X_c = X_c.astype(float)   # ensure numeric dtype after fill
y_c = train_df['is_canceled'].astype(int)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_cols = X_c.select_dtypes(include='number').columns
cat_cols = X_c.select_dtypes(exclude='number').columns

pre = ColumnTransformer([
    ('num', Pipeline([
        ('imp', SimpleImputer(strategy='median')),
        ('sc',  StandardScaler())            # optional
    ]), num_cols),
    ('cat', Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent'))
    ]), cat_cols)
])

# rebuild the models dictionary with pipelines
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

Xc_tr, Xc_val, yc_tr, yc_val = train_test_split(X_c, y_c, test_size=0.2, random_state=42, stratify=y_c)

base_clfs = {
    'LogReg':  LogisticRegression(max_iter=1000),
    'RF'    :  RandomForestClassifier(n_estimators=200, random_state=42),
    'GB'    :  GradientBoostingClassifier(random_state=42),
    'XGB'   :  XGBClassifier(n_estimators=300, learning_rate=0.05,
                             max_depth=6, subsample=0.8,
                             eval_metric='logloss', random_state=42),
    'MLP'   :  MLPClassifier(hidden_layer_sizes=(64,32),
                             max_iter=300, random_state=42)
}

class_models = {
    name: Pipeline([('pre', pre), ('clf', clf)])
    for name, clf in base_clfs.items()
}

c_metrics = {}
for name, mdl in class_models.items():
    mdl.fit(Xc_tr, yc_tr)
    preds = mdl.predict(Xc_val)
    proba = mdl.predict_proba(Xc_val)[:,1]
    c_metrics[name] = {
        'ACC': accuracy_score(yc_val, preds),
        'F1' : f1_score(yc_val, preds),
        'AUC': roc_auc_score(yc_val, proba)
    }

best_clf_name = max(c_metrics, key=lambda k: c_metrics[k]['AUC'])
print("🏆 Best cancellation model:", best_clf_name)
print(pd.DataFrame(c_metrics).T)

🏆 Best cancellation model: RF
             ACC        F1       AUC
LogReg  0.988776  0.984630  0.974169
RF      0.999958  0.999943  1.000000
GB      1.000000  1.000000  1.000000
XGB     1.000000  1.000000  1.000000
MLP     1.000000  1.000000  1.000000


ADR (Classification)

In [20]:

# 1️⃣  keep rows with a valid, positive ADR
df_reg = train_df[(train_df['adr'].notna()) & (train_df['adr'] > 0)].copy()

# 2️⃣  features (fill NaN in X) and target (no NaN now)
X_r = df_reg.drop('adr', axis=1).fillna(-1)
y_r = np.log1p(df_reg['adr'])

# 3️⃣  train / validation split
Xr_tr, Xr_val, yr_tr, yr_val = train_test_split(
    X_r, y_r, test_size=0.2, random_state=42)

reg_models = {
    'LinReg': LinearRegression(),
    'RF'    : RandomForestRegressor(n_estimators=200, random_state=42),
    'GB'    : GradientBoostingRegressor(random_state=42),
    'XGB'   : XGBRegressor(n_estimators=300, learning_rate=0.05,
                           max_depth=6, subsample=0.8, random_state=42),
    'MLP'   : MLPRegressor(hidden_layer_sizes=(128,64),
                           max_iter=300, random_state=42)
}

r_metrics = {}
for name, mdl in reg_models.items():
    mdl.fit(Xr_tr, yr_tr)
    pred = np.expm1(mdl.predict(Xr_val))
    true = np.expm1(yr_val)
    r_metrics[name] = {
        'R2' : r2_score(true, pred),
        'MAE': mean_absolute_error(true, pred)
    }

best_reg_name = max(r_metrics, key=lambda k: r_metrics[k]['R2'])
print("🏆 Best ADR model →", best_reg_name)
print(pd.DataFrame(r_metrics).T)

🏆 Best ADR model → RF
              R2        MAE
LinReg  0.264343  25.824479
RF      0.599159   6.593829
GB      0.485593  16.319676
XGB     0.578913  10.220724
MLP     0.121199  30.557214
