In [41]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, f1_score
from xgboost import XGBClassifier
import seaborn as sns
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [42]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss, accuracy_score

In [43]:
from sklearn.naive_bayes import CategoricalNB


--- 
Data cleaning 
---

In [44]:
data = pd.read_csv("train_data.csv")
data_test = pd.read_csv('test_data.csv')

In [45]:
data = data.iloc[:, 1:-1]
data_test = data_test.iloc[:, 1: ]
data_test.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,row_id
0,Resort Hotel,335,2017,June,26,26,1,3,2,0.0,...,A,A,0,No Deposit,0,Transient,74.25,0,2,68898
1,City Hotel,71,2016,June,25,14,0,3,1,0.0,...,A,A,0,Non Refund,0,Transient,120.0,0,0,68899
2,Resort Hotel,207,2017,May,20,19,2,2,2,0.0,...,D,D,2,No Deposit,0,Transient,117.0,0,1,68900
3,City Hotel,109,2016,August,33,8,2,5,1,0.0,...,A,A,0,No Deposit,0,Transient,108.9,0,1,68901
4,City Hotel,28,2017,April,17,27,0,2,2,0.0,...,A,A,0,No Deposit,0,Transient,116.2,0,0,68902


In [46]:
data.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
0,City Hotel,47,2016,March,13,20,2,2,2,0.0,...,A,D,0,No Deposit,0,Transient,85.0,0,0,2
1,Resort Hotel,221,2017,April,18,30,2,5,2,0.0,...,A,A,0,No Deposit,0,Transient-Party,71.43,0,0,1
2,City Hotel,11,2016,February,6,1,2,5,2,0.0,...,A,A,0,No Deposit,0,Transient,79.0,0,0,1
3,Resort Hotel,88,2015,November,48,28,2,4,2,0.0,...,A,A,0,No Deposit,0,Transient,32.4,0,0,1
4,Resort Hotel,250,2017,August,33,13,2,2,2,0.0,...,A,A,1,No Deposit,0,Transient,106.85,0,1,0


In [47]:
data_bis = data.copy()

In [48]:
data_bis['arrival_date_month'] = pd.to_datetime(data_bis['arrival_date_month'], format = '%B').dt.month
data_test['arrival_date_month'] =  pd.to_datetime(data_test['arrival_date_month'], format = '%B').dt.month
data_test.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,row_id
0,Resort Hotel,335,2017,6,26,26,1,3,2,0.0,...,A,A,0,No Deposit,0,Transient,74.25,0,2,68898
1,City Hotel,71,2016,6,25,14,0,3,1,0.0,...,A,A,0,Non Refund,0,Transient,120.0,0,0,68899
2,Resort Hotel,207,2017,5,20,19,2,2,2,0.0,...,D,D,2,No Deposit,0,Transient,117.0,0,1,68900
3,City Hotel,109,2016,8,33,8,2,5,1,0.0,...,A,A,0,No Deposit,0,Transient,108.9,0,1,68901
4,City Hotel,28,2017,4,17,27,0,2,2,0.0,...,A,A,0,No Deposit,0,Transient,116.2,0,0,68902


In [49]:
data_test = data_test.drop(columns=['row_id'])

In [50]:
### Get the columns to be processed by the previous pre-processing
non_numeric = data_bis.select_dtypes(exclude='number').columns
transform = [c for c in non_numeric if data_bis[c].unique().shape[0] >= 3]

### Get the columns to be one-hot encoded 
one_hot = [c for c in non_numeric if data_bis[c].unique().shape[0] < 3]

----

Hard to find any significant correlation with regards to the numerical values. We clearly see that the categorial variables are really significant here.

----

#  First strategy:

--- 
Description:
--- 

**Pipeline**: 


- Replace categorial data with certain statistial behaviour by the vector : (Pr(reservation status = 0 | country), Pr(reservation status = 1 | country), Pr(reservation status = 2 | country) smoothed out.
- Standard scaler for numerical values.
- Custom loss taking into account that predicting *check_out* and having *no_show* is something to be more penalized
  than any other error

In [51]:
train_df, test_df = train_test_split(data_bis, train_size=0.8, random_state=13)

In [52]:
train_df.shape, test_df.shape

((55118, 28), (13780, 28))

---

Data Transformation:
---

---

In the next sections, the following transformation will be applied to our data:

Let $X$ a categorical column which takes values in $\{x_1,...,x_p\}$ such that $p \ge 3$. If $X[i]$ is the value of $X$ at the row $i$, we
compute an estimate of the following probabilities:

$$
\hat{X}_{\text{check_out}}[i] = \hat{\mathbb{P}}(\text{check_out} | X = X[i])
$$

$$
\hat{X}_{\text{cancel}}[i] = \hat{\mathbb{P}}(\text{cancel} | X = X[i])
$$

$$
\hat{X}_{\text{no_show}}[i] = \hat{\mathbb{P}}(\text{no_show} | X = X[i])
$$

$$
\hat{n}[i] = \sum_{j = 1}^N \mathbb{1}_{X[j] = X[i]}
$$


One can replace the column $X$ by the column $\hat{X}_{\text{check_out}}, \hat{X}_{\text{cancel}}, \hat{X}_{\text{no_show}}, \hat{n}$


---
Advantages:

- Avoid one hot encoding where there are more than 3 different values that could be taken by a column
- Capture information related to the probability of an outcome knowing the value of the column
- Facilitate the learning : instead of heavily relying on the model to capture information, the features are added consciously
---
Drawbacks:

- Since there are some categories which are over-represented, and some under-represented, there is an unbalance.
- Relies heavily of the collected data, which means that a bias is likely to appear.
---
Find a compromise:

Compute the following estimate:

$$
\hat{p}_{\text{check_out}} = \hat{\mathbb{P}}(\text{check_out})
$$

$$
\hat{p}_{\text{cancel}} = \hat{\mathbb{P}}(\text{cancel})
$$

$$
\hat{p}_{\text{no_show}} = \hat{\mathbb{P}}(\text{no_show})
$$

Then transform data in the following way:

Let $X[j]$ for value of $X$ in the $j^{th}$ row, we update the follwing rows

$$
\hat{X}_{\text{check_out}}[j] \leftarrow \frac{n[j] \times \hat{X}_{\text{check_out}}[j] + K \times \hat{p}_{\text{check_out}}}{n[j] + K}
$$

$$
\hat{X}_{\text{cance}}[j] \leftarrow \frac{n[j] \times \hat{X}_{\text{cancel}}[j] + K \times \hat{p}_{\text{cancel}}}{n[j] + K}
$$

$$
\hat{X}_{\text{no_show}}[j] \leftarrow \frac{n[j] \times \hat{X}_{\text{no_show}}[j] + K \times \hat{p}_{\text{no_show}}}{n[j] + K}
$$

Where $K$ is a smothing hyper-parameter
 
---

For the categorical variables having unique values less or equal to two should be one-hot encoded.

---

In [53]:
### Get the columns to be processed by the previous pre-processing
non_numeric = data_bis.select_dtypes(exclude='number').columns
transform = [c for c in non_numeric if data_bis[c].unique().shape[0] >= 3]

### Get the columns to be one-hot encoded 
one_hot = [c for c in non_numeric if data_bis[c].unique().shape[0] < 3]

train_df = pd.get_dummies(train_df, columns = one_hot, drop_first=True, dtype=int)
test_df = pd.get_dummies(test_df, columns = one_hot, drop_first=True, dtype=int)

Pre-processing of the *transform* columns:
---


---
Create a Cross-Validation dataset *folds* from *train_df* on which we train multiple models on train datasets
and tune an expert aggregation on the validation sets.

---

In [47]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
num_classes = train_df['reservation_status'].nunique()

In [48]:
K = 5
folds = []
for train_idx, val_idx in kf.split(train_df, train_df['reservation_status']):
    ### Split folds:
    t = train_df.iloc[train_idx].copy()
    s = train_df.iloc[val_idx].copy()
    u = test_df.copy()
    
    
    ### Get global variables:
    g_probs = pd.DataFrame(t['reservation_status'].value_counts(normalize = True).reset_index())


    for k in range(len(transform)):
    
        c = transform[k] 

        ### first phase: compute conditional probabilities and n
        probs = t.groupby(c)['reservation_status'].value_counts(normalize = True).unstack().reset_index()
        probs = probs.fillna(0)
        cols = [f'{c}_{col}' for col in probs.columns if col != c]
        cols = [c] + cols

        probs = probs.merge(pd.DataFrame(t.groupby(c).size()), on = c, how = 'left')
        probs.columns = cols + [f'{c}_count']

        

        ### second phase : smothed probabilities:
        for j in t['reservation_status'].unique():
            gp = g_probs.loc[(g_probs['reservation_status'] == j), 'proportion'].values[0]
            probs[f'{c}_{j}'] = (probs[f'{c}_{j}'] * probs[f'{c}_count'] +  gp * K) / (probs[f'{c}_count'] + K)
        

        probs = probs.drop(f'{c}_count', axis = 1)
        

        t = t.merge(probs, on = c, how = 'left')
        t = t.drop(c, axis = 1)
        
        
        
        # merge into val fold
        s = s.merge(probs, on=c, how='left')
        for j in t['reservation_status'].unique():
            gp = g_probs.loc[g_probs['reservation_status']==j, 'proportion'].values[0]
            s[f'{c}_{j}'] = s[f'{c}_{j}'].fillna(gp)
        s = s.drop(c, axis = 1)
        
        
        u = u.merge(probs, on = c, how = 'left')
        for j in t['reservation_status'].unique():
            gp = g_probs.loc[g_probs['reservation_status']==j, 'proportion'].values[0]
            u[f'{c}_{j}'] = u[f'{c}_{j}'].fillna(gp)
        u = u.drop(c, axis = 1)
        
        
        
    
    # Prepare X, y
    y_train = t['reservation_status']
    X_train = t.drop(columns=['reservation_status'])
    y_val = s['reservation_status']
    X_val = s.drop(columns=['reservation_status'])
    y_test = u['reservation_status']
    X_test = u.drop(columns = 'reservation_status')
    
    
    # Scale numeric features for linear models
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    folds.append(
        (X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test)
    )

In [62]:
# === More diverse base models ===
# models = {
#     "fast_rf" : HistGradientBoostingClassifier(max_depth=8, learning_rate=0.1, max_iter=200, random_state=42),
#     "RandomForest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
#     "ExtraTrees": ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42),
#     "LogisticRegression": LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=42),
#     "GradientBoosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42),
#     "LightGBM": LGBMClassifier(n_estimators=200, num_leaves=31, learning_rate=0.1, random_state=42),
#     "CatBoost": CatBoostClassifier(n_estimators=200, depth=6, learning_rate=0.1, verbose=0, random_state=42)

# }

models = {
    "fast_rf" : HistGradientBoostingClassifier(max_depth=8, learning_rate=0.1, max_iter=200, random_state=42),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    "CatBoost": CatBoostClassifier(n_estimators=200, depth=6, learning_rate=0.1, verbose=0, random_state=42)

}

---
Learning phase:
---

Train each model of the transformed data

In [63]:
# Containers
oof_preds = {name: [] for name in models.keys()}
oof_labels = []
trained_models_per_fold = []
fold_test_sets = []

for fold_idx, (X_train, y_train, X_val, y_val, X_test_fold, y_test_fold) in enumerate(folds):
    print(f"\n=== Fold {fold_idx+1} ===")
    
    fold_test_sets.append((X_test_fold, y_test_fold))
    
    trained_models = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        trained_models[name] = model

        val_pred = np.clip(model.predict_proba(X_val), 1e-15, 1-1e-15)
        oof_preds[name].append(val_pred)

        print(f"{name}: Fold log-loss = {log_loss(y_val, val_pred):.4f}")
        
    trained_models_per_fold.append(trained_models)
    oof_labels.append(y_val)



=== Fold 1 ===
fast_rf: Fold log-loss = 0.3124
ExtraTrees: Fold log-loss = 0.4331
LogisticRegression: Fold log-loss = 0.4244
RandomForest: Fold log-loss = 0.3750
CatBoost: Fold log-loss = 0.3338

=== Fold 2 ===
fast_rf: Fold log-loss = 0.3159
ExtraTrees: Fold log-loss = 0.4346
LogisticRegression: Fold log-loss = 0.4310
RandomForest: Fold log-loss = 0.3759
CatBoost: Fold log-loss = 0.3371

=== Fold 3 ===
fast_rf: Fold log-loss = 0.3139
ExtraTrees: Fold log-loss = 0.4331
LogisticRegression: Fold log-loss = 0.4276
RandomForest: Fold log-loss = 0.3762
CatBoost: Fold log-loss = 0.3386

=== Fold 4 ===
fast_rf: Fold log-loss = 0.3167
ExtraTrees: Fold log-loss = 0.4440
LogisticRegression: Fold log-loss = 0.4329
RandomForest: Fold log-loss = 0.3800
CatBoost: Fold log-loss = 0.3383

=== Fold 5 ===
fast_rf: Fold log-loss = 0.3044
ExtraTrees: Fold log-loss = 0.4348
LogisticRegression: Fold log-loss = 0.4288
RandomForest: Fold log-loss = 0.3735
CatBoost: Fold log-loss = 0.3293


---
Meta-learner: train phase
---

Train a meta-learner for an expert aggregation

In [68]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import log_loss, f1_score
import pandas as pd
import numpy as np

# === Construction du jeu méta ===
# On empile les prédictions OOF (une par modèle de base)
meta_features = []

for name, preds_list in oof_preds.items():
    # Concatène les folds verticalement
    stacked_preds = np.vstack(preds_list)
    
    # Renomme les colonnes pour garder une trace du modèle et de la classe
    cols = [f"{name}_class{i}" for i in range(stacked_preds.shape[1])]
    
    meta_features.append(pd.DataFrame(stacked_preds, columns=cols))

# Fusionne toutes les features OOF
meta_X = pd.concat(meta_features, axis=1)

# Les vraies étiquettes associées
meta_y = np.concatenate(oof_labels)

print(f"Meta-features shape: {meta_X.shape}")
print(f"Meta-labels shape: {meta_y.shape}")

# === Meta-learner ===
meta_model = HistGradientBoostingClassifier(
    max_iter=300,
    learning_rate=0.05,
    max_depth=5,
    l2_regularization=0.1,
    early_stopping=True,
    validation_fraction=0.1,
    random_state=42
)

# Entraînement du meta-learner
print("\n=== Training Meta-Learner ===")
meta_model.fit(meta_X, meta_y)

# Évaluation sur les données OOF (sur lesquelles il a été entraîné)
meta_preds = meta_model.predict_proba(meta_X)
meta_preds = np.clip(meta_preds, 1e-15, 1 - 1e-15)

meta_logloss = log_loss(meta_y, meta_preds)
meta_f1 = f1_score(meta_y, np.argmax(meta_preds, axis=1), average='weighted')

print(f"\n=== Meta-Learner Performance (OOF) ===")
print(f"Log-loss: {meta_logloss:.4f}")
print(f"Weighted F1: {meta_f1:.4f}")


Meta-features shape: (55118, 15)
Meta-labels shape: (55118,)

=== Training Meta-Learner ===

=== Meta-Learner Performance (OOF) ===
Log-loss: 0.2846
Weighted F1: 0.8711


---
Testing phase: see how effective the aggregation:
---

Testing on the testing sets if the aggregation performed better than each model.

In [71]:
from sklearn.metrics import log_loss, f1_score
import numpy as np
import pandas as pd

# === Test final : prédiction avec le meta-learner sur les fold test sets ===
logloss_scores = []
f1_scores = []

for fold_idx, (trained_models, (X_test_fold, y_test_fold)) in enumerate(zip(trained_models_per_fold, fold_test_sets)):
    print(f"\n=== Fold {fold_idx+1} Test Evaluation ===")
    
    fold_base_preds = []
    
    for name, model in trained_models.items():
        preds_proba = np.clip(model.predict_proba(X_test_fold), 1e-15, 1-1e-15)
        cols = [f"{name}_class{i}" for i in range(preds_proba.shape[1])]
        fold_base_preds.append(pd.DataFrame(preds_proba, columns=cols))
    
    X_meta_test_fold = pd.concat(fold_base_preds, axis=1)
    
    test_meta_proba = meta_model.predict_proba(X_meta_test_fold)
    test_meta_preds = np.argmax(test_meta_proba, axis=1)
    
    test_logloss = log_loss(y_test_fold, test_meta_proba)
    test_f1 = f1_score(y_test_fold, test_meta_preds, average='weighted')
    
    logloss_scores.append(test_logloss)
    f1_scores.append(test_f1)
    
    print(f"Meta-learner - Fold log-loss: {test_logloss:.4f} | Weighted F1: {test_f1:.4f}")

# === Statistiques globales sur toutes les folds ===
print("\n=== Meta-learner - Global Test Performance ===")
print(f"Log-loss: mean = {np.mean(logloss_scores):.4f}, std = {np.std(logloss_scores):.4f}")
print(f"Weighted F1: mean = {np.mean(f1_scores):.4f}, std = {np.std(f1_scores):.4f}")



=== Fold 1 Test Evaluation ===
Meta-learner - Fold log-loss: 0.3320 | Weighted F1: 0.8478

=== Fold 2 Test Evaluation ===
Meta-learner - Fold log-loss: 0.3362 | Weighted F1: 0.8485

=== Fold 3 Test Evaluation ===
Meta-learner - Fold log-loss: 0.3401 | Weighted F1: 0.8465

=== Fold 4 Test Evaluation ===
Meta-learner - Fold log-loss: 0.3329 | Weighted F1: 0.8505

=== Fold 5 Test Evaluation ===
Meta-learner - Fold log-loss: 0.3084 | Weighted F1: 0.8656

=== Meta-learner - Global Test Performance ===
Log-loss: mean = 0.3299, std = 0.0111
Weighted F1: mean = 0.8518, std = 0.0070


---
Analysis:
---

First we se that Random Forest like models perform well. So we can choose to stick only to these kind of models. Then, what is to be noticed is the fact that the aggregator trained *across* the folds, that is, an aggregator that looks at the *universal* performence of *type* of model, does better than each model separately. The idea of looking across the folds imposed itself as a necessity to reduce bias.

---
Adapt strategy to the entire dataset:
---

In [None]:
### Split folds:
t = train_df.copy()
s = test_df.copy()



### Get global variables:
g_probs = pd.DataFrame(t['reservation_status'].value_counts(normalize = True).reset_index())


for k in range(len(transform)):

    c = transform[k] 

    ### first phase: compute conditional probabilities and n
    probs = t.groupby(c)['reservation_status'].value_counts(normalize = True).unstack().reset_index()
    probs = probs.fillna(0)
    cols = [f'{c}_{col}' for col in probs.columns if col != c]
    cols = [c] + cols

    probs = probs.merge(pd.DataFrame(t.groupby(c).size()), on = c, how = 'left')
    probs.columns = cols + [f'{c}_count']



    ### second phase : smothed probabilities:
    for j in t['reservation_status'].unique():
        gp = g_probs.loc[(g_probs['reservation_status'] == j), 'proportion'].values[0]
        probs[f'{c}_{j}'] = (probs[f'{c}_{j}'] * probs[f'{c}_count'] +  gp * K) / (probs[f'{c}_count'] + K)


    probs = probs.drop(f'{c}_count', axis = 1)


    t = t.merge(probs, on = c, how = 'left')
    t = t.drop(c, axis = 1)



    # merge into val fold
    s = s.merge(probs, on=c, how='left')
    for j in t['reservation_status'].unique():
        gp = g_probs.loc[g_probs['reservation_status']==j, 'proportion'].values[0]
        s[f'{c}_{j}'] = s[f'{c}_{j}'].fillna(gp)
    s = s.drop(c, axis = 1)


    




# Prepare X, y
y_train = t['reservation_status']
X_train = t.drop(columns=['reservation_status'])
y_test = s['reservation_status']
X_test = s.drop(columns=['reservation_status'])

# Scale numeric features for linear models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
trained_models_full = {}
meta_features_train_full = []

print("=== Phase 1: Retrain base models on full training set ===")
for name, model in models.items():
    print(f"Training {name}")
    model.fit(X_train_scaled, y_train)  # full training set
    trained_models_full[name] = model

    # Predictions for meta-learner (train set)
    preds_train = np.clip(model.predict_proba(X_train_scaled), 1e-15, 1-1e-15)
    meta_features_train_full.append(preds_train)

    # === Optional: Evaluate this base model on test set ===
    preds_test = np.clip(model.predict_proba(X_test_scaled), 1e-15, 1-1e-15)
    preds_test_labels = np.argmax(preds_test, axis=1)
    
    test_logloss = log_loss(y_test, preds_test)
    test_f1 = f1_score(y_test, preds_test_labels, average='weighted')
    
    print(f"{name} - Test log-loss: {test_logloss:.4f} | F1-weighted: {test_f1:.4f}")

print("Phase 1 completed.\n")


In [None]:
meta_features_test = []

print("=== Phase 2: Predict base model probabilities on test set ===")
for name, model in trained_models_full.items():
    preds_test = np.clip(model.predict_proba(X_test_scaled), 1e-15, 1-1e-15)
    meta_features_test.append(preds_test)

# Stack predictions for meta-learner
X_meta_test = np.hstack(meta_features_test)

print("Phase 2 completed.\n")


In [None]:
print("=== Phase 3: Aggregate predictions with universal meta-learner ===")

# Use the meta-learner learned from CV
test_meta_proba = meta_model.predict_proba(X_meta_test)
test_meta_preds = np.argmax(test_meta_proba, axis=1)

from sklearn.metrics import log_loss, f1_score
test_logloss = log_loss(y_test, test_meta_proba)
test_f1 = f1_score(y_test, test_meta_preds, average='weighted')

print(f"Aggregated model - Test log-loss: {test_logloss:.4f} | F1-weighted: {test_f1:.4f}")


# Final:

---
Without data transformation:
---

In [54]:
train_df, test_df = train_test_split(data_bis, train_size=0.95, random_state=13)
y_train = train_df['reservation_status']
X_train = train_df.drop(columns = ['reservation_status'])
X_train = pd.get_dummies(X_train, columns=one_hot, drop_first=True, dtype=int)
data_test = pd.get_dummies(data_test, columns=one_hot, drop_first = True, dtype=int)

In [55]:
X_train.shape, y_train.shape

((65453, 27), (65453,))

In [56]:
X_online, X_test = train_test_split(test_df, train_size = 0.5)

y_online = X_online['reservation_status']
X_online = X_online.drop(columns = ['reservation_status'])
X_online = pd.get_dummies(X_online, columns=one_hot, drop_first=True, dtype=int)

y_test = X_test['reservation_status']
X_test = X_test.drop(columns = ['reservation_status'])
X_test = pd.get_dummies(X_test, columns=one_hot, drop_first=True, dtype=int)

X_online.shape, X_test.shape, y_online.shape, y_test.shape

((1722, 27), (1723, 27), (1722,), (1723,))

In [57]:
hb = HistGradientBoostingClassifier(max_depth=8, learning_rate=0.1, max_iter=200, random_state=42)

### Split folds:
t = X_train.copy()
t['reservation_status'] = y_train
s = X_test.copy()
u = X_online.copy()
hb_final_test = data_test.copy()

In [58]:
K = 5
### Get global variables:
g_probs = pd.DataFrame(t['reservation_status'].value_counts(normalize = True).reset_index())


for k in range(len(transform)):

    c = transform[k] 

    ### first phase: compute conditional probabilities and n
    probs = t.groupby(c)['reservation_status'].value_counts(normalize = True).unstack().reset_index()
    probs = probs.fillna(0)
    cols = [f'{c}_{col}' for col in probs.columns if col != c]
    cols = [c] + cols

    probs = probs.merge(pd.DataFrame(t.groupby(c).size()), on = c, how = 'left')
    probs.columns = cols + [f'{c}_count']



    ### second phase : smothed probabilities:
    for j in t['reservation_status'].unique():
        gp = g_probs.loc[(g_probs['reservation_status'] == j), 'proportion'].values[0]
        probs[f'{c}_{j}'] = (probs[f'{c}_{j}'] * probs[f'{c}_count'] +  gp * K) / (probs[f'{c}_count'] + K)


    probs = probs.drop(f'{c}_count', axis = 1)


    t = t.merge(probs, on = c, how = 'left')
    t = t.drop(c, axis = 1)



    # merge into val fold
    s = s.merge(probs, on=c, how='left')
    for j in t['reservation_status'].unique():
        gp = g_probs.loc[g_probs['reservation_status']==j, 'proportion'].values[0]
        s[f'{c}_{j}'] = s[f'{c}_{j}'].fillna(gp)
    s = s.drop(c, axis = 1)


    # merge into val fold
    u = u.merge(probs, on=c, how='left')
    for j in t['reservation_status'].unique():
        gp = g_probs.loc[g_probs['reservation_status']==j, 'proportion'].values[0]
        u[f'{c}_{j}'] = u[f'{c}_{j}'].fillna(gp)
    u = u.drop(c, axis = 1)
    
    
    hb_final_test = hb_final_test.merge(probs, on=c, how='left')
    for j in t['reservation_status'].unique():
        gp = g_probs.loc[g_probs['reservation_status']==j, 'proportion'].values[0]
        hb_final_test[f'{c}_{j}'] = hb_final_test[f'{c}_{j}'].fillna(gp)
    hb_final_test = hb_final_test.drop(c, axis = 1)
    
t = t.drop(columns = ['reservation_status'])

In [59]:
name = "Histo"

# Scale numeric features for linear models
scaler = StandardScaler()
hb_train = scaler.fit_transform(t)
hb_test = scaler.transform(s)
hb_online = scaler.transform(u)
hb_final_test = scaler.transform(hb_final_test)


hb.fit(hb_train, y_train)
proba = hb.predict_proba(hb_test)
preds = np.argmax(proba, axis=1)
loss = log_loss(y_test, proba)
f1 = f1_score(y_test, preds, average='weighted')
print(f"{name} - Test log-loss: {loss:.4f} | F1-weighted: {f1:.4f}")

Histo - Test log-loss: 0.3100 | F1-weighted: 0.8620


In [66]:
CatBoost = CatBoostClassifier(verbose = 0, random_state = 42)

In [67]:
lgbm = lgb.LGBMClassifier(
    boosting_type='gbdt',   # classique, basé sur histogramme
    n_estimators=100,
    max_depth=7,
    num_leaves=31,
    learning_rate=0.1,
    n_jobs=-1,
    random_state=42
)

In [68]:
categorical_cols = X_train.select_dtypes(exclude='number').columns.to_list()
categorical_cols

['meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'assigned_room_type',
 'deposit_type',
 'customer_type']

In [69]:
for col in categorical_cols:
    X_train[col] = X_train[col].astype('category')
    X_online[col] = X_online[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [70]:
numeric_cols = X_train.select_dtypes(exclude='category').columns.tolist()
X_train[numeric_cols] = X_train[numeric_cols].astype(float)
X_test[numeric_cols] = X_test[numeric_cols].astype(float)
numeric_cols

['lead_time',
 'arrival_date_year',
 'arrival_date_month',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'booking_changes',
 'days_in_waiting_list',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests',
 'hotel_Resort Hotel']

In [71]:
name = "CatBoost"
CatBoost.fit(X_train, y_train, cat_features=categorical_cols)
proba = CatBoost.predict_proba(X_test)
preds = np.argmax(proba, axis=1)
loss = log_loss(y_test, proba)
f1 = f1_score(y_test, preds, average='weighted')
print(f"{name} - Test log-loss: {loss:.4f} | F1-weighted: {f1:.4f}")

CatBoost - Test log-loss: 0.3085 | F1-weighted: 0.8660


In [72]:
name = "LGBM"
lgbm.fit(X_train, y_train, categorical_feature = categorical_cols)
proba = lgbm.predict_proba(X_test)
preds = np.argmax(proba, axis=1)
loss = log_loss(y_test, proba)
f1 = f1_score(y_test, preds, average='weighted')
print(f"{name} - Test log-loss: {loss:.4f} | F1-weighted: {f1:.4f}")



LGBM - Test log-loss: 0.3148 | F1-weighted: 0.8602


In [73]:
models = {
    'catboost': CatBoost,
    'lgbm' : lgbm,
    'hb' : hb
}

In [74]:
# base_models: dict of trained models (XGBoost, LightGBM, CatBoost, etc.)
# X_online, y_online: small slice for online learning
# X_test, y_test: final test set

eta = 0.5  # learning rate for exponential weighting
n_experts = len(models)

# Ensure labels are NumPy arrays for positional indexing
y_online = np.asarray(y_online)
y_test   = np.asarray(y_test)

weights = np.ones(n_experts) / n_experts  # initial uniform weights


In [75]:
expert_probas_online = []

for name, model in models.items():
    if name == 'hb':
        proba = model.predict_proba(hb_online)
    else:
        proba = model.predict_proba(X_online)
    expert_probas_online.append(proba)

expert_probas_online = np.array(expert_probas_online)  # shape: (n_experts, n_samples, n_classes)


In [79]:
# Collect expert predictions on test set
expert_probas_test = []
for name, model in models.items():
    if name == 'hb':
        proba = model.predict_proba(hb_test)
    else:
        proba = model.predict_proba(X_test)
    expert_probas_test.append(proba)

expert_probas_test = np.array(expert_probas_test)

# Normalize final weights
final_weights = weights / weights.sum()

# Weighted aggregation for test set
agg_probas_test = np.tensordot(final_weights, expert_probas_test, axes=(0, 0))  # shape (n_samples, n_classes)
agg_preds_test = np.argmax(agg_probas_test, axis=1)

# Test evaluation
print("Test set log-loss:", log_loss(y_test, agg_probas_test))
print("Test set F1-weighted:", f1_score(y_test, agg_preds_test, average='weighted'))


Test set log-loss: 0.30402657019077556
Test set F1-weighted: 0.8641351183805687


In [None]:
for col in categorical_cols:
    data_test[col] = data_test[col].astype('category')

In [83]:
expert_probas_test = []
for name, model in models.items():
    if name == 'hb':
        proba = model.predict_proba(hb_final_test)
    else:
        proba = model.predict_proba(X_test)
    expert_probas_test.append(proba)

expert_probas_test = np.array(expert_probas_test)
# Weighted aggregation for test set
agg_probas_test = np.tensordot(final_weights, expert_probas_test, axes=(0, 0))  # shape (n_samples, n_classes)
agg_preds_test = np.argmax(agg_probas_test, axis=1)


Test set log-loss: 0.2994246203229059
Test set F1-weighted: 0.8704161752594727


In [None]:
f = pd.read_csv('test_data.csv')['row_id']
f

In [None]:
res = pd.DataFrame(f)
res['reservation_status'] = agg_preds_test

res = res.set_index('row_id')
res.to_csv('result.csv')

In [None]:
res.to_csv("result.csv")