## CART Model - Hotel Booking Cancellation Prediction

In [1]:
#Importing Libraries

import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [2]:
#Loading processed features and target

X_train = pd.read_csv("X_train_processed.csv")
X_test  = pd.read_csv("X_test_processed.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_test  = pd.read_csv("y_test.csv").squeeze()

print("X_train shape:", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape: ", y_test.shape)

print("\nDtypes of first few columns:")
print(X_train.dtypes.head())

X_train shape: (95512, 231)
X_test shape:  (23878, 231)
y_train shape: (95512,)
y_test shape:  (23878,)

Dtypes of first few columns:
0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object


In [3]:
#Handling class imbalance

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Before SMOTE:\n", y_train.value_counts())
print("\nAfter SMOTE:\n", y_train_balanced.value_counts())

Before SMOTE:
 is_canceled
0    60259
1    35253
Name: count, dtype: int64

After SMOTE:
 is_canceled
1    60259
0    60259
Name: count, dtype: int64


In [4]:
USE_SMOTE = True

if USE_SMOTE:
    X_train_cart = X_train_balanced
    y_train_cart = y_train_balanced
else:
    X_train_cart = X_train
    y_train_cart = y_train

In [5]:
#CART model

cart = DecisionTreeClassifier(
    criterion="gini",
    random_state=42,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1
)

#Training the model
cart.fit(X_train_cart, y_train_cart)

#Predicting on test data
y_pred = cart.predict(X_test)
y_proba = cart.predict_proba(X_test)[:, 1]

#Evaluation
print("CART Baseline Results")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

CART Baseline Results
Accuracy : 0.8448781304966915
ROC-AUC  : 0.8402571101532509

Confusion Matrix:
 [[12867  2040]
 [ 1664  7307]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.86      0.87     14907
           1       0.78      0.81      0.80      8971

    accuracy                           0.84     23878
   macro avg       0.83      0.84      0.84     23878
weighted avg       0.85      0.84      0.85     23878



In [6]:
#Hyperparameter tuning for CART

param_grid = {
    "max_depth": [None, 5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
    "max_features": [None, "sqrt", "log2"]
}

cart_base = DecisionTreeClassifier(
    criterion="gini",
    random_state=42
)

grid_search_cart = GridSearchCV(
    estimator=cart_base,
    param_grid=param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=2
)

grid_search_cart.fit(X_train_cart, y_train_cart)

print("Best parameters:", grid_search_cart.best_params_)
print("Best CV ROC-AUC:", grid_search_cart.best_score_)

Fitting 3 folds for each of 135 candidates, totalling 405 fits
Best parameters: {'max_depth': 15, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Best CV ROC-AUC: 0.9213021214434652


In [7]:
#Evaluatin Best Cart

best_cart = grid_search_cart.best_estimator_

print("\nBest Logistic Regression model (estimator):")
print(best_cart)

y_pred_best = best_cart.predict(X_test)
y_proba_best = best_cart.predict_proba(X_test)[:, 1]

print("Best CART Results on Test Set")
print("Accuracy :", accuracy_score(y_test, y_pred_best))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))


Best Logistic Regression model (estimator):
DecisionTreeClassifier(max_depth=15, min_samples_leaf=5, random_state=42)
Best CART Results on Test Set
Accuracy : 0.8412346092637575
ROC-AUC  : 0.9200908599167774

Confusion Matrix:
 [[12837  2070]
 [ 1721  7250]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87     14907
           1       0.78      0.81      0.79      8971

    accuracy                           0.84     23878
   macro avg       0.83      0.83      0.83     23878
weighted avg       0.84      0.84      0.84     23878

