## C4.5 Model - Hotel Booking Cancellation Prediction

In [1]:
#Importing required libraries
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

In [2]:
#Loading processed features and target

X_train = pd.read_csv("X_train_processed.csv")
X_test  = pd.read_csv("X_test_processed.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_test  = pd.read_csv("y_test.csv").squeeze()

print("X_train shape:", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape: ", y_test.shape)

print("\nDtypes of first few columns:")
print(X_train.dtypes.head())

X_train shape: (95512, 231)
X_test shape:  (23878, 231)
y_train shape: (95512,)
y_test shape:  (23878,)

Dtypes of first few columns:
0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object


In [3]:
#Handling class imbalance

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Before SMOTE:\n", y_train.value_counts())
print("\nAfter SMOTE:\n", y_train_balanced.value_counts())

Before SMOTE:
 is_canceled
0    60259
1    35253
Name: count, dtype: int64

After SMOTE:
 is_canceled
1    60259
0    60259
Name: count, dtype: int64


In [4]:
USE_SMOTE = True

if USE_SMOTE:
    X_train_c45 = X_train_balanced
    y_train_c45 = y_train_balanced
else:
    X_train_c45 = X_train
    y_train_c45 = y_train

In [5]:
#C4.5-style Decision Tree Model

c45 = DecisionTreeClassifier(
    criterion="entropy",
    random_state=42,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1
)

#Training the model
c45.fit(X_train_c45, y_train_c45)

#Predicting on test data
y_pred = c45.predict(X_test)
y_proba = c45.predict_proba(X_test)[:, 1]

#Evaluation
print("C4.5-style Tree")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

C4.5-style Tree
Accuracy : 0.8465533126727531
ROC-AUC  : 0.8423360868297874

Confusion Matrix:
 [[12878  2029]
 [ 1635  7336]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.86      0.88     14907
           1       0.78      0.82      0.80      8971

    accuracy                           0.85     23878
   macro avg       0.84      0.84      0.84     23878
weighted avg       0.85      0.85      0.85     23878



In [None]:
#Hyperparameter tuning

param_grid = {
    "max_depth": [None, 5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
    "max_features": [None, "sqrt", "log2"]
}

c45_base = DecisionTreeClassifier(
    criterion="entropy",
    random_state=42
)

grid_search_c45 = GridSearchCV(
    estimator=c45_base,
    param_grid=param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=2
)

grid_search_c45.fit(X_train_c45, y_train_c45)

print("Best parameters:", grid_search_c45.best_params_)
print("Best CV ROC-AUC:", grid_search_c45.best_score_)


Fitting 3 folds for each of 135 candidates, totalling 405 fits


In [None]:
#Evaluating best C4.5 model

best_c45 = grid_search_c45.best_estimator_

print("\nBest C4.5 model (estimator):")
print(best_log_reg)

y_pred_best = best_c45.predict(X_test)
y_proba_best = best_c45.predict_proba(X_test)[:, 1]

print("\n")
print("Accuracy :", accuracy_score(y_test, y_pred_best))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))