## Logistic Regression - Hotel Booking Cancellation Prediction

In [1]:
#Importing Libraries

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [2]:
#Loading processed features and target

X_train = pd.read_csv("X_train_processed.csv")
X_test  = pd.read_csv("X_test_processed.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_test  = pd.read_csv("y_test.csv").squeeze()

print("X_train shape:", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape: ", y_test.shape)

print("\nDtypes of first few columns:")
print(X_train.dtypes.head())

X_train shape: (95512, 231)
X_test shape:  (23878, 231)
y_train shape: (95512,)
y_test shape:  (23878,)

Dtypes of first few columns:
0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object


In [3]:
#Handling class imbalance

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Before SMOTE:\n", y_train.value_counts())
print("\nAfter SMOTE:\n", y_train_balanced.value_counts())

Before SMOTE:
 is_canceled
0    60259
1    35253
Name: count, dtype: int64

After SMOTE:
 is_canceled
1    60259
0    60259
Name: count, dtype: int64


In [4]:
USE_SMOTE = True

if USE_SMOTE:
    X_train_log = X_train_balanced
    y_train_log = y_train_balanced
else:
    X_train_log = X_train
    y_train_log = y_train

In [5]:
#Logistic Regression model

log_reg = LogisticRegression(
    penalty="l2",
    solver="lbfgs",
    max_iter=1000
)

#Training the model
log_reg.fit(X_train_log, y_train_log)

#Predicting on test set
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

#Evaluation
print("Logistic Regression - Baseline Results")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression - Baseline Results
Accuracy : 0.8081078817321383
ROC-AUC  : 0.8931205413518483

Confusion Matrix:
 [[12066  2841]
 [ 1741  7230]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.81      0.84     14907
           1       0.72      0.81      0.76      8971

    accuracy                           0.81     23878
   macro avg       0.80      0.81      0.80     23878
weighted avg       0.82      0.81      0.81     23878



In [6]:
#Logistic Regression with class_weight='balanced' (no SMOTE)

log_reg_balanced = LogisticRegression(
    penalty="l2",
    solver="lbfgs",
    max_iter=1000,
    class_weight="balanced"
)

log_reg_balanced.fit(X_train, y_train)

y_pred_bal = log_reg_balanced.predict(X_test)
y_proba_bal = log_reg_balanced.predict_proba(X_test)[:, 1]

print("Logistic Regression - class_weight='balanced' (no SMOTE)")
print("Accuracy :", accuracy_score(y_test, y_pred_bal))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_bal))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_bal))
print("\nClassification Report:\n", classification_report(y_test, y_pred_bal))


Logistic Regression - class_weight='balanced' (no SMOTE)
Accuracy : 0.8074378088617137
ROC-AUC  : 0.8930113368062382

Confusion Matrix:
 [[12056  2851]
 [ 1747  7224]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.81      0.84     14907
           1       0.72      0.81      0.76      8971

    accuracy                           0.81     23878
   macro avg       0.80      0.81      0.80     23878
weighted avg       0.81      0.81      0.81     23878



In [7]:
#Hyperparameter tuning for C

param_grid = {
    "C": [0.01, 0.1, 1.0, 10.0],
    "penalty": ["l2"],
    "solver": ["lbfgs"]
}

log_reg_base = LogisticRegression(
    max_iter=1000
)

grid_search_log = GridSearchCV(
    estimator=log_reg_base,
    param_grid=param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=2
)

grid_search_log.fit(X_train_log, y_train_log)

print("Best parameters:", grid_search_log.best_params_)
print("Best CV ROC-AUC:", grid_search_log.best_score_)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters: {'C': 10.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV ROC-AUC: 0.8889602289395101


In [9]:
#Evaluating best Logistic Regression model

best_log_reg = grid_search_log.best_estimator_

print("\nBest Logistic Regression model (estimator):")
print(best_log_reg)

y_pred_best = best_log_reg.predict(X_test)
y_proba_best = best_log_reg.predict_proba(X_test)[:, 1]

print("\n")
print("Accuracy :", accuracy_score(y_test, y_pred_best))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))


Best Logistic Regression model (estimator):
LogisticRegression(C=10.0, max_iter=1000)


Accuracy : 0.8079403635145322
ROC-AUC  : 0.8931825166513566

Confusion Matrix:
 [[12064  2843]
 [ 1743  7228]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.81      0.84     14907
           1       0.72      0.81      0.76      8971

    accuracy                           0.81     23878
   macro avg       0.80      0.81      0.80     23878
weighted avg       0.82      0.81      0.81     23878

