## KNN Model - Hotel Booking Cancellation Prediction

In [5]:
#Importing Required Libraries
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)

from imblearn.over_sampling import SMOTE


In [6]:
#Loading processed features and target
X_train = pd.read_csv("X_train_processed.csv")
X_test  = pd.read_csv("X_test_processed.csv")
y_train = pd.read_csv("y_train.csv")
y_test  = pd.read_csv("y_test.csv")

print(X_train.dtypes.head())

#Flatten target to 1D arrays (sklearn expects 1D y so)
y_train = y_train.squeeze()
y_test  = y_test.squeeze()

print("X_train shape:", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape: ", y_test.shape)


0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object
X_train shape: (95512, 231)
X_test shape:  (23878, 231)
y_train shape: (95512,)
y_test shape:  (23878,)


In [7]:
#Handling class imbalance with SMOTE

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("\nAfter SMOTE:\n", y_train_balanced.value_counts())

Before SMOTE: is_canceled
0    60259
1    35253
Name: count, dtype: int64

After SMOTE:
 is_canceled
1    60259
0    60259
Name: count, dtype: int64


In [8]:
#KNN Model
USE_SMOTE = True

if USE_SMOTE:
    X_train_knn = X_train_balanced
    y_train_knn = y_train_balanced
else:
    X_train_knn = X_train
    y_train_knn = y_train
    
k_values = [1, 3, 5, 7, 9, 11, 13, 15]
results = []

for k in k_values:
    knn = KNeighborsClassifier(
        n_neighbors=k,
        weights='distance',
        metric='minkowski',
        p=2,
        n_jobs=-1
    )

    #Training
    knn.fit(X_train_knn, y_train_knn)
    
    #Predicting on the test set
    y_pred = knn.predict(X_test)
    y_proba = knn.predict_proba(X_test)[:, 1]
    
    #Metrics
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)
    
    results.append({
        "k": k,
        "accuracy": acc,
        "roc_auc": roc
    })

results_df = pd.DataFrame(results)
print(results_df)

    k  accuracy   roc_auc
0   1  0.832231  0.827305
1   3  0.832859  0.890967
2   5  0.835832  0.910883
3   7  0.839392  0.920788
4   9  0.840774  0.925216
5  11  0.841318  0.928853
6  13  0.841737  0.931449
7  15  0.842449  0.933187


In [9]:
#Picking best k by ROC-AOC

best_row = results_df.loc[results_df["roc_auc"].idxmax()]
best_k = int(best_row["k"])

print("Best k:", best_k)
print("Best ROC-AUC:", best_row["roc_auc"])
print("Accuracy at best k:", best_row["accuracy"])

Best k: 15
Best ROC-AUC: 0.9331865368203385
Accuracy at best k: 0.8424491163414022


In [10]:
#Training with the best k

from sklearn.metrics import classification_report, confusion_matrix

best_knn = KNeighborsClassifier(
    n_neighbors=best_k,
    weights='distance',
    metric='minkowski',
    p=2,
    n_jobs=-1
)

best_knn.fit(X_train_knn, y_train_knn)

y_pred_best = best_knn.predict(X_test)
y_proba_best = best_knn.predict_proba(X_test)[:, 1]

print("Final KNN (k = {})".format(best_k))
print("Accuracy :", accuracy_score(y_test, y_pred_best))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))

Final KNN (k = 15)
Accuracy : 0.8424491163414022
ROC-AUC  : 0.9331865368203385

Confusion Matrix:
 [[12382  2525]
 [ 1237  7734]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.83      0.87     14907
           1       0.75      0.86      0.80      8971

    accuracy                           0.84     23878
   macro avg       0.83      0.85      0.84     23878
weighted avg       0.85      0.84      0.84     23878

