In [None]:
# Machine Learning Project - Hotel Cancellations
# Ensemble Models: Random Forest & Gradient Boosting

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

# Load dataset
df = pd.read_csv("hotel_bookings_updated_2024.csv")

# Drop non-ML usable datetime column
drop_cols = ['reservation_status_date']
df_clean = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Sample to speed training
if len(df_clean) > 20000:
    df_clean = df_clean.sample(n=20000, random_state=42)

# Target & Features
y = df_clean["is_canceled"]
X = df_clean.drop(columns=["is_canceled"])

# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

# Fill missing values
for col in X_encoded.columns:
    if X_encoded[col].isnull().any():
        X_encoded[col] = X_encoded[col].fillna(X_encoded[col].median())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, stratify=y, random_state=42
)

# Ensemble Models
rf = RandomForestClassifier(
    n_estimators=150,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

gb = GradientBoostingClassifier(
    n_estimators=80,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

rf.fit(X_train, y_train)
gb.fit(X_train, y_train)

# Predictions
rf_pred = rf.predict(X_test)
gb_pred = gb.predict(X_test)

# Evaluation function
def get_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1 Score": f1_score(y_true, y_pred, zero_division=0)
    }

rf_metrics = get_metrics(y_test, rf_pred)
gb_metrics = get_metrics(y_test, gb_pred)

print("Random Forest:", rf_metrics)
print("Gradient Boosting:", gb_metrics)

# Visualizations

# 1. Cancellation Distribution
df_clean["is_canceled"].value_counts().plot(kind='bar')
plt.title("Booking Cancellation Distribution")
plt.xlabel("is_canceled (0 = Not Canceled, 1 = Canceled)")
plt.ylabel("Count")
plt.show()

# 2. Lead Time Distribution
plt.hist(df_clean["lead_time"], bins=30)
plt.title("Lead Time Distribution")
plt.xlabel("Lead Time (days)")
plt.ylabel("Frequency")
plt.show()

# 3. Monthly Bookings
if "arrival_date_month" in df_clean.columns:
    month_order = [
        "January","February","March","April","May","June","July",
        "August","September","October","November","December"
    ]
    df_clean["arrival_date_month"].value_counts().reindex(month_order).plot(kind='bar')
    plt.title("Bookings by Arrival Month")
    plt.xlabel("Month")
    plt.ylabel("Count")
    plt.show()

# 4. Correlation Heatmap
numeric_cols = df_clean.select_dtypes(include=[np.number]).corr()
plt.imshow(numeric_cols, aspect='auto', cmap='viridis')
plt.colorbar()
plt.title("Correlation Heatmap")
plt.xticks(range(len(numeric_cols.columns)), numeric_cols.columns, rotation=90)
plt.yticks(range(len(numeric_cols.columns)), numeric_cols.columns)
plt.show()

# 5. Confusion Matrices
def plot_confusion(cm, title):
    plt.imshow(cm, cmap='plasma')
    plt.title(title)
    plt.colorbar()
    plt.xticks([0, 1], ["Not Canceled", "Canceled"])
    plt.yticks([0, 1], ["Not Canceled", "Canceled"])
    for i in range(2):
        for j in range(2):
            plt.text(j, i, cm[i, j], ha="center", va="center")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

plot_confusion(confusion_matrix(y_test, rf_pred), "Random Forest - Confusion Matrix")
plot_confusion(confusion_matrix(y_test, gb_pred), "Gradient Boosting - Confusion Matrix")

# 6. Feature Importance
def plot_feature_importance(model, X_data, title):
    importances = model.feature_importances_
    idx = np.argsort(importances)[-10:]
    plt.barh(range(len(idx)), importances[idx])
    plt.yticks(range(len(idx)), X_data.columns[idx])
    plt.title(title)
    plt.xlabel("Importance")
    plt.show()

plot_feature_importance(rf, X_encoded, "Random Forest - Top 10 Features")
plot_feature_importance(gb, X_encoded, "Gradient Boosting - Top 10 Features")

In [4]:
from google.colab import files

# Upload the file
uploaded = files.upload()

# Check if the file was uploaded
if 'hotel_bookings_updated_2024.csv' in uploaded:
    print('File hotel_bookings_updated_2024.csv uploaded successfully.')
    # Read the uploaded CSV into a DataFrame
    df = pd.read_csv('hotel_bookings_updated_2024.csv')
    print('DataFrame loaded successfully. Displaying head:')
    display(df.head())
else:
    print('hotel_bookings_updated_2024.csv was not uploaded.')

Saving hotel_bookings_updated_2024.csv to hotel_bookings_updated_2024.csv
File hotel_bookings_updated_2024.csv uploaded successfully.
DataFrame loaded successfully. Displaying head:


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,city
0,Resort Hotel - Chandigarh,0,342,2024,July,30,27,0,0,2,...,,,0,Transient,0.0,0,0,Check-Out,2024-07-27 22:16:40.916332324,Chandigarh
1,Resort Hotel - Mumbai,0,737,2024,April,17,28,0,0,2,...,,,0,Transient,0.0,0,0,Check-Out,2024-04-28 21:56:21.507509066,Mumbai
2,Resort Hotel - Delhi,0,7,2024,September,37,10,0,1,1,...,,,0,Transient,75.0,0,0,Check-Out,2024-09-10 03:46:25.734029096,Delhi
3,Resort Hotel - Kolkata,0,13,2024,August,33,14,0,1,1,...,304.0,,0,Transient,75.0,0,0,Check-Out,2024-08-14 18:07:10.049669568,Kolkata
4,Resort Hotel - Lucknow,0,14,2024,September,37,14,0,2,2,...,240.0,,0,Transient,98.0,0,1,Check-Out,2024-09-14 14:27:32.473846000,Lucknow
