In [1]:
# Step 1: Load and preprocess the dataset
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
pd.set_option('display.max_columns', None)

In [2]:

# Load the hotel bookings dataset
hotel_data = pd.read_csv('/content/hotel_bookings.csv')


In [3]:
# Drop unnecessary columns
hotel_data = hotel_data.drop(['company', 'reservation_status', 'reservation_status_date'], axis=1)

# Drop missing values
hotel_data = hotel_data.dropna()

# Encode categorical variables
hotel_data = pd.get_dummies(hotel_data, columns=['hotel', 'meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest', 'deposit_type', 'customer_type'])

# Convert date columns to numerical values
hotel_data['arrival_date'] = pd.to_datetime(hotel_data['arrival_date_year'].astype(str) + '-' + hotel_data['arrival_date_month'] + '-' + hotel_data['arrival_date_day_of_month'].astype(str))
hotel_data['arrival_date'] = (hotel_data['arrival_date'] - pd.to_datetime('1970-01-01')).dt.total_seconds()

In [4]:
# Check for non-numeric values in each column
for column in hotel_data.columns:
    try:
        pd.to_numeric(hotel_data[column])
    except ValueError:
        print(f"Column '{column}' contains non-numeric values.")

Column 'arrival_date_month' contains non-numeric values.
Column 'reserved_room_type' contains non-numeric values.
Column 'assigned_room_type' contains non-numeric values.


In [5]:
#get all unique values from the columns
print(hotel_data['arrival_date_month'].unique())
print(hotel_data['reserved_room_type'].unique())
print(hotel_data['assigned_room_type'].unique())

['July' 'August' 'September' 'October' 'November' 'December' 'January'
 'February' 'March' 'April' 'May' 'June']
['A' 'C' 'D' 'E' 'G' 'F' 'H' 'B']
['A' 'C' 'D' 'E' 'G' 'F' 'I' 'B' 'H' 'K']


# Changing categorical to numeric

In [6]:
#Months
months = {'January': 1, 'February': 2, 'March':3, 'April':4, 'May':5, 'June':6,'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
hotel_data['arrival_date_month_num'] = hotel_data['arrival_date_month'].map(months)

#reserved_room_type
months = {'A': 1, 'B': 2, 'C':3, 'D':4, 'E':5, 'F':6,'G':7, 'H':8}
hotel_data['reserved_room_type_num'] = hotel_data['reserved_room_type'].map(months)

#assigned_room_type
months = {'A': 1, 'B': 2, 'C':3, 'D':4, 'E':5, 'F':6,'G':7, 'H':8,'I':6,'J':7, 'K':8}
hotel_data['assigned_room_type_num'] = hotel_data['assigned_room_type'].map(months)


hotel_data = hotel_data.drop(['assigned_room_type', 'reserved_room_type', 'arrival_date_month'], axis=1)

In [7]:
# Split the data into features and target variable
X = hotel_data.drop('is_canceled', axis=1)
y = hotel_data['is_canceled']

In [8]:
# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 2: Perform exploratory data analysis (EDA) to gain insights into the dataset
# For example, you can plot the distribution of the target variable

sns.countplot(y)

# Step 3: Perform feature engineering to prepare the data for modeling
# No feature engineering performed in this code


In [10]:

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#NOTE: This step will take time for preocessing based on the computation power.
# Step 5: Train and evaluate different classification models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, solver='lbfgs'),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

results = []
for name, model in tqdm(models.items()):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    results.append((name, accuracy, precision, recall, f1, roc_auc, confusion))


In [12]:

# Step 6: Tune the hyperparameters of the best-performing model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

# Step 7: Evaluate the final model on the test set

#Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print(f'Test accuracy: {accuracy}')
print(f'Test precision: {precision}')
print(f'Test recall: {recall}')
print(f'Test F1 score: {f1}')
print(f'Test ROC AUC score: {roc_auc}')
print(f'Test confusion matrix:\n{confusion}')

KeyboardInterrupt: ignored