In [17]:
import pandas as pd 
import numpy as np

In [18]:
RANDOM_STATE = 42

In [19]:
# Import dataset
hotel_df = pd.read_csv("./data/cleaned-hotel-reservations.csv")
hotel_df.head(3)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled


In [20]:
from sklearn.preprocessing import LabelEncoder

# Encode non-integer categorical data
non_int_categorical_cols = ['type_of_meal_plan','room_type_reserved','market_segment_type','booking_status']

hotel_df[non_int_categorical_cols] = hotel_df[non_int_categorical_cols].apply(LabelEncoder().fit_transform)

## Examine the class label imbalance

Let's look at the dataset imbalance:

In [21]:
# Count the occurrences of each class in the oversampled data
cancelled, not_cancelled = np.bincount(hotel_df['booking_status'])

# Calculate the total number of samples
total = cancelled + not_cancelled

# Print the class distribution
print('Samples:\n    Total: {}\n    Canceled: {} ({:.2f}% of total)'.format(
    total, cancelled, 100 * cancelled / total))

Samples:
    Total: 36275
    Canceled: 11885 (32.76% of total)


This shows that the original dataset has a slightly lower fraction of positive samples, resulting in an imbalanced dataset which may lead to biases when training models for prediction. Hence, we explore techniques to balance the original dataset.

## Random Oversampling

We perform random oversampling on the minority class.

In [22]:
from imblearn.over_sampling import RandomOverSampler

In [23]:
# Split X and y columns
X = hotel_df.drop('booking_status', axis=True)
y = hotel_df['booking_status']

In [24]:
# Create the RandomOverSampler
random_oversampler = RandomOverSampler(random_state=RANDOM_STATE)

# Oversample the data
X_random_oversampled, y_random_oversampled = random_oversampler.fit_resample(X, y)

In [25]:
# Count the occurrences of each class in the oversampled data
cancelled_random_oversampled, not_cancelled_random_oversampled = np.bincount(y_random_oversampled)

# Calculate the total number of samples
total_random_oversampled = cancelled_random_oversampled + not_cancelled_random_oversampled

# Print the class distribution
print('Samples after oversampling:\n Total: {}\n Canceled: {} ({:.2f}% of total)'.format(total_random_oversampled, cancelled_random_oversampled, 100 * cancelled_random_oversampled / total_random_oversampled))

Samples after oversampling:
 Total: 48780
 Canceled: 24390 (50.00% of total)


The dataset is now balanced.

In [26]:
# Convert to a dataframe
random_oversampled_data = pd.concat([pd.DataFrame(X_random_oversampled), pd.DataFrame(y_random_oversampled)], axis=1)

In [27]:
# Save as csv
random_oversampled_data.to_csv("./data/hotel-reservations-random-oversampled.csv")

## Synthetic Minority Over-sampling Technique (SMOTE)

We perform SMOTE on the minority class.

In [28]:
from imblearn.over_sampling import SMOTE

In [29]:
smote = SMOTE(random_state=RANDOM_STATE)

# Perform SMOTE
X_smote, y_smote = smote.fit_resample(X, y)

In [30]:
# Count the occurrences of each class in the SMOTEd data
cancelled_smote, not_cancelled_smote = np.bincount(y_smote)

# Calculate the total number of samples
total_smote= cancelled_smote + not_cancelled_smote

# Print the class distribution
print('Samples after oversampling:\n Total: {}\n Canceled: {} ({:.2f}% of total)'.format(total_smote, cancelled_smote, 100 * cancelled_smote / total_smote))

Samples after oversampling:
 Total: 48780
 Canceled: 24390 (50.00% of total)


The dataset is now balanced.

In [31]:
# Convert to a dataframe
smote_data = pd.concat([pd.DataFrame(X_smote), pd.DataFrame(y_smote)], axis=1)

In [32]:
# Save as csv
smote_data.to_csv("./data/hotel-reservations-SMOTE.csv")