## Importing Libraries

In [1]:
#Libraries Import
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

!pip install imbalanced-learn

#For dealing with class imbalance
from imblearn.over_sampling import SMOTE

Defaulting to user installation because normal site-packages is not writeable


## Loading Dataset

In [2]:
#Loading the dataset
file_path = 'hotel_bookings.csv'
hotel_data = pd.read_csv(file_path)

#Displaying the first few rows and general info
hotel_data.head(), hotel_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

(          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
 0  Resort Hotel            0        342               2015               July   
 1  Resort Hotel            0        737               2015               July   
 2  Resort Hotel            0          7               2015               July   
 3  Resort Hotel            0         13               2015               July   
 4  Resort Hotel            0         14               2015               July   
 
    arrival_date_week_number  arrival_date_day_of_month  \
 0                        27                          1   
 1                        27                          1   
 2                        27                          1   
 3                        27                          1   
 4                        27                          1   
 
    stays_in_weekend_nights  stays_in_week_nights  adults  ...  deposit_type  \
 0                        0                     0       2  ...    No D

## Handle Missing Values

In [3]:
#Replacing missing values with '0' in Children; Country, Agent, Company missing values with 'Unknown'
hotel_data['children'] = hotel_data['children'].fillna(0)
hotel_data['country'] = hotel_data['country'].fillna('Unknown')

hotel_data['agent'] = hotel_data['agent'].fillna('Unknown')
hotel_data['company'] = hotel_data['company'].fillna('Unknown')

#Checking if any more remaining missing values
hotel_data.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
agent                             0
company                           0
days_in_waiting_list              0
customer_type                     0
adr                         

## Feature Engineering

In [4]:
#Creating new features, such as the total number of nights, and combine date columns to a single datetime feature.

#'total_nights' feature by combining stays in weekend and week nights
hotel_data['total_nights'] = hotel_data['stays_in_weekend_nights'] + hotel_data['stays_in_week_nights']

#A combined 'date' feature using year, month, and day
hotel_data['date'] = pd.to_datetime(hotel_data[['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month']].astype(str).agg('-'.join, axis=1))

#Removing the individual date columns now that we have the combined 'date'
hotel_data.drop(columns=['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'], inplace=True)

#Checking the new features
hotel_data[['total_nights', 'date']].head()

Unnamed: 0,total_nights,date
0,0,2015-07-01
1,0,2015-07-01
2,1,2015-07-01
3,1,2015-07-01
4,2,2015-07-01


In [5]:
#Separate features (X) and target (y)
X = hotel_data.drop(columns=['is_canceled'])
y = hotel_data['is_canceled']

#Spliting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Check the split dimensions
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((95512, 30), (23878, 30), (95512,), (23878,))

## Encoding Categorical Features

In [6]:
#One-Hot Encoding for multi-category columns
categorical_cols = ['hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type']
hotel_data_encoded = pd.get_dummies(hotel_data, columns=categorical_cols, drop_first=True)

#Label Encoding for binary categorical features
label_encoders = ['deposit_type', 'is_repeated_guest']
le = LabelEncoder()
for col in label_encoders:
    hotel_data_encoded[col] = le.fit_transform(hotel_data_encoded[col])

#Checking the updated dataset
hotel_data_encoded.head()


Unnamed: 0,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,country,is_repeated_guest,...,assigned_room_type_F,assigned_room_type_G,assigned_room_type_H,assigned_room_type_I,assigned_room_type_K,assigned_room_type_L,assigned_room_type_P,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,0,342,27,0,0,2,0.0,0,PRT,0,...,False,False,False,False,False,False,False,False,True,False
1,0,737,27,0,0,2,0.0,0,PRT,0,...,False,False,False,False,False,False,False,False,True,False
2,0,7,27,0,1,1,0.0,0,GBR,0,...,False,False,False,False,False,False,False,False,True,False
3,0,13,27,0,1,1,0.0,0,GBR,0,...,False,False,False,False,False,False,False,False,True,False
4,0,14,27,0,2,2,0.0,0,GBR,0,...,False,False,False,False,False,False,False,False,True,False


## Normalization

In [7]:
#List of categorical columns
categorical_cols = ['hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type', 'country']
hotel_data_encoded = pd.get_dummies(hotel_data, columns=categorical_cols, drop_first=True)

#List of numerical features to scale
numerical_cols = ['lead_time', 'total_nights', 'adults', 'children', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']

X = hotel_data_encoded.drop(columns=['is_canceled'])
y = hotel_data_encoded['is_canceled']

## Pipeline setup

In [8]:
#Preprocessing pipeline for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values in numerical columns
    ('scaler', StandardScaler())  # Scale numerical columns
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),  # Handle missing values in categorical columns
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical columns
])

# Combine the two transformers into one preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Check the processed data dimensions
X_train_processed.shape, X_test_processed.shape

((95512, 231), (23878, 231))

## Class Imbalance Handling

In [9]:
#Applying SMOTE on numerical features
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_processed, y_train)

#Check the class distribution after SMOTE
y_resampled.value_counts()

is_canceled
1    60259
0    60259
Name: count, dtype: int64

## Saving Test and Train Splits

In [10]:
#Saving the processed train-test splits as CSV files
X_train_processed_df = pd.DataFrame(X_train_processed)
X_test_processed_df = pd.DataFrame(X_test_processed)
y_train_df = pd.DataFrame(y_train)
y_test_df = pd.DataFrame(y_test)

#Save to CSV
X_train_processed_df.to_csv('X_train_processed.csv', index=False)
X_test_processed_df.to_csv('X_test_processed.csv', index=False)
y_train_df.to_csv('y_train.csv', index=False)
y_test_df.to_csv('y_test.csv', index=False)

#Verify that they are saved correctly
print("Files saved successfully!")


Files saved successfully!
