In [45]:
# import libraries
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import datetime

In [46]:
# load training dataset
hotel_booking = pd.read_csv('hotel_booking.csv')

In [47]:
hotel_booking_df = hotel_booking.copy()

In [48]:
hotel_booking_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 36 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [49]:
# drop unuseful columns
drop_columns = ['arrival_date_year', 'name', 'email', 'phone-number', 'credit_card', 'agent', 'company', 'country']
hotel_booking_df = hotel_booking_df.drop(columns=drop_columns)

# transform categorical variables
categorical_columns = ['hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status']
hotel_booking_df = pd.get_dummies(hotel_booking_df, columns=categorical_columns, drop_first=True)

# transform ordinal variables
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}
hotel_booking_df['arrival_date_month'] = hotel_booking_df['arrival_date_month'].map(month_mapping)

# feature clipping
hotel_booking_df['stays_in_weekend_nights'] = hotel_booking_df['stays_in_weekend_nights'].clip(0, 3)
hotel_booking_df['stays_in_week_nights'] = hotel_booking_df['stays_in_week_nights'].clip(0, 5)
hotel_booking_df['adults'] = hotel_booking_df['adults'].clip(0, 5)
hotel_booking_df['children'] = hotel_booking_df['children'].clip(0, 2)
hotel_booking_df['babies'] = hotel_booking_df['babies'].clip(0, 2)


# Apply z-score normalization
feature_col = [
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'adults',
    'children',
    'babies',
    'lead_time',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'booking_changes',
    'days_in_waiting_list',
    'adr',
    'required_car_parking_spaces',
    'total_of_special_requests',
    'arrival_date_month',
    'arrival_date_week_number',
    'arrival_date_day_of_month'
]
scaler = preprocessing.StandardScaler()

for col in feature_col:
    hotel_booking_df[col] = scaler.fit(hotel_booking_df[[col]]).transform(hotel_booking_df[[col]])

# Apply min-max normalization



# replace missing value with mode
hotel_booking_df['children'].fillna(hotel_booking_df['children'].mode()[0], inplace = True)

In [50]:
value_counts = hotel_booking_df['required_car_parking_spaces'].value_counts()
print(value_counts)

required_car_parking_spaces
-0.254873     111974
 3.821932       7383
 7.898737         28
 11.975542         3
 32.359567         2
Name: count, dtype: int64


In [51]:
hotel_booking_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 62 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   is_canceled                     119390 non-null  int64  
 1   lead_time                       119390 non-null  float64
 2   arrival_date_month              119390 non-null  float64
 3   arrival_date_week_number        119390 non-null  float64
 4   arrival_date_day_of_month       119390 non-null  float64
 5   stays_in_weekend_nights         119390 non-null  float64
 6   stays_in_week_nights            119390 non-null  float64
 7   adults                          119390 non-null  float64
 8   children                        119390 non-null  float64
 9   babies                          119390 non-null  float64
 10  is_repeated_guest               119390 non-null  int64  
 11  previous_cancellations          119390 non-null  float64
 12  previous_booking