In [449]:
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler      # Remove Mean and scale to Unit Variance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier


import warnings
warnings.filterwarnings('ignore')


In [450]:
train_df = pd.read_csv('../../Dataset/hotel_booking/hotel_bookings.csv')
#test_df = pd.read_csv('hotel_reservations_updated.csv')

#Data cleaning to make 2015-2016 dataset match the incoming streaming data
train_df.loc[(train_df['meal'] == 'Undefined'), 'meal']= 'SC'
train_df.loc[ (train_df['required_car_parking_spaces'] > 1), 'required_car_parking_spaces']= 1

In [451]:
train_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [452]:
#Checking null values
train_df.isnull().sum()[train_df.isnull().sum() > 0]

children         4
country        488
agent        16340
company     112593
dtype: int64

In [453]:
#Filling in NA values for children column
train_df.children.fillna(0, inplace = True)

In [454]:
train_df.drop(['hotel', 'arrival_date_week_number', 'babies', 'country', 'distribution_channel', 
                               'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'company', 
                               'days_in_waiting_list', 'customer_type', 'reservation_status', 'reservation_status_date'], 
                              axis = 1, inplace = True)

print(train_df.dtypes)

is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
meal                               object
market_segment                     object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
adr                               float64
required_car_parking_spaces         int64
total_of_special_requests           int64
dtype: object


In [455]:
train_df.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,market_segment,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,adr,required_car_parking_spaces,total_of_special_requests
0,0,342,2015,July,1,0,0,2,0.0,BB,Direct,0,0,0,C,0.0,0,0
1,0,737,2015,July,1,0,0,2,0.0,BB,Direct,0,0,0,C,0.0,0,0
2,0,7,2015,July,1,0,1,1,0.0,BB,Direct,0,0,0,A,75.0,0,0
3,0,13,2015,July,1,0,1,1,0.0,BB,Corporate,0,0,0,A,75.0,0,0
4,0,14,2015,July,1,0,2,2,0.0,BB,Online TA,0,0,0,A,98.0,0,1


In [456]:
correlation = train_df.corr()['is_canceled'].abs().sort_values(ascending = False)
correlation

is_canceled                       1.000000
lead_time                         0.293123
total_of_special_requests         0.234658
required_car_parking_spaces       0.197399
previous_cancellations            0.110133
is_repeated_guest                 0.084793
adults                            0.060017
previous_bookings_not_canceled    0.057358
adr                               0.047557
stays_in_week_nights              0.024765
arrival_date_year                 0.016660
arrival_date_day_of_month         0.006130
children                          0.005036
stays_in_weekend_nights           0.001791
Name: is_canceled, dtype: float64

In [457]:
#Dropping the arrival date variables (excluding month) which are not useful/relevant to the cancellation
train_df.drop(['arrival_date_year', 'arrival_date_day_of_month'], axis = 1, inplace = True)
train_df.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,market_segment,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,adr,required_car_parking_spaces,total_of_special_requests
0,0,342,July,0,0,2,0.0,BB,Direct,0,0,0,C,0.0,0,0
1,0,737,July,0,0,2,0.0,BB,Direct,0,0,0,C,0.0,0,0
2,0,7,July,0,1,1,0.0,BB,Direct,0,0,0,A,75.0,0,0
3,0,13,July,0,1,1,0.0,BB,Corporate,0,0,0,A,75.0,0,0
4,0,14,July,0,2,2,0.0,BB,Online TA,0,0,0,A,98.0,0,1


In [458]:
categorical_cols = ['arrival_date_month', 'meal', 'reserved_room_type', 'market_segment', 'is_repeated_guest']
numerical_cols = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'previous_cancellations', 'previous_bookings_not_canceled',
                  'adr', 'required_car_parking_spaces', 'total_of_special_requests']
numerical_df = train_df.drop(categorical_cols, axis = 1)
categorical_df = train_df.drop(numerical_cols, axis = 1)



In [459]:
#Checking for all unique values of the categorical columns in the training dataframe for encoding
for col in categorical_df.columns:
    print(f"{col}: {categorical_df[col].unique()}")

is_canceled: [0 1]
arrival_date_month: ['July' 'August' 'September' 'October' 'November' 'December' 'January'
 'February' 'March' 'April' 'May' 'June']
meal: ['BB' 'FB' 'HB' 'SC']
market_segment: ['Direct' 'Corporate' 'Online TA' 'Offline TA/TO' 'Complementary' 'Groups'
 'Undefined' 'Aviation']
is_repeated_guest: [0 1]
reserved_room_type: ['C' 'A' 'D' 'E' 'G' 'F' 'H' 'L' 'P' 'B']


In [460]:
#Encoding categorical variables
#categorical_df['arrival_date_month'] = categorical_df['arrival_date_month'].map({'January': 0, 'February': 1, 'March': 2,
                                                                                 #'April': 3, 'May': 4, 'June': 5, 'July': 6,
                                                                                 #'August': 7, 'September': 8, 'October': 9, 
                                                                                 #'November': 10, 'December': 11})
#categorical_df['meal'] = categorical_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3})
#categorical_df['reserved_room_type'] = categorical_df['reserved_room_type'].map({'A': 0, 'B': 1, 'D': 2, 'F': 3, 'E': 4, 'G': 5, 'C': 6,
                                                                   #'P': 7})
#categorical_df['market_segment'] = categorical_df['market_segment'].map({'Offline TA/TO': 0, 'Online TA': 1, 'Groups': 2, 'Complementary': 3,
                                                           #'Direct': 4, 'Corporate': 5, 'Undefined': 6, 'Aviation': 7})

# perform one-hot encoding for the arrival_date_month column
arrival_date_month_one_hot = pd.get_dummies(categorical_df['arrival_date_month'], prefix='arrival_date_month')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, arrival_date_month_one_hot], axis=1)
# drop the original arrival_date_month column
categorical_df.drop('arrival_date_month', axis=1, inplace=True)

# perform one-hot encoding for the meal column
meal_one_hot = pd.get_dummies(categorical_df['meal'], prefix='meal')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, meal_one_hot], axis=1)
# drop the original meal column
categorical_df.drop('meal', axis=1, inplace=True)

# perform one-hot encoding for the reserved_room_type column
reserved_room_type_one_hot = pd.get_dummies(categorical_df['reserved_room_type'], prefix='reserved_room_type')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, reserved_room_type_one_hot], axis=1)
# drop the original reserved_room_type column
categorical_df.drop('reserved_room_type', axis=1, inplace=True)

# perform one-hot encoding for the market_segment column
market_segment_one_hot = pd.get_dummies(categorical_df['market_segment'], prefix='market_segment')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, market_segment_one_hot], axis=1)
# drop the original customer_type column
categorical_df.drop('market_segment', axis=1, inplace=True)

# perform one-hot encoding for the is_repeated_guest column
is_repeated_guest_one_hot = pd.get_dummies(categorical_df['is_repeated_guest'], prefix='is_repeated_guest')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, is_repeated_guest_one_hot], axis=1)
# drop the original arrival_date_month column
categorical_df.drop('is_repeated_guest', axis=1, inplace=True)

categorical_df.head()

Unnamed: 0,is_canceled,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,arrival_date_month_May,...,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,is_repeated_guest_0,is_repeated_guest_1
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [461]:
numerical_df.drop('is_canceled', axis = 1, inplace = True)
numerical_df.head()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,342,0,0,2,0.0,0,0,0.0,0,0
1,737,0,0,2,0.0,0,0,0.0,0,0
2,7,0,1,1,0.0,0,0,75.0,0,0
3,13,0,1,1,0.0,0,0,75.0,0,0
4,14,0,2,2,0.0,0,0,98.0,0,1


In [462]:
numerical_df.var()

lead_time                         11419.721511
stays_in_weekend_nights               0.997229
stays_in_week_nights                  3.641554
adults                                0.335543
children                              0.158846
previous_cancellations                0.712904
previous_bookings_not_canceled        2.242317
adr                                2553.866100
required_car_parking_spaces           0.058258
total_of_special_requests             0.628529
dtype: float64

In [463]:
scaler = StandardScaler()
num_normalized_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns = numerical_df.columns)
num_normalized_df.var()

lead_time                         1.000008
stays_in_weekend_nights           1.000008
stays_in_week_nights              1.000008
adults                            1.000008
children                          1.000008
previous_cancellations            1.000008
previous_bookings_not_canceled    1.000008
adr                               1.000008
required_car_parking_spaces       1.000008
total_of_special_requests         1.000008
dtype: float64

In [464]:
num_normalized_df.head()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,2.227051,-0.92889,-1.31024,0.247897,-0.260659,-0.10318,-0.091555,-2.015038,-0.257351,-0.720694
1,5.923385,-0.92889,-1.31024,0.247897,-0.260659,-0.10318,-0.091555,-2.015038,-0.257351,-0.720694
2,-0.907814,-0.92889,-0.786207,-1.478447,-0.260659,-0.10318,-0.091555,-0.530935,-0.257351,-0.720694
3,-0.851667,-0.92889,-0.786207,-1.478447,-0.260659,-0.10318,-0.091555,-0.530935,-0.257351,-0.720694
4,-0.842309,-0.92889,-0.262174,0.247897,-0.260659,-0.10318,-0.091555,-0.07581,-0.257351,0.540666


In [472]:
df = pd.concat([categorical_df, num_normalized_df], axis = 1)
y = train_df['is_canceled']
df.drop(['is_canceled'], axis=1, inplace=True)
x = df
x.head()

Unnamed: 0,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,arrival_date_month_May,arrival_date_month_November,...,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,0,0,0,0,0,1,0,0,0,0,...,2.227051,-0.92889,-1.31024,0.247897,-0.260659,-0.10318,-0.091555,-2.015038,-0.257351,-0.720694
1,0,0,0,0,0,1,0,0,0,0,...,5.923385,-0.92889,-1.31024,0.247897,-0.260659,-0.10318,-0.091555,-2.015038,-0.257351,-0.720694
2,0,0,0,0,0,1,0,0,0,0,...,-0.907814,-0.92889,-0.786207,-1.478447,-0.260659,-0.10318,-0.091555,-0.530935,-0.257351,-0.720694
3,0,0,0,0,0,1,0,0,0,0,...,-0.851667,-0.92889,-0.786207,-1.478447,-0.260659,-0.10318,-0.091555,-0.530935,-0.257351,-0.720694
4,0,0,0,0,0,1,0,0,0,0,...,-0.842309,-0.92889,-0.262174,0.247897,-0.260659,-0.10318,-0.091555,-0.07581,-0.257351,0.540666


In [87]:
#test_df = test_df.drop(['Unnamed: 0', 'Booking_ID', 'arrival_date_year', 'arrival_date_day_of_month'], axis = 1)
#print(test_df.dtypes)

In [88]:
#cat_test_df = test_df[categorical_cols]
#cat_test_df.head()

In [89]:
#Encoding of Categorical variables of test data
#cat_test_df['arrival_date_month'] = cat_test_df['arrival_date_month'].map({'January': 0, 'February': 1, 'March': 2,
                                                                                 #'April': 3, 'May': 4, 'June': 5, 'July': 6,
                                                                                 #'August': 7, 'September': 8, 'October': 9, 
                                                                                 #'November': 10, 'December': 11})
#cat_test_df['meal'] = cat_test_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3})
#cat_test_df['reserved_room_type'] = cat_test_df['reserved_room_type'].map({'A': 0, 'B': 1, 'D': 2, 'F': 3, 'E': 4, 'G': 5, 'C': 6,
                                                                   #'P': 7})
#cat_test_df['market_segment'] = cat_test_df['market_segment'].map({'Offline TA/TO': 0, 'Online TA': 1, 'Groups': 2, 'Complementary': 3,
                                                           #'Direct': 4, 'Corporate': 5, 'Undefined': 6, 'Aviation': 7})
#cat_test_df.head()


In [90]:
#Encoding of Numerical variables of test data
#num_test_df = test_df.drop(columns = categorical_cols, axis = 1)
#num_test_df.drop('is_canceled', axis = 1, inplace = True)
#scaler = StandardScaler()
#num_normalized_test_df = pd.DataFrame(scaler.fit_transform(num_test_df), columns = num_test_df.columns)
#num_normalized_test_df.head()


In [473]:
#y_test = test_df['is_canceled']
#x_test = pd.concat([cat_test_df, num_normalized_test_df], axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=42)


In [481]:
print(x.dtypes)

arrival_date_month_April            uint8
arrival_date_month_August           uint8
arrival_date_month_December         uint8
arrival_date_month_February         uint8
arrival_date_month_January          uint8
arrival_date_month_July             uint8
arrival_date_month_June             uint8
arrival_date_month_March            uint8
arrival_date_month_May              uint8
arrival_date_month_November         uint8
arrival_date_month_October          uint8
arrival_date_month_September        uint8
meal_BB                             uint8
meal_FB                             uint8
meal_HB                             uint8
meal_SC                             uint8
reserved_room_type_A                uint8
reserved_room_type_B                uint8
reserved_room_type_C                uint8
reserved_room_type_D                uint8
reserved_room_type_E                uint8
reserved_room_type_F                uint8
reserved_room_type_G                uint8
reserved_room_type_H              

### Models

#### Logistic Regression


In [474]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_pred = logreg.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, logreg_pred)}")

Accuracy: 0.7541670156629533


#### CatBoost Classifier

In [475]:
cat = CatBoostClassifier(iterations=100)
cat.fit(x_train, y_train)
y_pred_cat = cat.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_cat)}")

Learning rate set to 0.5
0:	learn: 0.5337066	total: 72.3ms	remaining: 7.16s
1:	learn: 0.5023145	total: 82.2ms	remaining: 4.03s
2:	learn: 0.4876712	total: 94.1ms	remaining: 3.04s
3:	learn: 0.4784788	total: 104ms	remaining: 2.5s
4:	learn: 0.4727464	total: 114ms	remaining: 2.17s
5:	learn: 0.4677948	total: 126ms	remaining: 1.97s
6:	learn: 0.4639203	total: 136ms	remaining: 1.81s
7:	learn: 0.4608041	total: 147ms	remaining: 1.69s
8:	learn: 0.4588126	total: 157ms	remaining: 1.59s
9:	learn: 0.4574205	total: 167ms	remaining: 1.5s
10:	learn: 0.4550140	total: 178ms	remaining: 1.44s
11:	learn: 0.4523263	total: 188ms	remaining: 1.38s
12:	learn: 0.4511281	total: 198ms	remaining: 1.32s
13:	learn: 0.4493903	total: 210ms	remaining: 1.29s
14:	learn: 0.4480038	total: 220ms	remaining: 1.25s
15:	learn: 0.4457576	total: 231ms	remaining: 1.21s
16:	learn: 0.4426619	total: 243ms	remaining: 1.18s
17:	learn: 0.4411147	total: 253ms	remaining: 1.15s
18:	learn: 0.4394574	total: 264ms	remaining: 1.12s
19:	learn: 0.43

#### Random Forest Classifier

In [476]:
rd_clf = RandomForestClassifier()
rd_clf.fit(x_train, y_train)
y_pred_rd_clf = rd_clf.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_rd_clf)}")

Accuracy: 0.853505318703409


#### Gradient Boosting Classifier

In [477]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
y_pred_gb = gb.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}")

Accuracy: 0.7826032331015998


#### Ada Boost Classifier

In [478]:
ada = AdaBoostClassifier()
ada.fit(x_train, y_train)
y_pred_ada = ada.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_ada)}")

Accuracy: 0.7694949325739174


#### K Nearest Neighbours Classifier

In [479]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, knn_pred)}")

Accuracy: 0.810788173213837


### Model Comparison

Ranking Models By Accuracy

1. 0.853505 Random Classifier
2. 0.820504 CatBoost Classifier
3. 0.810788 K Nearest Neighbours Classifier
4. 0.782603 Gradient Boosting Classifier
5. 0.769494 Ada Boost Classifier
6. 0.754167 Logistic Regression

In [480]:
#Dumping model into Pickle File
pickle.dump(rd_clf, open('cancellation_model.pkl', 'wb'))