In [584]:
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler      # Remove Mean and scale to Unit Variance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier


import warnings
warnings.filterwarnings('ignore')


In [585]:
train_df = pd.read_csv('../../Dataset/batch_data/city_hotel_bookings_updated.csv')
#test_df = pd.read_csv('hotel_reservations_updated.csv')

#Data cleaning to make 2015-2016 dataset match the incoming streaming data
train_df.loc[(train_df['meal'] == 'Undefined'), 'meal']= 'SC'
train_df.loc[ (train_df['required_car_parking_spaces'] > 1), 'required_car_parking_spaces']= 1

In [586]:
train_df.head()

Unnamed: 0,Booking_ID,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,40060,0,6,2015,July,27,1,0,2,1,...,No Deposit,6.0,,0,Transient,0.0,0,0,Check-Out,2015-07-03
1,40061,1,88,2015,July,27,1,0,4,2,...,No Deposit,9.0,,0,Transient,76.5,0,1,Canceled,2015-07-01
2,40062,1,65,2015,July,27,1,0,4,1,...,No Deposit,9.0,,0,Transient,68.0,0,1,Canceled,2015-04-30
3,40063,1,92,2015,July,27,1,2,4,2,...,No Deposit,9.0,,0,Transient,76.5,0,2,Canceled,2015-06-23
4,40064,1,100,2015,July,27,2,0,2,2,...,No Deposit,9.0,,0,Transient,76.5,0,1,Canceled,2015-04-02


In [587]:
#Checking null values
train_df.isnull().sum()[train_df.isnull().sum() > 0]

country       24
agent       8131
company    75641
dtype: int64

In [588]:
#Filling in NA values for children column
train_df.children.fillna(0, inplace = True)

In [589]:
train_df.drop(['arrival_date_week_number', 'babies', 'country', 'distribution_channel', 
                               'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'company', 
                               'days_in_waiting_list', 'customer_type', 'reservation_status', 'reservation_status_date'], 
                              axis = 1, inplace = True)

print(train_df.dtypes)

Booking_ID                          int64
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                            int64
meal                               object
market_segment                     object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
adr                               float64
required_car_parking_spaces         int64
total_of_special_requests           int64
dtype: object


In [590]:
train_df.head()

Unnamed: 0,Booking_ID,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,market_segment,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,adr,required_car_parking_spaces,total_of_special_requests
0,40060,0,6,2015,July,1,0,2,1,0,HB,Offline TA/TO,0,0,0,A,0.0,0,0
1,40061,1,88,2015,July,1,0,4,2,0,BB,Online TA,0,0,0,A,76.5,0,1
2,40062,1,65,2015,July,1,0,4,1,0,BB,Online TA,0,0,0,A,68.0,0,1
3,40063,1,92,2015,July,1,2,4,2,0,BB,Online TA,0,0,0,A,76.5,0,2
4,40064,1,100,2015,July,2,0,2,2,0,BB,Online TA,0,0,0,A,76.5,0,1


In [591]:
correlation = train_df.corr()['is_canceled'].abs().sort_values(ascending = False)
correlation

is_canceled                       1.000000
Booking_ID                        0.647099
lead_time                         0.309242
total_of_special_requests         0.293889
previous_cancellations            0.166643
required_car_parking_spaces       0.133481
is_repeated_guest                 0.065840
previous_bookings_not_canceled    0.053134
adults                            0.053054
stays_in_week_nights              0.048704
children                          0.027089
adr                               0.011965
stays_in_weekend_nights           0.007343
arrival_date_day_of_month         0.004331
arrival_date_year                 0.003004
Name: is_canceled, dtype: float64

In [592]:
#Dropping the arrival date variables (excluding month) which are not useful/relevant to the cancellation
train_df.drop(['arrival_date_year', 'arrival_date_day_of_month'], axis = 1, inplace = True)
train_df.head()

Unnamed: 0,Booking_ID,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,market_segment,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,adr,required_car_parking_spaces,total_of_special_requests
0,40060,0,6,July,0,2,1,0,HB,Offline TA/TO,0,0,0,A,0.0,0,0
1,40061,1,88,July,0,4,2,0,BB,Online TA,0,0,0,A,76.5,0,1
2,40062,1,65,July,0,4,1,0,BB,Online TA,0,0,0,A,68.0,0,1
3,40063,1,92,July,2,4,2,0,BB,Online TA,0,0,0,A,76.5,0,2
4,40064,1,100,July,0,2,2,0,BB,Online TA,0,0,0,A,76.5,0,1


In [593]:
categorical_cols = ['arrival_date_month', 'meal', 'reserved_room_type', 'market_segment', 'is_repeated_guest']
numerical_cols = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'previous_cancellations', 'previous_bookings_not_canceled',
                  'adr', 'required_car_parking_spaces', 'total_of_special_requests']
booking_df = train_df['Booking_ID']
train_df = train_df.drop(['Booking_ID'], axis=1)
numerical_df = train_df.drop(categorical_cols, axis = 1)
categorical_df = train_df.drop(numerical_cols, axis = 1)




In [594]:
#Checking for all unique values of the categorical columns in the training dataframe for encoding
for col in categorical_df.columns:
    print(f"{col}: {categorical_df[col].unique()}")

is_canceled: [0 1]
arrival_date_month: ['July' 'August' 'September' 'October' 'November' 'December' 'January'
 'February' 'March' 'April' 'May' 'June']
meal: ['HB' 'BB' 'SC' 'FB']
market_segment: ['Offline TA/TO' 'Online TA' 'Groups' 'Complementary' 'Direct' 'Corporate'
 'Undefined' 'Aviation']
is_repeated_guest: [0 1]
reserved_room_type: ['A' 'B' 'D' 'F' 'E' 'G' 'C' 'P']


In [595]:
#Encoding categorical variables
#categorical_df['arrival_date_month'] = categorical_df['arrival_date_month'].map({'January': 0, 'February': 1, 'March': 2,
                                                                                 #'April': 3, 'May': 4, 'June': 5, 'July': 6,
                                                                                 #'August': 7, 'September': 8, 'October': 9, 
                                                                                 #'November': 10, 'December': 11})
#categorical_df['meal'] = categorical_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3})
#categorical_df['reserved_room_type'] = categorical_df['reserved_room_type'].map({'A': 0, 'B': 1, 'D': 2, 'F': 3, 'E': 4, 'G': 5, 'C': 6,
                                                                   #'P': 7})
#categorical_df['market_segment'] = categorical_df['market_segment'].map({'Offline TA/TO': 0, 'Online TA': 1, 'Groups': 2, 'Complementary': 3,
                                                           #'Direct': 4, 'Corporate': 5, 'Undefined': 6, 'Aviation': 7})

# perform one-hot encoding for the arrival_date_month column
arrival_date_month_one_hot = pd.get_dummies(categorical_df['arrival_date_month'], prefix='arrival_date_month')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, arrival_date_month_one_hot], axis=1)
# drop the original arrival_date_month column
categorical_df.drop('arrival_date_month', axis=1, inplace=True)

# perform one-hot encoding for the meal column
meal_one_hot = pd.get_dummies(categorical_df['meal'], prefix='meal')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, meal_one_hot], axis=1)
# drop the original meal column
categorical_df.drop('meal', axis=1, inplace=True)

# perform one-hot encoding for the reserved_room_type column
reserved_room_type_one_hot = pd.get_dummies(categorical_df['reserved_room_type'], prefix='reserved_room_type')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, reserved_room_type_one_hot], axis=1)
# drop the original reserved_room_type column
categorical_df.drop('reserved_room_type', axis=1, inplace=True)

# perform one-hot encoding for the market_segment column
market_segment_one_hot = pd.get_dummies(categorical_df['market_segment'], prefix='market_segment')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, market_segment_one_hot], axis=1)
# drop the original customer_type column
categorical_df.drop('market_segment', axis=1, inplace=True)

categorical_df.head()

Unnamed: 0,is_canceled,is_repeated_guest,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,...,reserved_room_type_G,reserved_room_type_P,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [596]:
numerical_df.drop('is_canceled', axis = 1, inplace = True)
numerical_df.head()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,6,0,2,1,0,0,0,0.0,0,0
1,88,0,4,2,0,0,0,76.5,0,1
2,65,0,4,1,0,0,0,68.0,0,1
3,92,2,4,2,0,0,0,76.5,0,2
4,100,0,2,2,0,0,0,76.5,0,1


In [597]:
numerical_df.var()

lead_time                         12309.575369
stays_in_weekend_nights               0.783272
stays_in_week_nights                  2.121148
adults                                0.259379
children                              0.138509
previous_cancellations                0.172617
previous_bookings_not_canceled        2.867639
adr                                1901.217583
required_car_parking_spaces           0.023689
total_of_special_requests             0.609612
dtype: float64

In [598]:
scaler = StandardScaler()
num_normalized_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns = numerical_df.columns)
num_normalized_df.var()

lead_time                         1.000013
stays_in_weekend_nights           1.000013
stays_in_week_nights              1.000013
adults                            1.000013
children                          1.000013
previous_cancellations            1.000013
previous_bookings_not_canceled    1.000013
adr                               1.000013
required_car_parking_spaces       1.000013
total_of_special_requests         1.000013
dtype: float64

In [599]:
num_normalized_df.head()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,-0.934996,-0.898493,-0.125622,-1.670911,-0.245496,-0.191934,-0.078169,-2.415092,-0.157742,-0.700484
1,-0.195909,-0.898493,1.24762,0.29261,-0.245496,-0.191934,-0.078169,-0.660612,-0.157742,0.580301
2,-0.403214,-0.898493,1.24762,-1.670911,-0.245496,-0.191934,-0.078169,-0.855554,-0.157742,0.580301
3,-0.159856,1.361341,1.24762,0.29261,-0.245496,-0.191934,-0.078169,-0.660612,-0.157742,1.861085
4,-0.08775,-0.898493,-0.125622,0.29261,-0.245496,-0.191934,-0.078169,-0.660612,-0.157742,0.580301


In [600]:
df = pd.concat([booking_df, categorical_df, num_normalized_df], axis = 1)
y = train_df['is_canceled']
df.drop(['is_canceled'], axis=1, inplace=True)
x = df
x.head()

Unnamed: 0,Booking_ID,is_repeated_guest,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,...,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,40060,0,0,0,0,0,0,1,0,0,...,-0.934996,-0.898493,-0.125622,-1.670911,-0.245496,-0.191934,-0.078169,-2.415092,-0.157742,-0.700484
1,40061,0,0,0,0,0,0,1,0,0,...,-0.195909,-0.898493,1.24762,0.29261,-0.245496,-0.191934,-0.078169,-0.660612,-0.157742,0.580301
2,40062,0,0,0,0,0,0,1,0,0,...,-0.403214,-0.898493,1.24762,-1.670911,-0.245496,-0.191934,-0.078169,-0.855554,-0.157742,0.580301
3,40063,0,0,0,0,0,0,1,0,0,...,-0.159856,1.361341,1.24762,0.29261,-0.245496,-0.191934,-0.078169,-0.660612,-0.157742,1.861085
4,40064,0,0,0,0,0,0,1,0,0,...,-0.08775,-0.898493,-0.125622,0.29261,-0.245496,-0.191934,-0.078169,-0.660612,-0.157742,0.580301


In [601]:
#test_df = test_df.drop(['Unnamed: 0', 'Booking_ID', 'arrival_date_year', 'arrival_date_day_of_month'], axis = 1)
#print(test_df.dtypes)

In [602]:
#cat_test_df = test_df[categorical_cols]
#cat_test_df.head()

In [603]:
#Encoding of Categorical variables of test data
#cat_test_df['arrival_date_month'] = cat_test_df['arrival_date_month'].map({'January': 0, 'February': 1, 'March': 2,
                                                                                 #'April': 3, 'May': 4, 'June': 5, 'July': 6,
                                                                                 #'August': 7, 'September': 8, 'October': 9, 
                                                                                 #'November': 10, 'December': 11})
#cat_test_df['meal'] = cat_test_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3})
#cat_test_df['reserved_room_type'] = cat_test_df['reserved_room_type'].map({'A': 0, 'B': 1, 'D': 2, 'F': 3, 'E': 4, 'G': 5, 'C': 6,
                                                                   #'P': 7})
#cat_test_df['market_segment'] = cat_test_df['market_segment'].map({'Offline TA/TO': 0, 'Online TA': 1, 'Groups': 2, 'Complementary': 3,
                                                           #'Direct': 4, 'Corporate': 5, 'Undefined': 6, 'Aviation': 7})
#cat_test_df.head()


In [604]:
#Encoding of Numerical variables of test data
#num_test_df = test_df.drop(columns = categorical_cols, axis = 1)
#num_test_df.drop('is_canceled', axis = 1, inplace = True)
#scaler = StandardScaler()
#num_normalized_test_df = pd.DataFrame(scaler.fit_transform(num_test_df), columns = num_test_df.columns)
#num_normalized_test_df.head()


In [605]:
#y_test = test_df['is_canceled']
#x_test = pd.concat([cat_test_df, num_normalized_test_df], axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=42)


In [606]:
print(x.dtypes)

Booking_ID                          int64
is_repeated_guest                   int64
arrival_date_month_April            uint8
arrival_date_month_August           uint8
arrival_date_month_December         uint8
arrival_date_month_February         uint8
arrival_date_month_January          uint8
arrival_date_month_July             uint8
arrival_date_month_June             uint8
arrival_date_month_March            uint8
arrival_date_month_May              uint8
arrival_date_month_November         uint8
arrival_date_month_October          uint8
arrival_date_month_September        uint8
meal_BB                             uint8
meal_FB                             uint8
meal_HB                             uint8
meal_SC                             uint8
reserved_room_type_A                uint8
reserved_room_type_B                uint8
reserved_room_type_C                uint8
reserved_room_type_D                uint8
reserved_room_type_E                uint8
reserved_room_type_F              

### Models

#### Logistic Regression


In [607]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_pred = logreg.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, logreg_pred)}")

Accuracy: 0.5843312744232951


#### CatBoost Classifier

In [608]:
cat = CatBoostClassifier(iterations=100)
cat.fit(x_train, y_train)
y_pred_cat = cat.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_cat)}")

Learning rate set to 0.5
0:	learn: 0.2166892	total: 20.5ms	remaining: 2.03s
1:	learn: 0.1371537	total: 32ms	remaining: 1.57s
2:	learn: 0.1239886	total: 43.1ms	remaining: 1.39s
3:	learn: 0.1175463	total: 56.2ms	remaining: 1.35s
4:	learn: 0.1125018	total: 70.5ms	remaining: 1.34s
5:	learn: 0.1063130	total: 83.8ms	remaining: 1.31s
6:	learn: 0.1009875	total: 96.9ms	remaining: 1.29s
7:	learn: 0.0974025	total: 110ms	remaining: 1.27s
8:	learn: 0.0951751	total: 126ms	remaining: 1.27s
9:	learn: 0.0911580	total: 139ms	remaining: 1.25s
10:	learn: 0.0893913	total: 152ms	remaining: 1.23s
11:	learn: 0.0876165	total: 164ms	remaining: 1.2s
12:	learn: 0.0859635	total: 177ms	remaining: 1.18s
13:	learn: 0.0831951	total: 190ms	remaining: 1.17s
14:	learn: 0.0812187	total: 203ms	remaining: 1.15s
15:	learn: 0.0802166	total: 216ms	remaining: 1.13s
16:	learn: 0.0788103	total: 228ms	remaining: 1.11s
17:	learn: 0.0777236	total: 238ms	remaining: 1.08s
18:	learn: 0.0767012	total: 249ms	remaining: 1.06s
19:	learn: 0

#### Random Forest Classifier

In [609]:
rd_clf = RandomForestClassifier()
rd_clf.fit(x_train, y_train)
y_pred_rd_clf = rd_clf.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_rd_clf)}")

Accuracy: 0.9803983360645405


#### Gradient Boosting Classifier

In [610]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
y_pred_gb = gb.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}")

Accuracy: 0.9630026471700491


#### Ada Boost Classifier

In [611]:
ada = AdaBoostClassifier()
ada.fit(x_train, y_train)
y_pred_ada = ada.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_ada)}")

Accuracy: 0.9529812176982226


#### K Nearest Neighbours Classifier

In [612]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, knn_pred)}")

Accuracy: 0.9537375519979832


### Model Comparison

Ranking Models By Accuracy/ TO EDIT

1. 0.980398 Random Forest Classifier
2. 0.979957 CatBoost Classifier
3. 0.963002 Gradient Boosting Classifier
4. 0.953737 K Nearest Neighbours Classifier
5. 0.952981 Ada Boost Classifier
6. 0.584331 Logistic Regression

In [613]:
#Dumping model into Pickle File
pickle.dump(rd_clf, open('cancellation_model.pkl', 'wb'))