In [769]:
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler      # Remove Mean and scale to Unit Variance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier


import warnings
warnings.filterwarnings('ignore')


In [770]:
train_df = pd.read_csv('../../Dataset/batch_data/city_hotel_bookings_updated (4).csv')
#test_df = pd.read_csv('hotel_reservations_updated.csv')

#Data cleaning to make 2015-2016 dataset match the incoming streaming data
train_df.loc[(train_df['meal'] == 'Undefined'), 'meal']= 'SC'
train_df.loc[ (train_df['required_car_parking_spaces'] > 1), 'required_car_parking_spaces']= 1

In [771]:
train_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,Booking_ID
0,City Hotel,1,88,2015,July,27,1,0,4,2,...,9.0,,0,Transient,76.5,0,1,Canceled,2015-07-01,40061
1,City Hotel,1,65,2015,July,27,1,0,4,1,...,9.0,,0,Transient,68.0,0,1,Canceled,2015-04-30,40062
2,City Hotel,1,92,2015,July,27,1,2,4,2,...,9.0,,0,Transient,76.5,0,2,Canceled,2015-06-23,40063
3,City Hotel,1,100,2015,July,27,2,0,2,2,...,9.0,,0,Transient,76.5,0,1,Canceled,2015-04-02,40064
4,City Hotel,1,79,2015,July,27,2,0,3,2,...,9.0,,0,Transient,76.5,0,1,Canceled,2015-06-25,40065


In [772]:
#Checking null values
train_df.isnull().sum()[train_df.isnull().sum() > 0]

children        1
country        16
agent        7388
company     74138
dtype: int64

In [773]:
#Filling in NA values for children column
train_df.children.fillna(0, inplace = True)

In [774]:
train_df.drop(['arrival_date_week_number', 'babies', 'country', 'distribution_channel', 
                               'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'company', 
                               'days_in_waiting_list', 'customer_type', 'reservation_status', 'reservation_status_date', 'hotel'], 
                              axis = 1, inplace = True)

print(train_df.dtypes)

is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
meal                               object
market_segment                     object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
adr                               float64
required_car_parking_spaces         int64
total_of_special_requests           int64
Booking_ID                          int64
dtype: object


In [775]:
train_df.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,market_segment,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,adr,required_car_parking_spaces,total_of_special_requests,Booking_ID
0,1,88,2015,July,1,0,4,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1,40061
1,1,65,2015,July,1,0,4,1,0.0,BB,Online TA,0,0,0,A,68.0,0,1,40062
2,1,92,2015,July,1,2,4,2,0.0,BB,Online TA,0,0,0,A,76.5,0,2,40063
3,1,100,2015,July,2,0,2,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1,40064
4,1,79,2015,July,2,0,3,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1,40065


In [776]:
correlation = train_df.corr()['is_canceled'].abs().sort_values(ascending = False)
correlation

is_canceled                       1.000000
Booking_ID                        0.653194
lead_time                         0.305611
total_of_special_requests         0.297411
previous_cancellations            0.170302
required_car_parking_spaces       0.134466
previous_bookings_not_canceled    0.047835
adr                               0.047745
is_repeated_guest                 0.047165
stays_in_week_nights              0.045164
adults                            0.042420
children                          0.027290
stays_in_weekend_nights           0.010552
arrival_date_day_of_month         0.006041
arrival_date_year                 0.005535
Name: is_canceled, dtype: float64

In [777]:
#Dropping the arrival date variables (excluding month) which are not useful/relevant to the cancellation
train_df.drop(['arrival_date_year', 'arrival_date_day_of_month'], axis = 1, inplace = True)
train_df.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,market_segment,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,adr,required_car_parking_spaces,total_of_special_requests,Booking_ID
0,1,88,July,0,4,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1,40061
1,1,65,July,0,4,1,0.0,BB,Online TA,0,0,0,A,68.0,0,1,40062
2,1,92,July,2,4,2,0.0,BB,Online TA,0,0,0,A,76.5,0,2,40063
3,1,100,July,0,2,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1,40064
4,1,79,July,0,3,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1,40065


In [778]:
categorical_cols = ['arrival_date_month', 'meal', 'reserved_room_type', 'market_segment', 'is_repeated_guest']
numerical_cols = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'previous_cancellations', 'previous_bookings_not_canceled',
                  'adr', 'required_car_parking_spaces', 'total_of_special_requests']
booking_df = train_df['Booking_ID']
train_df = train_df.drop(['Booking_ID'], axis=1)
numerical_df = train_df.drop(categorical_cols, axis = 1)
categorical_df = train_df.drop(numerical_cols, axis = 1)




In [779]:
#Checking for all unique values of the categorical columns in the training dataframe for encoding
for col in categorical_df.columns:
    print(f"{col}: {categorical_df[col].unique()}")

is_canceled: [1 0]
arrival_date_month: ['July' 'August' 'September' 'October' 'November' 'December' 'January'
 'February' 'March' 'April' 'May' 'June']
meal: ['BB' 'HB' 'SC' 'FB']
market_segment: ['Online TA' 'Groups' 'Offline TA/TO' 'Direct' 'Corporate' 'Aviation'
 'Complementary']
is_repeated_guest: [0 1]
reserved_room_type: ['A' 'D' 'B' 'F' 'E' 'G' 'C']


In [780]:
#Encoding categorical variables
#categorical_df['arrival_date_month'] = categorical_df['arrival_date_month'].map({'January': 0, 'February': 1, 'March': 2,
                                                                                 #'April': 3, 'May': 4, 'June': 5, 'July': 6,
                                                                                 #'August': 7, 'September': 8, 'October': 9, 
                                                                                 #'November': 10, 'December': 11})
#categorical_df['meal'] = categorical_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3})
#categorical_df['reserved_room_type'] = categorical_df['reserved_room_type'].map({'A': 0, 'B': 1, 'D': 2, 'F': 3, 'E': 4, 'G': 5, 'C': 6,
                                                                   #'P': 7})
#categorical_df['market_segment'] = categorical_df['market_segment'].map({'Offline TA/TO': 0, 'Online TA': 1, 'Groups': 2, 'Complementary': 3,
                                                           #'Direct': 4, 'Corporate': 5, 'Undefined': 6, 'Aviation': 7})

# perform one-hot encoding for the arrival_date_month column
arrival_date_month_one_hot = pd.get_dummies(categorical_df['arrival_date_month'], prefix='arrival_date_month')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, arrival_date_month_one_hot], axis=1)
# drop the original arrival_date_month column
categorical_df.drop('arrival_date_month', axis=1, inplace=True)

# perform one-hot encoding for the meal column
meal_one_hot = pd.get_dummies(categorical_df['meal'], prefix='meal')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, meal_one_hot], axis=1)
# drop the original meal column
categorical_df.drop('meal', axis=1, inplace=True)

# perform one-hot encoding for the reserved_room_type column
reserved_room_type_one_hot = pd.get_dummies(categorical_df['reserved_room_type'], prefix='reserved_room_type')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, reserved_room_type_one_hot], axis=1)
# drop the original reserved_room_type column
categorical_df.drop('reserved_room_type', axis=1, inplace=True)

# perform one-hot encoding for the market_segment column
market_segment_one_hot = pd.get_dummies(categorical_df['market_segment'], prefix='market_segment')
# concatenate the one-hot encoded columns to the original dataset
categorical_df = pd.concat([categorical_df, market_segment_one_hot], axis=1)
# drop the original customer_type column
categorical_df.drop('market_segment', axis=1, inplace=True)

categorical_df.head()

Unnamed: 0,is_canceled,is_repeated_guest,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,...,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [781]:
numerical_df.drop('is_canceled', axis = 1, inplace = True)
numerical_df.head()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,88,0,4,2,0.0,0,0,76.5,0,1
1,65,0,4,1,0.0,0,0,68.0,0,1
2,92,2,4,2,0.0,0,0,76.5,0,2
3,100,0,2,2,0.0,0,0,76.5,0,1
4,79,0,3,2,0.0,0,0,76.5,0,1


In [782]:
numerical_df.var()

lead_time                         12350.144041
stays_in_weekend_nights               0.771775
stays_in_week_nights                  2.029103
adults                                0.249451
children                              0.138215
previous_cancellations                0.165923
previous_bookings_not_canceled        2.646068
adr                                1355.201352
required_car_parking_spaces           0.023509
total_of_special_requests             0.607053
dtype: float64

In [783]:
scaler = StandardScaler()
num_normalized_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns = numerical_df.columns)
num_normalized_df.var()

lead_time                         1.000013
stays_in_weekend_nights           1.000013
stays_in_week_nights              1.000013
adults                            1.000013
children                          1.000013
previous_cancellations            1.000013
previous_bookings_not_canceled    1.000013
adr                               1.000013
required_car_parking_spaces       1.000013
total_of_special_requests         1.000013
dtype: float64

In [784]:
num_normalized_df.head()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,-0.209301,-0.911265,1.265828,0.276311,-0.246163,-0.190506,-0.06995,-0.841931,-0.157111,0.582224
1,-0.416265,-0.911265,1.265828,-1.7259,-0.246163,-0.190506,-0.06995,-1.072829,-0.157111,0.582224
2,-0.173307,1.365339,1.265828,0.276311,-0.246163,-0.190506,-0.06995,-0.841931,-0.157111,1.865705
3,-0.10132,-0.911265,-0.138217,0.276311,-0.246163,-0.190506,-0.06995,-0.841931,-0.157111,0.582224
4,-0.290287,-0.911265,0.563805,0.276311,-0.246163,-0.190506,-0.06995,-0.841931,-0.157111,0.582224


In [785]:
df = pd.concat([categorical_df, num_normalized_df], axis = 1)
y = train_df['is_canceled']
df.drop(['is_canceled'], axis=1, inplace=True)
x = df
x.head()

Unnamed: 0,is_repeated_guest,arrival_date_month_April,arrival_date_month_August,arrival_date_month_December,arrival_date_month_February,arrival_date_month_January,arrival_date_month_July,arrival_date_month_June,arrival_date_month_March,arrival_date_month_May,...,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,0,0,0,0,0,0,1,0,0,0,...,-0.209301,-0.911265,1.265828,0.276311,-0.246163,-0.190506,-0.06995,-0.841931,-0.157111,0.582224
1,0,0,0,0,0,0,1,0,0,0,...,-0.416265,-0.911265,1.265828,-1.7259,-0.246163,-0.190506,-0.06995,-1.072829,-0.157111,0.582224
2,0,0,0,0,0,0,1,0,0,0,...,-0.173307,1.365339,1.265828,0.276311,-0.246163,-0.190506,-0.06995,-0.841931,-0.157111,1.865705
3,0,0,0,0,0,0,1,0,0,0,...,-0.10132,-0.911265,-0.138217,0.276311,-0.246163,-0.190506,-0.06995,-0.841931,-0.157111,0.582224
4,0,0,0,0,0,0,1,0,0,0,...,-0.290287,-0.911265,0.563805,0.276311,-0.246163,-0.190506,-0.06995,-0.841931,-0.157111,0.582224


In [799]:
print(x.dtypes)

is_repeated_guest                   int64
arrival_date_month_April            uint8
arrival_date_month_August           uint8
arrival_date_month_December         uint8
arrival_date_month_February         uint8
arrival_date_month_January          uint8
arrival_date_month_July             uint8
arrival_date_month_June             uint8
arrival_date_month_March            uint8
arrival_date_month_May              uint8
arrival_date_month_November         uint8
arrival_date_month_October          uint8
arrival_date_month_September        uint8
meal_BB                             uint8
meal_FB                             uint8
meal_HB                             uint8
meal_SC                             uint8
reserved_room_type_A                uint8
reserved_room_type_B                uint8
reserved_room_type_C                uint8
reserved_room_type_D                uint8
reserved_room_type_E                uint8
reserved_room_type_F                uint8
reserved_room_type_G              

In [786]:
#test_df = test_df.drop(['Unnamed: 0', 'Booking_ID', 'arrival_date_year', 'arrival_date_day_of_month'], axis = 1)
#print(test_df.dtypes)

In [787]:
#cat_test_df = test_df[categorical_cols]
#cat_test_df.head()

In [788]:
#Encoding of Categorical variables of test data
#cat_test_df['arrival_date_month'] = cat_test_df['arrival_date_month'].map({'January': 0, 'February': 1, 'March': 2,
                                                                                 #'April': 3, 'May': 4, 'June': 5, 'July': 6,
                                                                                 #'August': 7, 'September': 8, 'October': 9, 
                                                                                 #'November': 10, 'December': 11})
#cat_test_df['meal'] = cat_test_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3})
#cat_test_df['reserved_room_type'] = cat_test_df['reserved_room_type'].map({'A': 0, 'B': 1, 'D': 2, 'F': 3, 'E': 4, 'G': 5, 'C': 6,
                                                                   #'P': 7})
#cat_test_df['market_segment'] = cat_test_df['market_segment'].map({'Offline TA/TO': 0, 'Online TA': 1, 'Groups': 2, 'Complementary': 3,
                                                           #'Direct': 4, 'Corporate': 5, 'Undefined': 6, 'Aviation': 7})
#cat_test_df.head()


In [789]:
#Encoding of Numerical variables of test data
#num_test_df = test_df.drop(columns = categorical_cols, axis = 1)
#num_test_df.drop('is_canceled', axis = 1, inplace = True)
#scaler = StandardScaler()
#num_normalized_test_df = pd.DataFrame(scaler.fit_transform(num_test_df), columns = num_test_df.columns)
#num_normalized_test_df.head()


In [790]:
#y_test = test_df['is_canceled']
#x_test = pd.concat([cat_test_df, num_normalized_test_df], axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=42)


### Models

#### Logistic Regression


In [792]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_pred = logreg.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, logreg_pred)}")

Accuracy: 0.7497579552055768


#### CatBoost Classifier

In [793]:
cat = CatBoostClassifier(iterations=100)
cat.fit(x_train, y_train)
y_pred_cat = cat.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_cat)}")

Learning rate set to 0.49567
0:	learn: 0.5503867	total: 17.9ms	remaining: 1.77s
1:	learn: 0.5179588	total: 28.2ms	remaining: 1.38s
2:	learn: 0.4968040	total: 38.5ms	remaining: 1.24s
3:	learn: 0.4897530	total: 48.8ms	remaining: 1.17s
4:	learn: 0.4837234	total: 60.1ms	remaining: 1.14s
5:	learn: 0.4770447	total: 71.3ms	remaining: 1.12s
6:	learn: 0.4739647	total: 81.3ms	remaining: 1.08s
7:	learn: 0.4688132	total: 93.2ms	remaining: 1.07s
8:	learn: 0.4664956	total: 105ms	remaining: 1.06s
9:	learn: 0.4635077	total: 116ms	remaining: 1.04s
10:	learn: 0.4590240	total: 126ms	remaining: 1.02s
11:	learn: 0.4550469	total: 137ms	remaining: 1s
12:	learn: 0.4521552	total: 147ms	remaining: 982ms
13:	learn: 0.4488682	total: 158ms	remaining: 970ms
14:	learn: 0.4468435	total: 167ms	remaining: 949ms
15:	learn: 0.4453910	total: 177ms	remaining: 927ms
16:	learn: 0.4438700	total: 189ms	remaining: 923ms
17:	learn: 0.4396125	total: 198ms	remaining: 903ms
18:	learn: 0.4359143	total: 210ms	remaining: 893ms
19:	lea

#### Random Forest Classifier

In [794]:
rd_clf = RandomForestClassifier()
rd_clf.fit(x_train, y_train)
y_pred_rd_clf = rd_clf.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_rd_clf)}")
print(f"Root Mean Squared Error: {mean_squared_error(y_test, y_pred_rd_clf, squared=False)}")

Accuracy: 0.8467695088104306
Root Mean Squared Error: 0.391446664552873


#### Gradient Boosting Classifier

In [795]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
y_pred_gb = gb.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}")

Accuracy: 0.7743497063189828


#### Ada Boost Classifier

In [796]:
ada = AdaBoostClassifier()
ada.fit(x_train, y_train)
y_pred_ada = ada.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_ada)}")

Accuracy: 0.7605370167172272


#### K Nearest Neighbours Classifier

In [797]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, knn_pred)}")

Accuracy: 0.8107532434002452


### Model Comparison

Ranking Models By Accuracy

1. 0.846769 Random Forest Classifier
2. 0.820886 CatBoost Classifier
3. 0.810753 K Nearest Neighbours Classifier
4. 0.774349 Gradient Boosting Classifier
5. 0.760537 Ada Boost Classifier
6. 0.749757 Logistic Regression

In [798]:
#Dumping model into Pickle File
pickle.dump(rd_clf, open('cancellation_model.pkl', 'wb'))