In [65]:
import numpy as np
import pandas as pd
import sklearn.preprocessing

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler      # Remove Mean and scale to Unit Variance
from sklearn.preprocessing import PowerTransformer    # Log Transformation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [66]:
train_df = pd.read_csv('city_hotel_bookings_updated.csv')
test_df = pd.read_csv('hotel_reservations_updated.csv')

#Data cleaning to make 2015-2016 dataset match the incoming streaming data
train_df.loc[(train_df['meal'] == 'Undefined'), 'meal']= 'SC'
train_df.loc[ (train_df['required_car_parking_spaces'] > 1), 'required_car_parking_spaces']= 1

In [67]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,40060,City Hotel,0,6,2015,July,27,1,0,2,...,No Deposit,6.0,,0,Transient,0.0,0,0,Check-Out,2015-07-03
1,40061,City Hotel,1,88,2015,July,27,1,0,4,...,No Deposit,9.0,,0,Transient,76.5,0,1,Canceled,2015-07-01
2,40062,City Hotel,1,65,2015,July,27,1,0,4,...,No Deposit,9.0,,0,Transient,68.0,0,1,Canceled,2015-04-30
3,40063,City Hotel,1,92,2015,July,27,1,2,4,...,No Deposit,9.0,,0,Transient,76.5,0,2,Canceled,2015-06-23
4,40064,City Hotel,1,100,2015,July,27,2,0,2,...,No Deposit,9.0,,0,Transient,76.5,0,1,Canceled,2015-04-02


In [68]:
#Checking null values
train_df.isnull().sum()[train_df.isnull().sum() > 0]

children        4
country        24
agent        8131
company     75641
dtype: int64

In [69]:
#Filling in NA values for children column
train_df.children.fillna(0, inplace = True)

In [70]:
train_df.drop(['hotel', 'arrival_date_week_number', 'babies', 'country', 'distribution_channel', 
                               'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'company', 
                               'days_in_waiting_list', 'customer_type', 'reservation_status', 'reservation_status_date', 'Unnamed: 0'], 
                              axis = 1, inplace = True)

print(train_df.dtypes)

is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
meal                               object
market_segment                     object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
adr                               float64
required_car_parking_spaces         int64
total_of_special_requests           int64
dtype: object


In [71]:
train_df.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,market_segment,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,adr,required_car_parking_spaces,total_of_special_requests
0,0,6,2015,July,1,0,2,1,0.0,HB,Offline TA/TO,0,0,0,A,0.0,0,0
1,1,88,2015,July,1,0,4,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1
2,1,65,2015,July,1,0,4,1,0.0,BB,Online TA,0,0,0,A,68.0,0,1
3,1,92,2015,July,1,2,4,2,0.0,BB,Online TA,0,0,0,A,76.5,0,2
4,1,100,2015,July,2,0,2,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1


In [72]:
correlation = train_df.corr()['is_canceled'].abs().sort_values(ascending = False)
correlation

  correlation = train_df.corr()['is_canceled'].abs().sort_values(ascending = False)


is_canceled                       1.000000
lead_time                         0.309242
total_of_special_requests         0.293889
previous_cancellations            0.166643
required_car_parking_spaces       0.133481
is_repeated_guest                 0.065840
previous_bookings_not_canceled    0.053134
adults                            0.053054
stays_in_week_nights              0.048704
children                          0.027089
adr                               0.011965
stays_in_weekend_nights           0.007343
arrival_date_day_of_month         0.004331
arrival_date_year                 0.003004
Name: is_canceled, dtype: float64

In [73]:
#Dropping the arrival date variables (excludind month) which are not useful/relevant to the cancellation
train_df.drop(['arrival_date_year', 'arrival_date_day_of_month'], axis = 1, inplace = True)
train_df.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,meal,market_segment,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,adr,required_car_parking_spaces,total_of_special_requests
0,0,6,July,0,2,1,0.0,HB,Offline TA/TO,0,0,0,A,0.0,0,0
1,1,88,July,0,4,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1
2,1,65,July,0,4,1,0.0,BB,Online TA,0,0,0,A,68.0,0,1
3,1,92,July,2,4,2,0.0,BB,Online TA,0,0,0,A,76.5,0,2
4,1,100,July,0,2,2,0.0,BB,Online TA,0,0,0,A,76.5,0,1


In [74]:
#Prepping a dataframe of Categorical variables to encode
categorical_cols = ['arrival_date_month', 'meal', 'reserved_room_type', 'market_segment']
categorical_df = train_df[categorical_cols]
categorical_df.head()

Unnamed: 0,arrival_date_month,meal,reserved_room_type,market_segment
0,July,HB,A,Offline TA/TO
1,July,BB,A,Online TA
2,July,BB,A,Online TA
3,July,BB,A,Online TA
4,July,BB,A,Online TA


In [76]:
#Checking for all unique values of the categorical columns in the training dataframe for encoding
for col in categorical_df.columns:
    print(f"{col}: {categorical_df[col].unique()}")

arrival_date_month: ['July' 'August' 'September' 'October' 'November' 'December' 'January'
 'February' 'March' 'April' 'May' 'June']
meal: ['HB' 'BB' 'SC' 'FB']
reserved_room_type: ['A' 'B' 'D' 'F' 'E' 'G' 'C' 'P']
market_segment: ['Offline TA/TO' 'Online TA' 'Groups' 'Complementary' 'Direct' 'Corporate'
 'Undefined' 'Aviation']


In [None]:
#Encoding categorical variables
categorical_df['arrival_date_month'] = categorical_df['arrival_date_month'].map({'January': 0, 'February': 1, 'March': 2,
                                                                                 'April': 3, 'May': 4, 'June': 5, 'July': 6,
                                                                                 'August': 7, 'September': 8, 'October': 9, 
                                                                                 'November': 10, 'December': 11})
categorical_df['meal'] = categorical_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3})
categorical_df['reserved_room_type'] = categorical_df['reserved_room_type'].map({'A': 0, 'B': 1, 'D': 2, 'F': 3, 'E': 4, 'G': 5, 'C': 6,
                                                                   'P': 7})
categorical_df['market_segment'] = categorical_df['market_segment'].map({'Offline TA/TO': 0, 'Online TA': 1, 'Groups': 2, 'Complementary': 3,
                                                           'Direct': 4, 'Corporate': 5, 'Undefined': 6, 'Aviation': 7})


In [79]:
categorical_df.head()

Unnamed: 0,arrival_date_month,meal,reserved_room_type,market_segment
0,6,2,0,0
1,6,0,0,1
2,6,0,0,1
3,6,0,0,1
4,6,0,0,1


In [85]:
numerical_df = train_df.drop(columns = categorical_cols, axis = 1)
numerical_df.drop('is_canceled', axis = 1, inplace = True)
numerical_df.head()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,6,0,2,1,0.0,0,0,0,0.0,0,0
1,88,0,4,2,0.0,0,0,0,76.5,0,1
2,65,0,4,1,0.0,0,0,0,68.0,0,1
3,92,2,4,2,0.0,0,0,0,76.5,0,2
4,100,0,2,2,0.0,0,0,0,76.5,0,1


In [86]:
numerical_df.var()

lead_time                         12309.575369
stays_in_weekend_nights               0.783272
stays_in_week_nights                  2.121148
adults                                0.259379
children                              0.138509
is_repeated_guest                     0.024959
previous_cancellations                0.172617
previous_bookings_not_canceled        2.867639
adr                                1901.217583
required_car_parking_spaces           0.023689
total_of_special_requests             0.609612
dtype: float64

In [88]:
scaler = StandardScaler()
num_normalized_df = pd.DataFrame(scaler.fit_transform(numerical_df), columns = numerical_df.columns)
num_normalized_df.var()

lead_time                         1.000013
stays_in_weekend_nights           1.000013
stays_in_week_nights              1.000013
adults                            1.000013
children                          1.000013
is_repeated_guest                 1.000013
previous_cancellations            1.000013
previous_bookings_not_canceled    1.000013
adr                               1.000013
required_car_parking_spaces       1.000013
total_of_special_requests         1.000013
dtype: float64

In [89]:
num_normalized_df.head()

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,adr,required_car_parking_spaces,total_of_special_requests
0,-0.934996,-0.898493,-0.125622,-1.670911,-0.245496,-0.162135,-0.191934,-0.078169,-2.415092,-0.157742,-0.700484
1,-0.195909,-0.898493,1.24762,0.29261,-0.245496,-0.162135,-0.191934,-0.078169,-0.660612,-0.157742,0.580301
2,-0.403214,-0.898493,1.24762,-1.670911,-0.245496,-0.162135,-0.191934,-0.078169,-0.855554,-0.157742,0.580301
3,-0.159856,1.361341,1.24762,0.29261,-0.245496,-0.162135,-0.191934,-0.078169,-0.660612,-0.157742,1.861085
4,-0.08775,-0.898493,-0.125622,0.29261,-0.245496,-0.162135,-0.191934,-0.078169,-0.660612,-0.157742,0.580301


In [93]:
x_train = pd.concat([categorical_df, num_normalized_df], axis = 1)
y_train = train_df['is_canceled']

print(x_train.dtypes)

arrival_date_month                  int64
meal                                int64
reserved_room_type                  int64
market_segment                      int64
lead_time                         float64
stays_in_weekend_nights           float64
stays_in_week_nights              float64
adults                            float64
children                          float64
is_repeated_guest                 float64
previous_cancellations            float64
previous_bookings_not_canceled    float64
adr                               float64
required_car_parking_spaces       float64
total_of_special_requests         float64
dtype: object


In [100]:
test_df = test_df.drop(['Unnamed: 0', 'Booking_ID', 'arrival_date_year', 'arrival_date_day_of_month'], axis = 1)
print(test_df.dtypes)

adults                              int64
children                            int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
meal                               object
required_car_parking_spaces         int64
reserved_room_type                 object
lead_time                           int64
arrival_date_month                 object
market_segment                     object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
adr                               float64
total_of_special_requests           int64
is_canceled                         int64
dtype: object


In [104]:
cat_test_df = test_df[categorical_cols]
cat_test_df.head()

Unnamed: 0,arrival_date_month,meal,reserved_room_type,market_segment
0,October,BB,A,Offline TA/TO
1,November,SC,A,Online TA
2,February,BB,A,Online TA
3,May,BB,A,Online TA
4,April,SC,A,Online TA


In [None]:
#Encoding of Categorical variables of test data
cat_test_df['arrival_date_month'] = cat_test_df['arrival_date_month'].map({'January': 0, 'February': 1, 'March': 2,
                                                                                 'April': 3, 'May': 4, 'June': 5, 'July': 6,
                                                                                 'August': 7, 'September': 8, 'October': 9, 
                                                                                 'November': 10, 'December': 11})
cat_test_df['meal'] = cat_test_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3})
cat_test_df['reserved_room_type'] = cat_test_df['reserved_room_type'].map({'A': 0, 'B': 1, 'D': 2, 'F': 3, 'E': 4, 'G': 5, 'C': 6,
                                                                   'P': 7})
cat_test_df['market_segment'] = cat_test_df['market_segment'].map({'Offline TA/TO': 0, 'Online TA': 1, 'Groups': 2, 'Complementary': 3,
                                                           'Direct': 4, 'Corporate': 5, 'Undefined': 6, 'Aviation': 7})
cat_test_df.head()


In [106]:
#Encoding of Numerical variables of test data
num_test_df = test_df.drop(columns = categorical_cols, axis = 1)
num_test_df.drop('is_canceled', axis = 1, inplace = True)
scaler = StandardScaler()
num_normalized_test_df = pd.DataFrame(scaler.fit_transform(num_test_df), columns = num_test_df.columns)
num_normalized_test_df.head()

Unnamed: 0,adults,children,stays_in_weekend_nights,stays_in_week_nights,required_car_parking_spaces,lead_time,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,adr,total_of_special_requests
0,0.298893,-0.26147,0.217401,-0.144803,-0.178819,1.614896,-0.16221,-0.063393,-0.087456,-1.095033,-0.78814
1,0.298893,-0.26147,1.365993,0.563972,-0.178819,-0.933701,-0.16221,-0.063393,-0.087456,0.092806,0.48376
2,-1.628975,-0.26147,1.365993,-0.853578,-0.178819,-0.98025,-0.16221,-0.063393,-0.087456,-1.237528,-0.78814
3,0.298893,-0.26147,-0.93119,-0.144803,-0.178819,1.46361,-0.16221,-0.063393,-0.087456,-0.097567,-0.78814
4,0.298893,-0.26147,0.217401,-0.853578,-0.178819,-0.433291,-0.16221,-0.063393,-0.087456,-0.254312,-0.78814


In [None]:
x_test = pd.concat([])