In [104]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn import metrics 
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import make_scorer, accuracy_score #Import scikit-learn metrics module for accuracy calculation
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Initial Data Cleaning

Our data set is from Kaggle and fairly large, with over 100,000 rows and 32 columns. Each row is a different reservation. Our target variable is whether or not they ended up cancelling their reservation. Our classes are imbalanced towards non cancelations but not to a huge extent (75,000 - 45,000). This may affect a logistic regression, but a more powerful model with this much data will be fine.

In [105]:
df = pd.read_csv("data/hotel_bookings.csv")
pd.set_option('display.max_columns', None)

In [106]:
df.shape

(119390, 32)

In [107]:
df.is_canceled.value_counts()

0    75166
1    44224
Name: is_canceled, dtype: int64

These are variable from after someone has checked in so they are not suitable for a predictive model. 

In [108]:
df = df.drop('reservation_status', axis = 1)

In [109]:
df = df.drop('reservation_status_date', axis = 1)

Theres exactly one family with 10 children, the rest have 3 oe less, lets just get rid of that outlier.

In [110]:
df.children.value_counts()


0.0     110796
1.0       4861
2.0       3652
3.0         76
10.0         1
Name: children, dtype: int64

In [111]:
df.children = df[df.children < 10]

We do not need the precision that day of the month or the week number provides to make a generalizable model 

In [112]:
df = df.drop('arrival_date_week_number', axis = 1)
df = df.drop('arrival_date_day_of_month', axis =1)

There are over 200 countries where people are visiting from in this data set, that will be a nightmare to make dummy variables out of and train models on, and may overfit to countries with few visitors, lets narrow it down to the top 20 and have the rest be "other".

In [113]:
df['one'] = 1
top_20_countries = df.groupby('country').sum().sort_values('one', ascending = False)[:20].index
df['top_20_c'] = df.country.apply(lambda x: x in top_20_countries)
df['country'] = np.where(df.top_20_c == True, df.country, 'other')

In [114]:
df.country.value_counts()

PRT      48590
GBR      12129
FRA      10415
ESP       8568
other     7371
DEU       7287
ITA       3766
IRL       3375
BEL       2342
BRA       2224
NLD       2104
USA       2097
CHE       1730
CN        1279
AUT       1263
SWE       1024
CHN        999
POL        919
ISR        669
RUS        632
NOR        607
Name: country, dtype: int64

In [115]:
df = df.drop('top_20_c', axis = 1)

Lets do a similar thing to company as we did for countries but only the top 10, companies with only a few reservations do not seem very predictive and could over fit the model. We will also set the NaNs (the majority of the dataset) to "no company."

In [116]:
df['company'] = df.company.astype(str)
df['company'] = np.where(df.company == 'nan', "no_company", df.company)

In [117]:
df.company.value_counts()

no_company    112593
40.0             927
223.0            784
67.0             267
45.0             250
               ...  
18.0               1
420.0              1
54.0               1
313.0              1
8.0                1
Name: company, Length: 353, dtype: int64

In [118]:
top_11_companies = df.groupby('company').sum().sort_values('one', ascending = False)[:11].index
df['top_10_co'] = df.company.apply(lambda x: x in top_11_companies)

In [119]:
df['company'] = np.where(df.top_10_co == True, df.company, 'other')

In [120]:
df.company.value_counts()

no_company    112593
other           3674
40.0             927
223.0            784
67.0             267
45.0             250
153.0            215
174.0            149
219.0            141
281.0            138
154.0            133
405.0            119
Name: company, dtype: int64

# Exploratory Data Analysis

Lets figure out how much money our hotels lost on cancellations.

In [121]:
df['total_nights'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

In [122]:
df[df.is_canceled == 1]['total_nights'].sum() * df[df.is_canceled == 1]['adr'].sum()

715866472739.3901

Thats a lot of cash! At the end we will show how much of this our model could save. This does not even go into the labor involved in switching reservations around or preparing different rooms. 

# Feature Engineering 

First lets make dummy variable out of the bevy of categorical data to make our data suited for any kind of model we choose to run.

In [None]:
df = pd.get_dummies(df, columns = ['deposit_type'], drop_first = True)
df = pd.get_dummies(df, columns = ['customer_type'], drop_first = True)
df = pd.get_dummies(df, columns = ['arrival_date_month'], drop_first = True)
df = pd.get_dummies(df, columns = ['reserved_room_type'], drop_first = True)
df = pd.get_dummies(df, columns = ['assigned_room_type'], drop_first = True)
df = pd.get_dummies(df, columns = ['meal'], drop_first = True)
df = pd.get_dummies(df, columns = ['market_segment'], drop_first = True)
df = pd.get_dummies(df, columns = ['country'], drop_first = True)
df = pd.get_dummies(df, columns = ['distribution_channel'], drop_first = True)
df = pd.get_dummies(df, columns = ['children'], drop_first = True)
df = pd.get_dummies(df, columns = ['hotel'], drop_first = True)
df = pd.get_dummies(df, columns = ['company'], drop_first = True)

Lets add a feature where somebody got a different room type than the one they reserved, as this may make them likely to cancel. 

In [None]:
df['dif_room_than_res'] = np.where(df.reserved_room_type == df.assigned_room_type, 0, 1)

Rather than make dummys out of the different agent IDs lets just make this column into had an agent when they booked the room or did not.

In [None]:
df['agent'] = np.where((df['agent'].astype('str') == 'NaN')|(df['agent'].astype('str') == 'nan'), 0, 1)

# Model iterations

Since overbooking a room where someone did not cancel would be a headache, we tailored our models towards precision. AKA if we say someone is going to cancel, there is a very high chance they are. This might dip into profitabality since less rooms can be overbooked but that could be adjusted on a client by client basis. 

In [None]:
X = df.drop('is_canceled', axis =1)
y = df.is_canceled
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10920)


In [None]:
lr = LogisticRegression (penalty = 'l1', solver='saga', max_iter = 1000)
lr.fit(X_train, y_train)
test_pred = lr.predict(scaled_test)


# Model Interpretation 