# ML Hotel Project - Team Indecision Tree

## Problem Statement
Given extensive information on around 120000 hotel bookings, create a binary classification model to predict whether a booking will be cancelled. In the case of cancelled bookings, predict how many days in advance the guest cancels

## The Code

### Necessary Imports

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn import svm
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from numpy.typing import ArrayLike
from typing import List
from sklearn.metrics import mean_squared_error, r2_score
import math
from sklearn.preprocessing import FunctionTransformer
from utility import *
import pickle as pkl



### Reading data

In [2]:
data = pd.read_csv('../data.csv')

### Exploratory Data Analysis

In [13]:
years = data['arrival_date_year'].unique().tolist()
years.sort()
x_ticks = []
booking_counts = []
cancellation_counts = []
months = ['January','February','March','April','May','June','July','August','September','October','November','December']
for year in years:
    for month in months:
        x_ticks.append(month + " " + str(year))
        booking_counts.append(len(data[(data['arrival_date_year'] == year) & (data['arrival_date_month'] == month)]))
        cancellation_counts.append(len(data[(data['arrival_date_year'] == year) & (data['arrival_date_month'] == month) & (data['is_canceled'] == 1)]))

# Removing time periods that are out of range of the data
x_ticks = x_ticks[6:-4]
booking_counts = booking_counts[6:-4]
cancellation_counts = cancellation_counts[6:-4]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=x_ticks, 
        y=booking_counts, 
        name="Bookings"
    )
)
fig.add_trace(
    go.Scatter(
        x=x_ticks, 
        y=cancellation_counts, 
        name="Cancellations"
    )
)
fig.update_layout(
    title="Booking and Cancellation Counts over Time", 
    xaxis_title="Time", 
    yaxis_title="Count"
)
fig.show()


It can be seen from the above graph that the Cancellations line maintains a similar trend as the Bookings line. This means that the proportion of cancellations has remained more or less the same throughout the time period

In [14]:
hotels = data['hotel'].unique()
booking_counts = []
cancellation_counts = []
for hotel in hotels:
    booking_counts.append(len(data[(data['hotel'] == hotel)]))
    cancellation_counts.append(len(data[(data['hotel'] == hotel) & (data['is_canceled'] == 1)]))

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=hotels,
        y=booking_counts,
        name="Bookings",
        width=0.5
    )
)
fig.add_trace(
    go.Bar(
        x=hotels,
        y=cancellation_counts,
        name="Cancellations",
        width=0.3
    )
)

fig.update_layout(
    title="Booking and Cancellation Counts against Hotel", 
    xaxis_title="Hotel", 
    yaxis_title="Count",
    barmode="overlay"
)
fig.show()


Total bookings in the given time period of a City Hotel is nearly twice as that of Resort Hotel.
Cancellation rate of City Hotel bookings is approx. 1.5 times as that of the Resort Hotel bookings 


In [15]:
# Sunburst graph
import plotly.express as px
fig = px.sunburst(data, path=['deposit_type', 'is_canceled'])
fig.update_traces(textinfo="label+percent parent")
fig.show()

The plot above depicts the cancellations by deposit types. 

In [16]:
data["reserved_is_assigned"] = np.where(data["reserved_room_type"] == data["assigned_room_type"], True, False)
data["reserved_is_assigned"].replace({True: 1, False:0}, inplace=True)
data[["assigned_room_type", "reserved_room_type", "reserved_is_assigned"]].head(30)

Unnamed: 0,assigned_room_type,reserved_room_type,reserved_is_assigned
0,C,C,1
1,C,C,1
2,C,A,0
3,A,A,1
4,A,A,1
5,A,A,1
6,C,C,1
7,C,C,1
8,A,A,1
9,D,D,1


In [34]:
room_type_based_group = data.groupby("is_canceled").reserved_is_assigned.value_counts()
df_room_type = pd.DataFrame(
    dict(
        ReservedIsAssigned=[1, 0, 1, 0],
        isCanceled=[0, 0, 1, 1],
        CategorySize=list(room_type_based_group.values),
    )
)

s0=df_room_type.query('isCanceled==0')
s1=df_room_type.query('isCanceled==1')

#layout = go.Layout(title= 'Pclass-Survived', xaxis = dict(title = 'Pclass'), yaxis = dict(title = 'CategorySize'),barmode='group' )
fig = go.Figure()

df_room_type=df_room_type['ReservedIsAssigned']

fig.add_trace(go.Bar(x=s0['ReservedIsAssigned'], y = s0['CategorySize'],
                    name='Not Canceled'
                    )
             )

fig.add_trace(go.Bar(x=s1['ReservedIsAssigned'], y = s1['CategorySize'],
                    name='Canceled'
                    )
             )

fig.update_layout(barmode='group', xaxis_title="Reserved is Assigned", yaxis_title="Count")

fig.show()
# room_type_based_group.is_canceled = room_type_based_group.is_canceled.map({0: "Not Canceled", 1: "Canceled",})


The above plot groups the cancellations by whether or not the reserved room type was assigned. 42% people who got their reserved room type canceled their bookings. 5.3% people who did not get their reserved room type cancelled their bookings. Since the data points in one category is manifold times larger than the other, we cannot conclude correlation between cancellations & reserved_is_assigned. 

In [9]:
data.drop("reserved_is_assigned", axis=1, inplace=True)

In [13]:
#Histogram between No. of special requests and cancellation
ratio = {}

for i in data['total_of_special_requests'].unique():
    ratio[i] = len(data[(data['total_of_special_requests']==i) & (data['is_canceled']==1)]) / len(data[data['total_of_special_requests']==i])

In [14]:
ratio

{0: 0.4772035609658978,
 1: 0.2202492024318305,
 3: 0.17861433720464556,
 2: 0.22098851106484693,
 4: 0.10588235294117647,
 5: 0.05}

In [15]:
reqd_df = pd.DataFrame({"total_of_special_requests": ratio.keys(), "Cancellation ratio": ratio.values()})
fig = px.bar(reqd_df, x="total_of_special_requests", y="Cancellation ratio", width=1000)
fig.update_traces(width=0.5)
fig.show()

In [20]:
import plotly.express as px
fig = px.sunburst(data, path=['market_segment', 'is_canceled'])
fig.show()

### Preprocessing

#### Pipeline

In [3]:
pipeline = make_pipeline(
    FunctionTransformer(DataImputer(column_to_value={"children":0})),
    FunctionTransformer(ArrivalDateTransformer()),
    FunctionTransformer(ColumnRemover(['country','agent','company','month','day','distribution_channel'])),
    FunctionTransformer(RoomTypeTransformer()),
    FunctionTransformer(MealTypeTransformer()),
    FunctionTransformer(UncleanDataPointsRemover()),
    FunctionTransformer(CancellationsDaysInserter()),
    FunctionTransformer(ColumnRemover(['reservation_status','reservation_status_date','arrival_date']))
)

pipeline.fit_transform(data)


Unnamed: 0,hotel,is_canceled,lead_time,year,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reserved_assigned_match,cancellation_days
2,Resort Hotel,0,7,2015,27,0,1,1,0.0,0,...,0,0,No Deposit,0,Transient,75.00,0,0,1,-1
3,Resort Hotel,0,13,2015,27,0,1,1,0.0,0,...,0,0,No Deposit,0,Transient,75.00,0,0,0,-1
4,Resort Hotel,0,14,2015,27,0,2,2,0.0,0,...,0,0,No Deposit,0,Transient,98.00,0,1,0,-2
5,Resort Hotel,0,14,2015,27,0,2,2,0.0,0,...,0,0,No Deposit,0,Transient,98.00,0,1,0,-2
6,Resort Hotel,0,0,2015,27,0,2,2,0.0,0,...,0,0,No Deposit,0,Transient,107.00,0,0,0,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,35,2,5,2,0.0,0,...,0,0,No Deposit,0,Transient,96.14,0,0,0,-7
119386,City Hotel,0,102,2017,35,2,5,3,0.0,0,...,0,0,No Deposit,0,Transient,225.43,0,2,0,-7
119387,City Hotel,0,34,2017,35,2,5,2,0.0,0,...,0,0,No Deposit,0,Transient,157.71,0,4,0,-7
119388,City Hotel,0,109,2017,35,2,5,2,0.0,0,...,0,0,No Deposit,0,Transient,104.40,0,0,0,-7


### Data Splitting

In [4]:
# Splitting data into train and test sets (80% train, 20% test)
data_train_and_validation, data_test = train_test_split(data, test_size=0.2, stratify=data['is_canceled'])

# Splitting data into features and label for classification
X_train_and_validation = data_train_and_validation.drop(['is_canceled','cancellation_days'], axis='columns')
X_train_and_validation_reg = data_train_and_validation[data_train_and_validation['is_canceled']==1]
X_train_and_validation_reg = X_train_and_validation_reg.drop(['is_canceled','cancellation_days'], axis='columns')

y_train_and_validation = data_train_and_validation.is_canceled
y_train_and_validation_reg = data_train_and_validation[data_train_and_validation['is_canceled']==1]
y_train_and_validation_reg = y_train_and_validation_reg.cancellation_days

X_test = data_test.drop(['is_canceled','cancellation_days'], axis='columns')
X_test_reg = data_test[data_test['is_canceled']==1]
X_test_reg = X_test_reg.drop(['is_canceled','cancellation_days'], axis='columns')

y_test = data_test.is_canceled
y_test_reg = data_test[data_test['is_canceled']==1]
y_test_reg = y_test_reg.cancellation_days

### Classification

#### Logistic Regression Model

In [5]:
def get_LR_classifier(features):
    
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(
        set(features) & set(
            [
                'deposit_type',
                'customer_type',
                'hotel',
                'market_segment',
            ]
        )
    )
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            (
                'categorical', 
                OneHotEncoder(
                    handle_unknown="ignore"
                ),
                categorical_features
            ),
            (
                'numerical',
                StandardScaler(),
                numerical_features
            )
        ]
    )

    classifier = make_pipeline(
        columnTransformer,
        LogisticRegression(
            max_iter=100000
        )
    )
    
    return classifier


##### Grid Search with different values for solver, penalty and c value

In [6]:
features = X_train_and_validation.columns
pipeline = get_LR_classifier(features)

param_grid = [
    {
        'logisticregression__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
        'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear']
    }
]

grid_pipeline = GridSearchCV(pipeline,param_grid)
grid_pipeline.fit(X=X_train_and_validation, y=y_train_and_validation)


In [7]:
best_params = grid_pipeline.best_params_
best_model = grid_pipeline.best_estimator_
predictions = best_model.predict(X_test)
plot_confusion_matrix(y_test, predictions)
print("Best Score: ", grid_pipeline.best_score_)

Best Score:  0.8132788255908734


In [11]:
pkl.dump(best_model,open("classifier.p","wb"))

#### SVM Model

In [12]:
def get_SVM_classifier(features):
    
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(
        set(features) & set(
            [
                'deposit_type',
                'customer_type',
                'hotel',
                'market_segment',
            ]
        )
    )
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            (
                'categorical', OneHotEncoder(
                    handle_unknown="ignore"
                ), 
                categorical_features
            ),
            (
                'numerical', 
                StandardScaler(), 
                numerical_features
            )
        ]
    )

    classifier = make_pipeline(
        columnTransformer,
        svm.SVC()
    )
    
    return classifier

In [11]:
features = X_train_and_validation.columns
pipeline = get_SVM_classifier(features)

param_grid = [
    {
        'svc__C':[0.0001],
        'svc__kernel': ['linear'],
    }
]

grid_pipeline = GridSearchCV(pipeline,param_grid,verbose=100)
grid_pipeline.fit(X=X_train_and_validation, y=y_train_and_validation)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START svc__C=0.0001, svc__kernel=linear...........................
[CV 1/5; 1/1] END svc__C=0.0001, svc__kernel=linear;, score=0.756 total time=14.5min
[CV 2/5; 1/1] START svc__C=0.0001, svc__kernel=linear...........................
[CV 2/5; 1/1] END svc__C=0.0001, svc__kernel=linear;, score=0.752 total time=14.9min
[CV 3/5; 1/1] START svc__C=0.0001, svc__kernel=linear...........................
[CV 3/5; 1/1] END svc__C=0.0001, svc__kernel=linear;, score=0.750 total time=16.0min
[CV 4/5; 1/1] START svc__C=0.0001, svc__kernel=linear...........................
[CV 4/5; 1/1] END svc__C=0.0001, svc__kernel=linear;, score=0.751 total time=17.5min
[CV 5/5; 1/1] START svc__C=0.0001, svc__kernel=linear...........................
[CV 5/5; 1/1] END svc__C=0.0001, svc__kernel=linear;, score=0.749 total time=16.8min


In [50]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
print(cross_val_score(estimator=get_SVM_classifier(features), X=X, y=y, cv=kfold))

[0.97281759 0.97281759 0.97446521 0.9751839  0.97391562]


### Regression

In [13]:
features = X_train_and_validation_reg.columns

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2)

In [14]:
def get_regressor(features, type):
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(set(features) & set([
        'deposit_type',
        'customer_type',
        'hotel',
        'market_segment',
    ]))
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            ('categorical', OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]
    )

    regressor = make_pipeline(
        columnTransformer,
        type
    )
    
    return regressor

#### Feature Selection by Backward Enumeration does not help

In [88]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
class Solution:
    features: List[int]
    mse: float

    def __init__(self, y: ArrayLike):
        self.features = list()
        self.mse = mean_squared_error(y, [y.mean()]*len(y))

    def update(self, features: List[int], mse: float) -> bool:
        if(mse < self.mse):
            print(', '.join(str(x) for x in features))
            print(f"\tNew Error: {mse:.3f} better than {self.mse:.3f}")
            self.features = features
            self.mse = mse
            return True

        return False
all_features = features
current_features = all_features
best = Solution(y_reg)
while len(current_features)>0:
    selected_feature = None

    for feature in current_features:
        new_features = current_features.drop(feature)
        Xr = X_reg[new_features]
        mses = cross_val_score(estimator=get_regressor(new_features, Ridge()), X=Xr, y=y_reg, cv=kfold, scoring="neg_mean_squared_error")
        mse = -np.average(mses)

        if(best.update(features=new_features, mse=mse)):
            selected_feature = feature
        
    if(selected_feature):
        current_features = current_features.drop(selected_feature)
    else:
        break
best_features = list(best.features)

lead_time, year, arrival_date_week_number, stays_in_weekend_nights, stays_in_week_nights, adults, children, babies, meal, market_segment, is_repeated_guest, previous_cancellations, previous_bookings_not_canceled, booking_changes, deposit_type, days_in_waiting_list, customer_type, adr, required_car_parking_spaces, total_of_special_requests, reserved_assigned_match
	New Error: 3231.538 better than 8226.009
hotel, lead_time, year, arrival_date_week_number, stays_in_weekend_nights, stays_in_week_nights, children, babies, meal, market_segment, is_repeated_guest, previous_cancellations, previous_bookings_not_canceled, booking_changes, deposit_type, days_in_waiting_list, customer_type, adr, required_car_parking_spaces, total_of_special_requests, reserved_assigned_match
	New Error: 3230.607 better than 3231.538
hotel, lead_time, year, arrival_date_week_number, stays_in_weekend_nights, stays_in_week_nights, children, babies, meal, market_segment, is_repeated_guest, previous_cancellations, previ

#### Grid Search

In [16]:
lasso_alpha = np.logspace(start=-3, stop=0, num=20)
ridge_alpha = np.logspace(start=-1, stop=2, num=20)

In [17]:
linear_regression_estimator=get_regressor(features,LinearRegression())

lasso_cv = GridSearchCV(
    estimator=get_regressor(features,Lasso(max_iter=10000)),
    param_grid={
        'lasso__alpha': lasso_alpha
    },
    cv=5,
    scoring='neg_mean_squared_error')

ridge_cv = GridSearchCV(
    estimator=get_regressor(features,Ridge()),
    param_grid={
        'ridge__alpha': ridge_alpha
    },
    cv=5,
    scoring='neg_mean_squared_error')

In [18]:
linear_regression_estimator.fit(X_train_and_validation_reg, y_train_and_validation_reg)

In [19]:
lasso_cv.fit(X_train_and_validation_reg, y_train_and_validation_reg)

In [20]:
ridge_cv.fit(X_train_and_validation_reg, y_train_and_validation_reg)

In [21]:
lasso_cv.best_params_

# R2 with training data
print("R2 with training data: ",r2_score(y_train_and_validation_reg, lasso_cv.predict(X_train_and_validation_reg)))
print("Mean Squared Error: ", math.sqrt(mean_squared_error(y_train_and_validation_reg, lasso_cv.predict(X_train_and_validation_reg))))

# R2 with testing data
print("R2 with testing data",r2_score(y_test_reg, lasso_cv.predict(X_test_reg)))
print("Mean Squared Error: ", math.sqrt(mean_squared_error(y_test_reg, lasso_cv.predict(X_test_reg))))


R2 with training data:  0.6087160672698395
Mean Squared Error:  56.686837358535975
R2 with testing data 0.6041969737911312
Mean Squared Error:  57.248149652316414


In [22]:
ridge_cv.best_params_

# R2 with training data
print("R2 with training data: ",r2_score(y_train_and_validation_reg, ridge_cv.predict(X_train_and_validation_reg)))
print("Mean Squared Error: ", math.sqrt(mean_squared_error(y_train_and_validation_reg, ridge_cv.predict(X_train_and_validation_reg))))

# R2 with testing data
print("R2 with testing data",r2_score(y_test_reg, ridge_cv.predict(X_test_reg)))
print("Mean Squared Error: ", math.sqrt(mean_squared_error(y_test_reg, ridge_cv.predict(X_test_reg))))

R2 with training data:  0.6087184762679315
Mean Squared Error:  56.68666285776836
R2 with testing data 0.6042039879851717
Mean Squared Error:  57.24764239064513


In [23]:
# R2 with training data
print("R2 with training data: ",r2_score(y_train_and_validation_reg, linear_regression_estimator.predict(X_train_and_validation_reg)))
print("Mean Squared Error: ", math.sqrt(mean_squared_error(y_train_and_validation_reg, linear_regression_estimator.predict(X_train_and_validation_reg))))

# R2 with testing data
print("R2 with testing data",r2_score(y_test_reg, linear_regression_estimator.predict(X_test_reg)))
print("Mean Squared Error: ", math.sqrt(mean_squared_error(y_test_reg, linear_regression_estimator.predict(X_test_reg))))


R2 with training data:  0.608659579825185
Mean Squared Error:  56.69092898965261
R2 with testing data 0.6042386045960926
Mean Squared Error:  57.24513887540601


In [None]:
pkl.dump(ridge_cv,open("regression.p","wb"))

#### Polynomial regression

In [24]:
def get_poly_regressor(features):
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(set(features) & set([
        'deposit_type',
        'customer_type',
        'hotel',
        'market_segment',
    ]))
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            ('categorial', OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]
    )

    poly_regressor = make_pipeline(
        columnTransformer,
        PolynomialFeatures(degree= 2),
        LinearRegression(),
            
    )
    
    return poly_regressor

In [26]:
poly_regressor = get_poly_regressor(features)
poly_regressor.fit(X_train_and_validation_reg, y_train_and_validation_reg)

In [27]:
# R2 with training data
print("R2 with training data: ",r2_score(y_train_and_validation_reg, poly_regressor.predict(X_train_and_validation_reg)))
print("Mean Squared Error: ", math.sqrt(mean_squared_error(y_train_and_validation_reg, poly_regressor.predict(X_train_and_validation_reg))))

# R2 with testing data
print("R2 with testing data",r2_score(y_test_reg, poly_regressor.predict(X_test_reg)))
print("Mean Squared Error: ", math.sqrt(mean_squared_error(y_test_reg, poly_regressor.predict(X_test_reg))))


R2 with training data:  0.7078576425609187
Mean Squared Error:  48.98165838413787
R2 with testing data -1.2218652180917206e+17
Mean Squared Error:  31807794003.47226
