# ML Hotel Project - Team Indecision Tree

## Problem Statement
Given extensive information on around 120000 hotel bookings, create a binary classification model to predict whether a booking will be cancelled. In the case of cancelled bookings, predict how many days in advance the guest cancels

## The Code

### Necessary Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold, train_test_split, RepeatedStratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn import svm
from itertools import combinations
from sklearn.metrics import mean_squared_error
import numpy as np
from numpy.typing import ArrayLike
from sklearn.model_selection import KFold
from typing import List
import plotly.express as px
import plotly.graph_objects as go

### Reading data and preprocessing

In [2]:
data = pd.read_csv('../data.csv')

### Exploratory Data Analysis

In [3]:
years = data['arrival_date_year'].unique().tolist()
years.sort()
x_ticks = []
booking_counts = []
cancellation_counts = []
months = ['January','February','March','April','May','June','July','August','September','October','November','December']
for year in years:
    for month in months:
        x_ticks.append(month + " " + str(year))
        booking_counts.append(len(data[(data['arrival_date_year'] == year) & (data['arrival_date_month'] == month)]))
        cancellation_counts.append(len(data[(data['arrival_date_year'] == year) & (data['arrival_date_month'] == month) & (data['is_canceled'] == 1)]))

# Removing time periods that are out of range of the data
x_ticks = x_ticks[6:-4]
booking_counts = booking_counts[6:-4]
cancellation_counts = cancellation_counts[6:-4]

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=x_ticks, 
        y=booking_counts, 
        name="Bookings"
    )
)
fig.add_trace(
    go.Scatter(
        x=x_ticks, 
        y=cancellation_counts, 
        name="Cancellations"
    )
)
fig.update_layout(
    title="Booking and Cancellation Counts over Time", 
    xaxis_title="Time", 
    yaxis_title="Count"
)
fig.show()


It can be seen from the above graph that the Cancellations line maintains a similar trend as the Bookings line. This means that the proportion of cancellations has remained more or less the same throughout the time period

In [4]:
hotels = data['hotel'].unique()
booking_counts = []
cancellation_counts = []
for hotel in hotels:
    booking_counts.append(len(data[(data['hotel'] == hotel)]))
    cancellation_counts.append(len(data[(data['hotel'] == hotel) & (data['is_canceled'] == 1)]))

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=hotels,
        y=booking_counts,
        name="Bookings",
        width=0.5
    )
)
fig.add_trace(
    go.Bar(
        x=hotels,
        y=cancellation_counts,
        name="Cancellations",
        width=0.3
    )
)

fig.update_layout(
    title="Booking and Cancellation Counts against Hotel", 
    xaxis_title="Hotel", 
    yaxis_title="Count",
    barmode="overlay"
)
fig.show()


TODO: **EXPLAIN!**

In [15]:
lead_times = data['lead_time'].to_list()
cancelled_lead_times = data[(data['is_canceled'] == 1)]
cancelled_lead_times = cancelled_lead_times['lead_time'].to_list()

fig = go.Figure()
fig = px.ecdf(x=lead_times)
fig.update_layout(
    title="CDF of Lead Time", 
    xaxis_title="Lead Time",
    yaxis_title="Percentage" 
)
fig.show()

fig = go.Figure()
fig = px.ecdf(x=cancelled_lead_times)
fig.update_layout(
    title="CDF of Lead Time of Canceled Bookings", 
    xaxis_title="Lead Time of Canceled Bookings",
    yaxis_title="Percentage" 
)
fig.show()

TODO: **EXPLAIN!**

In [157]:
# We can fill NaN values for number of children with 0 (only 4 NaN values)
data['children'] = data['children'].fillna(0)

In [158]:
# Converting arrival_date_month values from string to integer
data['arrival_date_month'].replace(
    ['July', 'August', 'September', 'October', 'November', 'December', 'January', 'February', 'March', 'April', 'May', 'June'],
    [7,8,9,10,11,12,1,2,3,4,5,6],
    inplace=True
)

data.rename(
    columns = {
        'arrival_date_year': 'year', 
        'arrival_date_month': 'month',
        'arrival_date_day_of_month': 'day'
    }, 
    inplace=True
)

# Creating new arrival_date column of type datetime
data.insert(3,'arrival_date',pd.to_datetime(data[['year', 'month', 'day']]))

In [159]:
# Removing country column for now or we can use target encoding
data = data.drop(['country'], axis=1)

In [160]:
# Removing agent and company columns since market_segment gives same information. 
data = data.drop(['agent','company'], axis=1)

In [161]:
# Removing all columns relating to arrival date except week number and year.
data = data.drop(['month','day'], axis=1)

In [162]:
# We can combine reserved_room_type and assigned_room_type columns into one that has boolean values for whether reserved and assigned matched
# 0 -> Reserved and assigned room types matched
# 1 -> Reserved and assigned room types didn't match
data['reserved_assigned_match'] = np.where(data['reserved_room_type'] == data['assigned_room_type'], 0, 1)
data = data.drop(['reserved_room_type','assigned_room_type'], axis=1)

In [163]:
# We can change the values in the meal types column. 0, 0.33, 0.67, 1 since the meal types are incremental
data['meal'].replace(['Undefined', 'SC', 'BB', 'HB', 'FB'], [0, 0, 0.33, 0.67, 1], inplace=True)

In [164]:
# We can remove the distribution channels column as market_segment column gives us same information with more granularity
data = data.drop(['distribution_channel'], axis=1)

In [165]:
# Remove records where adults, babies, children, stays_in_weekend_nights, stays_in_week_nights, is_canceled values are 0
data.drop(
    data[
        (data['adults']==0) &
        (data['children']==0) &
        (data['babies']==0) &
        (data['stays_in_weekend_nights']==0) & 
        (data['stays_in_week_nights']==0) &
        (data['is_canceled']==0)
    ].index,
    inplace=True
)

In [166]:
# Remove records with adr and is_canceled values as 0 and whose market segment is neither Complementary, Corporate nor Aviation
data.drop(
    data[
        (data["adr"]==0) &  
        (data["is_canceled"]==0)  &
        (data["market_segment"] != "Complementary") &
        (data["market_segment"] != "Corporate") &
        (data["market_segment"] != "Aviation") 
    ].index, 
    inplace = True
)

In [167]:
# Remove records with stays_in_weekend_nights and stays_in_week_nights values as 0 and arrival and reservation status date are the same
# and reservation status is not "Check Out"
data.drop(
    data[
        (data["stays_in_week_nights"]==0) &
        (data["stays_in_weekend_nights"]==0) &
        (data["arrival_date"] == data["reservation_status_date"]) &
        (data["reservation_status"]!="Check-Out")
    ].index,
    inplace=True
)

In [168]:
# Creating cancellation_days column which represents how many days in advance the guest cancels 
data['reservation_status_date'] = pd.to_datetime(data['reservation_status_date'], format = '%Y-%m-%d')
data['cancellation_days'] = data['arrival_date'] - data['reservation_status_date']
data['cancellation_days'] = data['cancellation_days'].dt.days

In [169]:
# Reservation Status can be ignored since it is being predicted     -> classification
# Reservation Status can be ignored since we will only be training  -> regression
# on those points where the booking has been cancelled
data = data.drop(['reservation_status','reservation_status_date','arrival_date'], axis=1)

# Maintaining a copy of data at this point for regression
regression_data = data.copy()

### Classification

In [170]:
# Splitting data into features and label for classification
X = data.drop('is_canceled', axis='columns')
y = data.is_canceled

In [171]:
# Splitting data into train and test sets (80% train, 20% test)
X_train_and_validation, X_test, y_train_and_validation, y_test = train_test_split(X, y, test_size=0.2)

In [172]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

#### Logistic Regression Model

In [173]:
def get_LR_classifier(features, solver, penalty, c_val):
    
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(
        set(features) & set(
            [
                'deposit_type',
                'customer_type',
                'hotel',
                'market_segment',
            ]
        )
    )
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            (
                'categorial', 
                OneHotEncoder(
                    handle_unknown="ignore"
                ),
                categorical_features
            ),
            (
                'numerical',
                StandardScaler(),
                numerical_features
            )
        ]
    )

    classifier = make_pipeline(
        columnTransformer,
        LogisticRegression(
            max_iter=1000000,
            solver=solver,
            penalty=penalty, 
            C=c_val
        )
    )
    
    return classifier


#### Grid Search with different values for solver, penalty and c value

In [174]:
features = X.columns
X_train, X_validation, y_train, y_validation = train_test_split(X_train_and_validation, y_train_and_validation, test_size=0.2)

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalties = ['l2']
c_values = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
list_string = list(map(str, c_values))

In [177]:
solvers = ['lbfgs']
c_values = [100000]

In [184]:
from sklearn.metrics import confusion_matrix

best_model = None
best_score = 0
best_predictions = None

for solver in solvers:
    for penalty in penalties:
        scores = []
        for c_val in c_values:
            classifier = get_LR_classifier(features, solver, penalty, c_val)
            classifier.fit(X=X_train, y=y_train)
            scores.append(classifier.score(X_validation,y_validation))
        mean_score = sum(scores)/len(scores)
        predictions = classifier.predict(X_validation)
        if( mean_score > best_score ):
            best_score = mean_score
            best_model = classifier
            best_predictions = predictions
        
        print("Confusion Matrix for the best model:-")
        print(confusion_matrix(y_validation, best_predictions))
        # plot_data = {'x':np.asarray(list_string),'y':np.asarray(scores)}
        # plot_data = pd.DataFrame(plot_data)
        # plt.figure()
        # sns.scatterplot(data=plot_data, x='x', y='y')
        # print(scores)

Confusion Matrix for the best model:-
[[11725     8]
 [    4  7187]]


In [185]:
best_model.fit(X_train_and_validation, y_train_and_validation)
print(f"Score:- {best_model.score(X_train_and_validation, y_train_and_validation)}")
y_pred = best_model.predict(X_test)
print("Confusion Matrix on the test set:-")
print(confusion_matrix(y_test, y_pred))

Score:- 0.9995243983639304
Confusion Matrix on the test set:-
[[14798    10]
 [    1  8846]]


#### SVM Model

In [49]:
def get_SVM_classifier(features):
    
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(
        set(features) & set(
            [
                'deposit_type',
                'customer_type',
                'hotel',
                'market_segment',
            ]
        )
    )
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            (
                'categorial', OneHotEncoder(
                    handle_unknown="ignore"
                ), 
                categorical_features
            ),
            (
                'numerical', 
                StandardScaler(), 
                numerical_features
            )
        ]
    )

    classifier = make_pipeline(
        columnTransformer,
        svm.SVC()
    )
    
    return classifier

In [50]:
print(cross_val_score(estimator=get_SVM_classifier(features), X=X, y=y, cv=kfold))

[0.97281759 0.97281759 0.97446521 0.9751839  0.97391562]


### Regression

In [None]:
regression_data = regression_data[regression_data['is_canceled'] == 1]
regression_data = regression_data.drop(['is_canceled'], axis = 1)
regression_data.corr()['cancellation_days']
#regression_data=regression_data[['hotel', 'lead_time', 'year', 'arrival_date_week_number', 'stays_in_weekend_nights', 'stays_in_week_nights', 'children', 'babies', 'meal', 'market_segment', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'deposit_type', 'days_in_waiting_list', 'customer_type', 'adr', 'total_of_special_requests', 'reserved_assigned_match','cancellation_days']]

In [None]:
X_reg = regression_data.drop('cancellation_days', axis='columns')
y_reg = regression_data.cancellation_days


In [None]:
def get_LR_regressor(features):
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(set(features) & set([
        'deposit_type',
        'customer_type',
        'hotel',
        'market_segment',
    ]))
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            ('categorial', OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]
    )

    regressor = make_pipeline(
        columnTransformer,
        LinearRegression()
    )
    
    return regressor

In [None]:
features = X_reg.columns
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2)

kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:


regressor = get_LR_regressor(features)
regressor.fit(X=X_train, y=y_train)
print("EQN: ", regressor[1].coef_)
print("Score: ", regressor.score(X_test, y_test))
print(cross_val_score(estimator=get_LR_regressor(features), X=X_reg, y=y_reg, cv=kfold))

## Feature selection by backward enumeration

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
class Solution:
    features: List[int]
    mse: float

    def __init__(self, y: ArrayLike):
        self.features = list()
        self.mse = mean_squared_error(y, [y.mean()]*len(y))

    def update(self, features: List[int], mse: float) -> bool:
        if(mse < self.mse):
            print(', '.join(str(x) for x in features))
            print(f"\tNew Error: {mse:.3f} better than {self.mse:.3f}")
            self.features = features
            self.mse = mse
            return True

        return False

In [None]:
all_features = features
current_features = all_features
best = Solution(y)

In [None]:
while len(current_features)>0:
    selected_feature = None

    for feature in current_features:
        new_features = current_features.drop(feature)
        Xr = X[new_features]
        mses = cross_val_score(estimator=get_LR_regressor(new_features), X=Xr, y=y, cv=kfold)
        mse = -np.average(mses)

        if(best.update(features=new_features, mse=mse)):
            selected_feature = feature
        
    if(selected_feature):
        current_features = current_features.drop(selected_feature)
    else:
        break

In [None]:
best_features = list(best.features)


## Polynomial regression

In [None]:
def get_poly_regressor(features):
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(set(features) & set([
        'deposit_type',
        'customer_type',
        'hotel',
        'market_segment',
    ]))
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            ('categorial', OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]
    )
    regressor = make_pipeline(
        columnTransformer,
        PolynomialFeatures(degree= 2),
        LinearRegression()
    )
    
    return regressor

In [None]:
poly_regressor = get_poly_regressor(features)
poly_regressor.fit(X=X_train, y=y_train)
print("Score: ", poly_regressor.score(X_test, y_test))
print(cross_val_score(estimator=get_poly_regressor(features), X=X_reg, y=y_reg, cv=kfold))

In [None]:
from sklearn.metrics import r2_score
y_pred_poly = poly_regressor.predict(X_train)
y_pred = regressor.predict(X_train)

print("R squared quadratic: {}".format(r2_score(y_true=y_train,y_pred=y_pred_poly)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=y_pred)))
