In [None]:
# common imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# pandas imports
from pandas.plotting import scatter_matrix

# machine learning imports
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.pipeline import Pipeline
from sklearn.pipeline import TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.base import BaseEstimator
from sklearn import metrics

# display setup
pd.set_option("display.max_columns", None) # the None parameter displays unlimited columns
sns.set(style="whitegrid") # for plots

## 1. Getting the Data

In [None]:
# read the csv file
df = pd.read_csv(r"hotel_bookings.csv")

In [None]:
# display the first 5 rows for a quick look
df.head()

In [None]:
# DataFrame shape (rows, columns)
# understand the amount of data we are working with
df.shape

In [None]:
# description of data
df.info()

> In a first observation it is clear that some features have
> missing values (i.e. "company" and "agent" columns).
> We will need to take care of this later.

In [None]:
# summary of the numerical attributes
# null values are ignored
df.describe()

> ### Features in the DataFrame:
>> 1. hotel: Resort Hotel or City Hotel
>> 2. is_canceled: Value indicating if the booking was canceled (1) or not (0)
>> 3. lead_time: Number of days between the booking date to the arrival date
>> 4. arrival_date_year: Year of arrival date
>> 5. arrival_date_month: Month of arrival date
>> 6. arrival_date_week_number: Week number according to year of arrival
>> 7. arrival_date_day_of_month: Day of arrival date
>> 8. stays_in_weekend_nights: Number of weekend nights booked (Saturday or Sunday)
>> 9. stays_in_week_nights: Number of week nights booked (Monday to Friday)
>> 10. adults: Number of adults
>> 11. children: Number of children
>> 12. babies: Number of babies
>> 13. meal: Type of meal booked
>> 14. country: Country of origin
>> 15. market_segment: Market segment designation, typically influences the price sensitivity
>> 16. distribution_channel: Booking distribution channel, refers to how the booking was made
>> 17. is_repeated_guest: Value indication if booking name was from a repeated guest (1) or not (0)
>> 18. previous_cancellations: Number of previous cancellations prior to current booking
>> 19. previous_bookings_not_canceled: Number of previous booking not canceled prior to current booking
>> 20. reserved_room_type: Code of room type reserved
>> 21. assigned_room_type: Code for the type of room assigned to the booking
>> 22. booking_changes: Number of changes made to the booking since entering the hotel management system
>> 23. deposit_type: Type of deposit made for the reservation
>> 24. agent: ID of the travel agency that made the booking
>> 25. company: ID of the company/organization that made the booking or is responsible for payment
>> 26. days_in_waiting_list: Number of days booking was in the waiting list until it was confirmed
>> 27. customer_type: Type of booking
>> 28. adr: Average Daily Rate (the sum of transactions divided by the number of nights stayed)
>> 29. required_car_parking_spaces: Number of car parking spaces requested
>> 30. total_of_special_requests: Number of special requests made by the customer
>> 31. reservation_status: Last reservation status (Canceled, Check-Out, No-Show)
>> 32. reservation_status_date: Date at which the last status was set
>>
>>> ##### *Understanding the features could help gain insight on how to treat null values.*

In [None]:
# a histogram plot for each numerical attribute
df.hist(bins=50, figsize=(20,15))
plt.tight_layout()
plt.show()

> Initial observations from the histograms:
>> 1. Some weeks have more bookings. This could be because of holiday or summer seasons, when people tend to travel more.
>> 2. According to the lead_time plot most bookings were made shortly before arrival.
>> 3. Booking tend to be without children or babies.
>> 4. It seems that the most accommodations are two weeks long or shorter.
>> 5. While most bookings were not canceled, there are thousands of instances that were.

> # Objective:
> ## Predicting if a booking will be canceled.
>> ### Chosen Feature:
>> #### *is_canceled* column
>>> 0 means the booking was not canceled
>>>
>>> 1 means the booking was canceled
>> ### Motive:
>> Like any business, hotels are also looking to gain profit. A model that predicts if the booking
>> is likely to be canceled could be a good indication for hotels, as they
>> may prefer to accept the lower risk bookings first.

> ### Splitting the Data:
>> Before further analysis let's split the data into a training set and a testing set.
>> This will ensure avoidance of bias that could occur from learning the data as a whole.

In [None]:
# use sklearn train_test_split function to split the data
# the reason for selecting 0.15 as the test size is because the dataset is very large
# the random state parameter ensures that data will be shuffled and split the same way in each run
train_set, test_set = train_test_split(df, test_size=0.15, random_state=42)

In [None]:
print("Number of instances in training set: ", len(train_set))
print("Number of instances in testing set: ", len(test_set))

## 2. Understanding and Visualizing the Data
> ##### *The motivation for this section is to gain more insights.*

In [None]:
# deep copy of the training set
df2 = train_set.copy()

In [None]:
df2.head(2)

> ### Missing Features:

In [None]:
# the methods below calculate the number of missing values
missing_values = df2.isna().sum()
missing_values = missing_values[missing_values != 0]
missing_values

In [None]:
# replace missing values

# can assume that there were no children
df2.fillna({"children": 0}, inplace=True)

# missing countries can be labeled unknown
df2.fillna({"country": "Unknown"}, inplace=True)

# missing agent ID can be zero, likely booked privately
df2.fillna({"agent": 0}, inplace=True)

# missing company ID can be zero, likely personal booking
df2.fillna({"company": 0}, inplace=True)

In [None]:
# check that the values were filled
df2.isna().sum()

> ### Numerical Attributes:

In [None]:
# method creates a correlations matrix
corr_matrix = df2.corr()

In [None]:
# looking at attributes correlation with is_canceled feature
corr_matrix["is_canceled"].sort_values(ascending=False)

In [None]:
# experimenting with attribute combinations

# adds column with total amount of guests that stayed
df2["guests_stayed"] = df2["adults"] + df2["children"] + df2["babies"]

# adds column with total nights stayed
df2["nights_stayed"] = df2["stays_in_week_nights"] + df2["stays_in_weekend_nights"]

In [None]:
# looking at the correlation matrix again with the added columns
corr_matrix = df2.corr()
corr_matrix["is_canceled"].sort_values(ascending=False)

> ### Correlations with is_canceled Attribute - Overview:
> The strongest positive correlations (0.1 or more) are:
> * lead_time
> * previous_cancellations
>
> The strongest negative correlations (-0.1 or less) are:
> * total_of_special_requests
> * required_car_parking_spaces
> * booking_changes
>
> The attribute combinations tested (guests stayed and nights stayed) both had weak correlations.

> ### Cancellations According to Lead Time

In [None]:
# density plot of lead time
# shows the distribution and highest concentration points
plt.figure(figsize=(10,5))
lead_time = df2['lead_time']
lead_time = pd.DataFrame(sorted(lead_time, reverse = True), columns = ['Lead'])
sns.distplot(lead_time)
plt.title("Lead Time", size=20)
plt.xlabel("lead time days", size=15)
plt.ylabel("density", size=15)
plt.tight_layout()
plt.show()

In [None]:
# divides lead time by less than 100 days, 100-355 days and 365 or more days
lead_time_1 = df2[df2["lead_time"] < 100]
lead_time_2 = df2[(df2["lead_time"] >= 100) & (df2["lead_time"] < 365)]
lead_time_3 = df2[df2["lead_time"] >= 365]

In [None]:
# calculates cancellations according to lead time groups
lead_cancel_1 = lead_time_1["is_canceled"].value_counts()
lead_cancel_2 = lead_time_2["is_canceled"].value_counts()
lead_cancel_3 = lead_time_3["is_canceled"].value_counts()

In [None]:
# density plot for each lead time group
fig, bx = plt.subplots(1,3,figsize=(21,6))
sns.distplot(lead_time_1["lead_time"], ax = bx[0])
bx[0].set_title("lead_time [0,100) days", size=20)
sns.distplot(lead_time_2["lead_time"], ax = bx[1])
bx[1].set_title("lead_time [100,365) days", size=20)
sns.distplot(lead_time_3["lead_time"], ax = bx[2])
bx[2].set_title("lead_time [365,max) days", size=20)
plt.tight_layout()
plt.show()

In [None]:
# total count of lead time according to cancellation
total_lead_days_cancel = pd.DataFrame(data=[lead_cancel_1,lead_cancel_2,lead_cancel_3],
             index=["[0,100) days", "[100,365) days", "[365,max) days"])
total_lead_days_cancel

In [None]:
fig, ax = plt.subplots(1,3, figsize=(21,6))
ax[0].pie(np.array([total_lead_days_cancel[0][0], total_lead_days_cancel[1][0]]),
          labels=["not_canceled", "canceled"], autopct='%1.1f%%', startangle=90,
          colors=['forestgreen', 'firebrick'])
ax[0].set_title("lead_time [0,100) days", size=20)
ax[1].pie(np.array([total_lead_days_cancel[0][1], total_lead_days_cancel[1][1]]),
          labels=["not_canceled", "canceled"], autopct='%1.1f%%', startangle=90,
          colors=['forestgreen', 'firebrick'])
ax[1].set_title("lead_time [100,365) days", size=20)
ax[2].pie(np.array([total_lead_days_cancel[0][2], total_lead_days_cancel[1][2]]),
          labels=["not_canceled", "canceled"], autopct='%1.1f%%', startangle=90,
          colors=['forestgreen', 'firebrick'])
ax[2].set_title("lead_time [365,max) days", size=20)
plt.tight_layout()
plt.show()

> #### Observations:
>> * Most bookings occur about 5 days prior to arrival.
>> * When the lead time is larger the chances for cancellation increase.
>> * The amount of bookings is steady overall between 20-100 days, then drops.

> ### Cancellations According to Previous Cancellations

In [None]:
# gets previous cancellation column
prev_cancel = df2["previous_cancellations"]

In [None]:
# sorts index values
prev_cancel.value_counts().sort_index()

In [None]:
print("Cancellation Rates:\n")
print('Never canceled =' ,str(round(df2[df2['previous_cancellations']==0]
                                            ['is_canceled'].mean()*100,2))+' %')
print('Canceled once =' ,str(round(df2[df2['previous_cancellations']==1]
                                            ['is_canceled'].mean()*100,2))+' %')
print('Canceled more than 10 times:',str(round(df2[df2['previous_cancellations']>10]
                                            ['is_canceled'].mean()*100,2))+' %')
print('Canceled more than 11 times:' ,str(round(df2[df2['previous_cancellations']>11]
                                            ['is_canceled'].mean()*100,2))+' %')

In [None]:
# creates a list with previous cancellations indices
prev_cancel_index = df2["previous_cancellations"].value_counts().index.to_list()
# sorts the list
prev_cancel_index.sort()

# calculates the average percentage of cancellations for each value in DataFrame
percentage_prev_cancel= []
for i in prev_cancel_index:
    percentage_prev_cancel.append((round(df2[df2["previous_cancellations"]==i]
                                        ["is_canceled"].mean()*100,2)))
percentage_prev_cancel

In [None]:
# creates a DataFrame with the results
df_prev_cancel = pd.DataFrame(percentage_prev_cancel, index=prev_cancel_index, columns=["Previous Cancellations %"])
df_prev_cancel

In [None]:
# plots previous cancellations by percentages
df_prev_cancel.plot(figsize= (10,5))
plt.title("Previous Cancellations", size=20)
plt.xlabel("Number of Previous Cancellations", size=15)
plt.ylabel("%", size=15)
plt.tight_layout()
plt.show()

> ### Observations:
>> The percentages show that when there are more previous cancellations, there is
>> a substantially higher chance the customer will cancel again.

> ### Cancellations According to Total of Special Requests

In [None]:
# number of instances for each value
df2["total_of_special_requests"].value_counts()

In [None]:
# group by cancellations
is_canceled = df2.groupby(by="is_canceled")

In [None]:
# get groups according to binary outcomes
canceled = is_canceled.get_group(1)
not_canceled = is_canceled.get_group(0)

In [None]:
# counts values for each outcome
special_requests_0 = not_canceled["total_of_special_requests"].value_counts()
special_requests_1 = canceled["total_of_special_requests"].value_counts()

In [None]:
# creates a DataFrame for each outcome
df_special_requests_0 = pd.DataFrame(special_requests_0.values, index=special_requests_0.index,
                                     columns=["not_canceled"])
df_special_requests_1 = pd.DataFrame(special_requests_1.values, index=special_requests_1.index,
                                     columns=["canceled"])

In [None]:
# joins both DataFrames side by side
df_special_requests= df_special_requests_0.join(df_special_requests_1)

In [None]:
# adds total of both outcomes
special_requests_total = df_special_requests["not_canceled"] + df_special_requests["canceled"]

# calculates percentage of cancellations for each number of requests value individually
special_requests_percentage = []
for i in special_requests_total.index:
    special_requests_percentage.append(round((special_requests_1[i]/special_requests_total[i])*100,2))
special_requests_percentage

In [None]:
# add percentages as new column in DataFrame
df_special_requests.join(pd.DataFrame(special_requests_percentage, index=df_special_requests.index,
             columns=["cancellations %"]))

In [None]:
# plots special requests according to cancellations
plt.figure(figsize=(10,5))
sns.countplot(x=df2["total_of_special_requests"], hue=df2["is_canceled"])
plt.title("Special Requests", size=20)
plt.xlabel("Number of Special Requests", size=15)
plt.legend(["not canceled", "canceled"])
plt.tight_layout()
plt.show()

> ### Observations:
>> * Nearly half of the bookings without special requests are canceled.
>> * There are fewer cancellations when the number of special requests increases.

> ### Cancellations According to Required Car Parking Spaces

In [None]:
df2["required_car_parking_spaces"].value_counts().sort_index()

In [None]:
# counts values for each outcome
parking_spaces_0 = not_canceled["required_car_parking_spaces"].value_counts()
parking_spaces_1 = canceled["required_car_parking_spaces"].value_counts()

In [None]:
# value counts for non canceled instances
parking_spaces_0

In [None]:
# value counts for canceled instances
parking_spaces_1

In [None]:
# pie plot of cancellations with zero required parking spaces
plt.pie(x=[parking_spaces_0[0], parking_spaces_1[0]], labels=["not_canceled", "canceled"],
        autopct='%1.1f%%', startangle=90, colors=['forestgreen', 'firebrick'])
plt.title("Zero Required Parking Spaces Cancellations", size=20)
plt.tight_layout()
plt.show()

> ### Observations:
>> * Dividing the instances into groups according to cancellations shows that all canceled
>> bookings were ones without required parking spaces.
>> * This could potentially be a bad indication for cancellations. The model could learn
>> that a booking can be canceled **only** if no parking spaces were required, which does not
>> necessarily have to be the case.

> ### Cancellations According to Booking Changes

In [None]:
df2["booking_changes"].value_counts().sort_index()

In [None]:
# counts values for each outcome
booking_changes_0 = not_canceled["booking_changes"].value_counts()
booking_changes_1 = canceled["booking_changes"].value_counts()

In [None]:
# sorts index numbers by value
booking_changes_0.sort_index()

In [None]:
# sorts index numbers by value
booking_changes_1.sort_index()

In [None]:
df_booking_changes_1 = pd.DataFrame(booking_changes_1, index=booking_changes_0.index)
df_booking_changes_1.fillna({"booking_changes": 0}, inplace=True)
booking_changes_1 = pd.Series(df_booking_changes_1["booking_changes"])

In [None]:
# adds total of both outcomes
booking_changes_total = booking_changes_0 + booking_changes_1

# calculates percentage of cancellations for each number of booking changes individually
percentage_booking_changes = []
for i in booking_changes_total.index:
    percentage_booking_changes.append(round((booking_changes_1[i]/booking_changes_total[i])*100,2))

In [None]:
# create a DataFrame with the percentage of cancellations
df_percentage_booking_changes = pd.DataFrame(percentage_booking_changes, index=booking_changes_total.index,
                                             columns=["cancellations %"])

In [None]:
# creates a DataFrame for each outcome
df_booking_changes_0 = pd.DataFrame(booking_changes_0.values, index=booking_changes_0.index,
             columns=["not_canceled"])
df_booking_changes_1 = pd.DataFrame(booking_changes_1.values, index=booking_changes_1.index,
             columns=["canceled"])

In [None]:
# joins all three DataFrames side by side
df_booking_changes = df_booking_changes_0.join\
    ([df_booking_changes_1, df_percentage_booking_changes])

# remove rows with 0% cancellations
df_booking_changes = df_booking_changes[df_booking_changes["cancellations %"]!=0]
df_booking_changes

> ### Observations:
>> * While a large amount of bookings with no changes were canceled, this category can change overtime
>> which could possibly be a source of leakage.

> ### ADR
>

In [None]:
df2[df2["adr"]==0]["guests_stayed"].value_counts().sort_index()

In [None]:
df2[df2["adr"]==0]["reservation_status"].value_counts()

In [None]:
df2["adr"].sort_values()

> ### Categorical Attributes:

> ### Cancellations According to Hotels and Arrival Month

In [None]:
df2["hotel"].value_counts()

In [None]:
# a plot of the number of instances for each hotel according to cancellations
plt.figure(figsize=(10,5))
sns.countplot(x=df2["hotel"], hue=df2["is_canceled"])
plt.title("Hotel Cancellations", size=20)
plt.legend(["not canceled", "canceled"])
plt.tight_layout()
plt.show()

In [None]:
ordered_months = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

resort_canceled_percent = []
city_canceled_percent = []

# dividing cancellation outcome by hotel and month of arrival
resort_1 = canceled[canceled["hotel"]=="Resort Hotel"]["arrival_date_month"].value_counts()
resort_0 = not_canceled[not_canceled["hotel"]=="Resort Hotel"]["arrival_date_month"].value_counts()
city_1 = canceled[canceled["hotel"]=="City Hotel"]["arrival_date_month"].value_counts()
city_0 = not_canceled[not_canceled["hotel"]=="City Hotel"]["arrival_date_month"].value_counts()

# calculating cancellation percentage according to hotel
for i in ordered_months:
    resort_canceled_percent.append(round((resort_1[i] / (resort_0[i]+resort_1[i]))*100,2))
    city_canceled_percent.append(round((city_1[i]/(city_0[i]+city_1[i]))*100,2))

# creates a DataFrame with the cancellation percentage of each hotel
df_resort_cancel = pd.DataFrame(resort_canceled_percent, index=ordered_months,
                                       columns=["Resort Hotel Canceled %"])
df_city_cancel = pd.DataFrame(city_canceled_percent, index=ordered_months,
                                       columns=["City Hotel Canceled %"])

# joins DataFrames
df_hotel_cancel = df_resort_cancel.join(df_city_cancel)
df_hotel_cancel

> ### Observations:
>> * There are more instances for City Hotel than Resort Hotel in the data.
>> * City Hotel has a higher cancellation rate according to arrival months.

> ### Cancellations According to Meal Booked

In [None]:
# plots meal according to cancellations
plt.figure(figsize=(10,5))
sns.countplot(x=df2["meal"], hue=df2["is_canceled"])
plt.title("Cancellations According to Meal Booked", size=20)
plt.xlabel("meal", size=15)
plt.legend(["not canceled", "canceled"])
plt.tight_layout()
plt.show()

> ### Observations:
>> * The BB (Bed & Breakfast) meal is most common. It is also most frequently canceled.

> ### Cancellations According to Market Segment, Distribution Channel, Customer Type and Room Type

In [None]:
# using groupby to show onl non canceled instances
reserved_room_1 = not_canceled[["reserved_room_type", "market_segment", "customer_type",
                            "distribution_channel", "adr", "guests_stayed"]]

# remove instances without guests
reserved_room_1 = reserved_room_1.loc[(reserved_room_1["guests_stayed"]>0)]
# calculate average adr per guest
# dividing adr by guests stayed
reserved_room_1["adr_per_guest"] = reserved_room_1["adr"] / reserved_room_1["guests_stayed"]

# plot of adr according to market segment and room type
plt.figure(figsize=(10,5))
sns.barplot(x=reserved_room_1["market_segment"], y= reserved_room_1["adr_per_guest"],
            hue=reserved_room_1["reserved_room_type"])
plt.title("ADR According to Market Segment and Room Type", size=20)
plt.legend(loc=2)
plt.tight_layout()
plt.show()

In [None]:
df2["market_segment"].value_counts()

In [None]:
# calculate cancellation percentage according to market segment
market_segment_percent = []

market_segment_1 = canceled["market_segment"].value_counts()
market_segment_total = df2["market_segment"].value_counts()

for i in market_segment_total.index:
    market_segment_percent.append(str(i+": ") +
                    str(round((market_segment_1[i]/market_segment_total[i])*100,2)))
market_segment_percent

In [None]:
df2["distribution_channel"].value_counts()

In [None]:
# calculate cancellation percentage according to distribution channel
distribution_channel_percent = []

distribution_channel_1 = canceled["distribution_channel"].value_counts()
distribution_channel_total = df2["distribution_channel"].value_counts()

for i in distribution_channel_total.index:
    distribution_channel_percent.append(str(i+": ") +
                    str(round((distribution_channel_1[i]/distribution_channel_total[i])*100,2)))
distribution_channel_percent

In [None]:
df2["customer_type"].value_counts()

In [None]:
# calculate cancellation percentage according to customer type
customer_type_percent = []

customer_type_1 = canceled["customer_type"].value_counts()
customer_type_total = df2["customer_type"].value_counts()

for i in customer_type_total.index:
    customer_type_percent.append(str(i+": ") +
                    str(round((customer_type_1[i]/customer_type_total[i])*100,2)))
customer_type_percent

In [None]:
# plot of cancellations according to room type
plt.figure(figsize=(10,5))
sns.countplot(x=df2["reserved_room_type"], hue=df2["is_canceled"])
plt.title("Cancellations According to Room Type", size=20)
plt.legend(["not canceled", "canceled"], loc=1)
plt.tight_layout()
plt.show()

> ### Observations:
>> * Aviation (airline staff) have the highest adr (relative cost of a room when the booking would be made for).
>> * Market segment cancellation rates are highest amongst travel agencies and tour operators.
>> * Distribution channel cancellation rates are highest amongst groups, travel agencies and tour operators.
>> * Customer type cancellation rates are highest amongst transient
>> (meaning the booking is not part of a group or contract and is not associated to another transient booking).
>> * The room type "A" is canceled most frequently.

> ### Cancellations According to Deposit Type

In [None]:
df2["deposit_type"].value_counts()

In [None]:
# calculate deposit type instances percentage in data
deposit_percent = round(df2["deposit_type"].value_counts()/len(df["deposit_type"])*100,4)
deposit_percent

In [None]:
# using groupby to divide according to deposit types
deposit = df2.groupby(by="deposit_type")
non_refund = deposit.get_group("Non Refund")
refundable = deposit.get_group("Refundable")
no_deposit = deposit.get_group("No Deposit")

In [None]:
# calculate number of cancellations according to deposit type
no_deposit_0 = (no_deposit["is_canceled"]==0).sum()
no_deposit_1 = (no_deposit["is_canceled"]==1).sum()
non_refund_0 = (non_refund["is_canceled"]==0).sum()
non_refund_1 = (non_refund["is_canceled"]==1).sum()
refundable_0 = (refundable["is_canceled"]==0).sum()
refundable_1 = (refundable["is_canceled"]==1).sum()
all_canceled = no_deposit_1 + non_refund_1 + refundable_1
all_not_canceled = no_deposit_0 + non_refund_0 + refundable_0

In [None]:
# check that all values were calculated
all_canceled + all_not_canceled == df2["deposit_type"].size

In [None]:
# create a DataFrame with the number of instances for each deposit type
df_deposit_type = pd.DataFrame(index=["Not Canceled", "Canceled"])
df_deposit_type["no_deposit"] = [no_deposit_0, no_deposit_1]
df_deposit_type["non_refund"] = [non_refund_0, non_refund_1]
df_deposit_type["refundable"] = [refundable_0, refundable_1]
df_deposit_type

In [None]:
cancel_labels = ["cancelled", "not Cancelled"]
fig, dx = plt.subplots(1,3, figsize=(21,6))
dx[0].pie(np.array([no_deposit_1, no_deposit_0]), labels=cancel_labels,
            autopct='%1.1f%%', startangle=90, colors=['firebrick', 'forestgreen'])
dx[0].set_title("No Deposit Cancellations", size=20)
dx[1].pie(np.array([non_refund_1, non_refund_0]), labels=cancel_labels,
            autopct='%1.1f%%', startangle=90, colors=['firebrick', 'forestgreen'])
dx[1].set_title("Non Refund Cancellations", size=20)
dx[2].pie(np.array([refundable_1, refundable_0]), labels=cancel_labels,
            autopct='%1.1f%%', startangle=90, colors=['firebrick', 'forestgreen'])
dx[2].set_title("Refundable Cancellations", size=20)
plt.tight_layout()
plt.show()

> #### Observations:
>> * The non refund values and graph looks a bit off. It almost seems as if the values
>> for cancellation were switched!
>> In light of this, it might be better to evaluate the model both with and without this
>> feature.

> ### Cancellations According to Country of Origin

In [None]:
df2["country"].unique().size

In [None]:
canceled["country"].value_counts()

In [None]:
# calculate countries by number of instances that appear in data
country_1 = (df2["country"].value_counts() <= 1).sum()
country_10 = (df2["country"].value_counts() <= 10).sum()
country_50 = (df2["country"].value_counts() <= 50).sum()
country_100 = (df2["country"].value_counts() <= 100).sum()
country_1000 = (df2["country"].value_counts() <= 1000).sum()

print("Number of countries with one or less instances:", country_1,
      "\nNumber of countries with 10 or less instances:", country_10,
      "\nNumber of countries with 50 or less instances:", country_50,
      "\nNumber of countries with 100 or less instances:", country_100,
      "\nNumber of countries with 1000 or less instances:", country_1000)

> ### Observations:
>> * There are 175 unique countries. This indicates that the data is representative
>> worldwide, contrary to a specific region.
>> * More than half of the instances have 50 or fewer observations in the DataFrame.
>> * A model would likely generalize better if we avoid using this column.

## 3. Data Cleaning

In [None]:
# clean copy of training set
df3 = train_set.copy()

In [None]:
# removes instances with zero guests

class RemoveZeroGuests(TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        XData = X.loc[((X["adults"]) + (X["children"]) + (X["babies"])) > 0]
        return XData

In [None]:
df3.shape

In [None]:
# use class to remove instances with zero guests that stayed
df3 = RemoveZeroGuests().fit_transform(df3)

In [None]:
df3.shape

In [None]:
# separate predictors from target values

# drop creates a copy without changing the training set
X_train = df3.drop("is_canceled", axis=1)

# create a deep copy of the target values
y_train = df3["is_canceled"].copy()

> ### Removing the Following Columns:
>> #### Numerical Attributes:
>> * arrival_date_year: This category references towards certain years. This could be
>> problematic for instances during years that do not appear in the training data, or
>> perhaps have bias towards certain years specifically due to different amounts in the
>> training data.
>> * arrival_date_day_of_month: The column arrival date week of month generalizes this.
>> * booking_changes: Could change over time, potentially causing data leakage.
>> * days_in_waiting_list: Could constantly change over time. Additionally, there are many
>> instances. This could prevent the model from generalizing.
>> * agent & company: Represented by an ID. These columns are uninformative since they
>> contain a substantial amount of various numerical values without having an actual
>> numerical meaning. Since other columns (such as market segment) indicate the type of
>> reservation, these columns won't be of much additional use.
>>
>> #### Categorical Attributes:
>> * country: There are many categories, most with few instances. In order to make a model
>> that generalizes, it is better to dismiss this category.
>> * assigned_room_type: Similar to reserved_room_type and seems like the reserved room is
>> a more suitable choice.
>> * reservation_status: Major data leakage! The categories are Check-Out, Canceled and No-Show.
>> This is exactly what we are trying to predict.
>> * reservation_status_date: This is the date when the reservation status was last changed,
>> and therefore is irrelevant.

In [None]:
num_features = ["lead_time", "stays_in_weekend_nights", "stays_in_week_nights", "adults",
                "children", "babies", "is_repeated_guest", "previous_cancellations",
                "previous_bookings_not_canceled", "adr", "required_car_parking_spaces",
                "total_of_special_requests"]

cat_features = ["hotel", "arrival_date_week_number", "arrival_date_month", "meal",
                "market_segment", "distribution_channel", "reserved_room_type",
                "deposit_type", "customer_type"]

In [None]:
# Undefined/SC both represent no meal package and can be combined

class ReplaceMeal(TransformerMixin):

    def fit(self,X, y=None):
        return self

    def transform(self, X):
        XData = X.copy()
        XData["meal"].replace("Undefined", "SC", inplace=True)
        return XData

In [None]:
# SimpleImputer constant default fills values with zero
# MinMaxScaler normalize data (rescale between 0-1)
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant")),
    ("min_max", MinMaxScaler())
])

In [None]:
# SimpleImputer fills missing values with 'Unknown'
# OneHotEncoder converts categories to a numeric dummy array
# (one binary attribute per category)
cat_pipeline = Pipeline([
    ("meal", ReplaceMeal()),
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("one_hot", OneHotEncoder(handle_unknown="ignore"))
])

In [None]:
# column transformer:
# features generated by each transformer will be concatenated to form a single feature space
# columns of the original feature matrix that are not specified are dropped
full_pipeline = ColumnTransformer([
    ("numerical", num_pipeline, num_features),
    ("categorical", cat_pipeline, cat_features)
])

In [None]:
# transform training data using pipeline
X_train_prepared = full_pipeline.fit_transform(X_train)

## 4. Training and Evaluating Models

Accuracy is less relevant for an imbalanced classification problem.
Evaluating by a metric that represents the data better is important.

In [None]:
# function prints scores, mean and std
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
def display_evaluation(train, pred):
    print("Accuracy:", metrics.accuracy_score(train, pred))
    print("Confusion Matrix:\n", metrics.confusion_matrix(train, pred))
    print("Precision:", metrics.precision_score(train, pred))
    print("Recall:", metrics.recall_score(train, pred))
    print("F1 Score:", metrics.f1_score(train, pred))
    print("ROC AUC Score:", metrics.roc_auc_score(train, pred))

### KNN

In [None]:
# instantiate classifier
# default k=5
knn = KNeighborsClassifier()

In [None]:
# train the model on the training set
knn.fit(X_train_prepared, y_train)

In [None]:
# test on a few instances from training data
some_data = X_train.iloc[:10]
some_labels = y_train.iloc[:10]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", knn.predict(some_data_prepared))
print("Labels:", list(some_labels))

In [None]:
knn_pred_1 = knn.predict(X_train_prepared)

In [None]:
display_evaluation(y_train, knn_pred_1)

In [None]:
# instantiate KNN model using distance instead of uniform
# distance means closer instances have a larger weight
# uniform weighs all instances equally
# default k=5
knn = KNeighborsClassifier(weights="distance")

In [None]:
knn.fit(X_train_prepared, y_train)

In [226]:
# test on a few instances from training data
some_data = X_train.iloc[:10]
some_labels = y_train.iloc[:10]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", knn.predict(some_data_prepared))
print("Labels:", list(some_labels))

Predictions: [0 0 0 0 0 0 1 1 1 0]
Labels: [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]


In [None]:
knn_pred_2 = knn.predict(X_train_prepared)

In [227]:
display_evaluation(y_train, knn_pred_2)

Accuracy: 0.9895289508225843
Confusion Matrix:
 [[63548   372]
 [  689 36718]]
Precision: 0.9899703424103532
Recall: 0.9815809875157057
F1 Score: 0.98575781575097
ROC AUC Score: 0.9878806063986538


> So far, the performance of the KNN model using distance weights instead of uniform
> drastically improved the results.
>
> We will test on another model and look for the best hyperparameters for most promising
> model.

> ### What is the Random Forest Classification Model?

Forests are based on multiple decision trees, so it is vital to first understand how decision
trees work.

A decision tree is a non-linear model built by constructing many linear boundaries.
The tree works as a sequence of yes or no, true or false questions that progress down
the tree until reaching a predicted class. The data is split into nodes based on
feature values. This model is good for occasions when there is no single linear line that
can divide the data. Gini Impurity of a node represents the probability that a randomly chosen
sample would be incorrectly classified, so the goal is to reduce this as much as possible.

Using a single decision tree could cause overfitting of the training data. For example,
a decision tree could create a leaf node (the predicted class) for each instance.
Using a forest could help generalize better to new data. The random forest model
samples random point and subsets of features when training. Then, the predictions are made
by averaging the predictions of each decision tree.