In [1]:
pip install altair

Note: you may need to restart the kernel to use updated packages.


In [2]:
import altair as alt
import numpy as np
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

## Predicting a Canceled Hotel Booking

## Introduction 

Every year, many hotel bookings are made and sometimes these bookings are canceled. This can cause the hotel to lose money and business. This data set contains information about hotel bookings, details about guests, their reservations, hotel attributes, and whether the reservations were canceled. We will use the columns "lead_time" (the number of days between when the reservation is made and the booking itself), "arrival_date_month" (the month of the year the booking is made for), and "adr" (the average daily rate of the room) to classify a booking as either canceled or not canceled using the "is_canceled" column. We will use the results of our analysis to suggest ways in which hotels can improve their business model so less guests cancel reservations.

[Hotel Booking Dataset on Kaggle](https://www.kaggle.com/datasets/saadharoon27/hotel-booking-dataset)

Data Atrributes:

- hotel: The type of hotel, either "City Hotel" or "Resort Hotel."
- is_canceled: Binary value indicating whether the booking was cancelled (1) or not (0).
- lead_time: Number of days between booking and arrival.
- arrival_date_year: Year of arrival date.
- arrival_date_month: Month of arrival date.
- arrival_date_week_number: Week number of arrival date.
- arrival_date_day_of_month: Day of the month of arrival date.
- stays_in_weekend_nights: Number of weekend nights (Saturday or Sunday) the guest stays.
- stays_in_week_nights: Number of weekday nights (Monday to Friday) the guest stays.
- adults: Number of adults.
- children: Number of children.
- babies: Number of babies.
- meal: Type of meal booked.
- country: Country of origin.
- market_segment: Market segment designation.
- distribution_channel: Booking distribution channel.
- is_repeated_guest: Binary value indicating whether the guest is a repeated guest (1) or not (0).
- previous_cancellations: Number of previous booking cancellations.
- previous_bookings_not_canceled: Number of previous bookings not cancelled.
- reserved_room_type: Code of room type reserved.
- assigned_room_type: Code of room type assigned at check-in.
- booking_changes: Number of changes/amendments made to the booking.
- deposit_type: Type of deposit made.
- agent: ID of the travel agency.
- company: ID of the company.
- days_in_waiting_list: Number of days in the waiting list before booking.
- customer_type: Type of booking.
- adr: Average daily rate.
- required_car_parking_spaces: Number of car parking spaces required.
- total_of_special_requests: Number of special requests made.
- reservation_status: Reservation last status.
- reservation_status_date: Date of the last status.
- name: Guest's name. (Not Real)
- email: Guest's email address.(Not Real)
- phone-number: Guest's phone number. (Not Real)
- credit_card: Guest's credit card details. (Not Real)

## Methods and Results

### Preliminary exploratory data analysis:

#### Importing data:

In [4]:
# Import the data from a google docs spreadsheet
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTdwTKkB_43NV_73UYihNEO66dAc4V_7cMmO77qsMsSjuZdXhqRiiauabSfHMmoKL70SMLpBYZecdbQ/pub?gid=1065236226&single=true&output=csv"
hotel_booking_full = pd.read_csv(url)
hotel_booking_full.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,Transient,0.0,0,0,Check-Out,2015-07-01,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,Transient,0.0,0,0,Check-Out,2015-07-01,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015-07-02,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015-07-02,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,Transient,98.0,0,1,Check-Out,2015-07-03,Linda Hines,LHines@verizon.com,713-226-5883,************5498


##### Altair can only take data up to 5000 entries so we will scale down our data

In [5]:
# Scale down our data to 5000 entries
hotel_booking = hotel_booking_full[hotel_booking_full['lead_time'].apply(lambda x: isinstance(x, int))][0:5000]

In [6]:
# Display the amount of columns and rows
hotel_booking.shape

(5000, 36)

In [7]:
hotel_booking['lead_time'].value_counts()

0      198
1       97
78      79
2       72
36      68
      ... 
382      1
344      1
248      1
336      1
342      1
Name: lead_time, Length: 272, dtype: int64

In [8]:
# Display the data types in the data frame
hotel_booking.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

#### Nulls:

In [9]:
# Display the number of null values in the data frame
hotel_booking.isna().sum()

hotel                                0
is_canceled                          0
lead_time                            0
arrival_date_year                    0
arrival_date_month                   0
arrival_date_week_number             0
arrival_date_day_of_month            0
stays_in_weekend_nights              0
stays_in_week_nights                 0
adults                               0
children                             0
babies                               0
meal                                 0
country                              2
market_segment                       0
distribution_channel                 0
is_repeated_guest                    0
previous_cancellations               0
previous_bookings_not_canceled       0
reserved_room_type                   0
assigned_room_type                   0
booking_changes                      0
deposit_type                         0
agent                              814
company                           4708
days_in_waiting_list     

There are only null values in columns "agent", "company", and "country". Since we will not use these columns for our analysis, rather than removing all the bookings with null values, we will remove these columns instead.

In [10]:
# Remove the columns "agent" and "company"
hotel_booking = hotel_booking.drop(columns=['agent', 'company', 'country'])
hotel_booking.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'market_segment', 'distribution_channel', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'reserved_room_type', 'assigned_room_type', 'booking_changes',
       'deposit_type', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'name', 'email',
       'phone-number', 'credit_card'],
      dtype='object')

#### Infomation about the FULL dataset:

In [11]:
# Display the columns with the number of non null values, null values and its data type
hotel_booking_info = pd.DataFrame({"name": hotel_booking.columns, 
                                   "non-nulls": len(hotel_booking)-hotel_booking.isnull().sum().values, 
                                   "nulls": hotel_booking.isnull().sum().values, 
                                   "type": hotel_booking.dtypes.values})
hotel_booking_info

Unnamed: 0,name,non-nulls,nulls,type
0,hotel,5000,0,object
1,is_canceled,5000,0,int64
2,lead_time,5000,0,int64
3,arrival_date_year,5000,0,int64
4,arrival_date_month,5000,0,object
5,arrival_date_week_number,5000,0,int64
6,arrival_date_day_of_month,5000,0,int64
7,stays_in_weekend_nights,5000,0,int64
8,stays_in_week_nights,5000,0,int64
9,adults,5000,0,int64


#### Distribution of cancelled vs not cancelled reservations:

In [13]:
# Display the number of canceled and not canceled bookings
hotel_booking['is_canceled'].value_counts()

0    2702
1    2298
Name: is_canceled, dtype: int64

In [14]:
# Display the percentage of cancelled and not cancelled bookings
notcan_dist = round((hotel_booking['is_canceled'].value_counts()[0]/hotel_booking['is_canceled'].count())*100,2)
can_dist = round((hotel_booking['is_canceled'].value_counts()[1]/hotel_booking['is_canceled'].count())*100,2)
print("Not Canceled Bookings:", str(notcan_dist)+"%" "\n"
      "Canceled Bookings:" , str(can_dist)+"%" )

Not Canceled Bookings: 54.04%
Canceled Bookings: 45.96%


We have a relatively even split between not canceled and canceled bookings

### Splitting into Training and Testing Data

In [15]:
# Split the data into training and testing data
hotel_training, hotel_testing = train_test_split(hotel_booking, test_size = 0.25, random_state=64)
hotel_training.shape

(3750, 33)

In [16]:
notcan_train_dist = round((hotel_training['is_canceled'].value_counts()[0]/hotel_training['is_canceled'].count())*100,2)
can_train_dist = round((hotel_training['is_canceled'].value_counts()[1]/hotel_training['is_canceled'].count())*100,2)
notcan_test_dist = round((hotel_testing['is_canceled'].value_counts()[0]/hotel_testing['is_canceled'].count())*100,2)
can_test_dist = round((hotel_testing['is_canceled'].value_counts()[1]/hotel_testing['is_canceled'].count())*100,2)
print("Training Distribution" "\n"
      "\t" "Not Canceled Bookings:", str(notcan_train_dist)+"%" "\n"
      "\t" "Canceled Bookings:" , str(can_train_dist)+"%" "\n"
      "Testing Distribution" "\n"
      "\t" "Not Canceled Bookings:", str(notcan_test_dist)+"%" "\n"
      "\t" "Canceled Bookings:" , str(can_test_dist)+"%" "\n")

Training Distribution
	Not Canceled Bookings: 53.79%
	Canceled Bookings: 46.21%
Testing Distribution
	Not Canceled Bookings: 54.8%
	Canceled Bookings: 45.2%



### Training Data Information:

In [17]:
# Display the columns of the training data along with the number of non null values, null values and data types
hotel_training_info = pd.DataFrame({"name": hotel_training.columns, 
                                   "non-nulls": len(hotel_training)-hotel_training.isnull().sum().values, 
                                   "nulls": hotel_training.isnull().sum().values, 
                                   "type": hotel_training.dtypes.values})
hotel_training_info

Unnamed: 0,name,non-nulls,nulls,type
0,hotel,3750,0,object
1,is_canceled,3750,0,int64
2,lead_time,3750,0,int64
3,arrival_date_year,3750,0,int64
4,arrival_date_month,3750,0,object
5,arrival_date_week_number,3750,0,int64
6,arrival_date_day_of_month,3750,0,int64
7,stays_in_weekend_nights,3750,0,int64
8,stays_in_week_nights,3750,0,int64
9,adults,3750,0,int64


### Exploratory Analysis of Lead time and Arrival Month

In [48]:
# Display the number of cancelled vs. not cancelled bookings in a plot for months and lead time

With the following graph...

## Part 1: Training to Predict with Lead Time and Arrival Month

### Cleaning Data for Training

In [21]:
def month_converter(month):
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    return months.index(month) + 1

In [22]:
hotel_booking['arrival_date_month'] = hotel_booking['arrival_date_month'].map(month_converter)

In [23]:
hotel_booking['lead_time'].value_counts()

0      198
1       97
78      79
2       72
36      68
      ... 
382      1
344      1
248      1
336      1
342      1
Name: lead_time, Length: 272, dtype: int64

### Training

In [24]:
clean_hotel_training, clean_hotel_testing = train_test_split(hotel_booking, test_size = 0.25, random_state=64)

In [25]:
notcan_train_dist = round((clean_hotel_training['is_canceled'].value_counts()[0]/clean_hotel_training['is_canceled'].count())*100,2)
can_train_dist = round((clean_hotel_training['is_canceled'].value_counts()[1]/clean_hotel_training['is_canceled'].count())*100,2)
notcan_test_dist = round((clean_hotel_testing['is_canceled'].value_counts()[0]/clean_hotel_testing['is_canceled'].count())*100,2)
can_test_dist = round((clean_hotel_testing['is_canceled'].value_counts()[1]/clean_hotel_testing['is_canceled'].count())*100,2)
print("Training Distribution" "\n"
      "\t" "Not Canceled Bookings:", str(notcan_train_dist)+"%" "\n"
      "\t" "Canceled Bookings:" , str(can_train_dist)+"%" "\n"
      "Testing Distribution" "\n"
      "\t" "Not Canceled Bookings:", str(notcan_test_dist)+"%" "\n"
      "\t" "Canceled Bookings:" , str(can_test_dist)+"%" "\n")

Training Distribution
	Not Canceled Bookings: 53.79%
	Canceled Bookings: 46.21%
Testing Distribution
	Not Canceled Bookings: 54.8%
	Canceled Bookings: 45.2%



In [26]:
hotel_preprocessor = make_column_transformer(
    (StandardScaler(), ["lead_time", "arrival_date_month"]),
)

In [27]:
knn = KNeighborsClassifier(n_neighbors=3) 

X = clean_hotel_training[["lead_time", "arrival_date_month"]]
y = clean_hotel_training["is_canceled"]

X_test = clean_hotel_testing[["lead_time", "arrival_date_month"]]
y_test = clean_hotel_testing["is_canceled"]

hotel_pipe = make_pipeline(hotel_preprocessor, knn).fit(X, y)

hotel_pipe

### Finetuning

In [28]:
param_grid = {"kneighborsclassifier__n_neighbors": range(1,1000,50)}

hotel_tune_grid = GridSearchCV(
    estimator=hotel_pipe,
    param_grid=param_grid,
    cv=5
)

In [29]:
acc_grid = pd.DataFrame(hotel_tune_grid.fit(X,y).cv_results_)
acc_grid = acc_grid.rename(columns={"param_kneighborsclassifier__n_neighbors":"n_neighbors"})

In [30]:
accuracy_vs_k = alt.Chart(acc_grid).mark_line(point=True).encode(
    x=alt.X("n_neighbors"),
    y=alt.Y("mean_test_score", scale=alt.Scale(domain=(0.6, 0.75)))
)

accuracy_vs_k

In [31]:
acc_grid = acc_grid.sort_values(by="mean_test_score",ascending=True)
best_n = acc_grid.iloc[0].n_neighbors
best_n

351

In [32]:
knn_tuned = KNeighborsClassifier(n_neighbors=351) 
hotel_pipe_tuned = make_pipeline(hotel_preprocessor, knn).fit(X, y)

hotel_pipe_tuned

## Part 2: Training to Predict with ADR (Average Daily Rate)

### Training

In [33]:
hotel_preprocessor_adr = make_column_transformer(
    (StandardScaler(), ["adr"]),
)

In [34]:
knn_adr = KNeighborsClassifier(n_neighbors=3) 

X_adr = clean_hotel_training[["adr"]]
y_adr = clean_hotel_training["is_canceled"]

X_adr_test = clean_hotel_testing[["adr"]]
y_adr_test = clean_hotel_testing["is_canceled"]

hotel_pipe_adr = make_pipeline(hotel_preprocessor_adr, knn_adr).fit(X_adr, y_adr)
hotel_pipe_adr

### Finetuning

In [35]:
param_grid_adr = {"kneighborsclassifier__n_neighbors": range(1,1200,50)}

hotel_tune_grid_adr = GridSearchCV(
    estimator=hotel_pipe_adr,
    param_grid=param_grid_adr,
    cv=5
)

In [36]:
acc_grid_adr = pd.DataFrame(hotel_tune_grid_adr.fit(X_adr,y_adr).cv_results_)
acc_grid_adr = acc_grid_adr.rename(columns={"param_kneighborsclassifier__n_neighbors":"n_neighbors"})

In [37]:
accuracy_vs_k_adr = alt.Chart(acc_grid_adr).mark_line(point=True).encode(
    x=alt.X("n_neighbors"),
    y=alt.Y("mean_test_score", scale=alt.Scale(domain=(0.5, 0.65)))
)
accuracy_vs_k_adr

In [38]:
acc_grid_adr = acc_grid_adr.sort_values(by="mean_test_score",ascending=True)
best_n_adr = acc_grid_adr.iloc[0].n_neighbors
best_n_adr

951

In [39]:
knn_tuned_adr = KNeighborsClassifier(n_neighbors=951) 
hotel_pipe_tuned_adr = make_pipeline(hotel_preprocessor_adr, knn_adr).fit(X_adr, y_adr)
hotel_pipe_tuned_adr

## Part 3: Classifying a new Data Point

## Discussion




-------------------------------------------------------------------------------------------------------------------------------
Discussion:
summarize what you found
discuss whether this is what you expected to find?
discuss what impact could such findings have?
discuss what future questions could this lead to?


## References

Saad Haron. 2023. Hotel Booking Dataset Version 1 [Data File]. 
Retrieved from https://www.kaggle.com/datasets/saadharoon27/hotel-booking-dataset/data