# Predicting Hotel Booking Cancellations

In this project, I'll use a hotel occupancy dataset. After data cleaning and exploratory analysis, I'll build predictive models with Logistic Regression, Random Forest, Decision Tree, KNN, and Naive Bayes to determine whether a booking will be canceled or not. By comparing their accuracy, I'll identify the most effective model for this prediction task.

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Import data

In [2]:
df = pd.read_csv("./Datasets/hotel_bookings.csv")
display(df.head(), df.shape)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,7/1/2015
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,7/1/2015
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,7/2/2015
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,7/2/2015
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,7/3/2015


(119390, 32)

### Data Cleaning

In [3]:
# Check missing/null data
df.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

Dealing with 'Agent', 'Company' and 'Country' null values

In [5]:
# Drop 'Agent' and 'Company' as they hold to many NaN values.
df.drop(['agent', 'company'], axis=1,inplace=True)

In [4]:
# For 'Country' column, check the most frequent value to fill NaN values.
df['country'].value_counts()

PRT    48590
GBR    12129
FRA    10415
ESP     8568
DEU     7287
       ...  
DJI        1
BWA        1
HND        1
VGB        1
NAM        1
Name: country, Length: 177, dtype: int64

In [None]:
# Our most frequent value for 'Country'
df['country'].value_counts().index[0]

In [None]:
# Fill 'Country' NaN values with the most frequent 'Country'
df['country'].fillna(df['country'].value_counts().index[0], inplace=True)

In [None]:
# Fill extra NaN with 0
df.fillna(0, inplace=True)

In [None]:
# Check if there are any remaining NaN values
print('Missing values in our df: ', df.isnull().sum().sum())

- Irrelevant data

Adults, Children & Babies cannot be 0 at the same time

In [None]:
df[df['children']==0].head()

In [None]:
filter1 = (df['children']==0) & (df['adults']==0) & (df['babies']==0)
display(df[filter1].head(), df[filter1].shape)

In [None]:
# Droping those 180 rows and saving into new dataframe
data = df[~filter1]
print('Original data shape: ', df.shape)
print('New data shape: ', data.shape)

### Analyse Demand of Hotels

Where do the guests come from?

In [None]:
data['is_canceled'].unique()

0 is not canceled, 1 means canceled

In [None]:
# Not canceled data
data[data['is_canceled']==0].head()

In [None]:
len_not_canceled = len(data[data['is_canceled']==0])

In [None]:
# Get % of customers of each country
data[data['is_canceled']==0]['country'].value_counts()/len_not_canceled

In [None]:
country_wise_data = data[data['is_canceled']==0]['country'].value_counts().reset_index()
# Modify cols name
country_wise_data.columns=['country','no_of_guests']
country_wise_data

In [None]:
# pip install plotly
# pip install chart_studio

In [None]:
import plotly
import chart_studio.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected = True)

In [None]:
import plotly.express as px

In [None]:
map_guest = px.choropleth(country_wise_data,
                          locations=country_wise_data['country'],
                          color=country_wise_data['no_of_guests'],
                          hover_name = country_wise_data['country'],
                          title= 'home country of guests')

map_guest.show()

### Analyse Price of Hotel across year

How much do guests pay for a room per night?

- Avg/Mean (if there is no presence of outliers) Vs Median

- Quantiles (q1 to 25h, 50th = Median, q3 to 75th) --> boxplot

In [None]:
# Not canceled bookings
data2 = data[data['is_canceled']==0]

In [None]:
data2.columns

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='reserved_room_type',y='adr', hue='hotel',data=data2)
plt.title('Price of room per night and person')
plt.xlabel('room types')
plt.ylabel('price (EUR)')

In [None]:
# Upper dots = positive-high outliers
# Bottom dots = low outliers
# Line innside the box = Median or Q2
# Bottom box line = Q1
# Bottom top line = Q3
# Upper & Down Fence -- aprox 90th and 10th data

City Hotel room type G has the highest median value, whereas it's type H room for Resort Hotel.

### Analyse Demand of Hotels

Which are the most busy months?

In [None]:
data['hotel'].unique()

In [None]:
# Distinct dfs for Resort and City hotel
data_resort= data[(data['hotel']=='Resort Hotel')&(data['is_canceled']==0)]
data_city= data[(data['hotel']=='City Hotel')&(data['is_canceled']==0)]

In [None]:
print('Resort Hotel data shape: ', data_resort.shape)
print('City Hotel data shape: ', data_city.shape)

In [None]:
# Resort
rush_resort = data_resort['arrival_date_month'].value_counts().reset_index()
rush_resort.columns=['month', 'no_of_guests']

# City
rush_city = data_city['arrival_date_month'].value_counts().reset_index()
rush_city.columns=['month', 'no_of_guests']

display(rush_resort, rush_city)

In [None]:
# Merge
final_rush = rush_resort.merge(rush_city, on='month')
final_rush.columns=['month', 'guests_resort', 'guests_city']
final_rush

In [None]:
# pip install sorted-months-weekdays
# pip install sort_dataframeby_monthorweek

In [None]:
import sort_dataframeby_monthorweek as sd

In [None]:
final_rush = sd.Sort_Dataframeby_Month(final_rush, 'month')
final_rush

In [None]:
final_rush.columns

In [None]:
px.line(data_frame=final_rush,
       x='month',
       y=['guests_resort', 'guests_city'],
       title = 'Number of guests by month')

### Which month has the highest avg daily rate (adr)?

In [None]:
data = sd.Sort_Dataframeby_Month(data, 'arrival_date_month')

In [None]:
sns.barplot(x='arrival_date_month', y='adr', data=data, hue='is_canceled')
plt.xticks(rotation='vertical')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='arrival_date_month', y='adr', data=data, hue='is_canceled')
plt.xticks(rotation='vertical')
plt.show()

As there is a extreme outlier, let's limit the plot

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='arrival_date_month', y='adr', data=data, hue='is_canceled')
plt.xticks(rotation='vertical')
plt.ylim(0,800) # new limit
plt.show()

### More Analysis

Analyse wether bookings were made only for weekdays or for weekends or for both?

In [None]:
pd.crosstab(index=data['stays_in_weekend_nights'],columns=data['stays_in_week_nights'])

Examples:
- 16436 guests have stayed 1 weeknight and 0 weekend nights.
- 6531 guests have stayed 1 weeknight and 2 weekend nights.

But what about the first cell 645 guests (0 nights) ?

In [None]:
def week_function(row):
    feature1 = 'stays_in_weekend_nights'
    feature2 = 'stays_in_week_nights'
    
    if row[feature2]== 0 and row[feature1] >0:
        return 'stay_just_weekend'
    
    elif row[feature2] >0 and row[feature1]== 0:
        return 'stay_just_weekdays'
    
    elif row[feature2] >0 and row[feature1] >0:
        return 'stay_both_weekdays_weekends'
        
    else:
        return 'undefined_data'

In [None]:
data2['weekend_or_weekday']=data2.apply(week_function, axis=1)
data2.head()

In [None]:
data2['weekend_or_weekday'].value_counts()

In [None]:
data2 = sd.Sort_Dataframeby_Month(data2,'arrival_date_month')

In [None]:
# Won't be sorted as it's not a df
data2.groupby(['arrival_date_month', 'weekend_or_weekday']).size()

In [None]:
group_data = data2.groupby(['arrival_date_month', 'weekend_or_weekday']).size().unstack().reset_index()

In [None]:
sorted_data = sd.Sort_Dataframeby_Month(group_data,'arrival_date_month')
sorted_data

In [None]:
sorted_data.set_index('arrival_date_month', inplace=True)
sorted_data

In [None]:
sorted_data.plot(kind='bar', stacked=True, figsize=(15,10))

### Features for Machine Learning models

Creating / reduce features to improve our models

In [None]:
data2.columns

Example: "family": "adults" & ("children" or "babies")

In [None]:
def family(row):
    
    if(row['adults']>0)& (row['children']>0 or row['babies']>0):
        return 1 #if this condition is true: is family
    else:
        return 0

In [None]:
data['is_family'] = data.apply(family, axis=1)

In [None]:
data['total_customers'] = data['adults'] + data['children'] + data['babies']

In [None]:
data['total_nights'] = data['stays_in_week_nights'] + data['stays_in_week_nights']

In [None]:
data.head(3)

"Deposit_type": if refundable deposit won't exist

In [None]:
data['deposit_type'].unique()

In [None]:
# Dictionary approach
dict1={'No Deposit': 0, 'Non Refund': 1, 'Refundable': 0}

In [None]:
# Map the dictionary
data['deposit_given'] = data['deposit_type'].map(dict1)

data['deposit_given']

In [None]:
data.columns

In [None]:
# Drop columns
data.drop(['adults', 'children', 'babies','deposit_type'], axis=1, inplace=True)

In [None]:
data.columns

In [None]:
data.head()

### Encoding categorical data

Mean encoding: 

In [None]:
# List comprehension to get categorical columns
cate_features =[col for col in data.columns if data[col].dtype == 'object']

cate_features

In [None]:
data_cat = data[cate_features]
data_cat.head()

In [None]:
# List comprehension to get numerical columns
num_features =[col for col in data.columns if data[col].dtype != 'object']

data[num_features].head()

In [None]:
data.groupby(['hotel'])['is_canceled'].mean()

Mean encoding:

For each hotel, if mean < 0.5 means there are more 0 than 1:

- City Hotel = count(1) < count(0)
- Resort Hotel = count(1) < count(0)

Now we can replace values for each corresponding mean.

In [None]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
data_cat['cancellation'] = data['is_canceled']

In [None]:
data_cat.head()

In [None]:
cols=data_cat.columns
cols

In [None]:
# Get all columns but 'cancellation'
cols=cols[0:-1]
cols

In [None]:
# Apply Mean Encoding (using own code instead of packages)
for col in cols:
    dict2 = data_cat.groupby([col])['cancellation'].mean().to_dict() # newdict
    data_cat[col]= data_cat[col].map(dict2)

In [None]:
data_cat.head()

### Handle Outliers

In [None]:
data[num_features].head()

In [None]:
# Merge cat & num dfs
dataframe = pd.concat([data_cat, data[num_features]], axis=1)
dataframe

In [None]:
# Remove duplicate info: 'is_canceled' and 'cancellations'
dataframe.drop(['cancellation'], axis=1, inplace=True)
dataframe.shape

In [None]:
# Normal Distribution / Gaussian --> Best for ML
# Left/Right skeweness --> low/high outliers --> logtransformation

In [None]:
sns.distplot(dataframe['lead_time'])

In [None]:
#log1p (issue with negative values)
def handle_outlier(col):
    dataframe[col] = np.log1p(dataframe[col])

In [None]:
handle_outlier('lead_time')

In [None]:
sns.distplot(dataframe['lead_time'])

In [None]:
# ADR
sns.distplot(dataframe['adr'])

In [None]:
# Filter: negative values (adr)
dataframe[dataframe['adr']<0]

In [None]:
# Apply function
handle_outlier('adr')

In [None]:
# 1 missing value: negative value cannot be handled by a log operation.
dataframe['adr'].isnull().sum()

In [None]:
# Plot and drop negative value
sns.distplot(dataframe['adr'].dropna())

### Important features using co-relation & univariate analysis

Univariate Distribution Example:
    
For 'lead_time':

    - find distribution when 'is_canceled' == 0
    - find distribution when 'is_canceled' == 1
    
Rule of thumb: the less overlapping between both distributions, the better for ML.

In [None]:
sns.FacetGrid(data, hue='is_canceled', xlim=(0,500)).map(sns.kdeplot, 'lead_time',shade=True).add_legend()

sns.FacetGrid(data, hue='is_canceled', xlim=(0, 500)):

    - data: The input DataFrame or data source.
    - hue: Categorical variable in the data that will be used for color-mapping. In this case, it is used to distinguish between different levels of the 'is_canceled' variable.
    - xlim: A tuple specifying the limits of the x-axis in the plot. It sets the minimum and maximum values displayed on the x-axis, in this case, from 0 to 500.


.map(sns.kdeplot, 'lead_time', shade=True):

    - .map(): This function is used to apply the sns.kdeplot function to the data in the FacetGrid.
    - sns.kdeplot: This is the kernel density estimation plot function, which estimates the probability density function of a continuous random variable (in this case, 'lead_time').
    - 'lead_time': The column name in the data DataFrame that will be plotted on the x-axis.
    - shade=True: This parameter enables shading under the KDE curve, creating a filled area that helps visualize the distribution better.
    - .add_legend(): This function adds a legend to the plot, which helps interpret the different hues (colors) used to represent the levels of the 'is_canceled' variable.



So, the code creates a FacetGrid plot using seaborn, where the 'lead_time' column from the data DataFrame is represented by a kernel density estimation plot. The plot is colored based on the 'is_canceled' variable. The x-axis limits are set from 0 to 500, and the KDE curve is shaded. Finally, a legend is added to explain the colors used for different cancellation levels.

Up to some extent, this feature will play an important role.

Important features using correlation

In [None]:
corr = dataframe.corr()
corr

In [None]:
corr['is_canceled'].sort_values(ascending=False)

Whenever 2 features have a high correlation (close to 1 or -1) we must drop those to avoid overfitting. Same with those who have 0 correlation.

    - reservation_status: 1.000
    - arrival_date_year: 0.016
    - arrival_date_week_number: 0.008
    - stays_in_weekend_nights: -0.001
    - arrival_date_day_of_month: -0.005
    
As long as we drop 'reservation_status', we can also drop 'reservation_status_date'.

'is_canceled' will be our target feature.

In [None]:
corr['is_canceled'].sort_values(ascending=False).index

In [None]:
features_to_drop = ['reservation_status', 'reservation_status_date',
                    'arrival_date_year', 'arrival_date_week_number',
                    'stays_in_weekend_nights','arrival_date_day_of_month']

In [None]:
dataframe.drop(features_to_drop, axis=1, inplace=True)

In [None]:
# 24 features
dataframe.shape

### Techniques of Feature Importance (for model building)

Feature selection

In [None]:
dataframe.head(2)

In [None]:
# Check null values first
dataframe.isnull().sum()

In [None]:
# Drop null values
dataframe.dropna(inplace=True)

In [None]:
# Drop target feature
x = dataframe.drop('is_canceled', axis=1) # do not update, 'inplace=True'

In [None]:
y= dataframe['is_canceled']

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
# Initialize Lasso
## Lasso(alpha=0.005)
# penalty paramater (the bigger the alpha value, the less features this model will select)

In [None]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005))

In [None]:
feature_sel_model.fit(x,y)

In [None]:
feature_sel_model.get_support()

    - False = not selected
    - True = selected

In [None]:
cols = x.columns
cols

In [None]:
# Filter: selected features
selected_feature = cols[feature_sel_model.get_support()]
selected_feature

In [None]:
x = x[selected_feature]
x

In [None]:
# Those features from X will be the ones used to predict 'is_canceled' (y)
y

### Building Machine Learning Model: Logistic Regression

Regression, **Classification**, Clustering model


For Classification cases:

    - Logistic Regression
    - Random Forest
    - Decision Tree
    - KNN
    - Boosting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [None]:
X_train.shape

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Initialize Logistic Regression Model
logreg=LogisticRegression()

In [None]:
# Fit the model with train data
logreg.fit(X_train, y_train)

In [None]:
# Predict array
pred=logreg.predict(X_test)
pred

#### Evaluate: Confusion Matrix, Accuracy Score, Classification Report...

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# Confusion Matrix
confusion_matrix(y_test, pred)

    - Top Left: True Positive
    - Top Right: False Positive
    - Bottom Left: False Negative
    - Bottom Right: True Negative

In [None]:
# Accuracy score
acsc= accuracy_score(y_test, pred)
print('Accuracy score: ', round(acsc,3))

### Cross Validation

Cross validation increases accuracy (Kfold, GridSearch, RandomizedSearch, Genetic Algo...)

Example (specific random_state?):
    
    - Cross Validation (CV) = 5
        - CV of data 5 times, imagine 1000 records;
            - 1 CV (last 20% of data as test / rest 80% train)
            - 2 CV (data from 60 to 80% as test / 0 to 60 and 80 to 100% as train)
            - 3 CV (data from 40 to 60% as test / 0 to 40 and 60 to 100% as train)
            - 4 CV (data from 20 to 40 as test / 0 to 20 and 40 to 100% as train)
            - 5 CV (first 20% of data as test / rest 80% train)

Accuracy: 
    - Mean of {1 CV, 2 CV... 5 CV} --> final accuracy --> Cross Validate

ML Algorithm (both reg/class). Ex Random Forest Regressor/Classifier.

Parameters (default):

    - n_estimators
    - max_features
    - max_depth
    
How to optimize our hyperparameters / model?

GridSearchCV:

    - Define Dict:
        'n_estimatators' = [100, 200, 300]
        'max_features' = ['auto', log2, sqrt]
        'max_depth' = [.   ]
        
    - Then pass the Dict to GridSearchCH()
        - It will perfom every and each combination determined in the Dict
        - It will return the best model / parameters (highest accuracy)
        
 RandomizedSearchCV:
 
     - It will pick random parameters and check which has higher accuracy

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
score = cross_val_score(logreg, x, y, cv=10)
score

In [None]:
score.mean()

Decision Tree:

    - From the decision tree, which feature will be selected as the main "road"
    - Entropy, Information Gain
    - Gini (?) Index, Impurity
    - Pre-prunny & Post --> if overfitting, high accuracy in Train data, low in Test Data
 
Entropy:

    - How random the data is?
    - Probability of occurence
    - Impurity
    
Information gain:

    - Based on entropy, which feature will provide the highest info gain (parent node)?
    
Gini (Impurity) Index:

    - ...

### Applying multiple algorithms and check on accuracy

In [None]:
score.mean()

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
models = []
models.append(('Logistic Regression', LogisticRegression()))  # 2nd to initialize
models.append(('Naive bayes', GaussianNB()))  
models.append(('Random Forest', RandomForestClassifier()))  
models.append(('Decision Tree', DecisionTreeClassifier())) 
models.append(('KNN', KNeighborsClassifier()))

In [None]:
models

In [None]:
for name, model in models:
    print(name)
    model.fit(X_train, y_train) # fit models
    
    predictions = model.predict(X_test) # prediciton
    
    from sklearn.metrics import confusion_matrix, accuracy_score
    print(confusion_matrix(predictions, y_test))
    print('\n')
    print(accuracy_score(predictions, y_test))
    print('\n')

In [None]:
results = []

for name, model in models:
    model.fit(X_train, y_train)  # fit model
    predictions = model.predict(X_test)  # prediction
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    accuracy = accuracy_score(y_test, predictions)

    results.append([name, tp, fp, fn, tn, accuracy])

# Creating a DataFrame from the results list
columns = ['Model', 'True Positive', 'False Positive', 'False Negative', 'True Negative', 'Accuracy Score']
results_df = pd.DataFrame(results, columns=columns)

results_df

In [None]:
results = []

for name, model in models:
    model.fit(X_train, y_train)  # fit model
    predictions = model.predict(X_test)  # prediction
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    accuracy = accuracy_score(y_test, predictions)

    results.append([name, tp, fp, fn, tn, accuracy])

# Creating a DataFrame from the results list
columns = ['Model', 'True Positive', 'False Positive', 'False Negative', 'True Negative', 'Accuracy Score']
results_df = pd.DataFrame(results, columns=columns)

# Find the row with the highest accuracy
highest_accuracy_row = results_df.loc[results_df['Accuracy Score'].idxmax()]

# Apply styling to highlight the row with the highest accuracy
highlighted_results_df = results_df.style.apply(lambda row: ['background: purple' if row.equals(highest_accuracy_row) else '' for v in row], axis=1)

highlighted_results_df