In [None]:
#import libraires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import preprocessing

In [None]:
#read csv file
df = pd.read_csv("hotel_bookings.csv")

In [None]:
#Check dataset
print("Shape:",df.shape)
print("Number of duplicated rows:",df.duplicated().sum())
print("Number of Null value:\n",df.isnull().sum())

In [None]:
#drop duplicated rows
df=df.drop_duplicates()

In [None]:
#fill null value in children column wiht 0
fill= {"agent": 0, "company": 0}
df= df.fillna(fill)
df['children']=df['children'].fillna(0)

In [None]:
print("Number of Null value:\n",df.isnull().sum())

In [None]:
#drop null value in country column
df=df.dropna(subset='country')

In [None]:
#list of columns in the dataset
df.columns.to_list()

In [None]:
#drop unnecessary columns without droping reservation_status
drop_cols1=[
 'arrival_date_year',
 'arrival_date_month',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'agent',
 'company']

data1=df.drop(drop_cols1,axis=1)

In [None]:
#drop unnecessary columns with droping reservation_status
drop_cols2=[
 'arrival_date_year',
 'arrival_date_month',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'agent',
 'company',
 'reservation_status']

data2=df.drop(drop_cols2,axis=1)

In [None]:
#list frist 5 rows
df.head()

In [None]:
#list last 5 rows
df.tail()

In [None]:
#descriptive statistics
df.describe()

In [None]:
#summary of a DataFrame
df.info()

In [None]:
#The different reservation status between different hotels
plt.figure(figsize=(8, 8),facecolor="azure")
sns.set_style("whitegrid")
sns.countplot(x='hotel',hue='is_canceled',data=df)

# Set legend outside the plot for clarity
plt.title('Reservation Status in Different Hotels', fontsize=18)
plt.xlabel('Hotel')
plt.ylabel('Number of Reservations')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(['Not Canceled', 'Canceled'], title='Status')
plt.tight_layout()
plt.show()

In [None]:
#Top 10 Contries by number of reservations
country_count=df.country.value_counts().head(10)

country_name={
    "PRT":"Portugal","GBR":"United Kingdom","FRA":"France","ESP":"Spain",
    "DEU":"Germany","ITA":"Italy","IRL":"Ireland",
    "BEL":"Belgium","BRA":"Brazil","NLD":"Netherlands"
}
country_count.index=country_count.index.map(lambda name : country_name.get(name,name)) 

plt.figure(figsize=(10,8),facecolor="azure")
sns.barplot(x=country_count.index,y=country_count.values)
plt.title("Top 10 Countries by Number of Reservations",fontsize=18)
plt.xlabel("Countries")
plt.ylabel("Number of Reservations")
plt.tight_layout()
plt.show()

In [None]:
#Top 10 Contries by number of reservation cancellations 
# Filter for canceled reservations
cancelled_data = df[df['is_canceled'] == 1]

# Get top 10 countries by number of cancellations
top_10_country = cancelled_data['country'].value_counts().head(10)

plt.figure(figsize=(8, 8),facecolor="lightblue")
plt.title('Top 10 Countries with Reservation Cancellations', fontsize=18)

# Pie chart
plt.pie(top_10_country,autopct='%.2f%%',labels=top_10_country.index)

plt.tight_layout()
plt.show()

In [None]:
#number of monthly customer in different hotels 
# columns used - arrival_date_month, arrival_date_year, hotel
monthly_customer = df.groupby(["hotel","arrival_date_year", "arrival_date_month"]).size().reset_index()
monthly_customer.columns = ["hotel", "arrival_date_year", "arrival_date_month", "customer_count"]
# Use short month labels
monthly_customer["month_str"] = monthly_customer["arrival_date_month"].str[:3]
monthly_customer["month_year_label"] = monthly_customer["month_str"] + " " + monthly_customer["arrival_date_year"].astype(str)

# Sort Date
monthly_customer["sort_date"] = pd.to_datetime(monthly_customer["arrival_date_year"].astype(str) + "-" + monthly_customer["arrival_date_month"], format="%Y-%B")
monthly_customer = monthly_customer.sort_values("sort_date")

# Plot
plt.figure(figsize=(14, 6),facecolor="azure")
sns.barplot(data=monthly_customer[monthly_customer["hotel"] == "City Hotel"],x="month_year_label", y="customer_count")
plt.title("City Hotel Monthly Customers",fontsize=18)
plt.xlabel("Date")
plt.ylabel("Customer Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 6),facecolor="azure")
sns.barplot(data=monthly_customer[monthly_customer["hotel"] == "Resort Hotel"],x="month_year_label", y="customer_count")
plt.title("Resort Hotel Monthly Customers",fontsize=18)
plt.xlabel("Date")
plt.ylabel("Customer Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#Histogram for Average Daily Rate ADR
plt.figure(figsize=(8,8), facecolor='azure')

sns.histplot(df['adr'] ,kde=True, color='skyblue')
plt.title('Histogram of Average Daily Rate',fontsize=18)
plt.xlabel("Average Daily Rate")
plt.ylabel("Frequence")
plt.xlim(0, 500)
plt.tight_layout()
plt.show()

In [None]:
#Boxplot for detecting outliers
plt.figure(figsize=(8,8), facecolor='azure')

sns.boxplot(x=df['adr'],color="lightblue")
plt.title("Boxplot of Average Daily Rate",fontsize=18)
plt.xlabel("Average Daily Rate")
plt.tight_layout()
plt.show()

In [None]:
#correlation between variables
numeric_data = df.select_dtypes(include="number").corr()

plt.figure(figsize=(15,8), facecolor='azure')
sns.heatmap(numeric_data , annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap Between Numeric Variables',fontsize=18)
plt.tight_layout()
plt.show()

In [None]:
# Line chart for customer reservation all time
plt.figure(figsize=(14, 6),facecolor="azure")
sns.lineplot(data=monthly_customer,x="month_year_label", y="customer_count")
plt.title("Customer Reservation Trend",fontsize=18)
plt.xlabel("Date")
plt.ylabel("Customer Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Model 1 With Reservation Status

In [None]:
# select all categorical variables
df_categorical1 = data1.select_dtypes(include=['object'])
df_categorical1.head()

In [None]:
# apply Label encoder to df_categorical
enc= preprocessing.LabelEncoder()
df_categorical1 = df_categorical1.apply(enc.fit_transform)
df_categorical1.head()


In [None]:
#concat with encoder
data1.drop(df_categorical1.columns, axis=1, inplace=True)
data1 = pd.concat([data1, df_categorical1], axis=1)
data1.head()

In [None]:
# Define features and target variable
X1=data1.drop('is_canceled',axis=1)
y1=data1['is_canceled']

In [None]:
# Split data into training and testing sets (70% train, 30% test)
X_train,X_test, y_train, y_test= train_test_split(X1, y1, test_size=.3, random_state=42)

In [None]:
# 1. Logistic Regression
log1=LogisticRegression(max_iter=1000, random_state=42)
# Train on training set
log1.fit(X_train,y_train)

In [None]:
# Predict on test set
log1_pred= log1.predict(X_test)

In [None]:
# Calculate performance metrics for Logistic Regression
print("Logistic Regression Matrices:")
print("Accuracy Score:",round(metrics.accuracy_score(y_test,log1_pred),3))
print('Precission_score:',round(metrics.precision_score(y_test,log1_pred),3))
print('Recall score:',round(metrics.recall_score(y_test,log1_pred),3))
print('F1-score:',round(metrics.f1_score(y_test,log1_pred),3))

In [None]:
# 2. Random Forest Classifier
rand1= RandomForestClassifier(random_state=42)
# Train on training set
rand1.fit(X_train, y_train)

In [None]:
# Predict on test set
rand1_pred= rand1.predict(X_test)

In [None]:
# Calculate performance metrics for Random Forest Classifier
print("Random Forest Classifier Matrices:")
print("Accuracy Score:",round(metrics.accuracy_score(y_test,rand1_pred),3))
print('Precission_score:',round(metrics.precision_score(y_test,rand1_pred),3))
print('Recall score:',round(metrics.recall_score(y_test,rand1_pred),3))
print('F1-score:',round(metrics.f1_score(y_test,rand1_pred),3))

In [None]:
# Confusion Matrices
log1_cm=metrics.confusion_matrix(y_test,log1_pred)
rand1_cm=metrics.confusion_matrix(y_test,rand1_pred)
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(12,5),facecolor='azure')
sns.heatmap(log1_cm, annot=True, fmt='d', ax=ax1, cmap='Blues')
ax1.set_title('Logistic Regression Confusion Matrix')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')
sns.heatmap(rand1_cm, annot=True, fmt='d', ax=ax2, cmap='Blues',cbar=False)
ax2.set_title('Random Forest Classifier Confusion Matrix')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
plt.tight_layout()
plt.show()

In [None]:
# visualize correlations between all features
plt.figure(figsize=(15,8), facecolor='azure')
sns.heatmap(data1.corr().round(2) , annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap Between Numeric Variables',fontsize=18)
plt.tight_layout()
plt.show()

# Model 2 Without Reservation Status

In [None]:
# select all categorical variables
df_categorical2 = data2.select_dtypes(include=['object'])
df_categorical2.head()

In [None]:
# apply Label encoder to df_categorical
df_categorical2 = df_categorical2.apply(enc.fit_transform)
df_categorical2.head()

In [None]:
#concat with encoder
data2.drop(df_categorical2.columns, axis=1, inplace=True)
data2 = pd.concat([data2, df_categorical2], axis=1)
data2.head()

In [None]:
# Define features and target variable
X2=data2.drop('is_canceled',axis=1)
y2=data2['is_canceled']

In [None]:
# Split data into training and testing sets (70% train, 30% test)
X_train,X_test, y_train, y_test= train_test_split(X2, y2, test_size=.3, random_state=42)

In [None]:
# 1. Logistic Regression
log2=LogisticRegression(max_iter=1000, random_state=42)
# Train on training set
log2.fit(X_train,y_train)

In [None]:
# Predict on test set
log2_pred= log2.predict(X_test)

In [None]:
# Calculate performance metrics for Logistic Regression
print("Logistic Regression Matrices:")
print("Accuracy Score:",round(metrics.accuracy_score(y_test,log2_pred),3))
print('Precission_score:',round(metrics.precision_score(y_test,log2_pred),3))
print('Recall score:',round(metrics.recall_score(y_test,log2_pred),3))
print('F1-score:',round(metrics.f1_score(y_test,log2_pred),3))

In [None]:
# 2. Random Forest Classifier
rand2= RandomForestClassifier(random_state=42)
# Train on training set
rand2.fit(X_train, y_train)

In [None]:
# Predict on test set
rand2_pred= rand2.predict(X_test)

In [None]:
# Calculate performance metrics for Random Forest Classifier
print("Random Forest Classifier Matrices:")
print("Accuracy Score:",round(metrics.accuracy_score(y_test,rand2_pred),3))
print('Precission_score:',round(metrics.precision_score(y_test,rand2_pred),3))
print('Recall score:',round(metrics.recall_score(y_test,rand2_pred),3))
print('F1-score:',round(metrics.f1_score(y_test,rand2_pred),3))

In [None]:
# Confusion Matrices
log2_cm=metrics.confusion_matrix(y_test,log2_pred)
rand2_cm=metrics.confusion_matrix(y_test,rand2_pred)
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(12,5),facecolor='azure')
sns.heatmap(log2_cm, annot=True, fmt='d', ax=ax1, cmap='Blues')
ax1.set_title('Logistic Regression Confusion Matrix')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')
sns.heatmap(rand2_cm, annot=True, fmt='d', ax=ax2, cmap='Blues',cbar=False)
ax2.set_title('Random Forest Classifier Confusion Matrix')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
plt.tight_layout()
plt.show()

In [None]:
# visualize correlations between all features
plt.figure(figsize=(15,8), facecolor='azure')
sns.heatmap(data2.corr().round(2) , annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap Between Numeric Variables',fontsize=18)
plt.tight_layout()
plt.show()