In [None]:
#import libraires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#read csv file
df = pd.read_csv("hotel_bookings.csv")

In [None]:
#Check dataset
print("Shape:",df.shape)
print("Number of duplicated rows:",df.duplicated().sum())
print("Number of Null value:\n",df.isnull().sum())

In [None]:
#drop duplicated rows
df=df.drop_duplicates()

In [None]:
#fill null value in children column wiht 0
df['children'] = df['children'].fillna(0)

In [None]:
#drop null value in country column
df=df.dropna(subset='country')

In [None]:
#list of columns in the dataset
df.columns.to_list()

In [None]:
#list frist 5 rows
df.head()

In [None]:
#list last 5 rows
df.tail()

In [None]:
#descriptive statistics
df.describe()

In [None]:
#summary of a DataFrame
df.info()

In [None]:
#The different reservation status between different hotels
plt.figure(figsize=(8, 8),facecolor="azure")
sns.set_style("whitegrid")
sns.countplot(x='hotel',hue='is_canceled',data=df)

# Set legend outside the plot for clarity
plt.title('Reservation Status in Different Hotels', fontsize=18)
plt.xlabel('Hotel')
plt.ylabel('Number of Reservations')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(['Not Canceled', 'Canceled'], title='Status')
plt.tight_layout()
plt.show()

In [None]:
#Top 10 Contries by number of reservations
country_count=df.country.value_counts().head(10)

country_name={
    "PRT":"Portugal","GBR":"United Kingdom","FRA":"France","ESP":"Spain",
    "DEU":"Germany","ITA":"Italy","IRL":"Ireland",
    "BEL":"Belgium","BRA":"Brazil","NLD":"Netherlands"
}
country_count.index=country_count.index.map(lambda name : country_name.get(name,name)) 

plt.figure(figsize=(10,8),facecolor="azure")
sns.barplot(x=country_count.index,y=country_count.values)
plt.title("Top 10 Countries by Number of Reservations",fontsize=18)
plt.xlabel("Countries")
plt.ylabel("Number of Reservations")
plt.tight_layout()
plt.show()

In [None]:
#Top 10 Contries by number of reservation cancellations 
# Filter for canceled reservations
cancelled_data = df[df['is_canceled'] == 1]

# Get top 10 countries by number of cancellations
top_10_country = cancelled_data['country'].value_counts().head(10)

plt.figure(figsize=(8, 8),facecolor="lightblue")
plt.title('Top 10 Countries with Reservation Cancellations', fontsize=18)

# Pie chart
plt.pie(top_10_country,autopct='%.2f%%',labels=top_10_country.index)

plt.tight_layout()
plt.show()

In [None]:
#number of monthly customer in different hotels 
# columns used - arrival_date_month, arrival_date_year, hotel
monthly_customer = df.groupby(["hotel","arrival_date_year", "arrival_date_month"]).size().reset_index()
monthly_customer.columns = ["hotel", "arrival_date_year", "arrival_date_month", "customer_count"]
# Use short month labels
monthly_customer["month_str"] = monthly_customer["arrival_date_month"].str[:3]
monthly_customer["month_year_label"] = monthly_customer["month_str"] + " " + monthly_customer["arrival_date_year"].astype(str)

# Sort Date
monthly_customer["sort_date"] = pd.to_datetime(monthly_customer["arrival_date_year"].astype(str) + "-" + monthly_customer["arrival_date_month"], format="%Y-%B")
monthly_customer = monthly_customer.sort_values("sort_date")

# Plot
plt.figure(figsize=(14, 6),facecolor="azure")
sns.barplot(data=monthly_customer[monthly_customer["hotel"] == "City Hotel"],x="month_year_label", y="customer_count")
plt.title("City Hotel Monthly Customers",fontsize=18)
plt.xlabel("Date")
plt.ylabel("Customer Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 6),facecolor="azure")
sns.barplot(data=monthly_customer[monthly_customer["hotel"] == "Resort Hotel"],x="month_year_label", y="customer_count")
plt.title("Resort Hotel Monthly Customers",fontsize=18)
plt.xlabel("Date")
plt.ylabel("Customer Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#Histogram for Average Daily Rate ADR
plt.figure(figsize=(8,8), facecolor='azure')

sns.histplot(df['adr'] ,kde=True, color='skyblue')
plt.title('Histogram of Average Daily Rate',fontsize=18)
plt.xlabel("Average Daily Rate")
plt.ylabel("Frequence")
plt.xlim(0, 500)
plt.tight_layout()
plt.show()

In [None]:
#Boxplot for detecting outliers
plt.figure(figsize=(8,8), facecolor='azure')

sns.boxplot(x=df['adr'],color="lightblue")
plt.title("Boxplot of Average Daily Rate",fontsize=18)
plt.xlabel("Average Daily Rate")
plt.tight_layout()
plt.show()

In [None]:
#correlation between variables
numeric_data = df.select_dtypes(include="number").corr()

plt.figure(figsize=(15,8), facecolor='azure')
sns.heatmap(numeric_data , annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap Between Numeric Variables',fontsize=18)
plt.tight_layout()
plt.show()

In [None]:
# Line chart for customer reservation all time
plt.figure(figsize=(14, 6),facecolor="azure")
sns.lineplot(data=monthly_customer,x="month_year_label", y="customer_count")
plt.title("Customer Reservation Trend",fontsize=18)
plt.xlabel("Date")
plt.ylabel("Customer Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()