#### https://www.kaggle.com/marcuswingen/eda-of-bookings-and-ml-to-predict-cancelations

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import warnings
warnings.simplefilter(action="ignore")

In [None]:
df=pd.read_csv(r'C:\Users\HP\Downloads\Project 2 Hotel Booking Data Analysis/hotel_bookings.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().values.any()

In [None]:
df.isnull().sum()

In [None]:
# Replace missing values:
# agent: If no agency is given, booking was most likely made without one.
# company: If none given, it was most likely private.

In [None]:
df.fillna(0,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df['meal'].value_counts()

In [None]:
df['children'].unique()

In [None]:
df['adults'].unique()

In [None]:
df['babies'].unique()

In [None]:
### seems to have some dirtiness in data as Adults,babies & children cant be zero at a same time 

In [None]:
df.shape

In [None]:
len(df[df['adults']==0])

In [None]:
filter=(df['children']==0) & (df['adults']==0) & (df['babies']==0)
df[filter]

In [None]:
### Visualise Entire Dataframe where adult,children & babies are 0

In [None]:
pd.set_option('display.max_columns',32)

In [None]:
filter=(df['children']==0) & (df['adults']==0) & (df['babies']==0)
df[filter]

In [None]:
data=df[~filter]

In [None]:
data.shape

In [None]:
data.head()

#### After cleaning, separate Resort and City hotel

In [None]:
resort = data[(data["hotel"] == "Resort Hotel") & (data["is_canceled"] == 0)]
city = data[(data["hotel"] == "City Hotel") & (data["is_canceled"] == 0)]

In [None]:
resort.shape

In [None]:
city.shape

## Where do the guests come from?

In [None]:
resort.head()

In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.express as px

In [None]:
labels=resort['country'].value_counts().index
values=resort['country'].value_counts()

In [None]:
trace=go.Pie(labels=labels, values=values,
               hoverinfo='label+percent', textinfo='value'
               )

In [None]:
iplot([trace])

## Home country of Guests, perform Spatial Analysis

In [None]:
country_wise_data=data[data['is_canceled']==0]['country'].value_counts().reset_index()
country_wise_data.columns=['country','No of guests']
country_wise_data

In [None]:
import folium
from folium.plugins import HeatMap

In [None]:
basemap=folium.Map()

In [None]:
country_wise_data.dtypes

In [None]:
# show on map
map_guest = px.choropleth(country_wise_data,
                    locations=country_wise_data['country'],
                    color=country_wise_data['No of guests'], 
                    hover_name=country_wise_data['country'], 
                    title="Home country of guests")
map_guest.show()

#### People from all over the world are staying in these two hotels. Most guests are from Portugal and other countries in Europe

## How much do guests pay for a room per night?

In [None]:
data.head()

#### Both hotels have different room types and different meal arrangements. Seasonal factors are also important. So the prices vary a lot. Since no currency information is given, but Portugal is part of the European Monetary Union, I assume that all prices are in EUR.

In [None]:
data2=data[data['is_canceled']==0]

In [None]:
# boxplot:
plt.figure(figsize=(12, 8))
sns.boxplot(x="reserved_room_type",
            y="adr",
            hue="hotel",
            data=data2)
plt.title("Price of room types per night and person", fontsize=16)
plt.xlabel("Room type", fontsize=16)
plt.ylabel("Price [EUR]", fontsize=16)
plt.legend(loc="upper right")
plt.ylim(0, 600)
plt.show()

#### This figure shows the average price per room, depending on its type and the standard deviation. Note that due to data anonymization rooms with the same type letter may not necessarily be the same across hotels.

## How does the price per night vary over the year?

In [None]:
data_resort=resort[resort['is_canceled']==0]

In [None]:
data_city=city[city['is_canceled']==0]

In [None]:
data_resort.head()

In [None]:
resort_hotel=data_resort.groupby(['arrival_date_month'])['adr'].mean().reset_index()
resort_hotel

In [None]:
city_hotel=data_city.groupby(['arrival_date_month'])['adr'].mean().reset_index()
city_hotel

In [None]:
final=resort_hotel.merge(city_hotel,on='arrival_date_month')
final.columns=['month','price_for_resort','price_for_city_hotel']
final

#### now we will observe over here is month column is not in order, & if we will visualise we will get improper conclusion
#### so very first we have to provide right hierarchy to the month column

In [None]:
## !pip install sort-dataframeby-monthorweek

## Dependency package needs to be installed
## pip install sorted-months-weekdays

In [None]:
!pip install sorted_months_weekdays

In [None]:
import sort_dataframeby_monthorweek as sd

In [None]:
final=sd.Sort_Dataframeby_Month(df=final,monthcolumnname='month')
final

In [None]:
px.line(final, x='month', y=['price_for_resort','price_for_city_hotel'], title='Room price per night over the Months')

### Conclusion-->> This clearly shows that the prices in the Resort hotel are much higher during the summer (no surprise here)., The price of the city hotel varies less and is most expensive during spring and autumn.

In [None]:

sns.lineplot(x = "month", y="price_for_resort", data=final,label='Resort')
sns.lineplot(x = "month", y="price_for_city_hotel", data=final,label='City_hotel')
plt.title("Room price per night and person over the year", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("Price [EUR]", fontsize=16)
plt.legend()
plt.show()


## Distribution of Nights Spent at Hotels by Market Segment and Hotel Type

In [None]:
data.head()

In [None]:
plt.figure(figsize = (15,10))
sns.boxplot(x = "market_segment", y = "stays_in_week_nights", data = data, hue = "hotel", palette = 'Set1');


#### Conclusion-->>
    It can be seen that most of the groups are normal distributed, some of them have high skewness. Looking at the distribution, most people do not seem to prefer to stay at the hotel for more than 1 week. But it seems normal to stay in resort hotels for up to 12-13 days
    It is obvious that when people go to resort hotels, they prefer to stay more.

## Analysing Preference of Guests, what they basically Prefer?

In [None]:
px.pie(data,names=data['meal'].value_counts().index,values=data['meal'].value_counts().values,hole=0.5)

#### Conclusion-->>
    Below the donut pie graph shows the meal categories. There is a big difference in the Bed&Breakfast category and the others. Almost 80% of bookings reserved for Bed&Breakfast.

## Simplify your analysis on the basis of differen types of hotels & meals

In [None]:
data.groupby(['hotel','meal']).agg({'meal':'count'}).unstack()

## Analyse Special Request done by Cuustomers¶

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='total_of_special_requests', data=data, palette = 'ocean_r')
plt.title('Total Special Request')

#### Conclusion-->> 
    Around 55% of bookings do not have any special requests

In [None]:
data.head()

In [None]:
pivot=data.groupby([ 'total_of_special_requests', 'is_canceled']).agg({'total_of_special_requests':'count'}).rename(columns={'total_of_special_requests':'count'}).unstack()
pivot

In [None]:
pivot.plot(kind='bar')

#### Conclusion-->> 
    This graph is about the relationship between special requests and cancellation booking status. Nearly half bookings without any special requests have been cancelled and another half of them have not been canceled.

## Which are the most busy month or in which months Guests are high?

In [None]:
data_resort.head()

In [None]:
rush_resort=data_resort['arrival_date_month'].value_counts().reset_index()
rush_resort.columns=['month','no of guests']
rush_resort

In [None]:
rush_city=data_city['arrival_date_month'].value_counts().reset_index()
rush_city.columns=['month','no of guests']
rush_city

In [None]:
final_rush=rush_resort.merge(rush_city,on='month')
final_rush.columns=['month','no of guests in resort','no of guest in city hotel']
final_rush

In [None]:
final_rush=sd.Sort_Dataframeby_Month(df=final_rush,monthcolumnname='month')
final_rush

In [None]:
final_rush.dtypes

In [None]:
final_rush.columns

In [None]:
px.line(data_frame=final_rush, x='month', y=['no of guests in resort','no of guest in city hotel'], title='Total no of guests per Months')

### Conclusion
     The City hotel has more guests during spring and autumn, when the prices are also highest.
    In July and August there are less visitors, although prices are lower.

    Guest numbers for the Resort hotel go down slighty from June to September, which is also when the prices are highest.
    Both hotels have the fewest guests during the winter.

## How long do people stay at the hotels?

In [None]:
filter=data['is_canceled']==0
clean_data=data[filter]

In [None]:
clean_data.head()

In [None]:
clean_data["total_nights"] = clean_data["stays_in_weekend_nights"] + clean_data["stays_in_week_nights"]

In [None]:
clean_data.head()

In [None]:
stay=clean_data.groupby(['total_nights','hotel']).agg('count').reset_index()
stay=stay.iloc[:,0:3]
stay.head()

In [None]:
stay=stay.rename(columns={'is_canceled':'Number of stays'})
stay.head()

In [None]:
plt.figure(figsize=(20, 8))
sns.barplot(x = "total_nights", y = "Number of stays" , hue="hotel",
            hue_order = ["City Hotel", "Resort Hotel"], data=stay)

## Bookings by market segment

In [None]:
clean_data['market_segment'].value_counts()

In [None]:
# pie plot
fig=px.pie(clean_data,
             values=clean_data['market_segment'].value_counts().values,
             names=clean_data['market_segment'].value_counts().index,
             title="Bookings per market segment")
fig.update_traces(rotation=-90, textinfo="percent+label")
fig.show()

##  price per night (ADR) and person based on booking and room

In [None]:
clean_data.head()

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(x="market_segment",
            y="adr",
            hue="reserved_room_type",
            data=clean_data,
            ci="sd",
            errwidth=1,
            capsize=0.1)

## How many bookings were cancelled?

In [None]:
cancel=data[data['is_canceled']==1]

In [None]:
rh_cancelations = cancel[cancel["hotel"] == "Resort Hotel"]["is_canceled"].sum()
ch_cancelations = cancel[cancel["hotel"] == "City Hotel"]["is_canceled"].sum()

In [None]:
rh_cancelations

In [None]:
ch_cancelations

In [None]:
### convert entire stats into percentage
px.pie(values=[11120,33079],names=[rh_cancelations,ch_cancelations])

## Which month have the highest number of cancellations?

In [None]:
data.head()

In [None]:
cancellation=data[data['is_canceled']==1]
cancellation.head()

In [None]:
cancellation['hotel'].unique()

In [None]:
cancel_month=data.groupby(['arrival_date_month','hotel']).agg('count').reset_index()
cancelled=cancel_month.iloc[:,0:3]

In [None]:
cancelled

In [None]:
cancelled=cancelled.rename(columns={'is_canceled':'no of cancellations'})
cancelled

In [None]:
final=sd.Sort_Dataframeby_Month(cancelled,'arrival_date_month')
final

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x = "arrival_date_month", y = "no of cancellations" , hue="hotel",
            hue_order = ["City Hotel", "Resort Hotel"], data=final)

#### conclusion 
    For the City hotel the relative number of cancelations is around 40 % throughout the year.
    For the Resort hotel it is highest in the summer and lowest during the winter.