In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Import all the necessary packages for EDA

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory

# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/hotel-booking-demand/hotel_bookings.csv')

## Getting to know your data

In [None]:
df.head()

In [None]:
# There are too many columns. Let's expand the maximum columns can be displayed 
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Let's visualize our data!

### Count Plot

In [None]:
sns.set(style = 'whitegrid')
plt.figure(figsize=(10,8))
ax = sns.countplot(x='hotel',data=df,palette='Set1')
ax.set_title(label='Hotel',size=30,color='b')
plt.show()

About two thirds of the customers booked City Hotel.

### Box Plot

In [None]:
# We can use a histogram to visualize lead_time
sns.set(style='ticks')
plt.figure(figsize=(15,12))
ax = sns.boxplot(x=df['is_canceled'],y=df['lead_time'],hue=df['hotel'])
ax.set_title('Lead_Time_Box_Plot', size=25)
plt.show()

From the bot plot, we can conclude that when lead_time is higher, there is a higher chance that a customer cancel the resevation and it makes sense. When a customer books a hotel 100 days (3 months) ahead, it's very likely for he or she to change the itineary and cancel the reservation. Besides, there is not much difference in average hotel book lead time whether is it cancelled or not between resort and city hotel.

Let's take a look at the arrival date week number. My hypothesis would be there will be more hotel reservations in July, August because it's summer time.

In [None]:
# Sort arrival date week number by count of hotel reservations
print(df['arrival_date_week_number'].nunique())
print(df['arrival_date_week_number'].value_counts().sort_values(ascending=False))  

## Histogram

Let's use histogram to visualize the distributions of the arrival date week hotel reservations number count.

In [None]:
sns.set(style='ticks')
plt.figure(figsize=(20,12))
ax = plt.hist(x=df['arrival_date_week_number'],bins=np.arange(55)-0.5,facecolor='blue',alpha=0.5)
plt.xlabel('arrival_date_week_number',size=20)
plt.ylabel('count',size=20)
plt.title('arrival_date_week_number_count',size=30)
plt.xticks(range(1,54,10))

plt.show()

My hypothesis was right!
<br><br> Indeed,hotel reservation reaches its peak around July and August. 

Let's take a look at previous cancellations. 
<br> If a customer has previous cancelled before, he or she should be more likely to cancel again. 

We take a look at the correlation between 'previous cancellations' and 'is cancelled' first.

In [None]:
df[['is_canceled','previous_cancellations']].corr()

 Correlation is only 0.1101. Let's check the distribution of previous cancellations.

In [None]:
# Sort previous cancellations by index (cancellation times)
df['previous_cancellations'].value_counts().sort_index()

Let's take a look customer cancellation rate for customers who cancelled more than 10 times before!

In [None]:
print('Cancellation rate for customers who have canceled \
more than 10 times before:',str(round(df[df['previous_cancellations']>10]['is_canceled'].mean()*100,2))+'%')


85.56%! 
<br>Let's dive in more to see the customer cancellation rate for customers with each previous cancellation times

In [None]:
# Convert index to a list 
a=df['previous_cancellations'].value_counts().index.to_list()
# Sort the list by ascending order
a.sort()
a

In [None]:
b = []
for ccltime in a:
    b.append(round(df[df['previous_cancellations']==ccltime]['is_canceled'].mean(),2))
b

In [None]:

df[df['previous_cancellations']==1]['is_canceled'].value_counts()

In [None]:
round(df[df['previous_cancellations']==1]['is_canceled'].mean(),2)

To our surprise, when customer previouslly had one time hotel reservation cancellation, 94% of them cancelled again. Notice, we have a fairly large data for it too. (6,000 data points)
<br><br> Let's dig a little further!

Notice there is a column "previous_bookings_not_canceled" next to the column "previous_cancellations". It's possible that a customer had previously not canceled every reservation he or she made. Thus, by only take account of 0 previous cancellation doesn't give us all embedded information. We define a new variable "previous cancellation rate" as previous cancellations divided by total previous reservations. Let's see if the new features makes more sense in our data.

In [None]:
# Define a new parameter pervious_cancellation_rate
df['previous_cancellation_rate']=df['previous_cancellations']/(df['previous_cancellations']+df['previous_bookings_not_canceled'])

In [None]:
df['previous_cancellation_rate'].value_counts(dropna=False)

Let's divide "previous_cancellation_rate" into 10 equal length intervals and compare their mean of "is_canceled".

In [None]:
df.sort_values('previous_cancellation_rate')

In [None]:
df['pervious_cancellation_rate_interval'] = pd.cut(x=df['previous_cancellation_rate'],bins=l, include_lowest=True)
df

In [None]:
l = list(range(11))
l = [i/10 for i in l]
l

In [None]:
# Create a list of range 10
l = list(range(10))
# Use list comprehension to create 10 equal intervals
pcr = [[i/10,(i+1)/10] for i in l]

pcr


In [None]:
0.4 in [0,2]

In [None]:
df[df['previous_cancellations']==1].sort_values('is_canceled',ascending = False).describe()

In [None]:
df[df['previous_cancellations']!=1].sort_values('is_canceled',ascending = False).describe()

In [None]:
a=df['previous_cancellations'].value_counts().index.to_list()
b=df['previous_cancellations'].value_counts().values
df[['previous_cancellations','hotel','is_canceled']].corr()

In [None]:
df[df['previous_cancellations']!=0]='Previously_cancelled'
df[df['previous_cancellations']==0]='Previously_never_cancelled'


In [None]:

sns.set_context('paper')
plt.figure(figsize=(20,6))

sns.barplot(x=b,y=a)

plt.show()

## Bivariate Analysis

In [None]:
df.corr()

In [None]:
# High negative total of sepcial requests with cancelation. My hypothesis is that when a customer has more special reqeusts and they are met, the customer 
# is more likely to stay with the hotel

sns.catplot(y='is_canceled',x='total_of_special_requests',kind='bar',data=df,height=8,aspect=2)
plt.xlabel('Cancelation',size=10)
plt.ylabel('Num of Speical Requests',size=10)
plt.title('Special Requests',size=20)
plt.show()

In [None]:
list(range(0,10))