   # 🏨 Hotel Bookings Dataset 🏨
***
   #### owner: Golovin Alexey

In [8]:
import pandas as pd
from datetime import datetime

In [39]:
file = './2_bookings.csv'
bookings = pd.read_csv(file, encoding='windows-1251', sep=';')
dfq = pd.read_csv(file, encoding='windows-1251', sep=';')

In [9]:
bookings.head(2)

Unnamed: 0,Hotel,Is Canceled,Lead Time,arrival full date,Arrival Date Year,Arrival Date Month,Arrival Date Week Number,Arrival Date Day of Month,Stays in Weekend nights,Stays in week nights,...,Adults,Children,Babies,Meal,Country,Reserved Room Type,Assigned room type,customer type,Reservation Status,Reservation status_date
0,Resort Hotel,0,342,2015-07-01,2015,July,27,1,0,0,...,2,0.0,0,BB,PRT,C,C,Transient,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015-07-01,2015,July,27,1,0,0,...,2,0.0,0,BB,PRT,C,C,Transient,Check-Out,2015-07-01


In [3]:
bookings.dtypes

Hotel                         object
Is Canceled                    int64
Lead Time                      int64
arrival full date             object
Arrival Date Year              int64
Arrival Date Month            object
Arrival Date Week Number       int64
Arrival Date Day of Month      int64
Stays in Weekend nights        int64
Stays in week nights           int64
stays total nights             int64
Adults                         int64
Children                     float64
Babies                         int64
Meal                          object
Country                       object
Reserved Room Type            object
Assigned room type            object
customer type                 object
Reservation Status            object
Reservation status_date       object
dtype: object

In [41]:
# making column with simillar style with underscores and lower letters
bookings.columns = bookings.columns.str.lower().str.replace(' ', '_')

In [42]:
bookings.head(3)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_full_date,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,adults,children,babies,meal,country,reserved_room_type,assigned_room_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015-07-01,2015,July,27,1,0,0,...,2,0.0,0,BB,PRT,C,C,Transient,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015-07-01,2015,July,27,1,0,0,...,2,0.0,0,BB,PRT,C,C,Transient,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015-07-01,2015,July,27,1,0,1,...,1,0.0,0,BB,GBR,A,C,Transient,Check-Out,2015-07-02


In [19]:
bookings.country.value_counts().sort_values(ascending = False).head(7)

PRT    48590
GBR    12129
FRA    10415
ESP     8568
DEU     7287
ITA     3766
IRL     3375
Name: country, dtype: int64

In [22]:
# calculate the average stays_total_nights by hotel type
avg_nights = bookings.groupby('hotel')['stays_total_nights'].mean()

# print the results
print('Average nights for City Hotel:', round(avg_nights['City Hotel'], 2))
print('Average nights for Resort Hotel:', round(avg_nights['Resort Hotel'], 2))

Average nights for City Hotel: 2.98
Average nights for Resort Hotel: 4.32


Sometimes the type of room assigned to the customer (**assigned_room_type**) is different from the one originally booked (**reserved_room_type**). This can happen, for example, due to overbooking.

How many such observations are there in the dataset?


In [25]:
df_diff_room_type = bookings[bookings["assigned_room_type"] != bookings["reserved_room_type"]]
count_diff_room_type = len(df_diff_room_type)
print(count_diff_room_type)

14917


Now let's analyze the dates of planned arrival (**arrival_date_year**).

In which month was the booking most often made in 2016? Did the most popular month change in 2017?

In [28]:
bookings.groupby('arrival_date_year').arrival_date_month.value_counts().sort_values(ascending = False).head(7)

arrival_date_year  arrival_date_month
2017               May                   6313
2016               October               6203
2017               April                 5661
                   June                  5647
2016               May                   5478
                   April                 5428
                   September             5394
Name: arrival_date_month, dtype: int64

Group the data by years, and then check for which month (arrival_date_month) the bookings for City Hotel were canceled most often in 2015, 2016, and 2017.

In [43]:
for year in [2015, 2016, 2017]:
    max_canceled_month = bookings.query('hotel == "City Hotel" and is_canceled == 1 and arrival_date_year == @year') \
    .groupby(['arrival_date_year', 'arrival_date_month']) \
    .agg({'is_canceled': 'count'}) \
    .idxmax()
    print(f'The majority of bookings cancellations in {year} happend in {max_canceled_month[0][1]}.')

The majority of bookings cancellations in 2015 happend in September.
The majority of bookings cancellations in 2016 happend in October.
The majority of bookings cancellations in 2017 happend in May.


Look at the numerical characteristics of three columns: **adults, children, and babies**. Which one has the highest mean value?

In [49]:
bookings[['adults', 'children', 'babies']].mean().round(2)

adults      1.86
children    0.10
babies      0.01
dtype: float64

Create a column named **"total_kids"** by combining the **"children"** and **"babies"** columns. For which type of hotel was the average value of the variable the highest?

**City hotel** - a hotel located in the city  
**Resort hotel** - a resort hotel  
Provide the largest average "total_kids" value as the answer, rounded to 2 decimal places.  

In [59]:
bookings['total_kids'] = bookings.children + bookings.babies
bookings.groupby('hotel').total_kids.mean().sort_values(ascending = False).round(2)

hotel
Resort Hotel    0.14
City Hotel      0.10
Name: total_kids, dtype: float64

Not all bookings were successful (is_canceled), so we can calculate how many customers were lost in the process. In other words, let's calculate the metric called **Churn Rate**.

**Churn rate** is the percentage of subscribers (for example, for push notifications from a site) who have unsubscribed from the communication channel, refused the services of the service for a certain period of time. In other words, it represents the ratio of the number of lost users to the total number of users, expressed as a percentage.

Create a column called "has_kids" which takes the value True if a customer indicated at least one child (total_kids) when booking, otherwise False. Then, check which group of users has a higher churn rate, i.e., the percentage of customers who canceled their booking.

In [75]:
# creating a new column
bookings['has_kids'] = bookings.total_kids > 0

Calculate with query and conditions

In [85]:
canceled_has_kids = bookings.query('has_kids == True and is_canceled == 1') \
    .is_canceled.value_counts()
canceled_no_kids = bookings.query('has_kids == False and is_canceled == 1') \
    .is_canceled.value_counts()

In [86]:
canceled_no_kids = canceled_no_kids[1]
canceled_has_kids = canceled_has_kids[1]
total_has_kids = len(bookings.query('has_kids == True'))
total_no_kids = len(bookings.query('has_kids == False'))

# function to calc churn rate
def churn_rate(total, canceled):
    return (canceled / total) * 100

In [92]:
round((churn_rate(total_no_kids, canceled_no_kids)),2)

37.22

In [93]:
round((churn_rate(total_has_kids, canceled_has_kids)),2)

34.92