# Introduction to Pandas project

In [1]:
import pandas as pd

In [4]:
path_to_file = 'bookings.csv'
bookings = pd.read_csv(path_to_file, sep=';')
bookings_head = bookings.head(7)

In [226]:
bookings.shape

(119390, 23)

In [7]:
bookings.dtypes

Hotel                         object
Is Canceled                    int64
Lead Time                      int64
arrival full date             object
Arrival Date Year              int64
Arrival Date Month            object
Arrival Date Week Number       int64
Arrival Date Day of Month      int64
Stays in Weekend nights        int64
Stays in week nights           int64
stays total nights             int64
Adults                         int64
Children                     float64
Babies                         int64
Meal                          object
Country                       object
Reserved Room Type            object
Assigned room type            object
customer type                 object
Reservation Status            object
Reservation status_date       object
dtype: object

### Create a function to changing incorrect column names to correct.

In [15]:
def change_column_names(name):
    name = name.replace(' ', '_').lower()
    return name

bookings = bookings.rename(columns=change_column_names)

In [32]:
bookings_with_no_cancel = bookings[bookings['is_canceled'] == 0]

In [37]:
# Count top-5 countries with no cancellation after reservation
bookings_with_no_cancel.value_counts('country').sort_values(ascending=False).head(5)

country
PRT    21071
GBR     9676
FRA     8481
ESP     6391
DEU     6069
dtype: int64

### Mean nights of staying at City Hotel and Resort Hotel

In [40]:
city_hotel = bookings.groupby('hotel').agg({'stays_total_nights' : 'mean'})

In [42]:
round(city_hotel, 2)

Unnamed: 0_level_0,stays_total_nights
hotel,Unnamed: 1_level_1
City Hotel,2.98
Resort Hotel,4.32


### Different rooms (on reservation and reality)

In [68]:
result = bookings[bookings['reserved_room_type'] != bookings['assigned_room_type']]

In [69]:
len(result)

14917

### Most non canceled month in 2016 and 2017

In [230]:
year2016 = bookings.query('arrival_date_year == 2016')
# year2016 = year.groupby('is_canceled' == 0).agg({'arrival_date_month' : 'sum'})
year2016 = year2016[year2016['is_canceled'] == 0]

In [231]:
year2016.value_counts('arrival_date_month').sort_values(ascending=False)

arrival_date_month
October      3689
May          3563
September    3372
April        3367
March        3347
August       3238
June         3196
July         3073
November     2818
February     2554
December     2462
January      1691
dtype: int64

In [232]:
year2017 = bookings.query('arrival_date_year == 2017')
year2017 = year2017[year2017['is_canceled'] == 0]

In [233]:
year2017.value_counts('arrival_date_month').sort_values(ascending=False)

arrival_date_month
May         3551
July        3329
March       3298
June        3208
April       3198
August      3109
February    2818
January     2431
dtype: int64

### Cancellations sorted by months in 2015/16/17 years

In [234]:
canceled_books = bookings.query('hotel == "City Hotel" & is_canceled == 1')

In [235]:
canceled_books.groupby('arrival_date_year')['arrival_date_month'].value_counts()

arrival_date_year  arrival_date_month
2015               September             1543
                   October               1321
                   August                1232
                   July                   939
                   December               668
                   November               301
2016               October               1947
                   June                  1720
                   September             1567
                   April                 1539
                   May                   1436
                   November              1360
                   August                1247
                   March                 1108
                   December              1072
                   July                  1043
                   February               930
                   January                438
2017               May                   2217
                   April                 1926
                   June                  1

In [236]:
print(bookings['adults'].mean(), bookings['children'].mean(), bookings['babies'].mean())

1.8564033838679956 0.10388990333874994 0.007948739425412514


In [237]:
bookings['total_kids'] = bookings['children'] + bookings['babies']

In [238]:
round(bookings.groupby('hotel').agg({'total_kids': 'mean'}), 2).max()

total_kids    0.14
dtype: float64

In [239]:
bookings['has_kids'] = 0

In [240]:
vals = []
for i in range(len(bookings)):
    if bookings['total_kids'][i] > 0:
        vals.append(True)
    else:
        vals.append(False)

In [245]:
bookings['has_kids'] = vals

### Churn rate of families with kids and without

In [242]:
total_table = bookings

In [243]:
without_kids = total_table['has_kids'].value_counts()[0]
with_kids = total_table['has_kids'].value_counts()[1]

In [244]:
print(len(total_table.query('is_canceled == 1 & has_kids == False')) / without_kids * 100)
print(len(total_table.query('is_canceled == 1 & has_kids == True')) / with_kids * 100)

37.221283323338604
34.92284612087441
