In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('./hotel_bookings.csv')

In [3]:
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [4]:
data.shape

(119390, 32)

In [5]:
data.isnull().values.any()

True

In [6]:
data.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [7]:
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
print(missing_value_df)

                                percent_missing
hotel                                  0.000000
total_of_special_requests              0.000000
required_car_parking_spaces            0.000000
adr                                    0.000000
customer_type                          0.000000
days_in_waiting_list                   0.000000
deposit_type                           0.000000
booking_changes                        0.000000
assigned_room_type                     0.000000
reserved_room_type                     0.000000
previous_bookings_not_canceled         0.000000
previous_cancellations                 0.000000
is_repeated_guest                      0.000000
reservation_status                     0.000000
distribution_channel                   0.000000
reservation_status_date                0.000000
meal                                   0.000000
babies                                 0.000000
adults                                 0.000000
stays_in_week_nights                   0

In [8]:
#dropping rows with missing country or children rows
data = data[data['children'].notna()]
data = data[data['country'].notna()]

In [9]:
data.shape

(118898, 32)

In [10]:
#Transforming arrival_date_month to datetime type
data["arrival_date_month"]=pd.to_datetime(data['arrival_date_month'],format='%B').dt.month
#combine year and month and day in a datetime variable
data["arrival_date"]=pd.to_datetime({"year":data["arrival_date_year"].values,
                                     "month":data["arrival_date_month"].values,
                                     "day":data["arrival_date_day_of_month"].values})
#Droping the year and month and day columns
data=data.drop(columns=['arrival_date_year','arrival_date_month','arrival_date_day_of_month'])

data.shape
data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date
19742,Resort Hotel,0,112,53,0,3,2,0.0,0,Undefined,...,,,0,Transient-Party,224.67,0,0,Check-Out,2016-01-02,2016-12-16
16087,Resort Hotel,0,103,31,4,9,2,1.0,0,HB,...,,,0,Transient,297.5,1,1,Check-Out,2015-08-14,2015-08-29
29490,Resort Hotel,0,118,44,2,5,2,0.0,0,BB,...,,,0,Transient-Party,79.2,0,1,Check-Out,2016-10-30,2016-11-09
114945,City Hotel,0,1,26,0,2,2,0.0,0,BB,...,14.0,,0,Transient,184.0,1,2,Check-Out,2017-06-29,2017-07-04
97804,City Hotel,0,67,39,2,1,2,0.0,0,BB,...,9.0,,0,Transient,149.4,0,1,Check-Out,2016-09-21,2016-09-23
34130,Resort Hotel,0,110,11,1,0,2,0.0,0,BB,...,66.0,,0,Transient-Party,50.0,0,0,Check-Out,2017-03-14,2017-03-23
82109,City Hotel,0,4,8,0,3,1,0.0,0,BB,...,,40.0,0,Transient,65.0,0,1,Check-Out,2016-02-19,2015-12-30
13540,Resort Hotel,1,160,33,2,3,2,0.0,0,BB,...,240.0,,0,Transient,238.0,0,0,Canceled,2017-03-12,2017-08-19
113979,City Hotel,0,30,24,1,2,2,0.0,0,BB,...,14.0,,0,Transient,161.67,0,3,Check-Out,2017-06-15,2017-06-22
4187,Resort Hotel,1,50,9,1,2,2,0.0,0,BB,...,240.0,,0,Transient,76.0,0,0,Canceled,2016-02-02,2016-02-26


In [11]:
data.dtypes

hotel                                     object
is_canceled                                int64
lead_time                                  int64
arrival_date_week_number                   int64
stays_in_weekend_nights                    int64
stays_in_week_nights                       int64
adults                                     int64
children                                 float64
babies                                     int64
meal                                      object
country                                   object
market_segment                            object
distribution_channel                      object
is_repeated_guest                          int64
previous_cancellations                     int64
previous_bookings_not_canceled             int64
reserved_room_type                        object
assigned_room_type                        object
booking_changes                            int64
deposit_type                              object
agent               

In [13]:
#Transforming the reservation_status_date variable type to Datetime 
data["reservation_status_date"]=pd.to_datetime(data["reservation_status_date"], format = '%Y-%m-%d')

In [14]:
data.dtypes

hotel                                     object
is_canceled                                int64
lead_time                                  int64
arrival_date_week_number                   int64
stays_in_weekend_nights                    int64
stays_in_week_nights                       int64
adults                                     int64
children                                 float64
babies                                     int64
meal                                      object
country                                   object
market_segment                            object
distribution_channel                      object
is_repeated_guest                          int64
previous_cancellations                     int64
previous_bookings_not_canceled             int64
reserved_room_type                        object
assigned_room_type                        object
booking_changes                            int64
deposit_type                              object
agent               

In [19]:
data = data.drop(columns = ['agent','company'])

In [20]:
data.shape

(118898, 28)

In [22]:
#Vizualizing the sum of missing values in each variable
data.isnull().sum()

hotel                               0
is_canceled                         0
lead_time                           0
arrival_date_week_number            0
stays_in_weekend_nights             0
stays_in_week_nights                0
adults                              0
children                            0
babies                              0
meal                                0
country                             0
market_segment                      0
distribution_channel                0
is_repeated_guest                   0
previous_cancellations              0
previous_bookings_not_canceled      0
reserved_room_type                  0
assigned_room_type                  0
booking_changes                     0
deposit_type                        0
days_in_waiting_list                0
customer_type                       0
adr                                 0
required_car_parking_spaces         0
total_of_special_requests           0
reservation_status                  0
reservation_

In [23]:
for column in ['arrival_date']:
    data[column] =data[column].fillna(data[column].mean())

In [24]:
#Droping the duplicated values
data.drop_duplicates( inplace = True)