In [1]:
#Setup Cell
import pandas as pd
hotel = pd.read_csv('./hotelbookings_raw.csv')
pd.set_option('display.max_columns', None)

In [2]:
#View original Column-names
hotel.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

In [3]:
#Rename Columns
hotel.rename(columns={'adults' : 'num_adults', 'children' : 'num_children','babies' : 'num_babies'},inplace= True)

In [4]:
#Drop columns
hotel.drop(columns=['days_in_waiting_list', 'customer_type'], inplace=True)
hotel.drop(columns=['company'], inplace=True)

In [5]:
#View original NaN Values
hotel.isna().mean() * 100

hotel                              0.000000
is_canceled                        0.000000
lead_time                          0.000000
arrival_date_year                  0.000000
arrival_date_month                 0.000000
arrival_date_week_number           0.000000
arrival_date_day_of_month          0.000000
stays_in_weekend_nights            0.000000
stays_in_week_nights               0.000000
num_adults                         0.000000
num_children                       0.000000
num_babies                         0.000000
meal                               0.000000
country                            0.106855
market_segment                     0.000000
distribution_channel               0.000000
is_repeated_guest                  0.000000
previous_cancellations             0.000000
previous_bookings_not_canceled     0.000000
reserved_room_type                 0.000000
assigned_room_type                 0.000000
booking_changes                    0.000000
deposit_type                    

In [6]:
#Checking Skew for Filling NaN Values
hotel['agent'].skew()

np.float64(-0.8129546575434005)

In [7]:
#Filling NaN Values
hotel['agent'] = hotel['agent'].fillna(hotel['agent'].median())
hotel['country'] = hotel['country'].fillna(hotel['country'].mode()[0])

In [8]:
#View Original data-type
hotel.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
num_adults                          int64
num_children                        int64
num_babies                          int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [9]:
#Change data-type
hotel['agent'] = hotel['agent'].astype(int)
hotel['is_canceled'] = hotel['is_canceled'].astype(bool)
hotel['is_repeated_guest'] = hotel['is_repeated_guest'].astype(bool)

In [10]:
#Check Duplicates
hotel.loc[hotel.duplicated(keep=False)]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,num_adults,num_children,num_babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
4,Resort Hotel,False,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,TA/TO,False,0,0,A,A,0,No Deposit,240,98.00,0,1,Check-Out,03-07-2015
5,Resort Hotel,False,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,TA/TO,False,0,0,A,A,0,No Deposit,240,98.00,0,1,Check-Out,03-07-2015
21,Resort Hotel,False,72,2015,July,27,1,2,4,2,0,0,BB,PRT,Direct,Direct,False,0,0,A,A,1,No Deposit,250,84.67,0,1,Check-Out,07-07-2015
22,Resort Hotel,False,72,2015,July,27,1,2,4,2,0,0,BB,PRT,Direct,Direct,False,0,0,A,A,1,No Deposit,250,84.67,0,1,Check-Out,07-07-2015
39,Resort Hotel,False,70,2015,July,27,2,2,3,2,0,0,HB,ROU,Direct,Direct,False,0,0,E,E,0,No Deposit,250,137.00,0,1,Check-Out,07-07-2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12150,Resort Hotel,True,223,2017,June,24,15,2,7,2,0,0,HB,DEU,Online TA,TA/TO,False,0,0,E,E,0,No Deposit,240,170.00,0,0,Canceled,04-11-2016
12153,Resort Hotel,True,223,2017,June,24,15,2,7,2,0,0,HB,DEU,Online TA,TA/TO,False,0,0,E,E,0,No Deposit,240,170.00,0,0,Canceled,04-11-2016
12163,Resort Hotel,True,30,2017,June,24,16,1,2,2,0,0,BB,PRT,Direct,Direct,False,0,0,A,A,0,No Deposit,240,0.00,0,0,Canceled,22-05-2017
12164,Resort Hotel,True,30,2017,June,24,16,1,2,2,0,0,BB,PRT,Direct,Direct,False,0,0,A,A,0,No Deposit,240,0.00,0,0,Canceled,22-05-2017


In [11]:
#Drop Duplicates
hotel.drop_duplicates(keep='first', inplace=True)

In [None]:
#Final Run
hotel

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,num_adults,num_children,num_babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,False,342,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,Direct,False,0,0,C,C,3,No Deposit,240,0.0,0,0,Check-Out,01-07-2015
1,Resort Hotel,False,737,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,Direct,False,0,0,C,C,4,No Deposit,240,0.0,0,0,Check-Out,01-07-2015
2,Resort Hotel,False,7,2015,July,27,1,0,1,1,0,0,BB,GBR,Direct,Direct,False,0,0,A,C,0,No Deposit,240,75.0,0,0,Check-Out,02-07-2015
3,Resort Hotel,False,13,2015,July,27,1,0,1,1,0,0,BB,GBR,Corporate,Corporate,False,0,0,A,A,0,No Deposit,304,75.0,0,0,Check-Out,02-07-2015
4,Resort Hotel,False,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,TA/TO,False,0,0,A,A,0,No Deposit,240,98.0,0,1,Check-Out,03-07-2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12159,Resort Hotel,True,253,2017,June,24,16,0,2,2,0,0,BB,GBR,Online TA,TA/TO,False,0,0,A,A,0,No Deposit,240,80.1,0,1,Canceled,08-10-2016
12160,Resort Hotel,True,269,2017,June,24,16,0,2,2,0,0,BB,GBR,Online TA,TA/TO,False,0,0,E,E,3,No Deposit,240,107.1,0,2,Canceled,09-12-2016
12161,Resort Hotel,True,158,2017,June,24,16,0,2,2,0,0,BB,ESP,Online TA,TA/TO,False,0,0,D,D,0,No Deposit,240,125.0,0,0,Canceled,11-01-2017
12162,Resort Hotel,True,343,2017,June,24,16,1,2,2,0,0,BB,PRT,Offline TA/TO,TA/TO,False,0,0,D,D,0,No Deposit,8,70.2,0,0,Canceled,21-10-2016
