# Machine Learning Project

## Necessary imports

In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Data Preparation

In [117]:
data = pd.read_csv('../data.csv')
display(data.head())
print('Shape of data: ', data.shape)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


Shape of data:  (119390, 32)


In [118]:
# The following columns have nan values
data.isna().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [119]:
# We can fill NaN values for number of children with 0 (only 4 NaN values)
data['children'] = data['children'].fillna(0)

In [120]:
# Agent and Company have quite a lot of NaN values (13% and 94%)
print("No. of unique agents: ", data['agent'].nunique())
print("No. of unique companies: ", data['company'].nunique())
print("No. of unique countries: ", data['country'].nunique())
print("% of missing agent values: ", data['agent'].isna().sum()/data.shape[0]*100)
print("% of missing company values: ", data['company'].isna().sum()/data.shape[0]*100)

# country has very few NaN values (0.4%). Maybe we can remove those data points
print("% of missing country values: ", data['country'].isna().sum()/data.shape[0]*100)

No. of unique agents:  333
No. of unique companies:  352
No. of unique countries:  177
% of missing agent values:  13.686238378423655
% of missing company values:  94.30689337465449
% of missing country values:  0.40874445095904177


In [121]:
# Changing Agent column to boolean (whether booked through agent or not)
# 0 -> Not booked by agent
# 1 -> Booked by agent
# data['agent'] = data['agent'].notnull().astype('int')

In [122]:
# Changing Comapny column to boolean (whether booked through company or not)
# 0 -> Not booked by company
# 1 -> Booked by company
# data['company'] = data['company'].notnull().astype('int')
# display(data.head())

In [123]:
# Removing country column for now or we can use target encoding
data = data.drop(['country'], axis=1)

# Removing agent and company columns since market_segment gives same information
data = data.drop(['agent','company'], axis=1)

# Removing all columns relating to arrival date except week number and year for now
data = data.drop(['arrival_date_month','arrival_date_day_of_month'], axis=1)


In [127]:
display(data.head())
print('Shape of data: ', data.shape)
data.describe()

print("No. of unique assigned room types: ", data['assigned_room_type'].nunique())
print("No. of unique customer types: ", data['customer_type'].nunique())
print("No. of unique deposit types: ", data['deposit_type'].nunique())
print("No. of unique distribution channels: ", data['distribution_channel'].nunique())
print("No. of unique market segments: ", data['market_segment'].nunique())
print("No. of unique meal types: ", data['meal'].nunique())
print("No. of unique reservation status types: ", data['reservation_status'].nunique())
print("No. of unique reserved room types: ", data['reserved_room_type'].nunique())

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,27,0,0,2,0.0,0,...,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,27,0,0,2,0.0,0,...,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,27,0,1,1,0.0,0,...,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,27,0,1,1,0.0,0,...,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,27,0,2,2,0.0,0,...,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out,2015-07-03


Shape of data:  (119390, 27)
No. of unique assigned room types:  12
No. of unique customer types:  4
No. of unique deposit types:  3
No. of unique distribution channels:  5
No. of unique market segments:  8
No. of unique meal types:  5
No. of unique reservation status types:  3
No. of unique reserved room types:  10


In [130]:
# We can combine reserved_room_type and assigned_room_type columns into one that has boolean values for whether reserved and assigned matched
# 0 -> Reserved and assigned room types matched
# 1 -> Reserved and assigned room types didn't match
data['reserved_assigned_match'] = np.where(data['reserved_room_type'] == data['assigned_room_type'], 0, 1)
data = data.drop(['reserved_room_type','reserved_room_type'], axis=1)

In [134]:
# We can change the values in the meal types column. 0, 0.33, 0.67, 1 since the meal types are incremental
data['meal'].replace(['Undefined', 'SC', 'BB', 'HB', 'FB'], [0, 0, 0.33, 0.67, 1], inplace=True)

0         0.33
1         0.33
2         0.33
3         0.33
4         0.33
          ... 
119385    0.33
119386    0.33
119387    0.33
119388    0.33
119389    0.67
Name: meal, Length: 119390, dtype: float64

In [135]:
# We can remove the distribution channels column as market_segment column gives us same information with more granularity
data = data.drop(['distribution_channel'], axis=1)

In [148]:
# Deposit Type, Customer Type and Market Segment will be one hot encoded


# Reservation Status can be ignored since it is being predicted     -> classification
# Reservation Status can be ignored since we will only be training  -> regression
# on those points where the booking has been cancelled
data = data.drop(['reservation_status'], axis=1)

In [149]:
display(data.head())

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date,reserved_assigned_match
0,Resort Hotel,0,342,2015,27,0,0,2,0.0,0,...,C,3,No Deposit,0,Transient,0.0,0,0,2015-07-01,0
1,Resort Hotel,0,737,2015,27,0,0,2,0.0,0,...,C,4,No Deposit,0,Transient,0.0,0,0,2015-07-01,0
2,Resort Hotel,0,7,2015,27,0,1,1,0.0,0,...,C,0,No Deposit,0,Transient,75.0,0,0,2015-07-02,1
3,Resort Hotel,0,13,2015,27,0,1,1,0.0,0,...,A,0,No Deposit,0,Transient,75.0,0,0,2015-07-02,0
4,Resort Hotel,0,14,2015,27,0,2,2,0.0,0,...,A,0,No Deposit,0,Transient,98.0,0,1,2015-07-03,0


In [146]:
for column in data:
    #corr = data['is_canceled'].corr(data[str(column)])
    #print(f"Correlation b/w is_canceled and {column}: {corr}")

hotel
is_canceled
lead_time
arrival_date_year
arrival_date_week_number
stays_in_weekend_nights
stays_in_week_nights
adults
children
babies
meal
market_segment
is_repeated_guest
previous_cancellations
previous_bookings_not_canceled
assigned_room_type
booking_changes
deposit_type
days_in_waiting_list
customer_type
adr
required_car_parking_spaces
total_of_special_requests
reservation_status
reservation_status_date
reserved_assigned_match
