# <div style="color:white;display:fill;border-radius:5px;background-color:#0E2031;letter-spacing:0.5px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Data Preparation</p></div> 

- **Data Preparation**
    - Libraries
    - Loading Data
    - Encoding
    - Normalization
    - Standardzation
    - Feature Selection

## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Libraries</p></div>

In [1]:
# Basic Tools
import pandas as pd
import numpy as np
#from datetime import datetime as dt

# Visualization Tools
# from matplotlib import pyplot as plt
# %matplotlib inline
# import seaborn as sns

## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Loading Data</p></div>

In [4]:
ROOT_DIR = '/home/alysson/projects/Hotel-Booking-Cancelations'
data = pd.read_csv(f'{ROOT_DIR}/data/data_processed/data_processed.csv')

In [17]:
data_prep = data.copy()
data_prep.shape

(87245, 37)

In [18]:
data_prep.isna().sum().sum()

1552

## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Encoding</p></div>

In [19]:
## CATEGORICAL FEATURES
hotel_dict = {'City Hotel': 0,  'Resort Hotel':1}
data_prep['hotel'] = data_prep['hotel'].map(hotel_dict) 

meal_dict = {'SC': 0,  'HB': 1, 'BB': 2, 'FB':3}
data_prep['meal'] = data_prep['meal'].map(meal_dict)

continentes_dict = {'Unknow':-1,'Native': 0,'Europe': 1, 'Asia': 2, 'North America':3, 'South America':4, 'Oceania':5, 'Africa':6 }
data_prep['continentes'] = data_prep['continentes'].map(continentes_dict) 

market_segment_dict = {'Undefined':-1,'Online TA': 0,'Offline TA/TO': 1, 'Groups': 2, 'Corporate':3, 'Direct':4, 'Aviation':5, 'Complementary':6}
data_prep['market_segment'] = data_prep['market_segment'].map(market_segment_dict)

distribution_dict = {'Undefined':-1,'TA/TO': 0,'Direct': 1, 'Corporate': 2, 'GDS':3}
data_prep['distribution_channel'] = data_prep['distribution_channel'].map(distribution_dict) 

customer_type_dict = {'Transient': 0,'Transient-Party': 1, 'Contract': 2, 'Contract':3, 'Group':4}
data_prep['customer_type'] = data_prep['customer_type'].map(customer_type_dict)

In [20]:
## NUMERICAL FEATURES

data_prep['previous_cancellations'] = data_prep['previous_cancellations'].apply(lambda x: 2 if (x >= 2) else x)
data_prep['previous_bookings_not_canceled'] = data_prep['previous_bookings_not_canceled'].apply(lambda x: 2 if (x >= 2) else x)
data_prep['booking_changes'] = data_prep['booking_changes'].apply(lambda x: 2 if (x >= 2) else x)

n = 20
top_agents = data_prep['agent'].value_counts().nlargest(n).index
top_companies = data_prep['company'].value_counts().nlargest(n).index
data_prep['agent'] = np.where(data_prep['agent'].isin(top_agents), data_prep['agent'], -1)
data_prep['company'] = np.where(data_prep['company'].isin(top_companies), data_prep['company'], -1)

## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Feature Selection</p></div>

There are some features that we will **not use under any circumstances** in our machine learning model:

**reservation_status:** Data leakage<br>
**reservation_status_date:** Data Leakage<br>
**deposit_type:** The dataset seems to contain several wrong values about the type of deposit and its relationship with cancellations<br>


The remaining features were selected based on various tests that are not described on the notebook, with a focus on performance and simplicity:

**adults, children, babies:** Replaced by "people"<br>
**stays_in_weekend_nights, stays_in_week_nights:** Replaced with days_stay<br>
**country:** Replaced by continentes 


In [21]:
data_prep.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'people', 'kids',
       'days_stay', 'country_name', 'continentes'],
      dtype='object')

In [22]:
SELECTED_FEATURES = ['hotel',
                     #'is_canceled',
                     'lead_time',
                     'arrival_date_year',   
                     #'arrival_date_month',
                     'arrival_date_week_number',
                     #'arrival_date_day_of_month',
                     #'stays_in_weekend_nights',                    
                     #'stays_in_week_nights',
                     #'adults',
                     #'children',
                     #'babies',
                     'meal',
                     #'country',
                     'market_segment',
                     'distribution_channel',
                     'is_repeated_guest',
                     'previous_cancellations',
                     #'assigned_room_type',
                     'previous_bookings_not_canceled',
                     #'reserved_room_type',
                     'booking_changes',
                     #'deposit_type',
                     'agent',
                     'company',
                     #'days_in_waiting_list',
                     'customer_type',
                     'adr',
                     'required_car_parking_spaces',
                     #'reservation_status',
                     #'reservation_status_date',
                     'people',
                     #'kids',
                     'days_stay',
                     #'country_name',
                     'continentes']


TARGET = 'is_canceled'

DROP_FEATURES = [value for value in data if value != 'is_canceled' and value not in data[SELECTED_FEATURES]]

In [23]:
data_prep[SELECTED_FEATURES]

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_week_number,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,customer_type,adr,required_car_parking_spaces,people,days_stay,continentes
0,1,342,2015,27,2,4,1,0,0,0,2,0.0,0.0,0,0.00,0,2.0,0,0.0
1,1,737,2015,27,2,4,1,0,0,0,2,0.0,0.0,0,0.00,0,2.0,0,0.0
2,1,7,2015,27,2,4,1,0,0,0,0,0.0,0.0,0,75.00,0,1.0,1,1.0
3,1,13,2015,27,2,3,2,0,0,0,0,-1.0,0.0,0,75.00,0,1.0,1,1.0
4,1,14,2015,27,2,0,0,0,0,0,0,240.0,0.0,0,98.00,0,2.0,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87240,0,23,2017,35,2,1,0,0,0,0,0,-1.0,0.0,0,96.14,0,2.0,7,1.0
87241,0,102,2017,35,2,0,0,0,0,0,0,9.0,0.0,0,225.43,0,3.0,7,1.0
87242,0,34,2017,35,2,0,0,0,0,0,0,9.0,0.0,0,157.71,0,2.0,7,1.0
87243,0,109,2017,35,2,0,0,0,0,0,0,-1.0,0.0,0,104.40,0,2.0,7,1.0


## <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Normalization</p></div>

In [24]:
log_columns = data_prep[SELECTED_FEATURES].skew().sort_values(ascending=False)
log_columns = log_columns.loc[log_columns > 0.75]
print('Columns/Skew\n',log_columns)    

Columns/Skew
 adr                               11.007980
people                            10.515798
previous_cancellations             8.314032
company                            7.857856
previous_bookings_not_canceled     5.264496
is_repeated_guest                  4.793307
required_car_parking_spaces        3.489034
customer_type                      3.232569
days_stay                          2.946994
continentes                        2.454346
booking_changes                    2.215153
distribution_channel               2.062676
lead_time                          1.430968
market_segment                     1.305475
agent                              1.289132
dtype: float64


In [25]:
# for col in log_columns.index:
#     data_prep[col] = np.log1p(data_prep[col])

In [26]:
# rbs = RobustScaler()
# for col in data_prep[selected_features]:
#     data_prep[col] = rbs.fit_transform(data_prep[[col]]).squeeze()  

In [27]:
y = data_prep[TARGET]
X = data_prep[SELECTED_FEATURES]