In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv("hotel_bookings.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [4]:
#we have already performed EDA on this
#in this dataset let us just focus on Data Preprocessing

In [5]:
data.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

### Handling Missing Value

There are multiple ways using which we can handle Missing Data, we have already seen dropna and fillna Pandas method. But here we shall deal Missing Data using Imputer Technique. In this technique, the missing data is filled up or imputed by a suitable substitute and there are multiple strategies behind it. 

- Replace with Mean
- Replace with Median
- Replace with Constant
- Replace with Most frequent

In [6]:
imputer = SimpleImputer(strategy="most_frequent")
X_imputer = imputer.fit_transform(data)

#### Note: We are not updating X_imputer to the main dataset, as this is an overview of how to use Simple Imputer to deal with empty values. On Day 11: We shall look into this in practice while discussing Linear Regression

In [7]:
X_imputer

array([['Resort Hotel', 0, 342, ..., 0, 'Check-Out', '2015-07-01'],
       ['Resort Hotel', 0, 737, ..., 0, 'Check-Out', '2015-07-01'],
       ['Resort Hotel', 0, 7, ..., 0, 'Check-Out', '2015-07-02'],
       ...,
       ['City Hotel', 0, 34, ..., 4, 'Check-Out', '2017-09-07'],
       ['City Hotel', 0, 109, ..., 0, 'Check-Out', '2017-09-07'],
       ['City Hotel', 0, 205, ..., 2, 'Check-Out', '2017-09-07']],
      dtype=object)

In [8]:
numeric = []
category = []
for col in data.columns:
    if data[col].dtype == "O":
        category.append(col)
    else:
        numeric.append(col)

In [9]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
1,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
2,Resort Hotel,July,BB,GBR,Direct,Direct,A,C,No Deposit,Transient,Check-Out,2015-07-02
3,Resort Hotel,July,BB,GBR,Corporate,Corporate,A,A,No Deposit,Transient,Check-Out,2015-07-02
4,Resort Hotel,July,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,August,BB,BEL,Offline TA/TO,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-06
119386,City Hotel,August,BB,FRA,Online TA,TA/TO,E,E,No Deposit,Transient,Check-Out,2017-09-07
119387,City Hotel,August,BB,DEU,Online TA,TA/TO,D,D,No Deposit,Transient,Check-Out,2017-09-07
119388,City Hotel,August,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-07


[![scale](https://i.stack.imgur.com/Z7ATR.png)](https://stackoverflow.com/questions/40758562/can-anyone-explain-me-standardscaler#40767144)

### Scale Numeric Data Type

We basically scale the Numeric data types i.e., int and float in range of 0-1, using Scling Techniques. We commonly us MinMaxScaler() and StandardScaler to scale numeric features in dataset

In [10]:
#initialize the scale methods that is supported in Sklearn

numeric_feature_scale_std = StandardScaler()
numeric_feature_scale_std_minmax = MinMaxScaler()

### Encoding Category Data Types

Category data types are objects. And Machine models are preferred to be trained with nnumeric value. But we do have Encoding Techniques in Sklearn and let us initialize those methods

In [11]:
#for encoding category data types we basically use LabelEncoder and OneHotEncoder
category_feature_encode_le = LabelEncoder()
category_feature_encode_one = OneHotEncoder()

#### Before Scaling

In [12]:
data[numeric]

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,0,342,2015,27,1,0,0,2,0.0,0,0,0,0,3,,,0,0.00,0,0
1,0,737,2015,27,1,0,0,2,0.0,0,0,0,0,4,,,0,0.00,0,0
2,0,7,2015,27,1,0,1,1,0.0,0,0,0,0,0,,,0,75.00,0,0
3,0,13,2015,27,1,0,1,1,0.0,0,0,0,0,0,304.0,,0,75.00,0,0
4,0,14,2015,27,1,0,2,2,0.0,0,0,0,0,0,240.0,,0,98.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2017,35,30,2,5,2,0.0,0,0,0,0,0,394.0,,0,96.14,0,0
119386,0,102,2017,35,31,2,5,3,0.0,0,0,0,0,0,9.0,,0,225.43,0,2
119387,0,34,2017,35,31,2,5,2,0.0,0,0,0,0,0,9.0,,0,157.71,0,4
119388,0,109,2017,35,31,2,5,2,0.0,0,0,0,0,0,89.0,,0,104.40,0,0


`Standard Scaler` helps to get standardized distribution, with a zero mean and standard deviation of one (unit variance). It standardizes features by subtracting the mean value from the feature and then dividing the result by feature standard deviation.

In [13]:
data[numeric] = numeric_feature_scale_std.fit_transform(data[numeric])

#### After Scaling

In [14]:
data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
57719,City Hotel,1.303712,1.525216,-0.221286,September,0.943385,1.617366,0.072502,-0.262174,0.247897,...,Non Refund,-0.547903,,-0.131924,Transient,-0.135174,-0.254873,-0.720694,Canceled,2016-02-09
84490,City Hotel,-0.76704,-0.973319,-0.221286,February,-1.335176,1.275712,-0.92889,-1.31024,-1.478447,...,No Deposit,-0.701368,,-0.131924,Transient,-2.015038,-0.254873,0.540666,Check-Out,2016-02-27
71954,City Hotel,1.303712,-0.842309,1.192195,July,0.134863,0.022977,0.072502,-1.31024,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,-0.07581,-0.254873,0.540666,Canceled,2017-07-10
90402,City Hotel,-0.76704,0.814022,-0.221286,June,-0.306148,-1.457527,0.072502,-0.262174,0.247897,...,No Deposit,-0.755533,,-0.131924,Transient-Party,0.596983,-0.254873,-0.720694,Check-Out,2016-06-06
62059,City Hotel,1.303712,-0.79552,-0.221286,December,1.89891,1.731251,1.073895,-0.262174,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,0.17154,-0.254873,-0.720694,Canceled,2016-12-13
76439,City Hotel,1.303712,2.376776,-1.634768,October,1.016887,-0.660332,0.072502,-0.786207,0.247897,...,Non Refund,-0.773587,,-0.131924,Transient,-0.78818,-0.254873,-0.720694,Canceled,2015-07-23
117184,City Hotel,-0.76704,0.009251,1.192195,July,0.208365,1.161827,0.072502,0.785891,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,1.052108,-0.254873,0.540666,Check-Out,2017-07-31
113550,City Hotel,-0.76704,-0.758089,1.192195,June,-0.37965,-1.457527,1.073895,0.785891,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,0.93338,-0.254873,-0.720694,Check-Out,2017-06-09
43159,City Hotel,-0.76704,-0.262125,-1.634768,September,0.796381,0.022977,-0.92889,-0.262174,0.247897,...,No Deposit,-0.773587,,-0.131924,Transient-Party,-0.78818,-0.254873,-0.720694,Check-Out,2015-09-18
70368,City Hotel,1.303712,0.561362,1.192195,June,-0.232646,-0.090908,1.073895,1.309924,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,0.643683,-0.254873,-0.720694,Canceled,2017-02-01


#### Before Encoding

In [15]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
1,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
2,Resort Hotel,July,BB,GBR,Direct,Direct,A,C,No Deposit,Transient,Check-Out,2015-07-02
3,Resort Hotel,July,BB,GBR,Corporate,Corporate,A,A,No Deposit,Transient,Check-Out,2015-07-02
4,Resort Hotel,July,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,August,BB,BEL,Offline TA/TO,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-06
119386,City Hotel,August,BB,FRA,Online TA,TA/TO,E,E,No Deposit,Transient,Check-Out,2017-09-07
119387,City Hotel,August,BB,DEU,Online TA,TA/TO,D,D,No Deposit,Transient,Check-Out,2017-09-07
119388,City Hotel,August,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-07


`Label Encoding` refers to converting the labels into a numeric form so as to convert them into the machine-readable form. Machine learning algorithms can then decide in a better way how those labels must be operated. It is an important pre-processing step for the structured dataset in supervised learning.

Suppose your features have data such as: "Male, Female, Not Preferred To Say", Label Encoder will convert it into "0,1, 2"

In [16]:
for i in category:
    data[i] = category_feature_encode_le.fit_transform(data[i].astype(str))

#### After Encoding

In [17]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,1,5,0,135,3,1,2,2,0,2,1,121
1,1,5,0,135,3,1,2,2,0,2,1,121
2,1,5,0,59,3,1,0,2,0,2,1,122
3,1,5,0,59,2,0,0,0,0,2,1,122
4,1,5,0,59,6,3,0,0,0,2,1,123
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,1,0,15,5,3,0,0,0,2,1,919
119386,0,1,0,56,6,3,4,4,0,2,1,920
119387,0,1,0,43,6,3,3,3,0,2,1,920
119388,0,1,0,59,6,3,0,0,0,2,1,920


### Updated Dataset

In [18]:
data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
111319,0,-0.76704,0.374205,1.192195,8,-0.673658,-1.115872,1.073895,-0.786207,0.247897,...,0,-0.313191,,-0.131924,3,-0.234115,-0.254873,0.540666,1,799
12727,1,1.303712,0.636224,1.192195,5,0.061361,-0.204793,1.073895,1.309924,1.974242,...,0,-0.647204,,-0.131924,2,1.2209,-0.254873,1.802026,0,695
13280,1,1.303712,-0.149832,1.192195,1,0.355369,-1.001987,0.072502,0.261858,0.247897,...,0,1.383958,,-0.131924,2,3.169428,-0.254873,-0.720694,0,805
69266,0,1.303712,0.748518,1.192195,8,-0.37965,1.389597,1.073895,0.261858,0.247897,...,1,-0.611094,,-0.131924,2,0.557407,-0.254873,-0.720694,0,634
13777,1,1.303712,-0.056254,1.192195,1,0.575875,1.617366,-0.92889,0.261858,0.247897,...,0,1.383958,,-0.131924,2,2.14045,-0.254873,0.540666,0,893
47728,0,-0.76704,-0.458639,-0.221286,7,-1.261674,-1.685297,-0.92889,0.785891,0.247897,...,0,-0.701368,,-0.131924,2,-0.469988,-0.254873,0.540666,1,369
7498,1,1.303712,-0.439924,-0.221286,1,0.428871,-0.432562,0.072502,-0.262174,0.247897,...,0,1.383958,,-0.131924,2,2.358118,-0.254873,-0.720694,0,497
37351,1,-0.76704,0.392921,1.192195,6,-0.306148,-0.660332,1.073895,0.261858,0.247897,...,0,1.383958,,-0.131924,2,0.458467,-0.254873,3.063386,1,836
1680,1,-0.76704,-0.954603,-1.634768,11,0.722879,-0.774217,-0.92889,0.261858,0.247897,...,0,1.383958,,-0.131924,2,0.874015,3.821932,-0.720694,1,194
6761,1,1.303712,0.514573,-0.221286,6,-0.012141,1.617366,0.072502,0.261858,0.247897,...,0,1.383958,,-0.131924,2,0.572248,-0.254873,0.540666,0,359


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  int32  
 1   is_canceled                     119390 non-null  float64
 2   lead_time                       119390 non-null  float64
 3   arrival_date_year               119390 non-null  float64
 4   arrival_date_month              119390 non-null  int32  
 5   arrival_date_week_number        119390 non-null  float64
 6   arrival_date_day_of_month       119390 non-null  float64
 7   stays_in_weekend_nights         119390 non-null  float64
 8   stays_in_week_nights            119390 non-null  float64
 9   adults                          119390 non-null  float64
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  float64
 12  meal            

As you can see there is no object data type now

#### We shall use MinMax and One Hot Encoder in different examples.

In part two we shall see One Hot Encoder and MinMax along with Feature Selection and Data Splitting. 

### Summary:

Before we train and predict our Machine Learning Models, here are the few steps you need to follow:
- EDA

#### Data Preprocess

- Handle Missing Data
- Scaling Numeric Data Type
- Encoding Category Data Type