# **Library**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Interactive table

from google.colab import data_table

data_table.enable_dataframe_formatter()

## **Dataset**

Download it [here](https://www.kaggle.com/datasets/jessemostipak/hotel-booking-demand)

## Input Data

In [2]:
from vega_datasets import data

cars = data.cars()
cars

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA
...,...,...,...,...,...,...,...,...,...
401,ford mustang gl,27.0,4,140.0,86.0,2790,15.6,1982-01-01,USA
402,vw pickup,44.0,4,97.0,52.0,2130,24.6,1982-01-01,Europe
403,dodge rampage,32.0,4,135.0,84.0,2295,11.6,1982-01-01,USA
404,ford ranger,28.0,4,120.0,79.0,2625,18.6,1982-01-01,USA


In [3]:
dataset = pd.read_csv('hotel_bookings.csv')
dataset.head()

FileNotFoundError: ignored

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

### **Checking missing data**

In [None]:
# Percentage of missing data each column
dataset.isna().mean().round(3).mul(100)

hotel                              0.0
is_canceled                        0.0
lead_time                          0.0
arrival_date_year                  0.0
arrival_date_month                 0.0
arrival_date_week_number           0.0
arrival_date_day_of_month          0.0
stays_in_weekend_nights            0.0
stays_in_week_nights               0.0
adults                             0.0
children                           0.0
babies                             0.0
meal                               0.0
country                            0.4
market_segment                     0.0
distribution_channel               0.0
is_repeated_guest                  0.0
previous_cancellations             0.0
previous_bookings_not_canceled     0.0
reserved_room_type                 0.0
assigned_room_type                 0.0
booking_changes                    0.0
deposit_type                       0.0
agent                             13.7
company                           94.3
days_in_waiting_list     

### **Adding value in missing data**

In [None]:
dataset['children'] = dataset['children'].fillna(dataset['children'].median())

In [None]:
dataset['children'].isna().sum()

0

### **Changing data type in children column**

In [None]:
dataset['children'] = dataset['children'].astype('int')
dataset['children'].head()

0    0
1    0
2    0
3    0
4    0
Name: children, dtype: int64

### **Checking and removed duplicate values**

In [None]:
dataset.duplicated().sum()

31994

In [None]:
dataset1 = dataset.drop_duplicates()
dataset1.duplicated().sum()

0

In [None]:
dataset1.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


## **Transforming Data**

In [None]:
dataset1['total_penghuni'] = dataset1['adults'] + dataset1['children'] + dataset1['babies']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['total_penghuni'] = dataset1['adults'] + dataset1['children'] + dataset1['babies']


In [None]:
dataset1['total_penghuni']

0         2
1         2
2         1
3         1
4         2
         ..
119385    2
119386    3
119387    2
119388    2
119389    2
Name: total_penghuni, Length: 87396, dtype: int64

### **Doing encode**

In [None]:
dataset1['is_resort'] = pd.get_dummies(dataset1['hotel'], drop_first = True)
dataset1['is_resort'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['is_resort'] = pd.get_dummies(dataset1['hotel'], drop_first = True)


0    1
1    1
2    1
3    1
4    1
Name: is_resort, dtype: uint8

### **Discretized** (Making new category)

In [None]:
def lead_time_category(x):

  if x <= 30:
    return '1 month'
  elif x <= 60:
    return '2 month'
  else:
    return '3 month'

dataset1['lead_time_category'] = dataset1['lead_time'].apply(lead_time_category)
dataset1['lead_time_category']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['lead_time_category'] = dataset1['lead_time'].apply(lead_time_category)


0         3 month
1         3 month
2         1 month
3         1 month
4         1 month
           ...   
119385    1 month
119386    3 month
119387    2 month
119388    3 month
119389    3 month
Name: lead_time_category, Length: 87396, dtype: object

### **Changing category**

In [None]:
dataset1['co_status'] = dataset1['reservation_status'].map({'Check-Out' : 'Success',
                                                           'Cancelled' : 'Not Success',
                                                           'No Show' : 'Not Success'})

dataset1['co_status'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['co_status'] = dataset1['reservation_status'].map({'Check-Out' : 'Success',


0    Success
1    Success
2    Success
3    Success
4    Success
Name: co_status, dtype: object

### **Scaling the data**

In [None]:
dataset1['adr_scaled'] = dataset1[['adr']].apply(lambda x: (x - x.max()) / (x.max() - x.min()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset1['adr_scaled'] = dataset1[['adr']].apply(lambda x: (x - x.max()) / (x.max() - x.min()))


### **Aggregating the data**

In [None]:
by_country = dataset1.groupby('country')['lead_time', 'total_penghuni', 'adr'].mean()
by_country

  by_country = dataset1.groupby('country')['lead_time', 'total_penghuni', 'adr'].mean()


Unnamed: 0_level_0,lead_time,total_penghuni,adr
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,126.000000,2.500000,128.340000
AGO,23.897661,1.836257,117.970029
AIA,0.000000,4.000000,265.000000
ALB,93.272727,1.909091,85.203636
AND,47.000000,2.714286,202.652857
...,...,...,...
VGB,109.000000,2.000000,100.500000
VNM,84.125000,2.125000,123.267500
ZAF,69.871795,2.000000,111.055385
ZMB,68.500000,3.000000,101.885000


### **Pivot**

In [None]:
pivot_lead_time = dataset1.pivot_table(index = 'reservation_status',
                                       columns = 'hotel',
                                       values = 'lead_time',
                                       aggfunc = 'mean')

pivot_lead_time.head()

hotel,City Hotel,Resort Hotel
reservation_status,Unnamed: 1_level_1,Unnamed: 2_level_1
Canceled,104.133325,115.775616
Check-Out,67.387999,73.999115
No-Show,50.759358,60.011278


### **Melt**

In [None]:
pivot_lead_time.melt()

Unnamed: 0,hotel,value
0,City Hotel,104.133325
1,City Hotel,67.387999
2,City Hotel,50.759358
3,Resort Hotel,115.775616
4,Resort Hotel,73.999115
5,Resort Hotel,60.011278
