<a href="https://colab.research.google.com/github/aflores/colab-notebooks/blob/master/hotel_booking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites

Install dependencies and define constants.

It is ok to ingnore the following error message:

ERROR: pip's dependency resolver does not currently 

In [None]:
#
# ydata-profiling is a one-line Exploratory Data Analysis (EDA) tool (kinda like pandas describe() but on steroids)
# for now it is safe to disregard the 'visions' version error message
!pip install ydata-profiling
# Scikit  is an open source machine learning library that supports supervised and unsupervised learning. 
# It also provides various tools for model fitting, data preprocessing, model selection, model evaluation, 
# and many other utilities.
!pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
# set the figure size using rcParams
plt.rcParams['figure.figsize'] = [12, 6]

from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


In [None]:
# Notebook specific variables
data_path = './drive/MyDrive/Colab Data/hotel-booking-data/'
bookings_file = f"{data_path}hotel-booking.csv"

# Load the data and do basic inspection

In [None]:
# Load the csv file into a pandas data frame
raw_bookings = pd.read_csv(bookings_file)

In [None]:
# column information description
raw_bookings.info()

In [None]:
# Statistical analysis of the numeric values in the dataFrame
raw_bookings.describe().transpose()

In [None]:
# look for empty cells
print(f"{'-' * 40}\nLook for nulls\nThis information is also in the Missing Values tab of the Profile Report\n{'-' * 40}")
raw_bookings.isna().sum()

# Feature Engineering

Base of the output from the report:

## Run a data profile report

In [None]:
# Prepare the profiling report
# this might take a couple of minutes
profile = ProfileReport(raw_bookings, title="Pandas Profiling Report")

In [None]:
# run and render
profile.to_notebook_iframe()

## Columns of interest

In [None]:
# use this in case you need to reload the full dataFrame as you indentify 
# additional columns to add or drop
#
# raw_bookings = pd.read_csv(bookings_file)

In [None]:
columns_to_include = ['is_canceled', 'hotel', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights',
'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'booking_changes',
'deposit_type', 'customer_type', 'adr']

print(f'Raw file shate: {raw_bookings.shape}')
df_bookings = raw_bookings.loc[:,columns_to_include]
print(f'df_bookings {df_bookings.shape}')

## Drop outliers

In [None]:
# these are rows that look suspicious and you may want to eliminate
#
# raw_bookings[raw_bookings['adr'] > 500][['is_canceled','reservation_status','reservation_status_date']]
# adr outlier 5400 ?
df_bookings = df_bookings[df_bookings.adr < 600]
print(f'Shape AFTER dropping {df_bookings.shape}')

# Missing Data and other transformations


In [None]:
# fill null values
df_bookings.isna().sum()
df_bookings['children'] = df_bookings['children'].fillna(0)
df_bookings['children'] = df_bookings['children'].astype(int)

df_bookings['agent'] = df_bookings['children'].fillna(0)
df_bookings['agent'] = df_bookings['agent'].astype(int)

df_bookings['company'] = df_bookings['children'].fillna(0)
df_bookings['company'] = df_bookings['company'].astype(int)

In [None]:
# One-hot encodings
df_bookings = pd.get_dummies(df_bookings,columns=['hotel','reserved_room_type','deposit_type', 'customer_type',
                                                  'arrival_date_year','arrival_date_month','arrival_date_day_of_month'])

In [94]:
df_bookings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119389 entries, 0 to 119389
Data columns (total 79 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   is_canceled                     119389 non-null  int64  
 1   lead_time                       119389 non-null  int64  
 2   stays_in_weekend_nights         119389 non-null  int64  
 3   stays_in_week_nights            119389 non-null  int64  
 4   adults                          119389 non-null  int64  
 5   children                        119389 non-null  int64  
 6   babies                          119389 non-null  int64  
 7   is_repeated_guest               119389 non-null  int64  
 8   previous_cancellations          119389 non-null  int64  
 9   previous_bookings_not_canceled  119389 non-null  int64  
 10  booking_changes                 119389 non-null  int64  
 11  adr                             119389 non-null  float64
 12  agent           

In [None]:
# Split the data
X, y = df_bookings.iloc[:,1:].values, df_bookings.iloc[:,0].values
X_train, X_test, y_train, y_test = \
   train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
# NEXT Scaling data 


# Misc.

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_bookings['arrival_date_month'].unique())
le.transform(df_bookings['arrival_date_month'])

In [None]:
#raw_bookings[raw_bookings.adults > 10][['is_canceled','hotel','adults']]
#sns.countplot(data=raw_bookings, x='hotel', hue='market_segment')
#sns.countplot(data=raw_bookings, x='is_repeated_guest', hue='is_canceled')
#raw_bookings[(raw_bookings.is_canceled == 1) & (raw_bookings.reservation_status != 'Canceled')]['reservation_status'].unique()
#raw_bookings[raw_bookings['agent'].isna()][['agent','company']]
raw_bookings.head()

In [None]:
%%script false --no-raise-error # do not process this cell
#sns.countplot(data=raw_bookings[raw_bookings['reservation_status'] == 'Canceled'],x="reservation_status_date", hue="reservation_status")
sns.countplot(
    data=raw_bookings[
        (raw_bookings['reservation_status'] == 'Canceled') 
        & (raw_bookings['reservation_status_date'].str.contains('2015-12-'))]
    ,x="reservation_status_date"
    ,hue="reservation_status")

In [None]:
%%script false --no-raise-error
sns.countplot(
    data=raw_bookings[
        (raw_bookings['reservation_status'] == 'Canceled')
        & (raw_bookings['reservation_status_date'].str.contains('2015-12-1'))]
    ,x="reservation_status_date"
    ,hue="reservation_status")