# Machine Learning Modelling

## Hotel Booking - Price Prediction

### Import Libraries

In [None]:
# General
import numpy as np
import pandas as pd

# EDA
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
colour_palette = sns.color_palette("hls", 8)

# Regression 
import statsmodels.api as sm
import statsmodels.formula.api as smf                 # for ols and logit
# from statsmodels.multivariate.pca import PCA

# Machine Learning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler      # Remove Mean and scale to Unit Variance
from sklearn.preprocessing import PowerTransformer    # Log Transformation
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Pre-Processing

In [None]:
## Importing Data
data_hotel_bookings = pd.read_csv()

## Understanding Data I
data_hotel_bookings.describe()

In [None]:
## Understanding Data II
print(data_hotel_bookings.dtypes)

In [None]:
## Check Missing Data
data_hotel_bookings.isnull().sum()[data_hotel_bookings.isnull().sum() > 0]

In [None]:
## Fill Missing Data

# For Children
data_hotel_bookings.children.fillna(0, inplace = True)

# For Country
data_hotel_bookings.country.fillna("Unknown", inplace = True)

# For Agent
data_hotel_bookings.loc[data_hotel_bookings.agent.isnull() == False, 'agent'] = 'Agent'
data_hotel_bookings.agent.fillna("No Agent", inplace = True)

# For Company 
data_hotel_bookings.loc[data_hotel_bookings.company.isnull() == False, 'company'] = 'Corporate'
data_hotel_bookings.company.fillna("No Company", inplace = True)

# Re-Confirm 
data_hotel_bookings.isnull().sum()[data_hotel_bookings.isnull().sum() > 0]

In [None]:
## Cleaning Data
data_hotel_bookings_resort = data_hotel_bookings.loc[data_hotel_bookings["hotel"] == "Resort Hotel"]
data_hotel_bookings_city = data_hotel_bookings.loc[data_hotel_bookings["hotel"] == "City Hotel"]

### Feature Engineering

In [None]:
## LabelEncoder / OneHotEncoder


In [None]:
## Principal Component Analysis (PCA)


In [None]:
## Correlation Matrix
data_hotel_bookings.corr()

In [None]:
## Correlation Matrix Heatmap
sns.heatmap(data_hotel_bookings.corr(), cmap='YlGnBu')
plt.show()

Observations

High Positive Correlation:
- is_canceled & lead_time
- stays_in_weekend_nights & stays_in_week_nights
- is_repeated_guest & bookings_not_cancelled
- agent & company

Moderate Positive Correlation:
- arrival_date_year & company
- arrival_date_year & adr
- adults & company
- adults & adr
- children & adr
- total_of_special_requests & adr

Moderate Negative Correlation:
- is_cancelled & total_of_special_requests
- is_cancelled & required_car_parking_spaces
- is_repeated_guest & company
- arrival_date_week_number & arrival_date_year
- is_repeated_guest & adr

In [None]:
## Feature Importance
corr_adr = data_hotel_bookings.corr()["adr"]
corr_adr.abs().sort_values(ascending = False)

Observations

Top 5 Feature Selection By Importance
1. children
2. adults
3. arrival_date_year
4. total_of_special_requests
5. is_repeated_guest

In [None]:
## Selecting Top 5 Features for Model Building
data_hotel_bookings_feature = data_hotel_bookings[['adr', 'children', 'adults', 'arrival_date_year', 'total_of_special_requests', 'is_repeated_guest']]
data_hotel_bookings_feature.drop(index = [row for row in data_hotel_bookings_feature.index if 0 >= data_hotel_bookings_feature.loc[row, 'adr']], inplace = True)
data_hotel_bookings_feature.dropna()
data_hotel_bookings_feature.head()

### Model Selection

In [None]:
## Evaluation Function (with different Metrics)
def evaluation(y_test, y_pred):
    print("Accuracy: ", accuracy_score(y_test, y_pred))         # Proportion of Correction Classification: (TP + TN) / (TP + TN + FP + FN)
    # print("Specificity: " (y_test, y_pred))                   # Proportion of Predicted No out of Actual No: (TN) / (FP + TN)
    print("Precision: ", precision_score(y_test, y_pred))       # Proportion of Actual Yes out of Predicted Yes: (TP) / (TP + FP)
    print("Sensitivity/Recall: ", recall_score(y_test, y_pred)) # Proportion of Predicted Yes out of Actual Yes: (TP) / (TP + FN)
    print("F1 Score: ", f1_score(y_test, y_pred))               # Balanced Index Between Precision & Sensitivity: (2 * Precision * Sensitivity) / (Precision + Sensitivity)
    print("Classification report: \n", classification_report(y_test, y_pred))
    ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot()
    plt.title('Confusion Matrix For Given Prediction')
    plt.show()

In [None]:
## Splitting Data
x = data_hotel_bookings_feature.drop(columns = 'adr')
y = data_hotel_bookings_feature.adr
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, train_size=0.7, random_state=0)
x_train.shape,x_test.shape

In [None]:
## Model 1: Naive Bayes (using GNB Technique)
# Assumes each parameter (Feature) has an independent capacity of predicting output variable

gnb = GaussianNB()
gnb_model = gnb.fit(x_train, y_train)
gnb_y_pred = gnb_model.predict(x_test)
gnb_cm = ConfusionMatrix(gnb)
gnb_cm.fit(x_train, y_train)
gnb_cm.score(x_test, y_test)

In [None]:
## Model 2: Random Forest Classifier
# Combines output of multiple decision trees to reach a single result
rf = RandomForestClassifier()
rf_model = rf.fit(x_train, y_train)
rf_y_pred = rf_model.predict(x_test)
evaluation(y_train, rf_model.predict(x_train))