In [44]:
import pandas as pd
import os

import sys
sys.path.append(os.path.join('..','src'))

# remove warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from feature_engineering import HotelBookingFeatures
from data_preprocessing import DataPreprocessor
from utils import DataLoader

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [37]:
data_folder = os.path.join('..','data')
file_to_open = os.path.join(data_folder,"hotel_bookings.csv")
# load the data using the DataLoader class
data_loader = DataLoader()
hotel_bookings = data_loader.load_data(file_to_open)
preprocess  = DataPreprocessor()
hotel_bookings = preprocess.fit_transform(hotel_bookings)
hotel_booking_features = HotelBookingFeatures()
hotel_bookings = hotel_booking_features.fit_transform(hotel_bookings)

Converting date columns to datetime...


100%|██████████| 3/3 [00:00<00:00, 63.75it/s]


Date columns converted to datetime.

Dropping rows with missing values...


100%|██████████| 1/1 [00:00<00:00,  8.86it/s]


5 rows dropped from 150000.

Dropping duplicate rows...


100%|██████████| 1/1 [00:00<00:00, 10.12it/s]


33 rows dropped from 149995.

Dropping rows with negative adr...


100%|██████████| 1/1 [00:00<00:00, 43.38it/s]


67 rows dropped from 149962.

Encoding categorical variables...


100%|██████████| 10/10 [00:00<00:00, 149.32it/s]


Categorical variables encoded.

hotel_name encoding:
Original values: {0: 'Algarve Retreat', 1: 'Braga City Hotel', 2: 'Duro Valley Resort', 3: 'Lisbon City Hotel', 4: 'Porto City Hotel'}
meal encoding:
Original values: {0: 'BB', 1: 'HB', 2: 'SC', 3: 'Undefined', 4: 'FB'}
source_country encoding:
Original values: {0: 'PRT', 1: 'FRA', 2: 'JPN', 3: 'DEU', 4: 'GBR', 5: 'ESP', 6: 'POL', 7: 'BRA', 8: 'AUT', 9: 'FIN', 10: 'NLD', 11: 'CPV', 12: 'ITA', 13: 'CHN', 14: 'GRC', 15: 'ARG', 16: 'IRL', 17: 'RUS', 18: 'CHE', 19: 'BEL', 20: 'USA', 21: 'THA', 22: 'LTU', 23: 'TWN', 24: 'BHR', 25: 'CN', 26: 'SAU', 27: 'AGO', 28: 'NOR', 29: 'LUX', 30: 'EST', 31: 'ROU', 32: 'SWE', 33: 'MKD', 34: 'ISR', 35: 'ZAF', 36: 'COL', 37: 'MEX', 38: 'OMN', 39: 'GIB', 40: 'MNE', 41: 'AND', 42: 'AUS', 43: 'DNK', 44: 'IRN', 45: 'CZE', 46: 'KOR', 47: 'KEN', 48: 'MYS', 49: 'SVN', 50: 'SMR', 51: 'UKR', 52: 'BGR', 53: 'SRB', 54: 'TUN', 55: 'TUR', 56: 'LVA', 57: 'HRV', 58: 'HUN', 59: 'IDN', 60: 'DZA', 61: 'BLR', 62: 'ATA', 63

In [38]:
columns_to_keep = ['hotel_name', 'guest_type',
       'customer_type', 'company', 'adr', 'adults', 'children',
       'babies', 'meal', 'market_segment','distribution_channel',
       'assigned_room_type','is_weekend_stay', 'num_days_stayed',
       'booking_lead_time', 'arrival_dayofweek', 'arrival_month',
       'arrival_weekofyear']

In [39]:
X = hotel_bookings[columns_to_keep]
y = hotel_bookings['is_canceled']


In [41]:
y[y==1].shape[0]/y.shape[0] * 100

33.98445578571667

In [46]:
import numpy as np
# define models to test:
base_models = [("DT_model", DecisionTreeClassifier(random_state=42)),
                ("RF_model", RandomForestClassifier(random_state=42,n_jobs=-1)),
                #("LR_model", LogisticRegression(random_state=42,n_jobs=-1)),
                ("XGB_model", XGBClassifier(random_state=42, n_jobs=-1))]

# split data into 'kfolds' parts for cross validation,
# use shuffle to ensure random distribution of data:
kfolds = 5 # 4 = 80% train, 20% validation
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

# do a grid search for each model:
for name, model in base_models:
    # define hyperparameters to search for each model:
    if name == "DT_model":
        param_grid = {"max_depth": [None, 5, 10],
                      "min_samples_leaf": [1, 5, 10],
                      "max_features": ["auto", "sqrt", "log2"]}
    elif name == "RF_model":
        param_grid = {"n_estimators": [100, 200, 300],
                      "max_depth": [None, 5, 10],
                      "min_samples_leaf": [1, 5],
                      "max_features": ["auto", "sqrt"]}
    elif name == "LR_model":
        param_grid = {"C": [0.1, 1, 10, 100],
                      "solver": ["liblinear"]}
    elif name == "XGB_model":
        param_grid = {"n_estimators": [100, 200, 300],
                      "max_depth": [None, 5, 10],
                      "learning_rate": [0.1, 0.01, 0.001]}
    
    # do the grid search:
    grid_search = GridSearchCV(model, 
                               param_grid=param_grid,
                               cv=split,
                               scoring="accuracy",
                               n_jobs=-1,
                               verbose=1)
    grid_search.fit(X, y)
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best score for {name}: {grid_search.best_score_}")
    print(f"Best estimator for {name}: {grid_search.best_estimator_}")
    print("")
# Preprocessing, fitting, making predictions and scoring for every model:
for name, model in base_models:
    
    # get cross validation score for each model:
    cv_results = cross_val_score(model, 
                                 X, y, 
                                 cv=split,
                                 scoring="accuracy",
                                 n_jobs=-1)
    # output:
    min_score = round(min(cv_results), 3)
    max_score = round(max(cv_results), 3)
    mean_score = round(np.mean(cv_results), 3)
    std_dev = round(np.std(cv_results), 3)
    print(f"{name} cross validation accuracy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")
    


Fitting 5 folds for each of 27 candidates, totalling 135 fits


Best parameters for DT_model: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 10}
Best score for DT_model: 0.7082224223623202
Best estimator for DT_model: DecisionTreeClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=10,
                       random_state=42)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
