# Preprocessing

In [1]:
import pandas as pd

data = pd.read_csv('../data/hotel_bookings.csv')
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [2]:
missing_data_count = data.isnull().sum()
missing_data_percentage = round((missing_data_count / len(data)) * 100, 1)

missing_data_stats = pd.DataFrame({
    'Missing data (count)': missing_data_count,
    'Missing data (%)': missing_data_percentage
})

missing_data_stats.sort_values(by='Missing data (%)', ascending=False).head(5)

Unnamed: 0,Missing data (count),Missing data (%)
company,112593,94.3
agent,16340,13.7
country,488,0.4
hotel,0,0.0
previous_cancellations,0,0.0


In [3]:
data_preprocessed = data.copy()

# change type of children into int
data_preprocessed['children'] = data['children'].fillna(-1).astype(int)
data_preprocessed['children'] = [None if x == -1 else x for x in data_preprocessed['children']]

month_dict = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}
data_preprocessed['arrival_date_month'] = data_preprocessed['arrival_date_month'].map(month_dict)

data_preprocessed.drop(['agent', 'company'], axis=1, inplace = True)

data_preprocessed = pd.get_dummies(data_preprocessed, columns=['hotel'], drop_first=True)
data_preprocessed = pd.get_dummies(data_preprocessed, columns=['meal'], drop_first=True)
data_preprocessed = pd.get_dummies(data_preprocessed, columns=['country'], drop_first=True)
data_preprocessed = pd.get_dummies(data_preprocessed, columns=['market_segment'], drop_first=True)
data_preprocessed = pd.get_dummies(data_preprocessed, columns=['distribution_channel'], drop_first=True)

data_preprocessed['room_attribution'] = (data_preprocessed['reserved_room_type'] == data_preprocessed['assigned_room_type'])
data_preprocessed.drop(['reserved_room_type', 'assigned_room_type'], axis=1, inplace=True)

data_preprocessed = pd.get_dummies(data_preprocessed, columns=['deposit_type'], drop_first=True)
data_preprocessed = pd.get_dummies(data_preprocessed, columns=['customer_type'], drop_first=True)
data_preprocessed = pd.get_dummies(data_preprocessed, columns=['reservation_status'], drop_first=True)

data_preprocessed.drop(['reservation_status_date'], axis=1, inplace=True)
data_preprocessed.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,distribution_channel_TA/TO,distribution_channel_Undefined,room_attribution,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,reservation_status_Check-Out,reservation_status_No-Show
0,0,342,2015,7,27,1,0,0,2,0.0,...,False,False,True,False,False,False,True,False,True,False
1,0,737,2015,7,27,1,0,0,2,0.0,...,False,False,True,False,False,False,True,False,True,False
2,0,7,2015,7,27,1,0,1,1,0.0,...,False,False,False,False,False,False,True,False,True,False
3,0,13,2015,7,27,1,0,1,1,0.0,...,False,False,True,False,False,False,True,False,True,False
4,0,14,2015,7,27,1,0,2,2,0.0,...,True,False,True,False,False,False,True,False,True,False


In [4]:
correlation = data_preprocessed.drop('is_canceled', axis=1).corrwith(data_preprocessed['is_canceled']).apply(abs)
correlation = correlation.sort_values(ascending=False)
print(correlation)

reservation_status_Check-Out    1.000000
deposit_type_Non Refund         0.481457
country_PRT                     0.336122
lead_time                       0.293123
room_attribution                0.247770
                                  ...   
country_ETH                     0.000385
country_LIE                     0.000385
country_TMP                     0.000385
country_KWT                     0.000110
country_SVK                     0.000057
Length: 218, dtype: float64


In [5]:
data_preprocessed.drop(['reservation_status_Check-Out'], axis=1, inplace=True)

In [6]:
data_preprocessed.shape

(119390, 218)

In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import warnings
warnings.filterwarnings('ignore')

imputer = IterativeImputer(n_nearest_features=10, random_state=42)
imputer.fit(data_preprocessed)
data_imputed = imputer.transform(data_preprocessed)
data_imputed = pd.DataFrame(data_imputed, columns=data_preprocessed.columns)

In [8]:
correlation = data_preprocessed.drop('is_canceled', axis=1).corrwith(data_preprocessed['is_canceled']).apply(abs)
correlation = correlation.sort_values(ascending=False)
print(correlation)

deposit_type_Non Refund      0.481457
country_PRT                  0.336122
lead_time                    0.293123
room_attribution             0.247770
total_of_special_requests    0.234658
                               ...   
country_ETH                  0.000385
country_LIE                  0.000385
country_TMP                  0.000385
country_KWT                  0.000110
country_SVK                  0.000057
Length: 217, dtype: float64


In [9]:
threshold = 0.1
columns_to_drop = correlation[correlation < threshold].index

In [10]:
data_filtered = data_imputed.drop(columns=columns_to_drop)

print(f'Columns dropped: {columns_to_drop}\n')
print(f'Elimated columns: {len(columns_to_drop)}')

Columns dropped: Index(['is_repeated_guest', 'market_segment_Corporate', 'country_ESP',
       'adults', 'previous_bookings_not_canceled', 'days_in_waiting_list',
       'country_NLD', 'country_BEL', 'adr', 'country_IRL',
       ...
       'country_BRA', 'country_BLR', 'country_CIV', 'country_KEN',
       'country_MNE', 'country_ETH', 'country_LIE', 'country_TMP',
       'country_KWT', 'country_SVK'],
      dtype='object', length=198)

Elimated columns: 198


In [11]:
data_filtered.shape

(119390, 20)

In [12]:
from utils import delete_multicollinearity

data_without_multicol = delete_multicollinearity(data_filtered, 'is_canceled', 10)

Dropped column distribution_channel_TA/TO with VIF: 11.6 (2s)


In [13]:
columns_to_keep = list(data_without_multicol.columns) + ['is_canceled']
data_ready = data_imputed[columns_to_keep]

In [14]:
data_ready.shape

(119390, 19)

In [15]:
from collections import Counter
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split

data_ready = data_ready.sample(frac=1, random_state=42)

X, y = data_ready.drop('is_canceled', axis=1), data_ready['is_canceled']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


ada = ADASYN(random_state=42)
X_train_res, y_train_res = ada.fit_resample(X_train, y_train)

print('Original train dataset shape {}'.format(Counter(y_train)))
print('Resampled train dataset shape {}'.format(Counter(y_train_res)))
print('Test dataset shape {}'.format(Counter(y_test)))

Original train dataset shape Counter({0.0: 5950, 1.0: 3601})
Resampled train dataset shape Counter({0.0: 5950, 1.0: 5846})
Test dataset shape Counter({0.0: 1478, 1.0: 910})


# Training

In [18]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

nb_col = X_train.shape[1]

dict_models = [
    {
        'name_clf' : 'Random Forest',
        'model': RandomForestClassifier(random_state=42),
        'scalers' : {
            'scaler': [None, MinMaxScaler(), StandardScaler()],
        },
        'grid' : {
            'model__n_estimators': [100, 200, 300],
            'model__max_depth': list(range(1, nb_col + 1)),
        },    
    },
    
    {
        'name_clf' : 'SVM',
        'model': SVC(random_state=42),
        'scalers' : {
            'scaler': [None, MinMaxScaler(), StandardScaler()],
        },
        'grid' : {
            'model__C': [10**k for k in range(-4, 5)],
            'model__kernel': ['linear', 'rbf', 'sigmoid'],
        },    
    },
]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from time import time
from utils import model_evaluation_clf

import warnings
warnings.filterwarnings("ignore")

results = pd.DataFrame(columns=["Model", "CPU time", "Accuracy", "Precision", "Recall", "f1-score", "AUC"])
models = {}
nb_res = 0

for i, dict_clf in enumerate(dict_models):
    model_name = dict_clf['name_clf']
    print(f'Training {model_name}...')

    model = dict_clf['model']

    steps = [
        ('scaler', None),
        ('model', model),
    ]

    pipeline = Pipeline(steps)

    param_grid = {
        **dict_clf['scalers'],
        **dict_clf['grid']
    }

    clf = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1)

    start_time = time()
    clf.fit(X_train_res, y_train_res)
    end_time = time()
    
    print(f"Best params {model_name}: \n{clf.best_params_}")

    eval = model_evaluation_clf(clf, X_test, y_test)
    
    models[model_name] = clf

    results.loc[nb_res] = [model_name, round(end_time - start_time, 1), eval['accuracy'], eval['precision'], eval['recall'], eval['f1'], eval['roc_auc']]
    nb_res += 1

    print(f"CPU Time: {round(end_time - start_time, 1)}s")
    print()

In [None]:
print(results.to_string(index=False))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

fig, axs = plt.subplots(1, 3, figsize=(20, 5))

for i, model_name in enumerate(models.keys()):
    model = models[model_name]
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', ax=axs[i//3, i%3])
    axs[i//3, i%3].set_title(model_name)


plt.show()