# Project 2 - Part II: Classification Task

### Notebook 1: Hard & Soft Voting Classifier

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score

### Load data

In [2]:
hotel_df = pd.read_csv(r'revised_hotel_df.csv')
hotel_df.shape

(115459, 20)

In [3]:
hotel_df.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_month',
       'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'meal',
       'country', 'distribution_channel', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'assigned_room_type', 'booking_changes', 'deposit_type',
       'days_in_waiting_list', 'customer_type', 'adr', 'under_18'],
      dtype='object')

### Evaluation Metric Decision

From Project 1, we decided that the chosen evaluation metric is recall. The goal is the produce a model with a __high recall__ rate.

### Data Preparation

In [4]:
    # one hot encode the categorical variables
hotel_df = pd.get_dummies(hotel_df, columns = ['hotel'], prefix='hotel')
hotel_df = pd.get_dummies(hotel_df, columns = ['arrival_date_month'], prefix='month')
hotel_df = pd.get_dummies(hotel_df, columns = ['meal'], prefix='meal')
hotel_df = pd.get_dummies(hotel_df, columns = ['country'], prefix='country')
hotel_df = pd.get_dummies(hotel_df, columns = ['distribution_channel'], prefix='distr')
hotel_df = pd.get_dummies(hotel_df, columns = ['assigned_room_type'], prefix='room')
hotel_df = pd.get_dummies(hotel_df, columns = ['deposit_type'], prefix='deposit')
hotel_df = pd.get_dummies(hotel_df, columns = ['customer_type'], prefix='cust')
#hotel_df.info()

Column rearrangement

In [5]:
hotel_df.insert(5, 'under_18', hotel_df.pop('under_18'))
hotel_df.insert(11, 'is_repeated_guest', hotel_df.pop('is_repeated_guest'))
#hotel_df.info()

Hard Voting Classifier requires high computing power. Due to limited resources, we are forced to cut the sample size. But again, we make sure that the proportion of cancelled reservations in this sample is about the same as in the original dataset, which was ~37%.

In [6]:
hotel_df_sample = hotel_df.copy()    

In [7]:
hotel_df_sample = hotel_df.sample(n=10000, random_state=8860).reset_index(drop=True)
hotel_df_sample['is_canceled'].value_counts()

0    6240
1    3760
Name: is_canceled, dtype: int64

In [8]:
X = hotel_df_sample.drop('is_canceled', axis=1)
y = hotel_df_sample['is_canceled']
#X.info()

Train-test split & Scaling

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 0, test_size = 0.2)

In [10]:
    # Standard Scaler is usually preferred b/c helps you account for outliers & keeps dispersion
scaler = StandardScaler()

    # fit_transform for train data set, but just the numerical columns, not one-hot encoded columns
X_train.iloc[ : , 0:10] = scaler.fit_transform(X_train.iloc[ : , 0:10])
X_test.iloc[ : , 0:10] = scaler.transform(X_test.iloc[ : , 0:10])

## Machine Learning Algorithms

### Model 1: Logistic Regression

In [11]:
    # instantiate with best hyperparameter found in project 1
logreg = LogisticRegression(random_state=123, C=100)

    # fit on training set
logreg.fit(X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Model 2: Kernelized SVM (poly)

In [12]:
    # instantiate, with best hyperparameters found in Project 1
    # need for probability to = True in order to run soft Voting Classifier
SVM2 = SVC(kernel='poly', random_state=123, C = 0.001, gamma = 1, degree = 3, probability = True)
    
    # fit on training data set
SVM2 = SVM2.fit(X_train, y_train)

### Model 3: Decision Trees

In [13]:
    # instantiate decision tree
dtree = DecisionTreeClassifier(random_state=123, max_depth=5, max_leaf_nodes=9, min_samples_split=2)

    # fit with best hyperparameters found in Project 1
dtree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=9,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

Now that we've fit the training data sets to all the models, we can move on to more advanced algorithms.

### Hard Voting Classifier

In [14]:
hard_voting_clf = VotingClassifier(estimators=[('logreg', logreg), ('SVM, kernel: poly', SVM2),
                                               ('Dtree', dtree)],
                              voting='hard', n_jobs=-1)
hard_voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('logreg',
                              LogisticRegression(C=100, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=123,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('SVM, kernel: poly',
                              SVC(C=0.001, cache_size=200, class_weight=None,
                                  coef0=0.0, de...
                                  shrinking=True, tol=0.001, verbose=False)),
                             (

### Soft Voting Classifier

In [15]:
soft_voting_clf = VotingClassifier(estimators=[('logreg', logreg), ('SVM, kernel: poly', SVM2),
                                               ('Dtree', dtree)],
                              voting='soft', n_jobs=-1)
soft_voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('logreg',
                              LogisticRegression(C=100, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=123,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('SVM, kernel: poly',
                              SVC(C=0.001, cache_size=200, class_weight=None,
                                  coef0=0.0, de...
                                  shrinking=True, tol=0.001, verbose=False)),
                             (

#### All Accuracy Scores

In [16]:
for clf in (logreg, SVM2, dtree, hard_voting_clf, soft_voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.789
SVC 0.801
DecisionTreeClassifier 0.7565
VotingClassifier 0.8015
VotingClassifier 0.7975


#### All Recall Scores

In [17]:
for clf in (logreg, SVM2, dtree, hard_voting_clf, soft_voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, recall_score(y_test, y_pred))

LogisticRegression 0.5585106382978723
SVC 0.6023936170212766
DecisionTreeClassifier 0.586436170212766
VotingClassifier 0.574468085106383
VotingClassifier 0.535904255319149


### Model Summaries & Best Model Selection

|   | Model Name | Recall Score | 
| - | ---------- | ----------- | 
| 1 | Logistic Regression  |  0.5585 |
| 2 | SVM, kernel='poly'  | 0.6024 | 
| 3 | Decision Tree  | 0.5864 | 
| 4 | Hard Voting Classifier  | 0.5745 |
| 5 | Soft Voting Classifier  | 0.5359 |

Amongst the recall scores, SVM with poly kernel gives the highest performance. Neither of the Voting Classifiers gives better recall score performance than the non-ensemble algorithms. In fact, the Soft Voting Classifier performs worse amongst this group of models.