# Project 2 - Part II: Classification Task

### Notebook 4: PCA & Reapplying Project 1 Models

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

### Load data

In [2]:
hotel_df = pd.read_csv(r'revised_hotel_df.csv')
hotel_df.shape

(115459, 20)

In [3]:
hotel_df.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_month',
       'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'meal',
       'country', 'distribution_channel', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'assigned_room_type', 'booking_changes', 'deposit_type',
       'days_in_waiting_list', 'customer_type', 'adr', 'under_18'],
      dtype='object')

### Evaluation Metric Decision

From Project 1, we decided that the chosen evaluation metric is recall. The goal is the produce a model with a __high recall__ rate.

### Data Preparation

In [4]:
    # one hot encode the categorical variables
hotel_df = pd.get_dummies(hotel_df, columns = ['hotel'], prefix='hotel')
hotel_df = pd.get_dummies(hotel_df, columns = ['arrival_date_month'], prefix='month')
hotel_df = pd.get_dummies(hotel_df, columns = ['meal'], prefix='meal')
hotel_df = pd.get_dummies(hotel_df, columns = ['country'], prefix='country')
hotel_df = pd.get_dummies(hotel_df, columns = ['distribution_channel'], prefix='distr')
hotel_df = pd.get_dummies(hotel_df, columns = ['assigned_room_type'], prefix='room')
hotel_df = pd.get_dummies(hotel_df, columns = ['deposit_type'], prefix='deposit')
hotel_df = pd.get_dummies(hotel_df, columns = ['customer_type'], prefix='cust')
#hotel_df.info()

Column rearrangement

In [5]:
hotel_df.insert(5, 'under_18', hotel_df.pop('under_18'))
hotel_df.insert(11, 'is_repeated_guest', hotel_df.pop('is_repeated_guest'))
#hotel_knn.info()

Forced to take smaller sample of data, due to computing constraints.

In [6]:
#hotel_df_sample_10k = hotel_df.sample(n=10000, random_state=321).reset_index(drop=True)

#X = hotel_df_sample_10k.drop('is_canceled', axis=1)
#y = hotel_df_sample_10k['is_canceled']
#y.shape

X = hotel_df.drop('is_canceled', axis=1)
y = hotel_df['is_canceled']

Train-test split.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 0, test_size = 0.2)

Scale the data.

In [8]:
    # Standard Scaler is usually preferred b/c helps you account for outliers & keeps dispersion
scaler = StandardScaler()

    # fit_transform for train data set, but just the numerical columns, not one-hot encoded columns
X_train.iloc[ : , 0:10] = scaler.fit_transform(X_train.iloc[ : , 0:10])
X_test.iloc[ : , 0:10] = scaler.transform(X_test.iloc[ : , 0:10])

# ready data set at this point: hotel_df

## PCA Dimensionality Reduction

We want a PCA that helps us define 95% of the variance in our original data set.

In [9]:
pca = PCA(n_components=0.95)

    # fit & transform this PCA onto the original training data set
X_train_reduced = pca.fit_transform(X_train)

    # just transform onto the original test data set
X_test_reduced = pca.transform(X_test)
#X_train_reduced

## Machine Learning Algorithms

### Model 1: KNN Classification

In [10]:
    # Not doing grid search due to computing constraints
    # Manually performed KNN for n_neighbors = 3,5,7,9,11
    # found that 7 gives good recall score, while also keeping difference between train & test scores low
np.random.seed(0)
knn = KNeighborsClassifier(n_neighbors=7)

In [11]:
   # fit to train model
knn.fit(X_train_reduced, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [12]:
y_pred_knn = knn.predict(X_test_reduced)

print('KNN after PCA, recall score: {:.4f}'.format(recall_score(y_test, y_pred_knn)))
#print('KNN after PCA, accuracy score: {:.4f}'.format(accuracy_score(y_test, y_pred_knn)))

KNN after PCA, recall score: 0.6739


In [13]:
print('KNN after PCA, train score: {:.4f}'.format(knn.score(X_train_reduced, y_train)))
print('KNN after PCA, test score: {:.4f}'.format(knn.score(X_test_reduced, y_test)))

KNN after PCA, train score: 0.8517
KNN after PCA, test score: 0.8043


KNN after PCA gives among the highest but not the highest, recall scores we've generated so far.

### Model 2: Logistic Regression

In [14]:
    # instantiate 
logreg = LogisticRegression(random_state = 321, C = 100)

    # fit to training set
logreg.fit(X_train_reduced, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=321, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
y_pred_lr = logreg.predict(X_test_reduced)

print("Logistic Regression after PCA, Recall score: {:.4f}".format(recall_score(y_test, y_pred_lr)))
#print("Logistic Regression after PCA, Accuracy score: {:.4f}".format(accuracy_score(y_test, y_pred_lr)))

Logistic Regression after PCA, Recall score: 0.5250


### Model 3: Linear SVM

In [16]:
    # instantiate
linSVM = LinearSVC(random_state = 0, penalty='l2', C=100, dual=False)

    # fit to train model
linSVM.fit(X_train_reduced, y_train)

LinearSVC(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)

##### Linear SVM Model Evaluation

In [17]:
   # creating y_pred for use with model evaluation calculations
y_pred_linsvm = linSVM.predict(X_test_reduced)
confusion_matrix(y_test, y_pred_linsvm)

array([[13554,   884],
       [ 4331,  4323]], dtype=int64)

In [18]:
print("Linear SVM after PCA, Recall score: {:.4f}".format(recall_score(y_test, y_pred_linsvm)))
#print("Linear SVM after PCA, Accuracy score: {:.4f}".format(accuracy_score(y_test, y_pred_linsvm)))

Linear SVM after PCA, Recall score: 0.4995


### Model 4: Decision Trees

In [19]:
    # instantiate
dtree = DecisionTreeClassifier(random_state=0, max_depth=5, max_leaf_nodes=9, min_samples_split=2)

    # fit to train model
dtree.fit(X_train_reduced, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=9,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [20]:
   # creating y_pred for use with model evaluation calculations
y_pred_dtree = dtree.predict(X_test_reduced)

In [21]:
print("Decision Tree after PCA, Recall score: {:.4f}".format(recall_score(y_test, y_pred_dtree)))
#print("Decision Tree after PCA, Accuracy score: {:.4f}".format(accuracy_score(y_test, y_pred_dtree)))

Decision Tree after PCA, Recall score: 0.2922


### Model 5: Kernelized SVM (rbf, poly, linear)

Forced to take smaller sample of data, due to computing constraints.

In [22]:
hotel_df_sample = hotel_df.sample(n=10000, random_state=321).reset_index(drop=True)

In [23]:
X = hotel_df_sample.drop('is_canceled', axis=1)
y = hotel_df_sample['is_canceled']
#y.shape

Train-test split.

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 0, test_size = 0.2)

Scale the data.

In [25]:
    # Standard Scaler is usually preferred b/c helps you account for outliers & keeps dispersion
scaler = StandardScaler()

    # fit_transform for train data set, but just the numerical columns, not one-hot encoded columns
X_train.iloc[ : , 0:10] = scaler.fit_transform(X_train.iloc[ : , 0:10])
X_test.iloc[ : , 0:10] = scaler.transform(X_test.iloc[ : , 0:10])

# ready data set at this point: hotel_df

### Dimensionality Reduction for 10000 sample data set

In [26]:
pca = PCA(n_components=0.95)

X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)
#X_train_reduced

#### 5a: SVM (kernel = 'rbf')

In [27]:
    # instantiate
svm_rbf = SVC(kernel='rbf', gamma=0.1, C=1)

    # fit to train model
svm_rbf.fit(X_train_reduced, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
y_pred_svm1 = svm_rbf.predict(X_test_reduced)

print("SVM (rbf kernel) after PCA, Recall score: {:.4f}".format(recall_score(y_test, y_pred_svm1)))
#print("SVM (rbf kernel) after PCA, Accuracy score: {:.4f}".format(accuracy_score(y_test, y_pred_svm1)))

SVM (rbf kernel) after PCA, Recall score: 0.5106


#### 5b: SVM (kernel = 'poly')

In [29]:
    # instantiate
svm_poly = SVC(kernel='poly', gamma=1, C=0.001, degree=3)

    # fit to train model
svm_poly.fit(X_train_reduced, y_train)

SVC(C=0.001, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [30]:
y_pred_svm2 = svm_poly.predict(X_test_reduced)

print("SVM (poly kernel) after PCA, Recall score: {:.4f}".format(recall_score(y_test, y_pred_svm2)))
#print("SVM (poly kernel) after PCA, Accuracy score: {:.4f}".format(accuracy_score(y_test, y_pred_svm2)))

SVM (poly kernel) after PCA, Recall score: 0.5053


#### 5c: SVM (kernel = 'linear')

In [31]:
    # instantiate
svm_lin = SVC(kernel='linear', gamma=0.001, C=1000)

    # fit to train model
svm_lin.fit(X_train_reduced, y_train)

SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [32]:
y_pred_svm3 = svm_rbf.predict(X_test_reduced)
confusion_matrix(y_test, y_pred_svm3)

print("SVM (linear kernel) after PCA, Recall score: {:.4f}".format(recall_score(y_test, y_pred_svm3)))
#print("SVM (linear kernel) after PCA, Accuracy score: {:.4f}".format(accuracy_score(y_test, y_pred_svm3)))

SVM (linear kernel) after PCA, Recall score: 0.5106


### Recall Score Summaries

|   | Model Name | Recall Score | Bagging Recall Score | Pasting Score | Adaboosting Recall Score | PCA Recall Score |
| - | ---------- | -----------  | -------------------- | ------------- | ------------------------ | ---------------- |
| 1 |   KNN      |      0.5319 |    NA   |    NA   |  NA   | 0.6739 |
| 2 | Logistic Regression    | 0.5362 | 0.5161 | 0.5158 | 0.5178 | 0.5250 |
| 3 | Linear SVM  | 0.5174 |  NA   |    NA   |  NA   | 0.4995 |
| 4 | Decision Tree  | 0.5767 | 0.4011 | 0.4018 | 0.6826 | 0.2922 |
| 5a |  SVM, kernel='rbf'   | 0.4691 |  NA   |    NA   |  NA   | 0.5106 |
| 5b |  SVM, kernel='poly'  | 0.5556 |  NA   |    NA   |  NA   | 0.5053 |
| 5c |  SVM, kernel='linear' | 0.5185 |  NA   |    NA   |  NA   | 0.5106 |
| 6 | Hard Voting Classifier  | 0.5745 |  NA   |    NA   |  NA   |  NA  |
| 7 | Soft Voting Classifier  | 0.5359 |  NA   |    NA   |  NA   |  NA  |
| 8 | Gradient Boosting  | 0.7061 |  NA   |    NA   |  NA   |  NA  |

Surprisingly, Decision Tree after PCA performs far worse than all the other models. In addition, all the SVM models also perform slightly worse after PCA.

Overall, PCA does not lead to better recall scores.

The best model, still is the Gradient Boosting Classifier.