In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

In [2]:
df = pd.read_csv('train.csv', index_col=0)
df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


## 5. Pre-processing
<br>
<div style="text-align: justify">
    As part of data mining, data pre-processing techniques are required to transform the raw data into a more useful and efficient format. Therefore, we will be leveraging on techniques like label encoding on our data before we proceed with training our model. <br><br>
    Considering that our data is relatively high dimensional and that 6 variables are highly correlated based on the correlation results derived from EDA, principal component analysis (PCA) will be used. However, after comparing the performances between models with PCA and without PCA, it is noted that models with PCA generally do not improve the performance. Hence, we will further extend the models without PCA along with GridSearchCV to fine-tune the hyperparameters of the models.
</div>

### 5.1 Feature Engineering - Label Encoding
<br>
<div style="text-align: justify">
    Even though some machine learning algorithms can work with categorical variables directly, it is recommended to convert them into numerical variables. Thus, label encoding will be used. <br><br>
    In this case, all of the variables that are categorical such as Gender, Customer Type, Type of Travel, as well as Class will be converted to numerical for both of the training and testing.
</div>

In [3]:
def transform_class(val):
        if val ==  'Eco':
            return 0
        elif val == 'Eco Plus':
            return 1
        else:
            return 2

In [4]:
for column in df.columns:
    if column == 'Class':
        df['Class'] = df['Class'].apply(transform_class)
    elif df[column].dtype == type(object):
        #Create the label encoder
        le = preprocessing.LabelEncoder()
        #Convert the non numeric data to numeric
        df[column] = le.fit_transform(df[column])

In [5]:
df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,1,0,13,1,1,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,0
1,5047,1,1,25,0,2,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,0
2,110028,0,0,26,0,2,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,24026,0,0,25,0,2,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,119299,1,0,61,0,2,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1


In [6]:
df_test = pd.read_csv('test.csv', index_col=0)
df_test.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [7]:
for column in df_test.columns:
    if column == 'Class':
        df_test['Class'] = df_test['Class'].apply(transform_class)
    elif df_test[column].dtype == type(object):
        #Create the label encoder
        le = preprocessing.LabelEncoder()
        #Convert the non numeric data to numeric
        df_test[column] = le.fit_transform(df_test[column])

In [8]:
df_test.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,19556,0,0,52,0,0,160,5,4,3,...,5,5,5,5,2,5,5,50,44.0,1
1,90035,0,0,36,0,2,2863,1,1,3,...,4,4,4,4,3,4,5,0,0.0,1
2,12360,1,1,20,0,0,192,2,0,2,...,2,4,1,3,2,2,2,0,0.0,0
3,77959,1,0,44,0,2,3377,0,0,0,...,1,1,1,1,3,1,4,0,6.0,1
4,36875,0,0,49,0,0,1182,2,3,4,...,2,2,2,2,4,2,4,0,20.0,1


### 5.2 Defining Training and Testing Set Data

<br>
<div style="text-align: justify">
    Since both training and testing dataset are provided, we do not need to use train_test_split to segregate our data. <br><br>
    In this case, "satisfaction" is the target/dependent (Y) variable while the rest of the variables are independent (X) variables which will be used to determine the Y variable for both training and testing dataset.
</div>

In [9]:
X_train = df[['Gender','Age','Type of Travel', 'Class','Inflight wifi service', 
              'Departure/Arrival time convenient',
              'Ease of Online booking','Food and drink', 
              'Online boarding','Seat comfort', 
              'Inflight entertainment', 
              'On-board service','Leg room service', 
              'Baggage handling', 'Checkin service',
              'Inflight service', 'Cleanliness']]

#target variable
y_train = df["satisfaction"]


X_test = df_test[['Gender','Age','Type of Travel', 'Class','Inflight wifi service', 
              'Departure/Arrival time convenient',
              'Ease of Online booking','Food and drink', 
              'Online boarding','Seat comfort', 
              'Inflight entertainment', 
              'On-board service','Leg room service', 
              'Baggage handling', 'Checkin service',
              'Inflight service', 'Cleanliness']]


y_test = df_test["satisfaction"]

## 6. Model Selection & Training


<div style="text-align: justify">
Several machine learning classification algorithms will be studied in predicting airlines' passengers satisfication. For all of the models, GridSearchCV will be used to fine-tune the hyperparameters with the aim to improve the performance of the models. <br>

The following are the models and ensemble methods (bagging and boosting) that will be studied: <br>
<b> 1. Classification Model </b> - kNN, Decision Tree, Naive Bayes, Support Vector Machine (SVM) with Linear, SVM with RBF, Logistic Regression, Logistic Regression with Stacking Classifier <br>
<b> 2. Bagging </b> - Random Forest and Extremely Randomized Trees <br>
<b> 3. Boosting </b> - AdaBoost, XGBoost, GradientBoost, CatBoost and LighGBM
</div>

### 6.1 kNN w/o GridSearchCV

<br>
<div style="text-align: justify">
    kNN is an algorithm that relies on finding similar records in the training dataset, with the assumption that similar records exist in close proximity. The distance between records will be computed first using measures such as Euclidean distance. By using K value, we will then look for records in our training data that are similar and assign the record to a class based on the majority votes of the neighbours (Shmueli, Bruce, Gedeck, & Patel, 2019).
</div>


In [10]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)

KNeighborsClassifier()

In [11]:
y_pred = model_knn.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[13855   718]
 [ 1497  9906]]


In [12]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  13855
False Positive:  718
False Negative:  1497
True Positive:  9906


In [13]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9147289805974745
Precision: 0.9324171686746988
Recall: 0.8687187582215207
F-measure: 0.8994415944068643


### 6.2 KNN with GridSearchCV

Tuned Parameters: {'n_neighbors': 7}

In [14]:
from sklearn.model_selection import GridSearchCV   

param_grid = {
    "n_neighbors" :[6, 7, 8, 9]
}

In [15]:
model_knn1 = KNeighborsClassifier()
grid = GridSearchCV(model_knn1, param_grid, n_jobs=-1, cv=10)                  
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [6, 7, 8, 9]})

In [16]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'n_neighbors': 7}


In [17]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[13882   691]
 [ 1504  9899]]


In [18]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  13882
False Positive:  691
False Negative:  1504
True Positive:  9899


In [19]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9154989220819217
Precision: 0.9347497639282342
Recall: 0.8681048846794703
F-measure: 0.9001955167553313


#### Key Takeaways
<br>
<div style="text-align: justify">
    Tuned parameter produced similar accuracy (91.5%) and f-measure (90%), thus suggesting that GridSearchCV was not effective in producing better results for kNN model.
</div>

### 6.3 Naive Bayes w/o GridSearchCV
<br>
<div style="text-align: justify">
    Naive Bayes is a probabilistic algorithm that is typically used for classification problems with an assumption that features are independent and equal. After inferring probability distribution that was generated by the labelled data for each label, bayes rules will be applied to estimate the probabilities for the unlabeled data. It is useful for very large datasets and is known to outperform even highly sophisticated classification methods (Ray, 2017).
</div>

In [24]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

MultinomialNB()

In [25]:
y_pred = model_nb.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[10667  3906]
 [ 2104  9299]]


In [26]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  10667
False Positive:  3906
False Negative:  2104
True Positive:  9299


In [27]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.7686325839236218
Precision: 0.7042029534267323
Recall: 0.815487152503727
F-measure: 0.7557704811443434


### 6.4 Naive Bayes with GridSearchCV

Tuned Parameters: {'alpha': 14.0}

In [28]:
param_grid = {'alpha': [14.0, 14.2, 14.5]}

model_nb1 = MultinomialNB()
grid = GridSearchCV(model_nb1, param_grid, n_jobs=-1, cv=10)

grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=MultinomialNB(), n_jobs=-1,
             param_grid={'alpha': [14.0, 14.2, 14.5]})

In [29]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'alpha': 14.0}


In [30]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[10668  3905]
 [ 2105  9298]]


In [31]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  10668
False Positive:  3905
False Negative:  2105
True Positive:  9298


In [32]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.7686325839236218
Precision: 0.7042338862379762
Recall: 0.8153994562834341
F-measure: 0.7557506299276598


#### Key Takeaways
<br>
<div style="text-align: justify">
    Despite tuning 'alpha' parameter with several different values in finding the optimal results, Naives Bayes with GridSearchCV has similar accuracy (76.9%) and f-measure (75.6%) with model without GridSearchCV. This could be due to the number of cross validation used which resulted in slightly lesser correctly classified records.
</div>

### 6.5 Decision Tree w/o GridSearchCV
<br>
<div style="text-align: justify">
    Decision tree classifies data items by posing a series of questions about the features associated with the items. Each question is contained in a node, and every internal node points to one child node for each possible answer. Decision trees are sometimes more interpretable than other classifiers such as SVM as they combine simple questions about the data in an understandable way (Kingsford and Salzberg, 2009). 
</div>

In [33]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(random_state=42) 
model_dt.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [34]:
y_pred = model_dt.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[13721   852]
 [  764 10639]]


In [35]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  13721
False Positive:  852
False Negative:  764
True Positive:  10639


In [36]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9377887280566677
Precision: 0.9258550169698024
Recall: 0.9330000876962203
F-measure: 0.9294138202149035


### 6.6 Decision Tree with GridSearchCV

Tuned Parameters: {'criterion': 'gini', 'max_depth': 15}

In [50]:
param_grid = {'max_depth': [10, 15, 20, 25, 40, 50, 60,80],
              'criterion': ['gini','entropy']}

model_dt1 = DecisionTreeClassifier(random_state=42)
grid = GridSearchCV(model_dt1, param_grid, n_jobs=-1, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=42),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 15, 20, 25, 40, 50, 60, 80]})

In [51]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'criterion': 'gini', 'max_depth': 15}


In [52]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14077   496]
 [  826 10577]]


In [53]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14077
False Positive:  496
False Negative:  826
True Positive:  10577


In [54]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9491068678780412
Precision: 0.9552063578072789
Recall: 0.9275629220380601
F-measure: 0.9411817049297028


#### Key Takeaways
<br>
<div style="text-align: justify">
    Given entropy and gini, gini is the preferred criterion and it produces a higher accuracy (94.9%) and f-measure (95.5%) as compared to decision tree without GridSearchCV.
</div>

### 6.7 RandomForest w/o GridSearchCV
<br>
<div style="text-align: justify">
Unlike decision trees which are prone to overfitting, random forest is effective in providing useful information on the importance of each variable, thus determining those that have the greatest impact on the dependent variable. The training data will be divided into different sets of data and for each set, a decision tree will be constructed. Since each  tree is trained in parallel, random forests are efficient on large data sets (Cui, 2018).
</div>

In [46]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [47]:
y_pred = model_rf.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14194   379]
 [  710 10693]]


In [48]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14194
False Positive:  379
False Negative:  710
True Positive:  10693


In [49]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9580766861718509
Precision: 0.9657695086705202
Recall: 0.9377356835920372
F-measure: 0.9515461624026696


### 6.8 RandomForest with GridSearchCV

Tuned Parameters: {'max_depth': 35, 'min_samples_split': 5, 'n_estimators': 250}

In [55]:
param_grid = {'n_estimators': [150, 250, 350],
              'max_depth' : [35, 45, 55],
              'min_samples_split' : [5, 15, 20]}

model_rf1 = RandomForestClassifier(random_state=42)
grid = GridSearchCV(model_rf1, param_grid, n_jobs=-1, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [35, 45, 55],
                         'min_samples_split': [5, 15, 20],
                         'n_estimators': [150, 250, 350]})

In [56]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'max_depth': 35, 'min_samples_split': 5, 'n_estimators': 250}


In [57]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14187   386]
 [  702 10701]]


In [58]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14187
False Positive:  386
False Negative:  702
True Positive:  10701


In [59]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9581151832460733
Precision: 0.9651844502570578
Recall: 0.9384372533543804
F-measure: 0.951622943530458


In [60]:
# Predict X_train
X_train_pred = grid.predict(X_train)

# Accuracy against y_train
train_accuracy = accuracy_score(y_train, X_train_pred)

print("Train Error: ", 1 - train_accuracy)
print("Test Error: ", 1 - accuracy)

Train Error:  0.008883199876809411
Test Error:  0.041884816753926746


#### Key Takeaways
<br>
<div style="text-align: justify">
    By increasing the number of trees in the forest, from 100 (default) to 350, it ensures that more sub-samples/observations of the dataset will be taken into account which thus helps to improve the predictive accuracy. Moreover, with larger depth of each tree, more splits are involved and hence, able to capture more information about the data. However, having more trees can be computationally intensive and might overfit.
</div>

### 6.9 Extremely Randomnized Tree w/o GridSearchCV
<br>
<div style="text-align: justify">
This tree-based ensemble method consists of randomizing strongly both attribute and cut-point choice when spliting a tree node. One of the strengths of  this technique is its computational efficiency (Geurts, Ernst and Wehenkel, 2006).
</div>

In [61]:
from sklearn.ensemble import ExtraTreesClassifier

model_ert = ExtraTreesClassifier(random_state=42)
model_ert.fit(X_train, y_train)

ExtraTreesClassifier(random_state=42)

In [62]:
y_pred = model_ert.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14148   425]
 [  722 10681]]


In [63]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14148
False Positive:  425
False Negative:  722
True Positive:  10681


In [64]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9558438558669541
Precision: 0.9617323969025752
Recall: 0.9366833289485224
F-measure: 0.9490426051801503


### 6.10 Extremely Randomnized Tree with GridSearchCV

Tuned Parameters: {'max_depth': 20, 'n_estimators': 210}

In [66]:
param_grid = {"n_estimators": [130, 140, 150, 160,210],
              "max_depth": [10, 20, 22, 24]}
              #"min_samples_split": [5, 10, 15, 20],
              #"min_samples_leaf": [1, 2, 5, 20]

model_ert1 = ExtraTreesClassifier(random_state=42)
grid = GridSearchCV(model_ert1, param_grid, n_jobs=-1, cv=10)                  
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=ExtraTreesClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [10, 20, 22, 24],
                         'n_estimators': [130, 140, 150, 160, 210]})

In [67]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'max_depth': 20, 'n_estimators': 210}


In [68]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14179   394]
 [  710 10693]]


In [69]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14179
False Positive:  394
False Negative:  710
True Positive:  10693


In [70]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9574992300585156
Precision: 0.9644628844592766
Recall: 0.9377356835920372
F-measure: 0.9509115162294353


In [71]:
# Predict X_train
X_train_pred = grid.predict(X_train)

# Accuracy against y_train
train_accuracy = accuracy_score(y_train, X_train_pred)

print("Train Error: ", 1 - train_accuracy)
print("Test Error: ", 1 - accuracy)

Train Error:  0.009768632583923575
Test Error:  0.04250076994148444


#### Key Takeaways
<br>
<div style="text-align: justify">
    After fitting the decision tree with a range of values of 10, 20, 22 and 24 for maximum depth parameter, it is noted that max_depth of 20 would provide the optimal result. While increasing the n_estimators parameter, it is necessary to ensure that it does not reduce the performance of the model. Hence, the optimal n_estimators from a range of values is found to be 210.
</div>

### 6.11 AdaBoost w/o GridSearchCV
<br>
<div style="text-align: justify">
AdaBoost is one of the most popular boosting algorithms in customer churn prediction (Clemente, Giner-Bosch and San Matías, n.d.). Observations which are misclassified will be assigned higher weights to increase its chances of being correctly classified. The use of multiple weak models, where decision boundaries results are combined, will gradually improve the accuracy of the models (Rahman, Irfan, Raza, Ghori, Yaqoob & Awais, 2020).
</div>

In [72]:
from sklearn.ensemble import AdaBoostClassifier

model_ada = AdaBoostClassifier(random_state=42)
model_ada.fit(X_train, y_train)

AdaBoostClassifier(random_state=42)

In [73]:
y_pred = model_ada.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[13532  1041]
 [ 1198 10205]]


In [74]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  13532
False Positive:  1041
False Negative:  1198
True Positive:  10205


In [75]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9138050508161379
Precision: 0.907433754223724
Recall: 0.8949399280890994
F-measure: 0.9011435383460639


### 6.12 AdaBoost with GridSearchCV

Tuned Parameters: {'n_estimators': 280}

In [76]:
param_grid = {'n_estimators': [30, 40, 50, 80, 90, 100, 130, 180, 280, 450, 500]}

model_ada1 = AdaBoostClassifier(random_state=42) 
grid = GridSearchCV(model_ada1, param_grid, n_jobs=-1, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=AdaBoostClassifier(random_state=42), n_jobs=-1,
             param_grid={'n_estimators': [30, 40, 50, 80, 90, 100, 130, 180,
                                          280, 450, 500]})

In [77]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'n_estimators': 280}


In [78]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[13520  1053]
 [ 1143 10260]]


In [79]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  13520
False Positive:  1053
False Negative:  1143
True Positive:  10260


In [80]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9154604250076994
Precision: 0.9069212410501193
Recall: 0.8997632202052092
F-measure: 0.9033280507131537


#### Key Takeaways
<br>
<div style="text-align: justify">
    After fitting adaboost model with a wide range of values for n_estimators parameter, it is noted that n_estimators of 280 would provide the optimal result. As compared to the other adaboost parameters, n_estimators is an important parameter that can aid in improving the performance of the model by changing the number of base models. This explains why only n_estimators parameter is being included.
</div>

### 6.13 XGBoost w/o GridSearchCV
<br>
<div style="text-align: justify">
XGBoost is highly scalable, quick to execute and typically outperforms other algorithms (Hirko, 2019). It uses a histogram based algorithm to filter the observations used for splitting and leverages multiple machine learning algorithms to improve the accuracy of predictions. In comparison to LightGBM and CatBoost, the computation time for XGBoost is longer and hence higher complexity (Rahman, Irfan, Raza, Ghori, Yaqoob & Awais, 2020). 
</div>

In [83]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier(random_state=42)
model_xgb.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [84]:
y_pred = model_xgb.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14191   382]
 [  717 10686]]


In [85]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14191
False Positive:  382
False Negative:  717
True Positive:  10686


In [86]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9576917154296274
Precision: 0.9654860860137333
Recall: 0.9371218100499868
F-measure: 0.9510925192470294


### 6.14 XGBoost with GridSearchCV

Tuned Parameters: {'gamma': 5, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 150}

In [87]:
param_grid = {'n_estimators': [100, 150],
              'max_depth': [5,10,15],
              'gamma': [0,1,5],
              'learning_rate': [0.1]}

model_xgb1 = xgb.XGBClassifier(random_state=42)
grid = GridSearchCV(model_xgb1, param_grid, n_jobs=-1, cv=10)
grid.fit(X_train, y_train)





GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=42,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, 

In [88]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'gamma': 5, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 150}


In [89]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14246   327]
 [  692 10711]]


In [90]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14246
False Positive:  327
False Negative:  692
True Positive:  10711


In [91]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9607714813674161
Precision: 0.9703750679470918
Recall: 0.9393142155573094
F-measure: 0.9545920413528808


In [92]:
# Predict X_train
X_train_pred = grid.predict(X_train)

# Accuracy against y_train
train_accuracy = accuracy_score(y_train, X_train_pred)

print("Train Error: ", 1 - train_accuracy)
print("Test Error: ", 1 - accuracy)

Train Error:  0.028738065906991017
Test Error:  0.03922851863258392


#### Key Takeaways
<br>
<div style="text-align: justify">
    The optimal value gamma found is 5. By increasing the value (default is 0), the algorithm will be more conservative and it reduces the likelihood of overfitting. By fine-tuning the hyperparameters, the accuracy of the model has increased from 95.8% to 96.1% as well as f-measure from 95.1% to 95.5%.
</div>

### 6.15 GradientBoost w/o GridSearchCV
<br>
<div style="text-align: justify">
Gradient boosting re-defines boosting as a numerical optimisation problem where its objective is to minimise the loss function of the model by adding weak learners using gradient descent. It is a stage-wise additive model that generates learners during the learning process whereby trees are added one at a time and existing trees are not changed (Mujtaba, 2020).
</div>

In [93]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [94]:
y_pred = model_gb.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[13845   728]
 [  993 10410]]


In [95]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  13845
False Positive:  728
False Negative:  993
True Positive:  10410


In [96]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.93374653526332
Precision: 0.9346381756150117
Recall: 0.912917653249145
F-measure: 0.923650237345282


### 6.16 GradientBoost with GridSearchCV

Tuned Parameters: {'max_depth': 6, 'n_estimators': 500}

In [97]:
param_grid = {'n_estimators': [450,500], 
              'max_depth':[6,8,10]}

model_gb1 = GradientBoostingClassifier(random_state=42)
grid = GridSearchCV(model_gb1, param_grid, n_jobs=-1, cv=10)                  
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=GradientBoostingClassifier(random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [6, 8, 10], 'n_estimators': [450, 500]})

In [98]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'max_depth': 6, 'n_estimators': 500}


In [99]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14164   409]
 [  672 10731]]


In [100]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14164
False Positive:  409
False Negative:  672
True Positive:  10731


In [101]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9583846627656298
Precision: 0.9632854578096948
Recall: 0.9410681399631676
F-measure: 0.9520471986869539


#### Key Takeaways
<br>
<div style="text-align: justify">
    The parameters of GradientBoost with GridSearchCV are predefined as 'n_estimators': [450,500] and 'max_depth': [6,8,10]. After fine-tuning the hyperparameters, it is noted that a n_estimators of 500 and max_depth of 6 would provide the optimal results without decreasing the performance of the model. <br><br>
    Moreover, upon trying out other values of n_estimators (more than 500) and max_depth (more than 10), these parameters did not perform well as compared to the optimal parameters mentioned previously.
</div>

### 6.17 CatBoost w/o GridSearchCV
<br>
<div style="text-align: justify">
CatBoost is an algorithm that uses gradient boosting on decision trees. Instead of converting each categorical value into binary values, CatBoost applies an efficient encoding method that leads to an improvement in quality of the models by reducing overfitting (Eitle and Buxmann, 2019). It is ideally suited for applications with large amount of categorical data. In comparison to other machine learning methods, CatBoost does not require extensive data training, is robust as it does not need extensive hyper-parameter tuning and reduce the chances of overfitting (Ray, 2017).
</div>

In [102]:
from catboost import CatBoostClassifier

model_cb = CatBoostClassifier(random_state=42)
model_cb.fit(X_train, y_train)

Learning rate set to 0.074823
0:	learn: 0.5634325	total: 112ms	remaining: 1m 52s
1:	learn: 0.4966481	total: 143ms	remaining: 1m 11s
2:	learn: 0.4294736	total: 160ms	remaining: 53.1s
3:	learn: 0.3893393	total: 178ms	remaining: 44.4s
4:	learn: 0.3606726	total: 198ms	remaining: 39.3s
5:	learn: 0.3356531	total: 214ms	remaining: 35.5s
6:	learn: 0.3142600	total: 254ms	remaining: 36s
7:	learn: 0.2970195	total: 273ms	remaining: 33.9s
8:	learn: 0.2825469	total: 292ms	remaining: 32.1s
9:	learn: 0.2720296	total: 308ms	remaining: 30.5s
10:	learn: 0.2625067	total: 324ms	remaining: 29.1s
11:	learn: 0.2501596	total: 341ms	remaining: 28.1s
12:	learn: 0.2385033	total: 360ms	remaining: 27.4s
13:	learn: 0.2315154	total: 379ms	remaining: 26.7s
14:	learn: 0.2246179	total: 396ms	remaining: 26s
15:	learn: 0.2197732	total: 415ms	remaining: 25.5s
16:	learn: 0.2150821	total: 434ms	remaining: 25.1s
17:	learn: 0.2088822	total: 452ms	remaining: 24.7s
18:	learn: 0.2061740	total: 470ms	remaining: 24.3s
19:	learn: 0.

160:	learn: 0.1147543	total: 5.42s	remaining: 28.2s
161:	learn: 0.1144618	total: 5.47s	remaining: 28.3s
162:	learn: 0.1143068	total: 5.53s	remaining: 28.4s
163:	learn: 0.1142045	total: 5.58s	remaining: 28.4s
164:	learn: 0.1139196	total: 5.63s	remaining: 28.5s
165:	learn: 0.1138155	total: 5.67s	remaining: 28.5s
166:	learn: 0.1136544	total: 5.71s	remaining: 28.5s
167:	learn: 0.1134474	total: 5.75s	remaining: 28.5s
168:	learn: 0.1132482	total: 5.8s	remaining: 28.5s
169:	learn: 0.1131022	total: 5.85s	remaining: 28.6s
170:	learn: 0.1129164	total: 5.9s	remaining: 28.6s
171:	learn: 0.1126949	total: 5.96s	remaining: 28.7s
172:	learn: 0.1124791	total: 6.01s	remaining: 28.7s
173:	learn: 0.1122998	total: 6.05s	remaining: 28.7s
174:	learn: 0.1121457	total: 6.09s	remaining: 28.7s
175:	learn: 0.1120658	total: 6.13s	remaining: 28.7s
176:	learn: 0.1119511	total: 6.18s	remaining: 28.7s
177:	learn: 0.1118430	total: 6.23s	remaining: 28.8s
178:	learn: 0.1117604	total: 6.27s	remaining: 28.8s
179:	learn: 0.

321:	learn: 0.0979274	total: 12.3s	remaining: 25.9s
322:	learn: 0.0978841	total: 12.3s	remaining: 25.9s
323:	learn: 0.0978220	total: 12.4s	remaining: 25.8s
324:	learn: 0.0977527	total: 12.4s	remaining: 25.8s
325:	learn: 0.0976800	total: 12.5s	remaining: 25.8s
326:	learn: 0.0976145	total: 12.5s	remaining: 25.7s
327:	learn: 0.0975794	total: 12.5s	remaining: 25.7s
328:	learn: 0.0975219	total: 12.6s	remaining: 25.7s
329:	learn: 0.0974513	total: 12.6s	remaining: 25.6s
330:	learn: 0.0973899	total: 12.7s	remaining: 25.6s
331:	learn: 0.0973446	total: 12.7s	remaining: 25.6s
332:	learn: 0.0973012	total: 12.8s	remaining: 25.6s
333:	learn: 0.0972542	total: 12.8s	remaining: 25.5s
334:	learn: 0.0972241	total: 12.8s	remaining: 25.5s
335:	learn: 0.0971789	total: 12.9s	remaining: 25.5s
336:	learn: 0.0971399	total: 13s	remaining: 25.5s
337:	learn: 0.0970803	total: 13s	remaining: 25.5s
338:	learn: 0.0970032	total: 13s	remaining: 25.4s
339:	learn: 0.0969381	total: 13.1s	remaining: 25.4s
340:	learn: 0.0968

481:	learn: 0.0899655	total: 18.9s	remaining: 20.3s
482:	learn: 0.0899410	total: 19s	remaining: 20.3s
483:	learn: 0.0898918	total: 19s	remaining: 20.3s
484:	learn: 0.0898664	total: 19.1s	remaining: 20.2s
485:	learn: 0.0898450	total: 19.1s	remaining: 20.2s
486:	learn: 0.0897905	total: 19.1s	remaining: 20.2s
487:	learn: 0.0897528	total: 19.2s	remaining: 20.1s
488:	learn: 0.0896966	total: 19.2s	remaining: 20.1s
489:	learn: 0.0896632	total: 19.3s	remaining: 20.1s
490:	learn: 0.0896522	total: 19.3s	remaining: 20s
491:	learn: 0.0896110	total: 19.4s	remaining: 20s
492:	learn: 0.0895768	total: 19.4s	remaining: 20s
493:	learn: 0.0895424	total: 19.5s	remaining: 19.9s
494:	learn: 0.0894958	total: 19.5s	remaining: 19.9s
495:	learn: 0.0894662	total: 19.5s	remaining: 19.8s
496:	learn: 0.0894340	total: 19.6s	remaining: 19.8s
497:	learn: 0.0894111	total: 19.6s	remaining: 19.8s
498:	learn: 0.0893722	total: 19.7s	remaining: 19.7s
499:	learn: 0.0893348	total: 19.7s	remaining: 19.7s
500:	learn: 0.0892725	

641:	learn: 0.0846282	total: 25.4s	remaining: 14.2s
642:	learn: 0.0845991	total: 25.5s	remaining: 14.1s
643:	learn: 0.0845836	total: 25.5s	remaining: 14.1s
644:	learn: 0.0845427	total: 25.5s	remaining: 14.1s
645:	learn: 0.0844557	total: 25.6s	remaining: 14s
646:	learn: 0.0844181	total: 25.6s	remaining: 14s
647:	learn: 0.0843958	total: 25.6s	remaining: 13.9s
648:	learn: 0.0843443	total: 25.7s	remaining: 13.9s
649:	learn: 0.0843226	total: 25.7s	remaining: 13.8s
650:	learn: 0.0842878	total: 25.7s	remaining: 13.8s
651:	learn: 0.0842638	total: 25.8s	remaining: 13.8s
652:	learn: 0.0842389	total: 25.8s	remaining: 13.7s
653:	learn: 0.0841769	total: 25.9s	remaining: 13.7s
654:	learn: 0.0841393	total: 25.9s	remaining: 13.6s
655:	learn: 0.0840943	total: 25.9s	remaining: 13.6s
656:	learn: 0.0840282	total: 26s	remaining: 13.6s
657:	learn: 0.0839895	total: 26s	remaining: 13.5s
658:	learn: 0.0839546	total: 26s	remaining: 13.5s
659:	learn: 0.0839284	total: 26.1s	remaining: 13.4s
660:	learn: 0.0838892	

804:	learn: 0.0798621	total: 32.2s	remaining: 7.79s
805:	learn: 0.0798229	total: 32.2s	remaining: 7.75s
806:	learn: 0.0798042	total: 32.3s	remaining: 7.71s
807:	learn: 0.0797723	total: 32.3s	remaining: 7.67s
808:	learn: 0.0797299	total: 32.4s	remaining: 7.64s
809:	learn: 0.0797025	total: 32.4s	remaining: 7.6s
810:	learn: 0.0796753	total: 32.4s	remaining: 7.56s
811:	learn: 0.0796417	total: 32.5s	remaining: 7.52s
812:	learn: 0.0796157	total: 32.5s	remaining: 7.48s
813:	learn: 0.0795788	total: 32.6s	remaining: 7.44s
814:	learn: 0.0795624	total: 32.6s	remaining: 7.4s
815:	learn: 0.0795052	total: 32.6s	remaining: 7.36s
816:	learn: 0.0794800	total: 32.7s	remaining: 7.32s
817:	learn: 0.0794507	total: 32.7s	remaining: 7.28s
818:	learn: 0.0794355	total: 32.8s	remaining: 7.24s
819:	learn: 0.0794167	total: 32.8s	remaining: 7.2s
820:	learn: 0.0793974	total: 32.8s	remaining: 7.16s
821:	learn: 0.0793727	total: 32.9s	remaining: 7.12s
822:	learn: 0.0793519	total: 32.9s	remaining: 7.08s
823:	learn: 0.0

963:	learn: 0.0761284	total: 38.7s	remaining: 1.45s
964:	learn: 0.0761108	total: 38.8s	remaining: 1.41s
965:	learn: 0.0760988	total: 38.8s	remaining: 1.36s
966:	learn: 0.0760906	total: 38.8s	remaining: 1.32s
967:	learn: 0.0760669	total: 38.9s	remaining: 1.28s
968:	learn: 0.0760517	total: 38.9s	remaining: 1.24s
969:	learn: 0.0760336	total: 38.9s	remaining: 1.2s
970:	learn: 0.0760178	total: 39s	remaining: 1.16s
971:	learn: 0.0760049	total: 39s	remaining: 1.12s
972:	learn: 0.0759904	total: 39s	remaining: 1.08s
973:	learn: 0.0759603	total: 39.1s	remaining: 1.04s
974:	learn: 0.0759329	total: 39.1s	remaining: 1s
975:	learn: 0.0758923	total: 39.1s	remaining: 963ms
976:	learn: 0.0758660	total: 39.2s	remaining: 922ms
977:	learn: 0.0758484	total: 39.2s	remaining: 882ms
978:	learn: 0.0758406	total: 39.2s	remaining: 842ms
979:	learn: 0.0758178	total: 39.3s	remaining: 802ms
980:	learn: 0.0757900	total: 39.3s	remaining: 762ms
981:	learn: 0.0757738	total: 39.4s	remaining: 721ms
982:	learn: 0.0757519	

<catboost.core.CatBoostClassifier at 0x27d10707f40>

In [103]:
y_pred = model_cb.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14230   343]
 [  692 10711]]


In [104]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14230
False Positive:  343
False Negative:  692
True Positive:  10711


In [105]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9601555281798583
Precision: 0.9689705084132441
Recall: 0.9393142155573094
F-measure: 0.9539119205592911


### 6.18 CatBoost with GridSearchCV

Tuned Parameters: {'depth': 8, 'learning_rate': 0.1, 'n_estimators': 350}

In [106]:
param_grid = {'n_estimators': [250, 350],
              'depth': [5, 8, 15],
              'learning_rate': [0.05, 0.1, 0.5]}

model_cb1 = CatBoostClassifier(random_state=42)
grid = GridSearchCV(model_cb1, param_grid, n_jobs=-1, cv=10)
grid.fit(X_train, y_train)

0:	learn: 0.5177863	total: 37.5ms	remaining: 13.1s
1:	learn: 0.4338036	total: 74.3ms	remaining: 12.9s
2:	learn: 0.3538858	total: 105ms	remaining: 12.2s
3:	learn: 0.3027540	total: 136ms	remaining: 11.8s
4:	learn: 0.2770231	total: 171ms	remaining: 11.8s
5:	learn: 0.2588304	total: 206ms	remaining: 11.8s
6:	learn: 0.2369871	total: 239ms	remaining: 11.7s
7:	learn: 0.2248739	total: 284ms	remaining: 12.1s
8:	learn: 0.2159939	total: 334ms	remaining: 12.7s
9:	learn: 0.2057484	total: 384ms	remaining: 13.1s
10:	learn: 0.1961886	total: 446ms	remaining: 13.7s
11:	learn: 0.1899643	total: 500ms	remaining: 14.1s
12:	learn: 0.1821279	total: 552ms	remaining: 14.3s
13:	learn: 0.1752288	total: 622ms	remaining: 14.9s
14:	learn: 0.1699629	total: 670ms	remaining: 15s
15:	learn: 0.1674179	total: 724ms	remaining: 15.1s
16:	learn: 0.1647210	total: 769ms	remaining: 15.1s
17:	learn: 0.1618256	total: 826ms	remaining: 15.2s
18:	learn: 0.1583037	total: 885ms	remaining: 15.4s
19:	learn: 0.1560197	total: 940ms	remaini

162:	learn: 0.0886382	total: 9s	remaining: 10.3s
163:	learn: 0.0885766	total: 9.05s	remaining: 10.3s
164:	learn: 0.0884158	total: 9.11s	remaining: 10.2s
165:	learn: 0.0883070	total: 9.16s	remaining: 10.2s
166:	learn: 0.0881546	total: 9.21s	remaining: 10.1s
167:	learn: 0.0879760	total: 9.27s	remaining: 10s
168:	learn: 0.0878666	total: 9.32s	remaining: 9.98s
169:	learn: 0.0877300	total: 9.36s	remaining: 9.91s
170:	learn: 0.0876713	total: 9.41s	remaining: 9.85s
171:	learn: 0.0876156	total: 9.46s	remaining: 9.79s
172:	learn: 0.0875548	total: 9.51s	remaining: 9.73s
173:	learn: 0.0873743	total: 9.57s	remaining: 9.68s
174:	learn: 0.0871200	total: 9.62s	remaining: 9.62s
175:	learn: 0.0869670	total: 9.68s	remaining: 9.57s
176:	learn: 0.0868692	total: 9.73s	remaining: 9.51s
177:	learn: 0.0867244	total: 9.78s	remaining: 9.45s
178:	learn: 0.0866320	total: 9.82s	remaining: 9.38s
179:	learn: 0.0865086	total: 9.88s	remaining: 9.33s
180:	learn: 0.0863788	total: 9.93s	remaining: 9.27s
181:	learn: 0.086

322:	learn: 0.0716162	total: 17.3s	remaining: 1.45s
323:	learn: 0.0715614	total: 17.4s	remaining: 1.39s
324:	learn: 0.0714787	total: 17.4s	remaining: 1.34s
325:	learn: 0.0713759	total: 17.5s	remaining: 1.28s
326:	learn: 0.0712618	total: 17.5s	remaining: 1.23s
327:	learn: 0.0712119	total: 17.6s	remaining: 1.18s
328:	learn: 0.0711135	total: 17.6s	remaining: 1.12s
329:	learn: 0.0710181	total: 17.7s	remaining: 1.07s
330:	learn: 0.0709609	total: 17.7s	remaining: 1.01s
331:	learn: 0.0709153	total: 17.8s	remaining: 962ms
332:	learn: 0.0708547	total: 17.8s	remaining: 909ms
333:	learn: 0.0707435	total: 17.8s	remaining: 855ms
334:	learn: 0.0706453	total: 17.9s	remaining: 801ms
335:	learn: 0.0705601	total: 17.9s	remaining: 748ms
336:	learn: 0.0704677	total: 18s	remaining: 694ms
337:	learn: 0.0704324	total: 18s	remaining: 640ms
338:	learn: 0.0703670	total: 18.1s	remaining: 587ms
339:	learn: 0.0703096	total: 18.1s	remaining: 533ms
340:	learn: 0.0702229	total: 18.2s	remaining: 480ms
341:	learn: 0.07

GridSearchCV(cv=10,
             estimator=<catboost.core.CatBoostClassifier object at 0x0000027D106F1D00>,
             n_jobs=-1,
             param_grid={'depth': [5, 8, 15], 'learning_rate': [0.05, 0.1, 0.5],
                         'n_estimators': [250, 350]})

In [107]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'depth': 8, 'learning_rate': 0.1, 'n_estimators': 350}


In [108]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14225   348]
 [  677 10726]]


In [109]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14225
False Positive:  348
False Negative:  677
True Positive:  10726


In [110]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9605404989220819
Precision: 0.9685750406357233
Recall: 0.940629658861703
F-measure: 0.954397828891756


In [111]:
# Predict X_train
X_train_pred = grid.predict(X_train)

# Accuracy against y_train
train_accuracy = accuracy_score(y_train, X_train_pred)

print("Train Error: ", 1 - train_accuracy)
print("Test Error: ", 1 - accuracy)

Train Error:  0.02694795195565136
Test Error:  0.03945950107791807


#### Key Takeaways
<br>
<div style="text-align: justify">
    With a ‘n_estimators’ of 350 and ‘depth’ of 8, it allows the model to learn the data better as more data is being captured. Moreover, for depth of tree, values in the range from 6 to 10 are recommended. In order to get the best possible quality, learning rate has been tuned to 0.1 which resulted in a lesser number of iterations required for training. This further helps to speed up the training and rate of convergence.
</div>

### 6.19 LightGBM w/o GridSearchCV
<br>
<div style="text-align: justify">
LightGBM is a fast and high performance gradient boosting framework based on decision tree algorithm. It adopts a histogram training algorithm and is capable of performing on large datasets with a reduction in training time (Khandelwal, 2017). This technique will choose the leaf with maximum delta loss to grow and when growing the same leaf (Mandot, 2017). 
</div>

In [112]:
import lightgbm as lgb
from lightgbm import LGBMModel, LGBMClassifier

model_lgb = LGBMClassifier(random_state=42) 
model_lgb.fit(X_train, y_train)

LGBMClassifier(random_state=42)

In [113]:
y_pred = model_lgb.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14188   385]
 [  743 10660]]


In [114]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14188
False Positive:  385
False Negative:  743
True Positive:  10660


In [115]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.956575300277179
Precision: 0.965142598460842
Recall: 0.9348417083223713
F-measure: 0.9497505345687812


### 6.20 LightGBM with GridSearchCV

Tuned Parameters: {'feature_fraction': 0.8, 'max_depth': 20, 'num_iterations': 300, 'num_leaves': 55}

In [116]:
param_grid = {'max_depth': [20, 30, 40], 
              'num_leaves': [35, 45, 55], 
              'num_iterations': [100, 200, 300],  
              'feature_fraction' : [0.6, 0.7, 0.8]}

model_lgb1 = LGBMClassifier(random_state=42)
grid = GridSearchCV(model_lgb1, param_grid, n_jobs=-1, cv=10)                  
grid.fit(X_train, y_train)





GridSearchCV(cv=10, estimator=LGBMClassifier(random_state=42), n_jobs=-1,
             param_grid={'feature_fraction': [0.6, 0.7, 0.8],
                         'max_depth': [20, 30, 40],
                         'num_iterations': [100, 200, 300],
                         'num_leaves': [35, 45, 55]})

In [117]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'feature_fraction': 0.8, 'max_depth': 20, 'num_iterations': 300, 'num_leaves': 55}


In [118]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14216   357]
 [  681 10722]]


In [119]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14216
False Positive:  357
False Negative:  681
True Positive:  10722


In [120]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9600400369571912
Precision: 0.967776875169239
Recall: 0.9402788739805314
F-measure: 0.9538297304510275


In [121]:
# Predict X_train
X_train_pred = grid.predict(X_train)

# Accuracy against y_train
train_accuracy = accuracy_score(y_train, X_train_pred)

print("Train Error: ", 1 - train_accuracy)
print("Test Error: ", 1 - accuracy)

Train Error:  0.024445642131198042
Test Error:  0.0399599630428088


#### Key Takeaways
<br>
<div style="text-align: justify">
    While it is understood that it is better to fine-tune the max_depth to a larger value, it is important to take note that this might lead to a possibility of overfitting. The reason why "num_leaves" was chosen as one of the parameters to fine-tune is because it aids in controlling the complexity of the model.
</div>

### 6.21 SVM with Linear Kernel
<br>
<div style="text-align: justify">
The objective of SVM is to find a hyperplane in an N-dimensional space that distinctly classifies the data points. SVM works better with two dimensional space which are linearly separable and for non-linear data, Kernel Trick such as Radial Basis Function (RBF) is used. 
</div>

In [122]:
from sklearn import svm

model_svmlinear = svm.SVC(kernel='linear', random_state=42)
model_svmlinear.fit(X_train, y_train)

SVC(kernel='linear', random_state=42)

In [123]:
y_pred = model_svmlinear.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[13037  1536]
 [ 2154  9249]]


In [124]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  13037
False Positive:  1536
False Negative:  2154
True Positive:  9249


In [125]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.857945796119495
Precision: 0.8575799721835883
Recall: 0.8111023414890818
F-measure: 0.8336938885884262


#### Key Takeaways
<br>
<div style="text-align: justify">
    After comparing SVM with linear kernel and SVM with radial basis fucntion (RBF) kernel, it is noted that SVM with RBF kernal performed much better with an accuracy of 88.5% and f-measure of 86.5%, as shown in the next section. <br><br>
    This could be due to the nature of the dataset where it is not linearly separable and since the SVM with linear kernel is not able to find a hyperplane which maximises the margin, it is not able to give the optimal performance (Raschka, n.d.). On the other hand, the RBF kernel which is non-linear, creates non-linear combinations of the features and hence, shows better performance for our dataset. Thus, we will only be tuning the SVM model based on the RBF kernel.
</div>

### 6.22 SVM with RBF w/o GridSearchCV

In [126]:
from sklearn import svm

model_svmrbf = svm.SVC(kernel='rbf', random_state=42)
model_svmrbf.fit(X_train, y_train)

SVC(random_state=42)

In [127]:
y_pred = model_svmrbf.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[13451  1122]
 [ 1866  9537]]


In [128]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  13451
False Positive:  1122
False Negative:  1866
True Positive:  9537


In [129]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.884970742223591
Precision: 0.8947368421052632
Recall: 0.8363588529334386
F-measure: 0.8645635028555888


### 6.23 SVM with RBF with GridSearchCV

Tuned Parameters: {'C': 1, 'gamma': 0.1}

In [130]:
param_grid = {'gamma':[0.1,1,'scale'],
              'C':[0.1,1]}

model_svmrbf1 = svm.SVC(kernel='rbf', random_state=42)
grid = GridSearchCV(model_svmrbf1, param_grid, n_jobs=-1, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=SVC(random_state=42), n_jobs=-1,
             param_grid={'C': [0.1, 1], 'gamma': [0.1, 1, 'scale']})

In [131]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'C': 1, 'gamma': 0.1}


In [132]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14047   526]
 [  917 10486]]


In [133]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14047
False Positive:  526
False Negative:  917
True Positive:  10486


In [134]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9444487218971358
Precision: 0.9522339266254994
Recall: 0.9195825659914058
F-measure: 0.9356234664287307


#### Key Takeaways
<br>
<div style="text-align: justify">
    Common parameters for tuning include C and Gamma with common values such as 0.1, 1, 10, 100 for both. Large values for C and Gamma will lead to overfitting, hence the value 100 is eliminated. Due to the limited computing power, value 10 for both parameters will not be included. The accuracy achieved through the GridsearchCV is lower by 0.001% compared to the base model without parameter tuning and this could be due to the number of cross validation used which led to different optimal values found.
</div>

### 6.24 Logistic Regression w/o GridSearchCV
<br>
<div style="text-align: justify">
Logistic regression is an increasingly popular statistical technique used to model and predict categorical outcome variables. Not only is it easy to apply, the conditional probabilities are determined through the training process, thus making training very efficient and valuable in the applications of customer relationship management. It is particularly well suited and frequently employed to assess and analyze customer behavior (Cui, 2018).
</div>

In [135]:
from sklearn.linear_model import LogisticRegression

model_lg = LogisticRegression(random_state=42)
model_lg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [136]:
y_pred = model_lg.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[12840  1733]
 [ 2037  9366]]


In [137]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  12840
False Positive:  1733
False Negative:  2037
True Positive:  9366


In [138]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.8548660301817061
Precision: 0.8438598071898369
Recall: 0.8213627992633518
F-measure: 0.8324593369478269


### 6.25 Logistic Regression with GridSearchCV 

Tuned Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

In [139]:
param_grid = {'penalty' : ['l1', 'l2', 'elasticnet'],
              'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag','saga'],
              'C': [0.001, 0.1, 1, 10]}

model_lg1 = LogisticRegression(random_state=42)
grid = GridSearchCV(model_lg1, param_grid, n_jobs=-1, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=LogisticRegression(random_state=42), n_jobs=-1,
             param_grid={'C': [0.001, 0.1, 1, 10],
                         'penalty': ['l1', 'l2', 'elasticnet'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']})

In [140]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


In [141]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[12838  1735]
 [ 2036  9367]]


In [142]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  12838
False Positive:  1735
False Negative:  2036
True Positive:  9367


In [143]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.8548275331074838
Precision: 0.8437218519185732
Recall: 0.8214504954836447
F-measure: 0.8324372361697402


#### Key Takeaways
<br>
<div style="text-align: justify">
    With 10-fold cross-validation for GridSearchCV, the dataset will be shuffled randomly and split into 10 groups, which resulted in smaller datasets. Hence, ‘liblinear’ is being used as it is a good choice for smaller datasets. Moreover, ‘liblinear’ handles ‘l1’ penalty as well, which leads to a sparser solution. However, the accuracy of logistic regression with GridSearchCV is found to be relatively close to the one without GridSearchCV. This could be due to the number of cross validation used which led to different optimal values found.
</div>

### 6.26 Logistic Regression with StackingClassifer w/o GridSearchCV
<br>
<div style="text-align: justify">
Stacking classifier is an ensemble-learning meta-classifier for stacking. The classifier combines multiple classification models together using a meta-classifier in making the final prediction (Ceballos, 2019). This classifier serves to improve the performance model by leveraging on the capabilities of the different models as compared to using a single model. 
</div>

In [144]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('dt', DecisionTreeClassifier(max_depth=15, random_state=42)),
              ('gb', GradientBoostingClassifier(n_estimators = 500, max_depth = 6, random_state=42))]

model_sc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
model_sc.fit(X_train, y_train)

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(n_estimators=10,
                                                       random_state=42)),
                               ('dt',
                                DecisionTreeClassifier(max_depth=15,
                                                       random_state=42)),
                               ('gb',
                                GradientBoostingClassifier(max_depth=6,
                                                           n_estimators=500,
                                                           random_state=42))],
                   final_estimator=LogisticRegression())

In [145]:
y_pred = model_sc.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14143   430]
 [  659 10744]]


In [146]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14143
False Positive:  430
False Negative:  659
True Positive:  10744


In [147]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9580766861718509
Precision: 0.9615178091999284
Recall: 0.9422081908269754
F-measure: 0.9517650706471188


### 6.27 Logistic Regression with StackingClassifer with GridSearchCV

Tuned Parameters: {'rf__n_estimators': 200}

In [148]:
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('dt', DecisionTreeClassifier(max_depth=15, random_state=42)),
              ('gb', GradientBoostingClassifier(n_estimators = 500, max_depth = 6, random_state=42))]

model_sc1 = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

param_grid = {'rf__n_estimators': [20,200]}
grid = GridSearchCV(model_sc1, param_grid, n_jobs=-1, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=StackingClassifier(estimators=[('rf',
                                                       RandomForestClassifier(n_estimators=10,
                                                                              random_state=42)),
                                                      ('dt',
                                                       DecisionTreeClassifier(max_depth=15,
                                                                              random_state=42)),
                                                      ('gb',
                                                       GradientBoostingClassifier(max_depth=6,
                                                                                  n_estimators=500,
                                                                                  random_state=42))],
                                          final_estimator=LogisticRegression()),
             n_jobs=-1, param_grid={'r

In [149]:
print('Best parameters found by grid search are:', grid.best_params_)

Best parameters found by grid search are: {'rf__n_estimators': 200}


In [150]:
y_pred = grid.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[14152   421]
 [  653 10750]]


In [151]:
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("True Negative: ", tn)
print("False Positive: ", fp)
print("False Negative: ", fn)
print("True Positive: ", tp)

True Negative:  14152
False Positive:  421
False Negative:  653
True Positive:  10750


In [152]:
accuracy = accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
fmeasure = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", fmeasure)

Accuracy: 0.9586541422851863
Precision: 0.9623131322173485
Recall: 0.9427343681487328
F-measure: 0.952423141667405


In [153]:
# Predict X_train
X_train_pred = grid.predict(X_train)

# Accuracy against y_train
train_accuracy = accuracy_score(y_train, X_train_pred)

print("Train Error: ", 1 - train_accuracy)
print("Test Error: ", 1 - accuracy)

Train Error:  0.0036860948567908336
Test Error:  0.041345857714813694


#### Key Takeaways
<br>
<div style="text-align: justify">
    Parameters for GridSearchCV had been predefined as 'n_estimators': [20,200] and 200 is found to be the best parameter. Moreover, commonly used data mining techniques such as random forest, decision tree and gradient boosting classfier are being included as well where hyperparameters of each classifier will be fine-tuned to improve the performance of the model.
</div>

## 6. Model Evaluation (Occam's Law)
<br>
<div style="text-align: justify">
    According to Occam's Law (Law of Briefness), given two models of similar generation errors, one should prefer the simpler model. Therefore, we will be evaluating our top five models based on the training and testing error.
</div>

<table>
  <tr>
    <th>Models</th>
    <th>Training Error</th>
    <th>Testing Error</th>
  </tr>
  <tr>
    <td>Random Forest</td>
    <td>0.0089</td>
    <td>0.0419</td>
  </tr>
  <tr>
    <td>Logistic Regression with Stacking Classifier</td>
    <td>0.0037</td>
    <td>0.0413</td>
  </tr>
  <tr>
    <td>CatBoost</td>
    <td>0.0269</td>
    <td>0.0395</td>
  </tr>
  <tr>
    <td>XGBoost</td>
    <td>0.0287</td>
    <td>0.0392</td>
  </tr>
  <tr>
    <td>LightGBM</td>
    <td>0.0244</td>
    <td>0.0400</td>
  </tr>
</table>


#### Key Takeaways
<br>
<div style="text-align: justify">
    Based on the table shown above, it is concluded that the XGBoost model performs best on our dataset based on the minimal difference of 0.011 in error which indicates that it is an appropriate model. However, CatBoost with a difference of 0.0126 also performed equally well as XGBoost since the model works well with categorical variables. Thus, both models will be taken into consideration for our future works.
</div>

## 7. Future Works
<br>
<div style="text-align: justify">
    <b> 1. Feature Importance </b> - it will be used to identify the top 10 variables to better predict the passenger satisfaction. This will be performed on XGBoost and CatBoost models with parameters tuning using GridSearchCV. From there, we will evaluate the model again to see if there is any improvement. <br><br>
    <b> 2. Address Overfitting </b> - as both models Random Forest and Logistic Regression with Stacking Classifier showed signs of overfitting, further parameters tuning will be performed such as decreasing the maximum depth of the tree, increasing the number of estimators, decreasing the number of maximum features and then conducting further evaluation. <br><br>
    <b> 3. Neural Network </b> - can look into using neural network as it has the ability to model customer satisfaction with its capability to handle any non-linear functions and multicollinearity within the input variable. <br><br>
    <b> 4. RandomizedSearchCV </b> - can look into using RandomizedSearchCV to fine-tune the hyperparameters instead of GridSearchCV. This is so because in GridSearchCV approach, every combination of hyperparameter values is tried which can be very inefficient and computationally intensive, considering that our dataset is relatively large. While it is possible that RandomizedSearchCV will not find as accurate of a result as GridSearchCV, it is still able to pick the best result more often than not and in a fraction of the time it takes GridSearchCV would have taken.
    
</div>

## 8. Conclusion
<br>
<div style="text-align: justify">
    In this study, exploratory data analysis (EDA), preprocessing of data such as feature encoding have been performed before leveraging on various classification models to find the best suited model in predicting airline's passengers satisfaction. Moreover, parameters of each models are being fine-tuned with the help of GridSearchCV to improve the performance of models. <br><br>
    By utiliing training and testing error to evaluation the top 5 models, it is concluded that XGBoost model performs best on our dataset followed by CatBoost. With the use of predictive analytics, it can help airline compabies to elevate customer loyalty by offering enhanced and personalized customer experience. This aims to ensure customer retention which results in driving business growth and profitability.
    
</div>