# Ensemble Learning

## Simple Ensemble Learning
### Logistic Regression, KNN, Random Forest 활용한 Ensemble 모델

In [1]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np

In [2]:
# read data

credData = pd.read_csv('data/ensemble_data_sample.csv', sep=',', header=None, na_values='?')
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [3]:
# changing the classes to 1 & 0

credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [4]:
# finding number of null values

credData.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [5]:
# Printing Shape

print('Shape of raw data set', credData.shape)

Shape of raw data set (690, 16)


In [6]:
# Printing data types

print('Data types of data set', credData.dtypes)

Data types of data set 0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
15      int64
dtype: object


In [7]:
# Dropping all the rows with NA values

newcred = credData.dropna(axis = 0)
newcred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [8]:
newcred.shape

(653, 16)

In [9]:
# Verifying no null values exist

newcred.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [10]:
# Printing Shape

print('Shape of raw data set', newcred.shape)

Shape of raw data set (653, 16)


In [11]:
# Printing data types

print('Data types of data set', newcred.dtypes)

Data types of data set 0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
15      int64
dtype: object


In [12]:
# Separating the categorical variables to make dummy

credCat = pd.get_dummies(newcred[[0, 3, 4, 5, 6, 8, 9, 11, 12]])
credCat.head()

Unnamed: 0,0_a,0_b,3_l,3_u,3_y,4_g,4_gg,4_p,5_aa,5_c,...,6_z,8_f,8_t,9_f,9_t,11_f,11_t,12_g,12_p,12_s
0,0,1,0,1,0,1,0,0,0,0,...,0,0,1,0,1,1,0,1,0,0
1,1,0,0,1,0,1,0,0,0,0,...,0,0,1,0,1,1,0,1,0,0
2,1,0,0,1,0,1,0,0,0,0,...,0,0,1,1,0,1,0,1,0,0
3,0,1,0,1,0,1,0,0,0,0,...,0,0,1,0,1,0,1,1,0,0
4,0,1,0,1,0,1,0,0,0,0,...,0,0,1,1,0,1,0,0,0,1


In [13]:
# Separating the numerical variables

credNum = newcred[[1, 2, 7, 10, 13, 14]]
credNum.head()

Unnamed: 0,1,2,7,10,13,14
0,30.83,0.0,1.25,1,202.0,0
1,58.67,4.46,3.04,6,43.0,560
2,24.5,0.5,1.5,0,280.0,824
3,27.83,1.54,3.75,5,100.0,3
4,20.17,5.625,1.71,0,120.0,0


In [14]:
# Making the X variable which is a concatenation of categorical and numerical data

X = pd.concat([credCat, credNum], axis = 1)
X.shape

(653, 46)

In [15]:
# Separating the label as y variable

y = newcred[15]
y.shape

(653,)

In [16]:
# Transforming with the scaler functitp

minmaxScaler = preprocessing.MinMaxScaler()
X_trans = pd.DataFrame(minmaxScaler.fit_transform(X))

In [17]:
# Printing the output

X_trans.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.271111,0.0,0.04386,0.014925,0.101,0.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.713016,0.159286,0.106667,0.089552,0.0215,0.0056
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.170635,0.017857,0.052632,0.0,0.14,0.00824
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.223492,0.055,0.131579,0.074627,0.05,3e-05
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.101905,0.200893,0.06,0.0,0.06,0.0


In [18]:
# Splitting the data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.3, random_state=123)

In [19]:
## Averaging Ensemble Model
# Defining base models

model1 = LogisticRegression(random_state=123)
model2 = KNeighborsClassifier(n_neighbors=5)
model3 = RandomForestClassifier(n_estimators=500)

In [20]:
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test)



In [21]:
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)

In [22]:
model3.fit(X_train, y_train)
pred3 = model3.predict(X_test)

In [23]:
# Calculating the ensemble prediction by averaging base model predictions

ensemblepred = (pred1 + pred2 + pred3) / 3
ensemblepred

array([0.        , 0.        , 1.        , 1.        , 0.        ,
       1.        , 1.        , 1.        , 0.66666667, 1.        ,
       0.        , 0.33333333, 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 1.        , 0.        ,
       0.        , 0.        , 0.33333333, 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       1.        , 0.        , 0.66666667, 0.        , 0.        ,
       1.        , 0.66666667, 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 1.        , 0.        , 1.        , 0.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.33333333, 1.        , 0.        , 0.        , 1.        ,
       1.        , 0.        , 1.        , 1.        , 1.        ,
       0.        , 0.        , 0.66666667, 0.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.     

In [24]:
ensemblepred = ensemblepred.round()
ensemblepred

array([0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0.,
       0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1.,
       1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0.,
       1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1.,
       0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.,
       1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 1.])

In [25]:
# Printing the order of classes for each model

print(model1.classes_)
print(model2.classes_)
print(model3.classes_)

[0 1]
[0 1]
[0 1]


In [26]:
# Confusion matrix

confusionMatrix = confusion_matrix(y_test, ensemblepred)
print(confusionMatrix)

[[96 11]
 [ 8 81]]


In [27]:
# Classification report

print(classification_report(y_test, ensemblepred))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       107
           1       0.88      0.91      0.90        89

    accuracy                           0.90       196
   macro avg       0.90      0.90      0.90       196
weighted avg       0.90      0.90      0.90       196



## Weighted Averaging
### Logistic Regression, KNN, Random Forest 활용한 Weighted Averaging Ensemble 모델

In [28]:
# Calculating the ensemble prediction applied by weighting

ensemblepred = (pred1 * 0.6 + pred2 * 0.2 + pred3 * 0.2)
ensemblepred

array([0. , 0. , 1. , 1. , 0. , 1. , 1. , 1. , 0.4, 1. , 0. , 0.6, 0. ,
       0. , 0. , 0. , 0. , 1. , 1. , 0. , 0. , 0. , 0.6, 1. , 0. , 0. ,
       0. , 0. , 0. , 1. , 1. , 0. , 0.8, 0. , 0. , 1. , 0.8, 0. , 1. ,
       0. , 0. , 0. , 0. , 0. , 1. , 0. , 1. , 0. , 1. , 0. , 1. , 1. ,
       1. , 1. , 1. , 0.2, 1. , 0. , 0. , 1. , 1. , 0. , 1. , 1. , 1. ,
       0. , 0. , 0.8, 0. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0. ,
       1. , 1. , 0. , 1. , 1. , 1. , 0. , 1. , 0. , 0.6, 0. , 0.8, 0. ,
       0. , 1. , 0. , 0. , 0. , 1. , 0. , 1. , 0. , 0. , 1. , 0. , 1. ,
       0. , 0. , 1. , 0. , 0. , 0. , 1. , 0. , 0.2, 0. , 0. , 0. , 1. ,
       0. , 1. , 0.8, 0.6, 0. , 1. , 0. , 0. , 1. , 0. , 0.8, 1. , 0. ,
       0. , 0.8, 0. , 0. , 1. , 1. , 0. , 1. , 1. , 0.8, 0.8, 0. , 1. ,
       0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 1. , 0.8, 0. ,
       1. , 1. , 1. , 0. , 0.8, 0. , 0.2, 0. , 1. , 1. , 1. , 0. , 0. ,
       0. , 1. , 0. , 1. , 0. , 0. , 0. , 1. , 1. , 0.8, 1. , 1.

In [29]:
ensemblepred = ensemblepred.round()
ensemblepred

array([0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0.,
       0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1.,
       1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0.,
       1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1.,
       0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 1.])

In [30]:
# Printing the order of classes for each model

print(model1.classes_)
print(model2.classes_)
print(model3.classes_)

[0 1]
[0 1]
[0 1]


In [31]:
# Confusion matrix

confusionMatrix = confusion_matrix(y_test, ensemblepred)
print(confusionMatrix)

[[93 14]
 [ 8 81]]


In [32]:
# Classification report

print(classification_report(y_test, ensemblepred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89       107
           1       0.85      0.91      0.88        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



## Max Voting
### Logistic Regression, KNN, Random Forest 활용한 Max Voting Ensemble 모델

In [33]:
from sklearn.ensemble import VotingClassifier

In [34]:
# Ensenble Models w/ VotingClassifier

model = VotingClassifier(estimators=[('lr', model1), ('knn', model2), ('rf', model3)], voting= 'hard')

In [35]:
# Fitting the model 

model.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=123,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                  

In [36]:
# Predict accuracy

model.score(X_test, y_test)

0.9081632653061225

In [37]:
# Prediction on test set

preds = model.predict(X_test)

In [38]:
# Confusion matrix

print(confusion_matrix(y_test, preds))

[[96 11]
 [ 7 82]]


In [39]:
# Classification report

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.93      0.90      0.91       107
           1       0.88      0.92      0.90        89

    accuracy                           0.91       196
   macro avg       0.91      0.91      0.91       196
weighted avg       0.91      0.91      0.91       196

