<a href="https://colab.research.google.com/github/bodadaniel/Ensemble_methods/blob/main/Ensemble_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import packages

In [2]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.7.0-py3-none-any.whl (17 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.7.0 scikit-optimize-0.9.0


In [31]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from skopt import BayesSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from collections import OrderedDict
import itertools
from mlxtend.classifier import StackingClassifier as MLStackingClassifier
from sklearn.ensemble import StackingClassifier as SLStackingClassifier
from sklearn.linear_model import LogisticRegression


#Load data

In [4]:
data = load_breast_cancer()

In [5]:
df = pd.DataFrame(data.data, columns = data.feature_names)
df['target'] = data.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


#Data properties

In [58]:
df.shape

(569, 31)

In [None]:
(df.isna().sum() == 0).all()

True

In [None]:
df.target.value_counts()

1    357
0    212
Name: target, dtype: int64

In [None]:
df.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                 float64
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean radius,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
mean texture,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
mean perimeter,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
mean area,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
mean smoothness,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
mean compactness,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
mean concavity,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
mean concave points,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
mean symmetry,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
mean fractal dimension,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ['target']),
                                                    df.target,
                                                    test_size=0.2,
                                                    random_state=0)

#Estimators

Random Forest classifier

In [34]:
rd_hpars = {
    'n_estimators' : np.arange(100, 701, 200),
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [None] + list(np.arange(5, 21, 5)),
    'max_features' : ['sqrt', 'log2', None],
    'min_samples_split' : [2] + list(np.arange(5, 51, 10)),
    'min_samples_leaf' : [1] + list(np.arange(5, 11, 5)),
    'bootstrap' : [True, False]
}

rd_hpars

{'n_estimators': array([100, 300, 500, 700]),
 'criterion': ['gini', 'entropy', 'log_loss'],
 'max_depth': [None, 5, 10, 15, 20],
 'max_features': ['sqrt', 'log2', None],
 'min_samples_split': [2, 5, 15, 25, 35, 45],
 'min_samples_leaf': [1, 5, 10],
 'bootstrap': [True, False]}

In [37]:
rd_grid = BayesSearchCV(estimator = RandomForestClassifier(random_state = 123),
                        search_spaces = rd_hpars,
                        n_iter = 50,
                        scoring = 'accuracy',
                        n_jobs = -1,
                        refit = True,
                        cv = 5,
                        verbose = 0,
                        random_state = 123)

rd_grid.fit(X_train, y_train)

rd_grid.score(X_test, y_test)

0.9824561403508771

In [None]:
rf_cv_results_df = pd.DataFrame(rd_grid.cv_results_)
rf_cv_results_df[rf_cv_results_df['rank_test_score'] <= 5].T

Unnamed: 0,23,26,40,47,48
mean_fit_time,1.549641,1.818277,2.240258,0.858688,0.999964
std_fit_time,0.200491,0.14888,0.038442,0.119866,0.035932
mean_score_time,0.056395,0.060551,0.085581,0.031841,0.046606
std_score_time,0.012454,0.018257,0.016764,0.005954,0.014141
param_bootstrap,False,False,True,False,True
param_criterion,entropy,log_loss,entropy,log_loss,log_loss
param_max_depth,15,,20,10,
param_max_features,sqrt,sqrt,log2,log2,log2
param_min_samples_leaf,1,1,1,1,1
param_min_samples_split,5,5,2,2,2


In [None]:
rd_grid.best_params_

OrderedDict([('bootstrap', False),
             ('criterion', 'log_loss'),
             ('max_depth', 10),
             ('max_features', 'log2'),
             ('min_samples_leaf', 1),
             ('min_samples_split', 2),
             ('n_estimators', 300)])

In [36]:
rf_clf = RandomForestClassifier(**dict(OrderedDict([('bootstrap', False),
             ('criterion', 'log_loss'),
             ('max_depth', 10),
             ('max_features', 'log2'),
             ('min_samples_leaf', 1),
             ('min_samples_split', 2),
             ('n_estimators', 300)])))

rf_clf.fit(X_train, y_train)

rf_clf.score(X_test, y_test)

0.9824561403508771

Gradient boosting classifier

In [None]:
gb_hpars = {
    'n_estimators' : np.arange(100, 701, 200),
    'learning_rate' : [0.001, 0.01, 0.1],
    'loss' : ['log_loss', 'exponential'],
    'criterion' : ['friedman_mse', 'squared_error'],
    'max_depth' : [None] + list(np.arange(5, 21, 5)),
    'max_features' : ['sqrt', 'log2', None],
    'min_samples_split' : [2] + list(np.arange(5, 51, 10)),
    'min_samples_leaf' : [1] + list(np.arange(5, 11, 5))
}

gb_hpars

{'n_estimators': array([100, 300, 500, 700]),
 'learning_rate': [0.001, 0.01, 0.1],
 'loss': ['log_loss', 'exponential'],
 'criterion': ['friedman_mse', 'squared_error'],
 'max_depth': [None, 5, 10, 15, 20],
 'max_features': ['sqrt', 'log2', None],
 'min_samples_split': [2, 5, 15, 25, 35, 45],
 'min_samples_leaf': [1, 5, 10]}

In [None]:
gb_grid = BayesSearchCV(estimator = GradientBoostingClassifier(random_state = 123),
                        search_spaces = gb_hpars,
                        n_iter = 50,
                        scoring = 'accuracy',
                        n_jobs = -1,
                        refit = True,
                        cv = 5,
                        verbose = 0,
                        random_state = 123)

gb_grid.fit(X_train, y_train)

gb_grid.score(X_test, y_test)

0.9824561403508771

In [None]:
gb_cv_results_df = pd.DataFrame(gb_grid.cv_results_)
gb_cv_results_df[gb_cv_results_df['rank_test_score'] <= 5].T

Unnamed: 0,9,17,19,28,29,34
mean_fit_time,1.868153,0.761194,1.088125,0.387955,1.180921,1.036737
std_fit_time,0.498372,0.143413,0.150031,0.066251,0.197748,0.176683
mean_score_time,0.007254,0.005371,0.005322,0.004676,0.006557,0.00668
std_score_time,0.003396,0.000986,0.00077,0.000623,0.001128,0.001935
param_criterion,friedman_mse,friedman_mse,friedman_mse,friedman_mse,friedman_mse,friedman_mse
param_learning_rate,0.1,0.1,0.1,0.1,0.1,0.1
param_loss,exponential,exponential,log_loss,exponential,log_loss,log_loss
param_max_depth,15,5,5,15,15,15
param_max_features,log2,log2,log2,log2,log2,log2
param_min_samples_leaf,5,10,5,5,10,5


In [None]:
gb_grid.best_params_

OrderedDict([('criterion', 'friedman_mse'),
             ('learning_rate', 0.1),
             ('loss', 'exponential'),
             ('max_depth', 15),
             ('max_features', 'log2'),
             ('min_samples_leaf', 5),
             ('min_samples_split', 2),
             ('n_estimators', 300)])

In [8]:
gb_clf = GradientBoostingClassifier(**dict(OrderedDict([('criterion', 'friedman_mse'),
             ('learning_rate', 0.1),
             ('loss', 'exponential'),
             ('max_depth', 15),
             ('max_features', 'log2'),
             ('min_samples_leaf', 5),
             ('min_samples_split', 2),
             ('n_estimators', 300)])))

gb_clf.fit(X_train, y_train)

gb_clf.score(X_test, y_test)

0.9824561403508771

SVM

In [None]:
svm_hpars = {
    'svc__C' : [1, 2, 3, 4, 5, 7],
    'svc__kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
    'svc__degree' : np.arange(2, 7, 1),
    'svc__gamma' : ['scale', 'auto'],
}

svm_hpars

{'svc__C': [1, 2, 3, 4, 5, 7],
 'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
 'svc__degree': array([2, 3, 4, 5, 6]),
 'svc__gamma': ['scale', 'auto']}

In [None]:
svm_grid = BayesSearchCV(estimator = Pipeline(steps=[("standardscaler", StandardScaler()), ("svc", SVC())]),
                        search_spaces = svm_hpars,
                        n_iter = 50,
                        scoring = 'accuracy',
                        n_jobs = -1,
                        refit = True,
                        cv = 5,
                        verbose = 0,
                        random_state = 123)

svm_grid.fit(X_train, y_train)

svm_grid.score(X_test, y_test)

0.9912280701754386

In [None]:
svm_cv_results_df = pd.DataFrame(svm_grid.cv_results_)
svm_cv_results_df[svm_cv_results_df['rank_test_score'] <= 5].T

Unnamed: 0,4,10,13,16,20,21,22,31,33,34,35,42,43,45,48
mean_fit_time,0.034412,0.016118,0.020571,0.02151,0.01912,0.030581,0.01941,0.023393,0.026104,0.015075,0.017751,0.021642,0.017418,0.016604,0.016641
std_fit_time,0.008668,0.005598,0.007484,0.006709,0.00696,0.010213,0.007091,0.008689,0.007274,0.004691,0.006545,0.003516,0.005497,0.006396,0.007187
mean_score_time,0.013316,0.006943,0.008283,0.009371,0.006003,0.012159,0.00708,0.00544,0.007889,0.006659,0.00689,0.009677,0.007342,0.005271,0.006382
std_score_time,0.008042,0.002726,0.0033,0.003688,0.002138,0.005248,0.003355,0.001131,0.004805,0.00382,0.002678,0.002517,0.003851,0.00155,0.002514
param_svc__C,3,3,2,2,3,3,2,2,3,3,2,2,2,2,2
param_svc__degree,4,4,5,5,5,6,2,3,3,2,6,4,2,4,3
param_svc__gamma,auto,scale,scale,auto,auto,scale,scale,auto,scale,auto,scale,auto,auto,scale,scale
param_svc__kernel,rbf,rbf,rbf,rbf,rbf,rbf,rbf,rbf,rbf,rbf,rbf,rbf,rbf,rbf,rbf
params,"{'svc__C': 3, 'svc__degree': 4, 'svc__gamma': ...","{'svc__C': 3, 'svc__degree': 4, 'svc__gamma': ...","{'svc__C': 2, 'svc__degree': 5, 'svc__gamma': ...","{'svc__C': 2, 'svc__degree': 5, 'svc__gamma': ...","{'svc__C': 3, 'svc__degree': 5, 'svc__gamma': ...","{'svc__C': 3, 'svc__degree': 6, 'svc__gamma': ...","{'svc__C': 2, 'svc__degree': 2, 'svc__gamma': ...","{'svc__C': 2, 'svc__degree': 3, 'svc__gamma': ...","{'svc__C': 3, 'svc__degree': 3, 'svc__gamma': ...","{'svc__C': 3, 'svc__degree': 2, 'svc__gamma': ...","{'svc__C': 2, 'svc__degree': 6, 'svc__gamma': ...","{'svc__C': 2, 'svc__degree': 4, 'svc__gamma': ...","{'svc__C': 2, 'svc__degree': 2, 'svc__gamma': ...","{'svc__C': 2, 'svc__degree': 4, 'svc__gamma': ...","{'svc__C': 2, 'svc__degree': 3, 'svc__gamma': ..."
split0_test_score,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022,0.978022


In [None]:
svm_grid.best_params_

OrderedDict([('svc__C', 3),
             ('svc__degree', 4),
             ('svc__gamma', 'auto'),
             ('svc__kernel', 'rbf')])

In [9]:
sv_clf = Pipeline(steps=[("standardscaler", StandardScaler()), ("svc", SVC(**dict(OrderedDict([('C', 3),
             ('degree', 4),
             ('gamma', 'auto'),
             ('kernel', 'rbf'),
             ('probability', True) ]))))])

sv_clf.fit(X_train, y_train)

sv_clf.score(X_test, y_test)

0.9912280701754386

#Voting

In [27]:
#weights based on scoring
scores = [f.score(X_test, y_test) for f in (rf_clf, gb_clf, sv_clf)]
score_weights = scores / sum(scores)
score_weights

array([0.33035714, 0.33333333, 0.33630952])

In [30]:
weights = pd.Series(score_weights).map({e: i+1 for i, e in enumerate(sorted(set(score_weights)))}).values
weights

array([1, 2, 3])

In [120]:
#majority voting with weights
clf_voting = VotingClassifier(
    estimators=[
       ('rf', rf_clf),
       ('gb', gb_clf),
       ('sv', sv_clf)],
    voting='hard',
    weights = weights)

clf_voting.fit(X_train, y_train)
clf_voting.score(X_test, y_test)

0.9912280701754386

In [124]:
#majority voting without weights
clf_voting = VotingClassifier(
    estimators=[
       ('rf', rf_clf),
       ('gb', gb_clf),
       ('sv', sv_clf)],
    voting='hard')

clf_voting.fit(X_train, y_train)
clf_voting.score(X_test, y_test)

0.9824561403508771

In [122]:
#averaging with weights
clf_voting = VotingClassifier(
    estimators=[
       ('rf', rf_clf),
       ('gb', gb_clf),
       ('sv', sv_clf)],
    voting='soft',
    weights = weights)

clf_voting.fit(X_train, y_train)
clf_voting.score(X_test, y_test)

0.9912280701754386

In [123]:
#averaging without weights
clf_voting = VotingClassifier(
    estimators=[
       ('rf', rf_clf),
       ('gb', gb_clf),
       ('sv', sv_clf)],
    voting='soft')

clf_voting.fit(X_train, y_train)
clf_voting.score(X_test, y_test)

0.9824561403508771

In [166]:
#weights for fine-tuning
all = list(itertools.product([1, 2],[1, 2], [1, 2]))

weights = {'weights' : [x for x in all if not (np.array(x) == max(x)).all() == True ] + [(1, 1, 1)]}
weights

{'weights': [(1, 1, 2),
  (1, 2, 1),
  (1, 2, 2),
  (2, 1, 1),
  (2, 1, 2),
  (2, 2, 1),
  (1, 1, 1)]}

In [167]:
#grid search for fine-tuning for majority voting
vot_grid = GridSearchCV(estimator = VotingClassifier(
                                    estimators=[
                                      ('rf', rf_clf),
                                      ('gb', gb_clf),
                                      ('sv', sv_clf)],
                                    voting='hard'),
                        param_grid = weights,
                        scoring = 'accuracy',
                        n_jobs = -1,
                        refit = True,
                        cv = 5,
                        verbose = 0)

vot_grid.fit(X_train, y_train)

vot_grid.score(X_test, y_test)

0.9912280701754386

In [168]:
#same result as before
vot_grid.best_params_

{'weights': (1, 1, 2)}

In [169]:
#grid search for fine-tuning for average voting
vot_grid = GridSearchCV(estimator = VotingClassifier(
                                    estimators=[
                                      ('rf', rf_clf),
                                      ('gb', gb_clf),
                                      ('sv', sv_clf)],
                                    voting='soft'),
                        param_grid = weights,
                        scoring = 'accuracy',
                        n_jobs = -1,
                        refit = True,
                        cv = 5,
                        verbose = 0)

vot_grid.fit(X_train, y_train)

vot_grid.score(X_test, y_test)

0.9912280701754386

In [170]:
#same result as before
vot_grid.best_params_

{'weights': (1, 1, 2)}

#Stacking

MLExtend

In [200]:
st_clf = MLStackingClassifier(classifiers = [rf_clf, gb_clf, sv_clf],
                             meta_classifier = LogisticRegression())

In [201]:
st_hpars = {'meta_classifier__C': [0.09, 0.1, 0.11]}
st_hpars

{'meta_classifier__C': [0.09, 0.1, 0.11]}

In [202]:
st_grid = GridSearchCV(estimator = st_clf,
                      param_grid = st_hpars,
                      scoring = 'accuracy',
                      n_jobs = -1,
                      refit = True,
                      cv = 5,
                      verbose = 0)

st_grid.fit(X_train, y_train)

st_grid.score(X_test, y_test)

0.9824561403508771

In [203]:
st_grid.best_params_

{'meta_classifier__C': 0.09}

Scikit Learn

In [208]:
st_clf = SLStackingClassifier(estimators = [('rf', rf_clf), ('gb', gb_clf), ('sv', sv_clf)],
                              final_estimator = LogisticRegression())

In [212]:
st_hpars = {'final_estimator__C': [0.08, 0.09, 0.1]}
st_hpars

{'final_estimator__C': [0.08, 0.09, 0.1]}

In [213]:
st_grid = GridSearchCV(estimator = st_clf,
                      param_grid = st_hpars,
                      scoring = 'accuracy',
                      n_jobs = -1,
                      refit = True,
                      cv = 5,
                      verbose = 0)

st_grid.fit(X_train, y_train)

st_grid.score(X_test, y_test)

0.9736842105263158

In [214]:
st_grid.best_params_

{'final_estimator__C': 0.09}