In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np;
import pandas as pd
import sklearn.metrics as metrics

## Classification using Ensemble Methods

We'll go over code for some ensemble methods in `scikit-learn`. These are defined in the module `sklearn.ensemble`. 

**See:** https://scikit-learn.org/stable/modules/ensemble.html

### Read in a data set

We'll use the `wdbc` breast cancer data set. In this dataset, `B` is 1 and `M` is 0. 

In [4]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

bc=datasets.load_breast_cancer()

#iris = datasets.load_iris()

X = bc.data
y = bc.target



In [6]:
wdbc_data = '../data-sets/wdbc.csv'
df = pd.read_csv(wdbc_data, header=0)
data = df.iloc[:].values
X = data[:, 1:-1]
y = data[:, -1]

### Scaling the data

We'll scale the data using a standard scaler and then split the sets into training and testing sets. 

In [7]:
from sklearn.model_selection import train_test_split
from  sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
XScaled= scaler.fit_transform(X)

X_train, X_test, y_train, y_test =  train_test_split(XScaled, y, test_size=0.3, shuffle=True)

print("Training set size:", len(y_train))
print("Testing set size:", len(y_test))
print("Input attributes:",X.shape[1])


Training set size: 398
Testing set size: 171
Input attributes: 30


## Bagging

#### Classification

Bagging estimators exist for classification and for regression. The estimator for classfication is called `BaggingClassifier`. Note the parameters that can be set: 

*  `base_estimator`
*  `n_estimators`
*  `max_samples`
*  `max_features`

**See:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html

Below, we'll create a descision tree classifier and compare it to bagged decision trees. 

In [8]:
from sklearn.ensemble import BaggingClassifier
from sklearn import tree

treeLearner = tree.DecisionTreeClassifier()

bagging = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(), 
                            n_estimators=25, 
                            max_samples=1.0, 
                            max_features=1.0)


for clf in [treeLearner,bagging]:
    clf.fit(X_train, y_train)                         
    predicted= np.array(clf.predict(X_test))
    cm = metrics.confusion_matrix(y_test, predicted,labels=[1,0])
    print()
    print(clf)
    print("","B","M",sep="\t")
    print("B",cm[0,0],cm[0,1],sep="\t" )
    print("M",cm[1,0],cm[1,1],sep="\t" )
    print('-'*20)
    print("accuracy:", metrics.accuracy_score(predicted,y_test))   


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
	B	M
B	101	10
M	6	54
--------------------
accuracy: 0.9064327485380117

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        

#### Regression

For regression, `BaggingRegressor` can be used. We'll try it with a California housing data set. 

**See:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html

In [9]:
from sklearn.ensemble import BaggingRegressor
from sklearn import tree
import scipy.stats

treeLearner = tree.DecisionTreeRegressor()

bagging = BaggingRegressor(base_estimator=tree.DecisionTreeRegressor(), 
                            n_estimators=25, 
                            max_samples=1.0, 
                            max_features=1.0)

scaler = StandardScaler()

cali= datasets.fetch_california_housing()
XCaliScaled= scaler.fit_transform(cali.data)

X_cali_train, X_cali_test, y_cali_train, y_cali_test =  train_test_split(XCaliScaled, cali.target, test_size=0.3, shuffle=True)

print("Training set size:", len(y_cali_train))
print("Testing set size:", len(y_cali_test))
print("Input attributes:",XCaliScaled.shape[1])

print('-'*20)

for clf in [treeLearner,bagging]:
    clf.fit(X_cali_train, y_cali_train)                         
    predicted= np.array(clf.predict(X_cali_test))
    print(f"{clf.__class__}\t MSE:{metrics.mean_squared_error(y_cali_test,predicted)}")

print('-'*20)
    
    
for i in range(10):
    print(clf.predict([X_cali_test[i]])[0], y_cali_test[i])

Training set size: 14448
Testing set size: 6192
Input attributes: 8
--------------------
<class 'sklearn.tree._classes.DecisionTreeRegressor'>	 MSE:0.5347956993773441
<class 'sklearn.ensemble._bagging.BaggingRegressor'>	 MSE:0.26757225464244205
--------------------
2.0232000000000006 2.443
1.7665599999999997 1.729
1.10364 0.862
0.75464 0.611
0.85328 0.868
2.3042800000000003 3.257
2.14224 2.295
2.39656 3.229
0.76824 0.643
2.44276 1.975


In [12]:
scipy.stats.describe(cali.target)

DescribeResult(nobs=20640, minmax=(0.14999, 5.00001), mean=2.068558169089147, variance=1.3316148163035277, skewness=0.9776922140978419, kurtosis=0.3275001388119616)

## Random Forests

**See:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

#### Classification

In [13]:
from sklearn.ensemble import RandomForestClassifier    
from sklearn import tree

treeLearner = tree.DecisionTreeClassifier()

randomF = RandomForestClassifier(n_estimators=50, 
                            criterion='gini', # or entropy
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            max_features='auto', 
                            bootstrap=True)

for clf in [treeLearner,randomF]:
    clf.fit(X_train, y_train)                         
    predicted= np.array(clf.predict(X_test))
    cm = metrics.confusion_matrix(y_test, predicted,labels=[1,0])
    print(f"\n{clf.__class__}")
    print("","B","M",sep="\t")
    print("B",cm[0,0],cm[0,1],sep="\t" )
    print("M",cm[1,0],cm[1,1],sep="\t" )
    print('-'*20)
    print("accuracy:", metrics.accuracy_score(predicted,y_test))   


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
	B	M
B	101	10
M	8	52
--------------------
accuracy: 0.8947368421052632

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
	B	M
B	106	5
M	5	55
--------------------
accuracy: 0.9415204678362573


#### Regression

**See:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
import scipy.stats

treeLearner = tree.DecisionTreeRegressor()

bagging = RandomForestRegressor(n_estimators=50, 
                            criterion='mse', # mae
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            max_features='auto', 
                            bootstrap=True)

scaler = StandardScaler()
XCaliScaled= scaler.fit_transform(cali.data)

X_cali_train, X_cali_test, y_cali_train, y_cali_test =  train_test_split(XCaliScaled, cali.target, test_size=0.3, shuffle=True)

print("Training set size:", len(y_cali_train))
print("Testing set size:", len(y_cali_test))
print("Input attributes:",XCaliScaled.shape[1])
print(scipy.stats.describe(cali.target))

print('-'*20)


for clf in [treeLearner,bagging]:
    clf.fit(X_cali_train, y_cali_train)                         
    predicted= np.array(clf.predict(X_cali_test))
    print("MSE:",metrics.mean_squared_error(y_cali_test,predicted))

print('-'*20)
    
for i in range(10):
    print(clf.predict([X_cali_test[i]])[0], y_cali_test[i])

Training set size: 14448
Testing set size: 6192
Input attributes: 8
DescribeResult(nobs=20640, minmax=(0.14999, 5.00001), mean=2.068558169089147, variance=1.3316148163035277, skewness=0.9776922140978419, kurtosis=0.3275001388119616)
--------------------
MSE: 0.5622960422606792
MSE: 0.2576759036837204
--------------------
3.0213203999999996 3.31
0.88696 0.95
1.889720000000001 4.5
1.7393600000000005 1.708
1.7983199999999997 2.215
1.9486199999999996 2.308
2.7991601999999993 4.029
2.3649199999999997 2.151
1.8054400000000006 1.648
1.1465000000000003 1.264


## Extremely Randomized Trees

Here, random candidate splits are generated, and the best amoung these is chosen. 

**See:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

In [15]:
from sklearn.ensemble import ExtraTreesClassifier    
from sklearn import tree

treeLearner = tree.DecisionTreeClassifier()

et = ExtraTreesClassifier(n_estimators=50, 
                            criterion='gini', 
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            max_features='auto', 
                            bootstrap=True)

for clf in [treeLearner,et]:
    clf.fit(X_train, y_train)                         
    predicted= np.array(clf.predict(X_test))
    cm = metrics.confusion_matrix(y_test, predicted,labels=[1,0])
    print("","B","M",sep="\t")
    print("B",cm[0,0],cm[0,1],sep="\t" )
    print("M",cm[1,0],cm[1,1],sep="\t" )
    print('-'*20)
    print("accuracy:", metrics.accuracy_score(predicted,y_test))   

	B	M
B	101	10
M	6	54
--------------------
accuracy: 0.9064327485380117
	B	M
B	108	3
M	4	56
--------------------
accuracy: 0.9590643274853801


## AdaBoost

#### Classification

**See:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

Regarding the algorithm used: 

* "If `SAMME.R` then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities." 
* "If `SAMME` then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations."

In [16]:
from sklearn.ensemble import AdaBoostClassifier  
from sklearn import tree

treeLearner = tree.DecisionTreeClassifier()

ad = AdaBoostClassifier( base_estimator =tree.DecisionTreeClassifier(max_depth=5),
                           n_estimators=25, 
                           learning_rate=1.0, 
                           algorithm='SAMME.R')

for clf in [treeLearner,ad]:
    clf.fit(X_train, y_train)                         
    predicted= np.array(clf.predict(X_test))
    cm = metrics.confusion_matrix(y_test, predicted,labels=[1,0])
    print("","B","M",sep="\t")
    print("B",cm[0,0],cm[0,1],sep="\t" )
    print("M",cm[1,0],cm[1,1],sep="\t" )
    print('-'*20)
    print("accuracy:", metrics.accuracy_score(predicted,y_test))   

	B	M
B	101	10
M	5	55
--------------------
accuracy: 0.9122807017543859
	B	M
B	105	6
M	6	54
--------------------
accuracy: 0.9298245614035088


In [17]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn import tree
import scipy.stats

treeLearner = tree.DecisionTreeRegressor()

bagging = AdaBoostRegressor( base_estimator =tree.DecisionTreeRegressor(),
                           n_estimators=25, 
                           learning_rate=1.0, 
                           loss='square')

scaler = StandardScaler()
XCaliScaled= scaler.fit_transform(cali.data)

X_cali_train, X_cali_test, y_cali_train, y_cali_test =  train_test_split(XCaliScaled, cali.target, test_size=0.3, shuffle=True)

print("Training set size:", len(y_cali_train))
print("Testing set size:", len(y_cali_test))
print("Input attributes:",XCaliScaled.shape[1])
print(scipy.stats.describe(cali.target))

print('-'*20)


for clf in [treeLearner,bagging]:
    clf.fit(X_cali_train, y_cali_train)                         
    predicted= np.array(clf.predict(X_cali_test))
    print("MSE:",metrics.mean_squared_error(y_cali_test,predicted))

print('-'*20)
    
for i in range(10):
    print(clf.predict([X_cali_test[i]]), y_cali_test[i])

Training set size: 14448
Testing set size: 6192
Input attributes: 8
DescribeResult(nobs=20640, minmax=(0.14999, 5.00001), mean=2.068558169089147, variance=1.3316148163035277, skewness=0.9776922140978419, kurtosis=0.3275001388119616)
--------------------
MSE: 0.5500306827816238
MSE: 0.24607775772125112
--------------------
[1.219] 1.25
[2.693] 3.124
[3.647] 5.00001
[0.946] 0.838
[0.913] 1.408
[1.164] 1.306
[1.581] 1.498
[0.9] 1.333
[0.863] 0.691
[4.847] 4.373


## Voting

Votiting classifier and regressor classes are also defined in sklearn. 

* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html

In [18]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial',
                          random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = tree.DecisionTreeClassifier()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('dt', clf3)], voting='hard')

for clf in [clf1, clf2, clf3, eclf]:
    clf.fit(X_train, y_train)                         
    predicted= np.array(clf.predict(X_test))
    cm = metrics.confusion_matrix(y_test, predicted,labels=[1,0])
    print('-'*80)
    print(clf.__class__)
    print("","B","M",sep="\t")
    print("B",cm[0,0],cm[0,1],sep="\t" )
    print("M",cm[1,0],cm[1,1],sep="\t" )
    print('-'*20)
    print("accuracy:", metrics.accuracy_score(predicted,y_test))   


--------------------------------------------------------------------------------
<class 'sklearn.linear_model._logistic.LogisticRegression'>
	B	M
B	110	1
M	3	57
--------------------
accuracy: 0.9766081871345029
--------------------------------------------------------------------------------
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
	B	M
B	104	7
M	6	54
--------------------
accuracy: 0.9239766081871345
--------------------------------------------------------------------------------
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
	B	M
B	101	10
M	7	53
--------------------
accuracy: 0.9005847953216374
--------------------------------------------------------------------------------
<class 'sklearn.ensemble._voting.VotingClassifier'>
	B	M
B	104	7
M	5	55
--------------------
accuracy: 0.9298245614035088
