## Ensemble Learning

Random Forest project using moons and Iris dataset

In [30]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import numpy as np

#### Voting Classifiers

In [17]:
# create the make_moons dataset
X1, y1 = make_moons(n_samples=1000, noise=0.4, random_state=42)

# split data
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

print("Dimensions of training data:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("\nDimensions of testing data:")
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


Dimensions of training data:
X_train: (800, 2)
y_train: (800,)

Dimensions of testing data:
X_test: (200, 2)
y_test: (200,)


In [18]:
lr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', lr_clf), ('rf', rf_clf), ('svc', svm_clf)],
    voting='hard'
)

voting_clf.fit(X_train, y_train)

In [19]:
from sklearn.metrics import accuracy_score

for clf in (lr_clf, rf_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.82
RandomForestClassifier 0.855
SVC 0.855
VotingClassifier 0.87


As can be seen above, the voting classifier is slightly more accurate than the others

#### Bagging and Pasting classifiers

Use the same algorithm but using different subsets.

In [20]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [21]:
# to use pasting, set bootstrap=False
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1,
    random_state=24, oob_score=True
)

bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)
print('Accuracy: ',accuracy_score(y_test, y_pred_bag))
print(bag_clf.oob_score_)
# Accuracy Bagging = 0.85 and Pasting = 0.855

Accuracy:  0.85
0.8525


In [22]:
# class probability for each instance
bag_clf.oob_decision_function_

array([[0.04298643, 0.95701357],
       [0.89838337, 0.10161663],
       [0.99548533, 0.00451467],
       ...,
       [0.90702948, 0.09297052],
       [0.53056769, 0.46943231],
       [0.06966292, 0.93033708]])

In [23]:
# creating random forest classifier
rf_clf = RandomForestClassifier(
    n_estimators=500, max_leaf_nodes=16, n_jobs=-1
)

rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_rf))

0.85


In [24]:
# create new bagging model
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter='random', max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
)

bag_clf.fit(X_train, y_train)
y_pred_bag_clf = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_bag_clf))

0.86


In [25]:
from sklearn.ensemble import ExtraTreesClassifier

extra_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
extra_clf.fit(X_train, y_train)
y_pred_extra_trees = extra_clf.predict(X_test)

print("Accuracy score for ExtraTreesClassifier:", accuracy_score(y_test, y_pred_extra_trees))

Accuracy score for ExtraTreesClassifier: 0.855


##### Comparing different classifiers using the Iris database

In [26]:
from sklearn.datasets import load_iris

df = load_iris()

X_train, X_test, y_train, y_test = train_test_split(df['data'], 
                                                    df['target'], 
                                                    test_size=0.4, 
                                                    random_state=42)

rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

# show feature importance
for name, score in zip(df['feature_names'], rf_clf.feature_importances_):
    print(name, score)

Accuracy Score: 0.9833333333333333
sepal length (cm) 0.10055125029547227
sepal width (cm) 0.037725538074469794
petal length (cm) 0.4007830258696043
petal width (cm) 0.4609401857604536


In [27]:
# AdaBoost classifier
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200, algorithm='SAMME.R', learning_rate=0.5
)

ada_clf.fit(X_train, y_train)
y_pred_ada = ada_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_ada))



0.9666666666666667


In [None]:
# gradient boosting regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

gbrt = GradientBoostingRegressor()