# Ensemble Machine Learning Algorithms in Python with scikit-learn


    Bagging. Building multiple models (typically of the same type) from different subsamples of the training dataset.
    Boosting. Building multiple models (typically of the same type) each of which learns to fix the prediction errors of a prior model in the chain.
    Voting. Building multiple models (typically of differing types) and simple statistics (like calculating the mean) are used to combine predictions.


[Source](https://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/)

In [1]:
# Bagged Decision Trees for Classification
import pandas
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier


In [2]:
import warnings
warnings.filterwarnings('ignore')

Bagging Algorithms

Bagging performs best with algorithms that have high variance. A popular example are decision trees, often constructed without pruning.

In [3]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
# Made np array from Pandas dataframe. 
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("results =", results)
print("results.mean() - ", results.mean())
type(results)



results = [0.67532468 0.81818182 0.75324675 0.63636364 0.81818182 0.81818182
 0.85714286 0.85714286 0.69736842 0.77631579]
results.mean() -  0.770745044429255


numpy.ndarray

Random Forest. It is bagging as well, by the way. 

In [4]:
num_trees = 100
max_features = 3
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("results.mean() - ",results.mean())

# stopped here

results.mean() -  0.7551606288448395


Extra Trees

In [5]:
from sklearn.ensemble import ExtraTreesClassifier
num_trees = 100
max_features = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("results.mean() - ",results.mean())

results.mean() -  0.7655844155844156


Boosting Algorithms

AdaBoost

In [6]:
from sklearn.ensemble import AdaBoostClassifier

num_trees = 30
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("results.mean() - ",results.mean())

results.mean() -  0.760457963089542


Stochastic Gradient Boosting

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

num_trees = 100
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("results.mean() - ",results.mean())

results.mean() -  0.7681989063568012


Voting Ensemble

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score

In [9]:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression(max_iter=1000)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))
# # create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv=kfold)
print("results.mean() - ",results.mean())

results.mean() -  0.7603896103896104
