## Custom Ensemble Machine Learning Algorithms

In [4]:
## Creating synthetic Data Points
from sklearn.datasets import make_classification
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=1)

In [7]:
X

array([[  2.47475454,   0.40165523,   1.68081787, ...,  -6.59044146,
         -2.21290585,  -3.139579  ],
       [  0.84802507,   2.81841945,  -2.76008732, ...,   3.00844461,
          0.78661954,  -1.27681551],
       [ -1.90041246,  -0.56901823,  -1.76220236, ...,   3.37336417,
         -2.28613707,   1.90344983],
       ...,
       [  0.7673844 ,  -2.91920559,   2.80851577, ...,   4.42591832,
          0.46321196,  -3.30523346],
       [  2.05510667,  -0.99009741,   0.73577291, ...,   3.05100898,
         -1.40715279,  -0.51579331],
       [-10.96847792,  -2.39810735,  -0.96700953, ..., -11.16298557,
          1.16646392,   0.60835176]])

In [5]:
from collections import Counter
counter=Counter(y)

In [6]:
counter

Counter({0: 501, 1: 499})

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

# get models
# get a voting ensemble of models
# define the base models
models = list()

decision_tree = Pipeline([('m', DecisionTreeClassifier())])
models.append(('decision', decision_tree))

randomforest = Pipeline([('m', RandomForestClassifier())])
models.append(('randomforest', randomforest))

svc = Pipeline([('m', SVC())])
models.append(('svc', svc))
# define the voting ensemble
ensemble = VotingClassifier(estimators=models, voting='hard')


In [9]:
models

[('decision', Pipeline(steps=[('m', DecisionTreeClassifier())])),
 ('randomforest', Pipeline(steps=[('m', RandomForestClassifier())])),
 ('svc', Pipeline(steps=[('m', SVC())]))]

In [10]:
ensemble

VotingClassifier(estimators=[('decision',
                              Pipeline(steps=[('m',
                                               DecisionTreeClassifier())])),
                             ('randomforest',
                              Pipeline(steps=[('m',
                                               RandomForestClassifier())])),
                             ('svc', Pipeline(steps=[('m', SVC())]))])

In [11]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(ensemble, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [12]:
n_scores

array([0.94, 0.96, 0.94, 0.92, 0.9 , 0.96, 0.94, 0.93, 0.95, 0.94, 0.96,
       0.9 , 0.91, 0.95, 0.94, 0.94, 0.95, 0.95, 0.96, 0.96, 0.94, 0.97,
       0.96, 0.96, 0.91, 0.89, 0.97, 0.94, 0.94, 0.89])

In [14]:
n_scores.mean()

0.9390000000000002