In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [96]:
import numpy as np

In [2]:
from sklearn.datasets import make_moons

In [3]:
X, y = make_moons(n_samples=1000, noise=0.4)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

In [6]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], 
    voting='soft')

In [7]:
from sklearn.metrics import accuracy_score

In [8]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.81
RandomForestClassifier 0.835
SVC 0.87
VotingClassifier 0.845


  if diff:


In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [10]:
bag_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500,
                           max_samples=100, bootstrap=True, n_jobs=-1)

In [11]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [12]:
accuracy_score(y_test, y_pred)

0.865

In [13]:
bag_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500,
                           max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)

In [14]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=100, n_estimators=500, n_jobs=-1, oob_score=True,
         random_state=None, verbose=0, warm_start=False)

In [15]:
bag_clf.oob_score_

0.84875

In [16]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.865

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [19]:
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred_rf)

0.865

In [20]:
from sklearn.ensemble import ExtraTreesClassifier

In [21]:
etr_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [22]:
etr_clf.fit(X_train, y_train)
y_pred_rf = etr_clf.predict(X_test)
accuracy_score(y_test, y_pred_rf)

0.855

In [23]:
from sklearn.ensemble import AdaBoostClassifier

In [39]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), 
                             n_estimators=200, algorithm="SAMME.R", 
                             learning_rate=0.05)

In [40]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.05, n_estimators=200, random_state=None)

In [41]:
accuracy_score(y_test, ada_clf.predict(X_test))

0.875

In [42]:
from sklearn.ensemble import GradientBoostingClassifier

In [91]:
gbdt_clf = GradientBoostingClassifier(max_depth=3, n_estimators=200, 
                                      learning_rate=0.05)

In [92]:
gbdt_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [93]:
accuracy_score(y_test, gbdt_clf.predict(X_test))

0.885

In [94]:
accuracies = [accuracy_score(y_test, y_pred) for y_pred in gbdt_clf.staged_predict(X_test)]

In [97]:
bst_num_estimators = np.argmax(accuracies)

In [99]:
gbdt_clf_best = GradientBoostingClassifier(max_depth=2, 
                                           n_estimators=bst_num_estimators,
                                           learning_rate=0.05)

In [100]:
gbdt_clf_best.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=2,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=113,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [101]:
accuracy_score(y_test, gbdt_clf_best.predict(X_test))

0.885