In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

In [3]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import numpy as np

np.random.seed(123)

clf1 = LogisticRegression()
clf2 = RandomForestClassifier()
clf3 = GaussianNB()

print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3], ['Logistic Regression', 'Random Forest', 'naive Bayes']):

    scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

5-fold cross validation:

Accuracy: 0.90 (+/- 0.05) [Logistic Regression]
Accuracy: 0.92 (+/- 0.05) [Random Forest]
Accuracy: 0.91 (+/- 0.04) [naive Bayes]


# Create custom Ensembler

There is an existing SciKit estimator sklearn.ensemble.VotingClassifier(estimators, voting=’hard’, weights=None, n_jobs=1, flatten_transform=None)



In [4]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
import numpy as np
import operator

class EnsembleClassifier(BaseEstimator, ClassifierMixin):
    """
    Ensemble classifier for scikit-learn estimators.

    Parameters
    ----------

    clf : `iterable`
      A list of scikit-learn classifier objects.
    weights : `list` (default: `None`)
      If `None`, the majority rule voting will be applied to the predicted class labels.
        If a list of weights (`float` or `int`) is provided, the averaged raw probabilities (via `predict_proba`)
        will be used to determine the most confident class label.

    """
    def __init__(self, clfs, weights=None):
        self.clfs = clfs
        self.weights = weights

    def fit(self, X, y):
        """
        Fit the scikit-learn estimators.

        Parameters
        ----------

        X : numpy array, shape = [n_samples, n_features]
            Training data
        y : list or numpy array, shape = [n_samples]
            Class labels

        """
        for clf in self.clfs:
            clf.fit(X, y)

    def predict(self, X):
        """
        Parameters
        ----------

        X : numpy array, shape = [n_samples, n_features]

        Returns
        ----------

        maj : list or numpy array, shape = [n_samples]
            Predicted class labels by majority rule

        """

        self.classes_ = np.asarray([clf.predict(X) for clf in self.clfs])
        if self.weights:
            avg = self.predict_proba(X)

            maj = np.apply_along_axis(lambda x: max(enumerate(x), key=operator.itemgetter(1))[0], axis=1, arr=avg)

        else:
            maj = np.asarray([np.argmax(np.bincount(self.classes_[:,c])) for c in range(self.classes_.shape[1])])

        return maj

    def predict_proba(self, X):

        """
        Parameters
        ----------

        X : numpy array, shape = [n_samples, n_features]

        Returns
        ----------

        avg : list or numpy array, shape = [n_samples, n_probabilities]
            Weighted average probability for each class per sample.

        """
        self.probas_ = [clf.predict_proba(X) for clf in self.clfs]
        avg = np.average(self.probas_, axis=0, weights=self.weights)

        return avg

In [5]:
np.random.seed(123)
eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], weights=[1,1,1])

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):

    scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))



Accuracy: 0.90 (+/- 0.05) [Logistic Regression]
Accuracy: 0.92 (+/- 0.05) [Random Forest]
Accuracy: 0.91 (+/- 0.04) [naive Bayes]
Accuracy: 0.95 (+/- 0.03) [Ensemble]


# Tuning the weights for the classifiers

In [11]:
import pandas as pd

np.random.seed(123)

df = pd.DataFrame(columns=('w1', 'w2', 'w3', 'mean', 'std'))

i = 0
for w1 in range(1,4):
    for w2 in range(1,4):
        for w3 in range(1,4):

            if len(set((w1,w2,w3))) == 1: # skip if all weights are equal
                continue

            eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], weights=[w1,w2,w3])
            scores = cross_validation.cross_val_score(
                                            estimator=eclf,
                                            X=X,
                                            y=y,
                                            cv=5,
                                            scoring='accuracy',
                                            n_jobs=1)

            df.loc[i] = [w1, w2, w3, scores.mean(), scores.std()]
            i += 1

df.sort_values(['mean', 'std'], ascending=False)

Unnamed: 0,w1,w2,w3,mean,std
2,1.0,2.0,1.0,0.953333,0.033993
17,3.0,1.0,2.0,0.953333,0.033993
16,3.0,1.0,1.0,0.946667,0.045216
20,3.0,2.0,2.0,0.946667,0.045216
1,1.0,1.0,3.0,0.946667,0.04
6,1.0,3.0,2.0,0.946667,0.033993
7,1.0,3.0,3.0,0.946667,0.033993
11,2.0,2.0,1.0,0.946667,0.033993
13,2.0,3.0,1.0,0.946667,0.033993
14,2.0,3.0,2.0,0.946667,0.033993


In [12]:
from sklearn import * 



In [22]:
df = pd.read_csv("/data/credit-default.csv")
target = "default"
X = df.copy()
del X[target]
y = np.where(df[target] == 2, 1, 0)

X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                        test_size = 0.3, random_state = 1)

est = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("model", EnsembleClassifier([
        linear_model.LogisticRegression(),
        tree.DecisionTreeClassifier(),
        svm.SVC(probability=True),
        neighbors.KNeighborsClassifier(n_neighbors=10)
    ], [1, 1, 1, 1]))
])
est.fit(X_train, y_train)

y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)

print("Accuracy: %f\nPrecision: %f\nRecall: %f" % (
    metrics.accuracy_score(y_test, y_test_pred),
    metrics.precision_score(y_test, y_test_pred),
    metrics.recall_score(y_test, y_test_pred)
))

Accuracy: 0.753333
Precision: 0.620000
Recall: 0.360465


# Using out of the box Ensemblers

### Boosted Models

In [26]:
df = pd.read_csv("/data/credit-default.csv")
target = "default"
X = df.copy()
del X[target]
y = np.where(df[target] == 2, 1, 0)

X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                        test_size = 0.3, random_state = 1)

est = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("model", ensemble.AdaBoostClassifier(base_estimator=LogisticRegression()
                                          , n_estimators=50, random_state=1))
])
est.fit(X_train, y_train)

y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)

print("Accuracy: %f\nPrecision: %f\nRecall: %f" % (
    metrics.accuracy_score(y_test, y_test_pred),
    metrics.precision_score(y_test, y_test_pred),
    metrics.recall_score(y_test, y_test_pred)
))

Accuracy: 0.756667
Precision: 0.600000
Recall: 0.453488
