In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data", header=None)

X = df.iloc[:,0:56]
y = df.iloc[:,57]

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [2]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

clf = LogisticRegression()
tree = DecisionTreeClassifier()
svm = svm.SVC(probability=True)

lr = Pipeline([['sc', StandardScaler()],
                  ['clf', clf]])

clf_labels = ['Logistic Regression', 'Decision Tree', 'Naive Bayesian']

def cv(all_clf, clf_labels):
    for clf, label in zip(all_clf, clf_labels):
        scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
        print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
        
cv([lr,tree,svm], clf_labels)

ROC AUC: 0.97 (+/- 0.01) [Logistic Regression]
ROC AUC: 0.90 (+/- 0.02) [Decision Tree]
ROC AUC: 0.97 (+/- 0.01) [Naive Bayesian]


### http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator

In [3]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator

class MajorityVoteClassifier(BaseEstimator, 
                             ClassifierMixin):
    def __init__(self, classifiers={}, vote='classlabel', weights=None):

        self.classifiers = classifiers
        self.named_classifiers = {key: value for key, value
                                  in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights

    def fit(self, X, y):
        if self.vote not in ('probability', 'classlabel'):
            raise ValueError("vote must be 'probability' or 'classlabel'"
                             "; got (vote=%r)"
                             % self.vote)

        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d classifiers'
                             % (len(self.weights), len(self.classifiers)))

        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self

    def predict(self, X):
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
        else:  # 'classlabel' vote

            #  Collect results from clf.predict calls
            predictions = np.asarray([clf.predict(X)
                                      for clf in self.classifiers_]).T

            maj_vote = np.apply_along_axis(
                                      lambda x:
                                      np.argmax(np.bincount(x,
                                                weights=self.weights)),
                                      axis=1,
                                      arr=predictions)
        maj_vote = self.lablenc_.inverse_transform(maj_vote)
        return maj_vote

    def predict_proba(self, X):
        probas = np.asarray([clf.predict_proba(X)
                             for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba

In [4]:
mv_clf = MajorityVoteClassifier(classifiers=[lr, tree, svm])

clf_labels += ['Majority Voting']
all_clf = [lr, tree, svm, mv_clf]

cv(all_clf, clf_labels)

ROC AUC: 0.97 (+/- 0.01) [Logistic Regression]
ROC AUC: 0.90 (+/- 0.02) [Decision Tree]
ROC AUC: 0.97 (+/- 0.01) [Naive Bayesian]
ROC AUC: 0.98 (+/- 0.01) [Majority Voting]


In [None]:
mv_clf = MajorityVoteClassifier(classifiers=[lr, tree, svm], vote='probability')
cv(all_clf, clf_labels)

In [None]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors=1,
                            p=2,
                            metric='minkowski')
clf_labels[2] = 'Nearest Neighbors'
all_clf[2] = knn
cv(all_clf, clf_labels)