In [7]:
import numpy as np

np.argmax(np.bincount([0, 0, 1], weights=[0.2, 0.2, 0.6]))

1

In [8]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [9]:
iris = datasets.load_iris()
X, y = iris.data[50:, [1, 2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test =\
       train_test_split(X, y, 
                        test_size=0.5, 
                        random_state=1,
                        stratify=y)

In [17]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

clf1 = LogisticRegression(penalty='l2', solver='liblinear',
                          C=0.001,
                          random_state=1)

clf2 = DecisionTreeClassifier(max_depth=1,
                              criterion='entropy',
                              random_state=0)

clf3 = KNeighborsClassifier(metric='euclidean')


pipe1 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf1]])

pipe2 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf3]])


clf_labels = ['Logistic regression', 'Decision tree', 'KNN']

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe2], clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10)
    print("ACC: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))

10-fold cross validation:

ACC: 0.86 (+/- 0.17) [Logistic regression]
ACC: 0.89 (+/- 0.16) [Decision tree]
ACC: 0.85 (+/- 0.13) [KNN]


### Majority voting

In [11]:
import ensemble

In [19]:
# Majority Rule (hard) Voting

mv_clf = ensemble.MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe2])

clf_labels += ['Majority voting']
all_clf = [pipe1, clf2, pipe2, mv_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
    print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))

ROC AUC: 0.87 (+/- 0.17) [Logistic regression]
ROC AUC: 0.89 (+/- 0.16) [Decision tree]
ROC AUC: 0.94 (+/- 0.10) [KNN]
ROC AUC: 0.97 (+/- 0.07) [Majority voting]


### Voting Classifier Scikit Learning

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier

In [39]:
X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

lr = LogisticRegression(solver='liblinear', random_state=42)
dt = DecisionTreeClassifier(random_state=42)
knn = KNN(metric='euclidean')

pipe_lr = Pipeline([['sc', StandardScaler()],
                  ['lr', lr]])

pipe_knn = Pipeline([['sc', StandardScaler()],
                  ['knn', knn]])

vc = VotingClassifier(estimators=[('lr', pipe_lr), ('dt', dt), ('knn', pipe_knn)], voting='hard')

# Define a list called classifier that contains the tuples (classifier_name, classifier)
classifiers = [('Logistic Regression', pipe_lr), 
               ('Classification Tree', dt),
               ('KNN', pipe_knn),
               ('Voting Classifier', vc)]

for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))

Logistic Regression : 0.982
Classification Tree : 0.942
KNN : 0.959
Voting Classifier : 0.994


In [35]:
classifiers

[('Logistic Regression', Pipeline(memory=None,
           steps=[('sc',
                   StandardScaler(copy=True, with_mean=True, with_std=True)),
                  ['lr',
                   LogisticRegression(C=1.0, class_weight=None, dual=False,
                                      fit_intercept=True, intercept_scaling=1,
                                      l1_ratio=None, max_iter=100,
                                      multi_class='warn', n_jobs=None,
                                      penalty='l2', random_state=42,
                                      solver='liblinear', tol=0.0001, verbose=0,
                                      warm_start=False)]],
           verbose=False)),
 ('Classification Tree',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                         max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_s

Modifique o notebook para incluir um classificador KNN no ensemble.