In [1]:
import pickle
import os.path

import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import ExtraTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
def open_mnist_or_download_if_missing():
    
    if os.path.exists("data/mnist/mnist.pickle"):
        print("Using local mnist")
        with open("data/mnist/mnist.pickle", mode="rb") as fp:
            mnist = pickle.load(fp)
    else:
        print("Downloading mnist")
        mnist = fetch_openml("mnist_784", version=1)
        with open("data/mnist/mnist.pickle", mode="wb") as fp:
            pickle.dump(mnist, fp)
    mnist.target = mnist.target.astype(np.uint8)
    return mnist

In [3]:
random_state = 65

mnist = open_mnist_or_download_if_missing()

train_idx, test_idx = next(
    ShuffleSplit(n_splits=1, train_size=60000, random_state=random_state).split(mnist.data, mnist.target))
train_idx, cv_idx = next(
    ShuffleSplit(n_splits=1, train_size=50000, random_state=random_state).split(mnist.data.iloc[train_idx],
                                                                                mnist.target.iloc[train_idx]))

X_train = mnist.data.iloc[train_idx]
y_train = mnist.target.iloc[train_idx]

X_cv = mnist.data.iloc[cv_idx]
y_cv = mnist.target.iloc[cv_idx]

X_test = mnist.data.iloc[test_idx]
y_test = mnist.target.iloc[test_idx]

Using local mnist


In [4]:
linear_svc_clf = LinearSVC(max_iter=100, random_state=random_state)
rf_clf = RandomForestClassifier(random_state=random_state)
lr_clf = LogisticRegression(random_state=random_state)
extra_tree_clf = ExtraTreeClassifier(random_state=random_state)
classifiers = [linear_svc_clf, rf_clf, lr_clf, extra_tree_clf]

for clf in classifiers:
    print(f"Training {clf}")
    clf.fit(X_train, y_train)

print(f"Classifiers:{classifiers}")
print(f"Scores: {[clf.score(X_cv, y_cv) for clf in classifiers]}")

Training LinearSVC(max_iter=100, random_state=65)




Training RandomForestClassifier(random_state=65)
Training LogisticRegression(random_state=65)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training ExtraTreeClassifier(random_state=65)
Classifiers:[LinearSVC(max_iter=100, random_state=65), RandomForestClassifier(random_state=65), LogisticRegression(random_state=65), ExtraTreeClassifier(random_state=65)]
Scores: [0.8575, 0.9672, 0.9214, 0.8135]


In [5]:
named_estimators = [("random_forest", rf_clf), ("logistic_regression", lr_clf),
                    ("extra_tree", extra_tree_clf)]
voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier(estimators=[('random_forest',
                              RandomForestClassifier(random_state=65)),
                             ('logistic_regression',
                              LogisticRegression(random_state=65)),
                             ('extra_tree',
                              ExtraTreeClassifier(random_state=65))])

In [6]:
print(f"hard voting classifier score: {voting_clf.score(X_cv, y_cv)}")
print(voting_clf.estimators_)
print(f"Scores: {[clf.score(X_cv, y_cv) for clf in voting_clf.estimators_]}")

hard voting classifier score: 0.9491
[RandomForestClassifier(random_state=65), LogisticRegression(random_state=65), ExtraTreeClassifier(random_state=65)]
Scores: [0.9672, 0.9214, 0.8135]


In [7]:
voting_clf.voting = "soft"
print(f"soft voting classifier score: {voting_clf.score(X_cv, y_cv)}")

soft voting classifier score: 0.931


In [8]:
voting_clf.voting = "hard"
print(f"hard voting classifier test score: {voting_clf.score(X_test, y_test)}")

hard voting classifier test score: 0.9784


In [9]:
print(f"Scores: {[clf.score(X_test, y_test) for clf in voting_clf.estimators_]}")

Scores: [0.9867, 0.9321, 0.9217]


In [10]:
y_cv_pred = np.empty((len(X_cv), len(classifiers)))
y_cv_pred.shape

(10000, 4)

In [17]:
for idx, clf in enumerate(classifiers):
    y_cv_pred[:, idx] = clf.predict(X_cv)

46340    7
5153     3
10982    1
25666    1
32055    1
        ..
3525     5
14322    8
17478    4
57856    3
7        3
Name: class, Length: 10000, dtype: uint8

In [20]:
blender = RandomForestClassifier(random_state=65)
blender.fit(y_cv_pred, y_cv)

RandomForestClassifier(random_state=65)

In [21]:
y_test_pred = np.empty((len(X_test), len(classifiers)))
y_test_pred.shape

(10000, 4)

In [23]:
for idx, clf in enumerate(classifiers):
    y_test_pred[:, idx] = clf.predict(X_test)

In [25]:
blender.score(y_test_pred, y_test)

0.9864