In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier

In [5]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [8]:
X = mnist.data
Y = mnist.target

In [9]:
print(X.shape)
print(Y.shape)

(70000, 784)
(70000,)


In [21]:
X_combine, X_test, y_combine, y_test = train_test_split(X, Y, random_state=42, test_size=10000)

X_combine.shape, X_test.shape

((60000, 784), (10000, 784))

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X_combine, y_combine, random_state=2, test_size=10000)

X_train.shape, X_val.shape

((50000, 784), (10000, 784))

In [25]:
print('Size of Training set: ', X_train.shape[0])
print('Size of Testing set: ', X_test.shape[0])
print('Size of Validation set: ', X_val.shape[0])

Size of Training set:  50000
Size of Testing set:  10000
Size of Validation set:  10000


In [31]:
lr_clf = LogisticRegression(max_iter=1000)
dt_clf = DecisionTreeClassifier(max_depth=1)
rf_clf = RandomForestClassifier()
svm_clf = SVC()

In [32]:
lr_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
y_pred1 = lr_clf.predict(X_test)
y_pred2 = dt_clf.predict(X_test)
y_pred3 = rf_clf.predict(X_test)
y_pred4 = svm_clf.predict(X_test)

In [37]:
print(lr_clf.__class__.__name__, accuracy_score(y_test, y_pred1))
print(dt_clf.__class__.__name__, accuracy_score(y_test, y_pred2))
print(rf_clf.__class__.__name__, accuracy_score(y_test, y_pred3))
print(svm_clf.__class__.__name__, accuracy_score(y_test, y_pred4))

LogisticRegression 0.9122
DecisionTreeClassifier 0.2054
RandomForestClassifier 0.9639
SVC 0.9759


In [40]:
voting_classifier = VotingClassifier(
    estimators = [('lr', lr_clf),('dt', dt_clf),('rf',rf_clf),('svm', svm_clf)],
    voting='hard'
)

In [41]:
voting_classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
voting_val_pred = voting_classifier.predict(X_val)
print("Voting Classifier accuracy:", accuracy_score(y_val, voting_val_pred))

Voting Classifier accuracy: 0.9691
