In [1]:
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
mnist = fetch_mldata('MNIST Original')

In [3]:
mnist

{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([0., 0., 0., ..., 9., 9., 9.])}

In [4]:
X, y = mnist['data'], mnist['target']

In [5]:
X.shape

(70000, 784)

In [6]:
y.shape

(70000,)

#### There are 70,000 images and each image has 784 features. Each image is 28x28 pixels.

In [7]:
np.unique(mnist.target)

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

### Split the data into a training set, validation set and testing set (using 50,000 for training, 10,000 for validation and 10,000 for testing )

In [8]:
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=10000, random_state=11)
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size=10000, random_state=11)

In [9]:
X_train.shape, X_val.shape, X_test.shape

((50000, 784), (10000, 784), (10000, 784))

In [10]:
y_train.shape, y_val.shape, y_test.shape

((50000,), (10000,), (10000,))

### Traing classifiers RandomForest, ExtraTrees and SVM

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC

In [12]:
rand_forest_clf = RandomForestClassifier(n_estimators=10, random_state=11)
ext_trees_clf = ExtraTreesClassifier(n_estimators=10, random_state=11)
svc_clf = LinearSVC(random_state=11)

In [13]:
estimators = [rand_forest_clf, ext_trees_clf, svc_clf]

In [14]:
for estimator in estimators:
    print("Training --> ", estimator)
    estimator.fit(X_train, y_train)

Training -->  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=11, verbose=0, warm_start=False)
Training -->  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=11, verbose=0, warm_start=False)
Training -->  LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ov

In [15]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9457, 0.9471, 0.8749]

#### linear SVM ??

In [16]:
vot_estimators = [("rand_forest_clf", rand_forest_clf),("ext_trees_clf", ext_trees_clf),("svc_clf", svc_clf)]

In [17]:
from sklearn.ensemble import VotingClassifier

In [18]:
vot_clf = VotingClassifier(vot_estimators)

In [19]:
vot_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rand_forest_clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
     ... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=11, tol=0.0001,
     verbose=0))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [20]:
vot_clf.score(X_val, y_val)

  if diff:


0.9514

In [21]:
[estimator.score(X_val, y_val) for estimator in vot_clf.estimators_]

[0.9457, 0.9471, 0.8749]

In [22]:
vot_clf.estimators

[('rand_forest_clf',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
              oob_score=False, random_state=11, verbose=0, warm_start=False)),
 ('ext_trees_clf',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=11, verbose=0, warm_start=False)),
 ('svc_clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, loss='squared_hi

In [23]:
vot_clf.estimators_

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=11, verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=11, verbose=0, warm_start=False),
 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_

#### To change the voting to soft and do scoring, first remove svm from list of trained estimators to avoid error since  'LinearSVC' doesnt support 'predict_proba'

In [27]:
del vot_clf.estimators_[2]

In [28]:
vot_clf.voting = "soft"

In [29]:
vot_clf.score(X_val, y_val)

  if diff:


0.9604

### Result improved with 'soft' voting!

In [30]:
vot_clf.score(X_test, y_test)

  if diff:


0.962

In [31]:
[estimator.score(X_test, y_test) for estimator in vot_clf.estimators_]

[0.9473, 0.9494]