### DSTA

#### Non-binary Classification: the MNIST784 dataset

A simple notebook for testing a classifier against the MNIST 784 dataset.

A solution is also available from the Scikit-learn web site.

In [None]:
import time
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state


In [None]:
FILE = 'mnist_784'


mnist = fetch_openml(FILE, version=1)

mnist.keys()

In [None]:
X,y=mnist['data'], mnist['target']

X.shape

#### Example images

In [None]:
import matplotlib as mpi
import matplotlib.pyplot as plt
some_digit=X[0]
some_digit_image=some_digit.reshape(28,28)
plt.imshow(some_digit_image, cmap="binary")
plt.axis("off")
plt.show()

#### Split the data-set 

* Prepare the dataset by dividing it into train and test set.
  
* First 6000 is used as training and the rest as test.

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

#### Classification

* Scale $X$ to have zero mean and unit variance [required by regressor]
* Fit the model.
* Find the score.

In [None]:
t0 = time.time()

#scale data to have zero mean and unit variance [required by regressor]
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# apply logistic regressor with 'sag' solver, C is the inverse regularization strength
clf = LogisticRegression(C=1e5,
                         multi_class='multinomial',
                         penalty='l2', solver='sag', tol=0.1)
# fit data
clf.fit(X_train, y_train)

# percentage of nonzero weights
sparsity = np.mean(clf.coef_ == 0) * 100

# compute accuracy
score = clf.score(X_test, y_test)

#display run time
run_time = time.time() - t0

print('Example run in %.3f s' % run_time)

print("Sparsity with L2 penalty: %.2f%%" % sparsity)
print("Test score with L2 penalty: %.4f" % score)

#### Classification report

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

#get prediction from the classifier
y_pred=clf.predict(X_test)

#print classification report
print (classification_report(
        y_test,
        y_pred
    ))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# plot weights vs the pixel position
coef = clf.coef_.copy()

plt.figure(figsize=(10, 5))

scale = np.abs(coef).max()

for i in range(10):
    l2_plot = plt.subplot(2, 5, i + 1)
    l2_plot.imshow(coef[i].reshape(28, 28), interpolation='nearest',
                   cmap=plt.cm.Greys, vmin=-scale, vmax=scale)
    l2_plot.set_xticks(())
    l2_plot.set_yticks(())
    l2_plot.set_xlabel('Class %i' % i)
plt.suptitle('classification weights vector $w_j$ for digit class $j$')

plt.show()