# Flatten layer, mnist data

Simple classification example

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
from tensorflow import keras
from keras import layers
from keras.losses import SparseCategoricalCrossentropy

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split


In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version = 1)

In [None]:
mnist['data'].shape
mnist['data'].to_numpy().reshape(-1, 28, 28).shape

We create training and test split, and further split the training set into assessment and validation sets. We can scale by dividing by 255, as the pixel values are 0 to 255.

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(mnist['data']/255, mnist['target'], stratify= mnist['target'])
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, stratify=y_train_full)

In [None]:
plt.figure()
plt.imshow(X_train.to_numpy().reshape((39375, 28, 28))[2])
plt.colorbar()
plt.grid(False)
plt.show()

## MLP models tensorflow/keras

#### Use Sequential API.

#### Tensorflow mnist data
is 28 x 28 data. The sklearn and keras versions are the same just formatted differently. We can use either and reshape as necessary. For variety we will load the tf data and reshape.

In [None]:
mnist_keras = keras.datasets.mnist.load_data()


In [None]:
(X_train_full, y_train_full), (X_test, y_test) = mnist_keras

X_train_full = X_train_full/255.0
X_test = X_test/255.0

X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, stratify=y_train_full)

In [None]:
X_train.shape

With keras we have much more flexibility. But we also must correctly set the output layers and the activation on the output, to be appropriate with the loss function.

`SparseCategoricalCrossentropy` is just the multiclass log likelihood- the multiclass version of logistic regression. If we use one hot on the labels (we dont need to here), we should use `CategoricalCrossentropy`. `from_logits` is set to true when there is no softmax activation on the output layer. This is all in the keras documentation.

We first define the model and then we compile it. Then we fit. There are many more available parameters for `compile`. You should check them out. 

In [None]:
# bla bla
kmlp = keras.models.Sequential(
    [
        layers.Flatten(input_shape=(28, 28)),
        layers.Dense(50, activation="relu"),
        layers.Dense(100, activation="relu"),
        layers.Dense(100, activation="relu"),
        layers.Dense(100, activation="relu"),
        layers.Dense(10),
        layers.Softmax()
    ]
)

kmlp.compile(loss = SparseCategoricalCrossentropy(),
              optimizer = keras.optimizers.legacy.SGD(
                    learning_rate=0.01,            
                    name='SGD'), 
             metrics = ["accuracy"])

In [None]:
kmlp.summary()

In [None]:
kmlp.fit(X_train, y_train, 
         epochs = 10, 
         validation_data=(X_val, y_val), 
         verbose = True)

loss_and_metrics = kmlp.evaluate(X_test, y_test, batch_size=128)
print("Accuracy: {:.4f}".format(loss_and_metrics[1]))
print("Cross entropy: {:.4f}".format(loss_and_metrics[0]))


In [None]:
classes = kmlp.predict(X_test, batch_size = 128)
np.set_printoptions(precision=2)
print(classes[0:3])

In [None]:
hard_preds = np.argmax(classes, axis= 1)
#hard_preds = kmlp.predict_classes(X_test)
print(hard_preds[0:3])

#### Locate and visualize incorrect. From Dietel and Dietel (You may have used in Python programming class) 

In [None]:
preds = np.argmax(kmlp.predict(X_test), axis= 1)
print(np.mean(preds != y_test))
badX = X_test[preds != y_test,:,:]
preds_badX = np.argmax(kmlp.predict(badX), axis= 1)
y_badX = y_test[preds != y_test]


In [None]:
fig, axes =  plt.subplots(4, 6, figsize =(16, 10), subplot_kw={'xticks': (), 'yticks':()})

for ax, item, bp, y in zip(axes.ravel(), badX, preds_badX, y_badX):
    ax.imshow(item.reshape(28,28))
    #plt.gray()
    ax.set_title(f'pred: {bp}, true: {y}')





### Remarks:

- We've done no model tuning, tolerance and max iterations were not optimally set. 
- We've used dense layers for our model. In fact, convolution networks work best with image data. 