In [31]:
# Retrieve Data

from keras.datasets import mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

In [32]:
# Network Architecture

from keras import models
from keras import layers

# Two layers
# Second layer is a 10-way softmax layers (returns array of 10 probability scores, one for each digit)

network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(28*28,)))
network.add(layers.Dense(10, activation='softmax'))



In [33]:
# Compilation step

network.compile(optimizer='rmsprop', 
                loss='categorical_crossentropy', 
                metrics=['accuracy'])

# Compilation step has 3 components:
# 1) Loss function: how the network will measure its performance on the training data
# 2) Optimizer: the mechanism through which the network iwll update itself based on the 
#    data it sees and its loss function
# 3) Metrics for training and testing: In this case, we will only measure accuracy

In [34]:
# Preparing the image data

train_images = train_images.reshape((60000, 28*28))
train_images = train_images.astype('float32') / 255

test_images = test_images.reshape((10000, 28*28))
test_images = test_images.astype('float32') / 255

# Proprocess data by reshaping it into the shape the network expects
# Scale so all values are in the [0, 1] interval
# Array of shape (60000, 28, 28), values [0, 255] --> Reshape to float32 array (60000, 28*28), values [0, 1]

In [35]:
# Preparing labels

from keras.utils import to_categorical

train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

In [36]:
# Train network using Keras' "fit" method

network.fit(train_images, train_labels, epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x12c5cb278>

In [37]:
# Test network on test set using Keras' "evaluate" method

test_loss, test_acc = network.evaluate(test_images, test_labels)

print('Test Accuracy: ', test_acc)

Test Accuracy:  0.9793


In [38]:
# Data Representations for Neural Networks: 

#     1) Scalars: 0-D tensors, only one number

#     2) Vectors: 1-D tensors
    
#     3) Matrices: 2-D tensors
    
#     4) 3D Tensors/higher dimensions

# Key Attributes: 
#     1) number of axes: rank
#     2) shape: tuple of integres that describes how many dimensions the tensor has along each axis
#     3) Data type

In [39]:
# Gradient-based optimization

# output = relu(dot(W, input) + b)
# Where W and b are tensors that are attributes of the layer (weights)


# Training Loop:
#     1) Draw a batch of training samples x and corresponding targets y
#     2) Run the network on x to obtain predictions y_pred
#     3) Compute the loss of the network on the batch, a measure of the mismatch between y_pred and y
#     4) Update all the weights of the network to slightly reduce the loss on this batch

# If you continue the training loop, you'll eventually end up with a network that has a very low loss
# on its training data

# Mini-batch stochastic gradient descent:
#     1) Draw a batch of training samples and corresponding targets y
#     2) Run the network on x to obtain predictions y_pred
#     3) Compute the loss of the network on the batch, a measure of the mismatch between y_pred and y
#     4) ompute the gradient of the loss with regard to the network's parameters (a backward pass)
#     5) Move the parameters a little in the opposite direction from the gradient, reducing the loss
#        on the batch a bit.

