## MNIST Basic Approach (Softmax)

In [101]:
import tensorflow.compat.v1 as tf
import numpy as np
tf.disable_v2_behavior()


In [136]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(path='mnist.npz')

In [137]:
# normalize and reshape features
x_train = (x_train/255).reshape(x_train.shape[0], 784)
x_test = (x_test/255).reshape(x_test.shape[0], 784)

# one-hot encoding of labels
y_train = tf.keras.utils.to_categorical(y_train, num_classes=10, dtype=int)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=10, dtype=int)

(10000, 10)

In [129]:
# Placeholders
x = tf.placeholder(tf.float32, shape=[None, 784])
y_true = tf.placeholder(tf.float32, shape=[None, 10])

In [130]:
# Variables
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))

In [131]:
# Graph Operations
y = tf.matmul(x, W) + b

In [132]:
# Loss Function
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y))

In [133]:
# Optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.5)
train = optimizer.minimize(cross_entropy)

In [140]:
# Session
init = tf.global_variables_initializer()

with tf.Session() as session:
    session.run(init)
    
    batch_size = 100
    max_index = x_train.shape[0]
    for step in range(1000):
        start = np.random.randint(0, max_index - batch_size)
        end = start + batch_size
        
        session.run(train, feed_dict={x: x_train[start:end], y_true: y_train[start:end]})
    
    # Evaluate
    correct_pred = tf.equal(tf.argmax(y, 1), tf.argmax(y_true, 1))
    acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    print(session.run(acc, feed_dict={x: x_test, y_true: y_test}))

0.9176


## Convolutional Neural Network
- Just like the simple perceptron, CNNs also have their origins in biological research
- Hubel and Wisel studied the structure of the visual cortex in mammals, winning a Nobel Prize in 1981
- Their research revealed that neurons in the visual cortex had a small local receptive field
- This idea then inspired an ANN architecture that would become CNN
- Famously implemented in the 1998 paper by Yann LeCun et al
- The LeNet-5 architecture was first used to classify the MNIST data set

#### Concepts
- Tensors: N-dimensional arrays
    - scalar -> 3
    - vector -> [1,2]
    - matrix -> [[1,2],[3,4],...,[8,9]]
    - tensor -> [[[1,2],[3,4]],...,[[8,9],[10,11]]]
    
- Densely Connected Layer: each neuron is connected to every neuron in the next layer
- Convolutional Layer: each unit is connected to a smaller number of nearby units in the next layer
    - MNIST dataset is 28x28 pixels, but most images are at least 256x256 or greater - or a total of <56K
    - This leads to too many parameters, unscalable to new images
    - Convolutions also have a major advantage for image processing, where pixels nearby to each other are much more correlated to each other for image detection
    - Each CNN layer looks at an increasingly larger part of the image
    - Having units only connected to nearby units also aids in *invariance*
    - CNN also helps with regularization, limiting the search of weights to the size of the convolution
    - Convolution:
        - Filters and filter size
            - Commonly visualized with grids, where we pass the filters (grid of weights) through the input, and compute the multiplication of the weights in the filter and the input. Then sum the results to get a final output
        - Stride
            - We move the filter grid over by the amount of the stride - e.g. 1 or 2 pixels
- Pooling Layer: subsample the input image, which reduces the memory use and computer load as well as reducing the number of parameters
    - Create an N x N pool of pixels and evaluate the maximum value - only that value makes it to the next layer (representative value)
    - Move over by the value of the stride and repeat the process
    - This end up removing a lot of information. Even a small pooling "kernel" of 2x2 with a stride of 2 will remove 75% of the input data
- Dropout: can be thought of as a form of regularization to help prevent overfitting.
    - During training, units are randomly dropped, along with their connections
    - This helps prevent units from "co-adapting" too much

In [141]:
# Helper functions

def init_weights(shape):
    init_random_dist = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(init_random_dist)

def init_bias(shape):
    init_bias_vals = tf.constant(0.1, shape=shape)
    return tf.Variable(init_bias_vals)

def conv2d(x,W):
    # x --> [batch,height,width,Channels]
    # W --> [filter height, filter width, Channels In, Channels Out]
    
    return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding="SAME")

def max_pooling_2by2(x):
    # x --> [batch,height,width,Channels]
    # Pooling along height and width only; that's why [1,2,2,1]
    pool_along_height_width = [1,2,2,1]
    return tf.nn.max_pool(x, ksize=pool_along_height_width, strides=pool_along_height_width, padding="SAME")

In [142]:
# Convolutional Layer
def convolutional_layer(input_x, shape):
    W = init_weights(shape)
    b = init_bias([shape[3]])
    
    return tf.nn.relu(conv2d(input_x,W) + b)

In [143]:
# Fully connected layer
def normal_full_layer(input_layer, size):
    input_size = int(input_layer.get_shape()[1])
    W = init_weights([input_size, size])
    b = init_bias([size])
    
    return tf.matmul(input_layer, W) + b

In [144]:
# Placeholders
x = tf.placeholder(tf.float32, shape=[None, 784])
y_true = tf.placeholder(tf.float32, shape=[None, 10])

In [146]:
# Layers
x_image = tf.reshape(x, [-1,28,28,1]) # putting flatten image back into normal shape 28x28

convo_1 = convolutional_layer(x_image, shape=[5,5,1,32])
convo_1_pooling = max_pooling_2by2(convo_1)

convo_2 = convolutional_layer(convo_1_pooling, shape=[5,5,32,64])
convo_2_pooling = max_pooling_2by2(convo_2)

convo_2_flat = tf.reshape(convo_2_pooling, [-1,7*7*64])
full_layer_one = tf.nn.relu(normal_full_layer(convo_2_flat, 1024))

In [147]:
# Dropout
hold_prob = tf.placeholder(tf.float32)
full_one_dropout = tf.nn.dropout(full_layer_one, keep_prob=hold_prob)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [148]:
y_pred = normal_full_layer(full_one_dropout, 10)

In [149]:
# Loss function
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred))

In [150]:
# Optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train = optimizer.minimize(cross_entropy)

In [152]:
init = tf.global_variables_initializer()

steps = 1000
with tf.Session() as session:
    session.run(init)
    
    for i in range(steps+1):
        batch_size = 50
        max_index = x_train.shape[0]
        
        start = np.random.randint(0, max_index - batch_size)
        end = start + batch_size

        session.run(train, feed_dict={x: x_train[start:end], y_true: y_train[start:end], hold_prob: 0.5})
        
        if i%10 == 0:
            matches = tf.equal(tf.argmax(y_pred,1), tf.argmax(y_true,1))
            acc = tf.reduce_mean(tf.cast(matches, tf.float32))
            print(f"ON STEP: {i}")
            print(f"Accuracy: {session.run(acc, feed_dict={x: x_test, y_true: y_test, hold_prob: 1.0})}")

ON STEP: 0
Accuracy: 0.13359999656677246
ON STEP: 10
Accuracy: 0.555899977684021
ON STEP: 20
Accuracy: 0.7773000001907349
ON STEP: 30
Accuracy: 0.8687000274658203
ON STEP: 40
Accuracy: 0.8931000232696533
ON STEP: 50
Accuracy: 0.9153000116348267
ON STEP: 60
Accuracy: 0.9211999773979187
ON STEP: 70
Accuracy: 0.930899977684021
ON STEP: 80
Accuracy: 0.9384999871253967
ON STEP: 90
Accuracy: 0.9444000124931335
ON STEP: 100
Accuracy: 0.9474999904632568
ON STEP: 110
Accuracy: 0.9520000219345093
ON STEP: 120
Accuracy: 0.954800009727478
ON STEP: 130
Accuracy: 0.9526000022888184
ON STEP: 140
Accuracy: 0.9599999785423279
ON STEP: 150
Accuracy: 0.9567000269889832
ON STEP: 160
Accuracy: 0.9509999752044678
ON STEP: 170
Accuracy: 0.9595000147819519
ON STEP: 180
Accuracy: 0.9578999876976013
ON STEP: 190
Accuracy: 0.9563999772071838
ON STEP: 200
Accuracy: 0.961899995803833
ON STEP: 210
Accuracy: 0.96670001745224
ON STEP: 220
Accuracy: 0.9664999842643738
ON STEP: 230
Accuracy: 0.9675999879837036
ON STEP:

[]