In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# The Forward And Backward Pass

In [3]:
mnist = tf.keras.datasets.mnist


(x_train, y_train), (x_test, y_test) = mnist.load_data()
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1] * x_train.shape[1]))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1] * x_test.shape[1]))
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape, y_train.min(), y_train.max())
n, m = x_train.shape
c = y_train.max() + 1
print(n, m, c)

(60000, 28, 28) (60000,) (10000, 28, 28) (10000,)
(60000, 784) (60000,) (10000, 784) (10000,) 0 9
60000 784 10


In [4]:
def normalize(x, m, s): return (x - m) / s

In [5]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(33.318421449829934, 78.56748998339798)

In [6]:
x_train = normalize(x_train, train_mean, train_std)
x_test  = normalize(x_test,  train_mean, train_std)

In [7]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(-3.064638490070051e-17, 0.9999999999999998)

# Foundations Version - Basic Architecture

In [8]:
# num hidden
nh = 50

In [9]:
# simple normal random init
w1 = np.random.randn(m, nh)
b1 = np.zeros(nh)
w2 = np.random.randn(nh, 1)
b2 = np.zeros(1)

In [10]:
# check that the mean and std are resp. around 0. and 1.
w1.mean(), w1.std()

(0.000419959375982562, 1.0018807144190218)

In [11]:
def lin(x, w, b): return np.dot(x, w) + b

a = np.array([[1.,2,3.],[4,5,6]])
b = np.array([[2,5], [6,7.]])
c = np.array([5., 8.])[..., None]
np.dot(b, a)

In [12]:
t = lin(x_test, w1, b1)

In [13]:
# the input mean and std to the second layer is as bellow which is not good
# we want it to be ~0. and ~1. respectively
t.mean(), t.std()

(1.9860241802211678, 29.25473786224826)

In [14]:
# The way to do that this way is to initialize with Kaiming He Init
w1 = np.random.randn(m, nh) / np.sqrt(m)
b1 = np.zeros(nh)
w2 = np.random.randn(nh, 1) / np.sqrt(nh)
b2 = np.zeros(1)

In [15]:
# Now the output to the second layer is OK
t = lin(x_test, w1, b1)
t.mean(), t.std()

(-0.06095717097152738, 1.0602446127397316)

In [23]:
def relu(x): return np.clip(x, 0., None)

In [25]:
# Now the mean and std are not 0. and 1.
t = relu(lin(x_test, w1, b1))
t.mean(), t.std()

(0.39201003176162474, 0.6063833774349231)

In [98]:
# let's try kaiming he init for relu
w1 = np.random.randn(m, nh) * np.sqrt(2/m)
b1 = np.zeros(nh)
w2 = np.random.randn(nh, 1) * np.sqrt(2/nh)
b2 = np.zeros(1)

In [99]:
w1.mean(), w1.std()

(-0.0004121727552985243, 0.05033071378698265)

In [100]:
# Now it's OK
t = relu(lin(x_test, w1, b1))
t.mean(), t.std()

(0.5072675518797128, 0.7694559665135039)

from https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer?version=stable

In [159]:
# using tensorflow this gives the following:
linear_layer = tf.keras.layers.Dense(nh, activation='relu', kernel_initializer='he_normal')
y = linear_layer(x_test.astype(np.float32))
tf.math.reduce_mean(y), tf.math.reduce_std(y)

(<tf.Tensor: id=4349, shape=(), dtype=float32, numpy=0.5823935>,
 <tf.Tensor: id=4356, shape=(), dtype=float32, numpy=0.8481663>)

In [52]:
x_test.shape, w1.shape

((10000, 784), (784, 50))

In [63]:
y.shape

TensorShape([10000, 50])

In [165]:
def model(xb):
    lin1 = tf.keras.layers.Dense(nh, activation='relu', kernel_initializer='he_normal')
    lin2 = tf.keras.layers.Dense(1, activation='relu', kernel_initializer='he_normal')
    y = lin2(lin1(xb))
    return y
# tf.math.reduce_mean(y), tf.math.reduce_std(y)

In [166]:
x_test_gpu = tf.constant(x_test, dtype=tf.float32)

In [169]:
%timeit -n 10 _=model(x_test_gpu)

14.8 ms ± 92.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Loss Function

In [170]:
model(x_test_gpu).shape

TensorShape([10000, 1])