In [20]:
import tensorflow as tf
import numpy as np

In [2]:
# to avoid a cuBLAS error, per https://stackoverflow.com/questions/43990046/tensorflow-blas-gemm-launch-failed
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Very basic TensorFlow to do automatic differentiation

In [3]:
x = tf.Variable(0.)
with tf.GradientTape() as tape:
    y = 2 * x + 3
grad_of_y_wrt_x = tape.gradient(y, x)
grad_of_y_wrt_x

<tf.Tensor: shape=(), dtype=float32, numpy=2.0>

Ok, so we showed above that the derivative of 2x+3 is 2. Cool. This is a single variable/single dimension.

And I should remember explicitly that gradient(y, x), pronounced 'gradient of y with respect to x', truly means 'how does y change when x changes?'

And with two dimensions, so where x is a matrix, or more generally, a tensor. 

In [4]:
x = tf.Variable(tf.random.uniform((2,2)))  # a 2x2 matrix, aka a 2x2 tensor
with tf.GradientTape() as tape:
    y = 2 * x + 3
tape.gradient(y, x)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[2., 2.],
       [2., 2.]], dtype=float32)>

And with a list of variables that matches here a single layer neural network.

In [5]:
tf.zeros((2,))

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0., 0.], dtype=float32)>

In [6]:
W = tf.Variable(tf.random.uniform((2, 2)))
b = tf.Variable(tf.zeros((2,)))
x = tf.random.uniform((2, 2))
with tf.GradientTape() as tape:
    y = tf.matmul(W, x) + b
tape.gradient(y, [W, b]) # how does y change when W and b change?

[<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[1.008519 , 1.0719895],
        [1.008519 , 1.0719895]], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 2.], dtype=float32)>]

The return value above is a list two tensors, with the same shape as W and b respectively. So the more specific answer to the question of 'how does y change when W and b change' I think shows how... y changes when W changes (which is a 2x2 matrix) and how y changes when b changes, which is a vector of length 2?

# Implementing a very simple network in straight TensorFlow

In [7]:
class NaiveDense:
    
    def __init__(self, input_size, output_size, activation):
        self.activation = activation
        
        w_shape = (input_size, output_size)
        w_initial_value = tf.random.uniform(w_shape, minval=0, maxval=1e-1)
        self.W = tf.Variable(w_initial_value)
        
        b_shape = (output_size,)
        b_initial_value = tf.zeros(b_shape)
        self.b = tf.Variable(b_initial_value)
        
    def __call__(self, inputs):
        return self.activation(tf.matmul(inputs, self.W) + self.b)
    
    @property
    def weights(self):
        return [self.W, self.b]

In [8]:
class NaiveSequential:
    
    def __init__(self, layers):
        self.layers = layers
        
    def __call__(self, inputs):
        x = inputs
        for layer in self.layers:
            x = layer(x)
        return x
    
    @property
    def weights(self):
        weights = []
        for layer in self.layers:
            weights += layer.weights
        return weights

In [9]:
model = NaiveSequential([
    NaiveDense(input_size=28 * 28, output_size=512, activation=tf.nn.relu),
    NaiveDense(input_size=512, output_size=10, activation=tf.nn.softmax)
])
assert len(model.weights) == 4

In [10]:
model.weights

[<tf.Variable 'Variable:0' shape=(784, 512) dtype=float32, numpy=
 array([[0.04411194, 0.08256219, 0.04511452, ..., 0.09753089, 0.09141455,
         0.09512258],
        [0.0099643 , 0.04921366, 0.01289421, ..., 0.02002212, 0.04500791,
         0.06642369],
        [0.0232447 , 0.07011235, 0.0965726 , ..., 0.01693965, 0.021614  ,
         0.06633174],
        ...,
        [0.07201973, 0.05671605, 0.05204531, ..., 0.0245415 , 0.0136045 ,
         0.06236341],
        [0.09544007, 0.08046494, 0.08366068, ..., 0.06056479, 0.07528609,
         0.00934405],
        [0.03553861, 0.0967003 , 0.09796096, ..., 0.01411401, 0.07238515,
         0.03471782]], dtype=float32)>,
 <tf.Variable 'Variable:0' shape=(512,) dtype=float32, numpy=
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0

In [11]:
class BatchGenerator:
    
    def __init__(self, images, labels, batch_size=128):
        self.index = 0
        self.images = images
        self.labels = labels
        self.batch_size = batch_size
        
    def next(self):
        images = self.images[self.index : self.index + self.batch_size]
        labels = self.labels[self.index : self.index + self.batch_size]
        self.index += self.batch_size
        return images, labels

In [12]:
def one_training_step(model, images_batch, labels_batch):
    with tf.GradientTape() as tape:
        predictions = model(images_batch)
        per_sample_losses = tf.keras.losses.sparse_categorical_crossentropy(labels_batch, predictions)
        average_loss = tf.reduce_mean(per_sample_losses)
    
    gradients = tape.gradient(average_loss, model.weights) # how does the average loss change when the weights change?
    update_weights_naive(gradients, model.weights)
    return average_loss

In [18]:
learning_rate = 1e-3

def update_weights_naive(gradients, weights):
    for g, w in zip(gradients, weights):
        w.assign_sub(w * learning_rate) # assign_sub is the TF equivalent of -=
        
from tensorflow.keras import optimizers
optimizer = optimizers.SGD(learning_rate=1e-3)
def update_weights_via_Keras(gradients, weights):
    optimizer.apply_gradients(zip(gradients, weights))

In [14]:
# implement an epoch of training - i.e., use all of the data, in batches 
def fit(model, images, labels, epochs, batch_size=128):
    for epoch_counter in range(epochs):
        print(f'Epoch {epoch_counter}')
        batch_generator = BatchGenerator(images, labels)
        for batch_counter in range(len(images) // batch_size):
            images_batch, labels_batch = batch_generator.next()
            loss = one_training_step(model, images_batch, labels_batch)
            if batch_counter % 100 == 0:
                print(f'Loss at batch {batch_counter} is {loss:.2f}')

In [15]:
from tensorflow.keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images.shape, train_labels.shape, test_images.shape, test_labels.shape

((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

In [16]:
train_images = train_images.reshape((60000, 28 * 28)).astype('float32') / 255
test_images = test_images.reshape((10000, 28 * 28)).astype('float32') / 255
train_images.shape, train_labels.shape, test_images.shape, test_labels.shape

((60000, 784), (60000,), (10000, 784), (10000,))

In [17]:
fit(model, train_images, train_labels, epochs=5, batch_size=128)

Epoch 0
Loss at batch 0 is 4.21
Loss at batch 100 is 3.94
Loss at batch 200 is 3.57
Loss at batch 300 is 2.93
Loss at batch 400 is 2.98
Epoch 1
Loss at batch 0 is 2.67
Loss at batch 100 is 2.66
Loss at batch 200 is 2.56
Loss at batch 300 is 2.40
Loss at batch 400 is 2.42
Epoch 2
Loss at batch 0 is 2.35
Loss at batch 100 is 2.38
Loss at batch 200 is 2.35
Loss at batch 300 is 2.31
Loss at batch 400 is 2.32
Epoch 3
Loss at batch 0 is 2.30
Loss at batch 100 is 2.32
Loss at batch 200 is 2.31
Loss at batch 300 is 2.30
Loss at batch 400 is 2.31
Epoch 4
Loss at batch 0 is 2.30
Loss at batch 100 is 2.31
Loss at batch 200 is 2.31
Loss at batch 300 is 2.30
Loss at batch 400 is 2.30


In [22]:
predictions = model(test_images).numpy() # the numpy method converts a TF tensor to a NumPy tensor
predicted_labels = np.argmax(predictions, axis=1)
matches = predicted_labels == test_labels
print(f'accuracy: {matches.average():.2f}')

AttributeError: 'numpy.ndarray' object has no attribute 'average'

In [26]:
predictions

array([[0.10005613, 0.10147651, 0.09842804, ..., 0.09844318, 0.10246617,
        0.09711426],
       [0.10010671, 0.10241948, 0.0975792 , ..., 0.09812434, 0.10418177,
        0.0950273 ],
       [0.09993846, 0.10084959, 0.09906089, ..., 0.09930016, 0.10134929,
        0.0984856 ],
       ...,
       [0.10013156, 0.102461  , 0.09761871, ..., 0.09790655, 0.10410839,
        0.09529602],
       [0.10015665, 0.10201796, 0.09761112, ..., 0.09826908, 0.10389639,
        0.09563259],
       [0.1001974 , 0.10311829, 0.09654474, ..., 0.09715367, 0.10607777,
        0.09331694]], dtype=float32)

In [29]:
np.argmax(predictions, axis=1)[100:125]

array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8], dtype=int64)

In [37]:
predictions[5]

array([0.100006  , 0.10127877, 0.09874418, 0.10133034, 0.10022171,
       0.0994174 , 0.1002842 , 0.09899999, 0.10188216, 0.09783527],
      dtype=float32)

In [40]:
len(predicted_labels == 8)

10000

So, it doesn't actually work... 