This is a demonstration of the "raw" TensorFlow 2.0 (eager) API.  
We get to see manual parameter creation, manual graph building, and manual gradient updates.  
After seeing how TF works, we can start throwing on helpers, wrappers, and managers for convenience and performance (like running operations on GPU).

Reference: https://www.tensorflow.org/tutorials/quickstart/advanced

In [None]:
import tensorflow as tf
import numpy as np
print("TensorFlow version:", tf.__version__)

# Fetch the data

In [None]:
# Download MNIST dataset into numpy tensors.

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], -1) / 255
x_test = x_test.reshape(x_test.shape[0], -1) / 255
y_train = np.int_(y_train)
y_test = np.int_(y_test)

In [None]:
# How big is our dataset? What kind of data do we have? 

print(x_train.shape, ', ', x_train.dtype)
print(y_train.shape, ', ', y_train.dtype)
print()
print(x_test.shape, ', ', x_test.dtype)
print(y_test.shape, ', ', y_test.dtype)

In [None]:
# Look at an example
# Images are monochrome with integer pixel values between 0 and 255 (inclusive)

np.set_printoptions(linewidth=1000)
print(x_train[0].reshape(28, 28))

In [None]:
# Visualize some images and check their labels

import matplotlib.pyplot as plt
import numpy as np

for i in range(6):
  print(y_train[i])
  plt.imshow(x_train[i].reshape(28, 28))
  plt.show()
  print('')

# Define the model

In [None]:
INPUT_DIM = x_train.shape[-1]
HIDDEN_DIM = 256
OUTPUT_DIM = 10

# Define and initialize the model parameters by sampling each element i.i.d. from a normal distribution
SCALE = 1e-2
W1 = tf.Variable(SCALE * np.random.normal(size=(INPUT_DIM, HIDDEN_DIM)))
W2 = tf.Variable(SCALE * np.random.normal(size=(HIDDEN_DIM, OUTPUT_DIM)))
B1 = tf.Variable(SCALE * np.random.normal(size=(1, HIDDEN_DIM)))
B2 = tf.Variable(SCALE * np.random.normal(size=(1, OUTPUT_DIM)))

parameters = [W1, W2, B1, B2]

@tf.function
def model_fn(x):
  # x.shape=(batch_size, INPUT_DIM)
  h = tf.nn.relu(tf.matmul(x, W1) + B1)
  return tf.matmul(h, W2) + B2

In [None]:
# Test out our model

out = model_fn(x_train[:23])
print(out.shape)

In [None]:
# View our parameters

print(W1)
print(B1)

In [None]:
# Define the loss function

@tf.function
def select_indices(x, indices):
  # From row i, select element indices[i], i.e.
  # return [x[i, indices[i]] for i in range(x.shape[0])]

  # See https://stackoverflow.com/a/48491902/15601980
  row_indices = tf.range(indices.shape[0], dtype=tf.int64)
  full_indices = tf.stack([row_indices, indices], axis=1)
  return tf.gather_nd(x, full_indices)

# Maximum likelihood loss (negative log probability of the data)
@tf.function
def mle_loss(logits, labels):
  # equivalent to cross entropy loss where target probs are 1 on the correct labels
  # https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy
  logits_adjusted = tf.nn.log_softmax(logits, axis=1)  # rescale outputs in log space
  return -tf.reduce_mean(select_indices(logits_adjusted, labels))

***Cross entropy loss***

See https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy.

Let $\vec{q} = (q_1, \dots, q_n)$ be a vector of predicted probabilities,  
and let $\vec{p} = (p_1, \dots, p_n)$ be a vector of target probabilities.

The [cross entropy](https://en.wikipedia.org/wiki/Cross_entropy) of the two distributions is

$$
H(\vec{p}, \vec{q}) = -\sum_{i=1}^n p_i \log q_i
$$

Holding $\vec{p}$ fixed, $H(\vec{p}, \vec{q})$ is maximized when $\vec{q} = \vec{p}$.

In our case, the labels provide a one-hot target distribution.  
Let $t$ be the target label for some input image.
A one-hot distribution puts all probability on $t$, i.e.

$$\vec{\mathbb{1}}[t] = (0,\dots,0,1,0,\dots,0)$$

where $\mathbb{1}[t]_t = 1$.
Let $\vec{Y}$ be the raw model outputs and $\vec{q}$ be the model probabilities, i.e. 

$$
\vec{q} = \text{softmax}(\vec{Y}) = \frac{1}{\sum_{i=1}^n \exp(Y_i)}\Big(\exp(Y_1),\dots,\exp(Y_n)\Big)
$$

and let $\vec{p} = \vec{\mathbb{1}}[t]$.


Then the cross entropy loss (negated so that minimizing maximizes cross entropy) is


$$\begin{aligned}
L &= -H(\vec{p}, \vec{q}) \\
&= -H\left(\vec{\mathbb{1}}[t],\ \text{softmax}(\vec{Y})\right) \\
&= \sum_{i=1}^n \mathbb{1}[t]_i \log\left( \text{softmax}(\vec{Y}) \right)\\
&= \log\left( \text{softmax}(\vec{Y})_t \right) \\
&= Y_t - \log\left(\sum_{i=1}^n \exp(Y_i)\right)
\end{aligned}$$



In [None]:
@tf.function
def accuracy(logits, target):
  argmaxs = tf.math.argmax(logits, axis=1)
  corrects = tf.math.equal(argmaxs, target)
  return tf.math.count_nonzero(corrects) / logits.shape[0]

# Training loop

In [None]:
@tf.function
def train_step(images, labels, lr):
  with tf.GradientTape() as tape:
    predictions = model_fn(images)
    loss = mle_loss(predictions, labels)
    loss += regularizer_weight * sum(tf.norm(p, 1) for p in parameters)  # regularization
  gradients = tape.gradient(loss, parameters)

  for p, g in zip(parameters, gradients):
    p.assign_sub(lr * g)

  return loss

In [None]:
batch_size = 100
learning_rate = 1e-2
num_epochs = 1000
regularizer_weight = 1e-2

for epoch in range(batch_size):
  idx = np.random.permutation(x_train.shape[0])  # random ordering of the training set

  ## training step
  for i in range(0, x_train.shape[0], batch_size):
    x = x_train[idx[i:i+batch_size]]
    y = y_train[idx[i:i+batch_size]]

    loss_ = train_step(x, y, learning_rate).numpy()

    if i % (batch_size * 100) == 0:
      print('Step:', i//batch_size, '; Loss:', loss_)

  train_logits = model_fn(x_train)  # Training accurate
  test_logits = model_fn(x_test)  # Test accuracy
  print('')
  print('Epoch: %d | Train Accuracy: %.2f | Test Accuracy: %.2f' % (epoch, accuracy(train_logits, y_train), accuracy(test_logits, y_test)))
  print('')