# Best Practices and Common Pitfalls

In [None]:
#%tensorflow_version 2.x
import tensorflow as tf

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Input, Dense

from tensorflow.keras import utils
from matplotlib import pyplot as plt
import numpy as np

## Identifying overfitting

-   Underfitting: model does not capture statistics of training data
    -   high training error
-   Overfitting: model memorizes idiosyncracies of training data
    -   low training error
    -   model does not generalize
-   Monitor performance on held-out **validation set**

In [None]:
from utils import mnist_imgs, mnist_lbls, mnist_convnet

# By using `validation_split`, we can automatically reserve a portion of the
# training set to monitor validation accuracy
model = mnist_convnet()
model.fit(mnist_imgs, mnist_lbls, epochs=5, batch_size=64, validation_split=0.1)

How to prevent overfitting:

-   early stopping: stop training once validation accuracy decreases
-   regularization: enforce additional restrictions on the network
-   use more data, e.g., via data augmentation

In [None]:
cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', # stop as soon as validation accuracy decreases
    min_delta=0.001, # a decrease of smaller than 0.001 will be ignored
    patience=0, # stop as soon as this occurs and do not wait for another epoch
)

model.fit(mnist_imgs, mnist_lbls, epochs=5, batch_size=64,
    validation_split=0.1, callbacks=[cb])

## Appropriate Loss Functions

-   The loss function (or objective) the can have a drastic influence on
    the learning process.
-   Many loss functions expect a specific input format.
-   With wrong loss, code may compile without warning, but produce
    nonsensical results.

In [None]:
# transform MNIST into dataset
dataset = tf.data.Dataset.from_tensor_slices((mnist_imgs, mnist_lbls))

batched_dataset = dataset.batch(32).repeat(3).shuffle(buffer_size=100000).prefetch(buffer_size=10000)

def build_model(input_shape=(28, 28, 1), num_classes=10):
    """
    Build a simple conv net for image classification
    :return: the compiled model
    """
    # input is 28 x 28 x 1
    inputs = Input(shape=input_shape, name='inputs')
    x = Conv2D(filters=64, kernel_size=(3,3), activation='relu', name='dense_1')(inputs)
    x = Conv2D(filters=64, kernel_size=(3,3), activation='relu', name='dense_2')(x)
    x = Flatten()(x)
    outputs = Dense(num_classes, activation='softmax', name='predictions')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss='categorical_crossentropy',
                  metrics=['categorical_accuracy'])
    return model
  
model = build_model()
model.fit(batched_dataset)

So what’s the problem? We forgot the transform the label to categorical
labels by

In [None]:
from tensorflow.keras import utils
mnist_lbls = utils.to_categorical(mnist_lbls, 10)

Even with integer labels the computation can by carried out but don’t
make sense at all. We could also change the loss function of the model:

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'])

## Training vs. testing

-   Some layers change behavior between training and testing
-   Example: `Dropout`
    -   drops activations and rescales inputs during training
    -   passes input through during testing
-   Example: `BatchNormalization`
    -   uses mean and variance of current batch during training
    -   uses learned mean and variance during testing

### Example 1: Training on MNIST

-   Convolutional network trained on fashionMNIST, using Keras

In [None]:
net = tf.keras.models.load_model("../models/model_fmnist.h5")

(train_imgs, train_lbls), (test_imgs, test_lbls) = tf.keras.datasets.fashion_mnist.load_data()
train_imgs = train_imgs.reshape(-1, 28, 28, 1) / 255.0
test_imgs = test_imgs.reshape(-1, 28, 28, 1) / 255.0

In [None]:
net.summary()

-   When using Keras functions like `.fit()`, `.evaluate()` and
    `.predict()`, the model sets the correct mode automatically

In [None]:
net.evaluate(test_imgs, test_lbls)

In [None]:
(test_lbls == tf.argmax(net(test_imgs, training=True), axis=1)).numpy().mean()

In [None]:
(test_lbls == tf.argmax(net(test_imgs, training=False), axis=1)).numpy().mean()

### Example 2: A custom layer

**Task**: Implement a fully connected layer that performs dropout on its
weight matrix during training. Ensure that the dropout is only applied
during training.

Subclass
[`tf.keras.layer`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer).

In [None]:
from tensorflow.keras.layers import Layer
from tensorflow.python.keras.utils import tf_utils

class DropWeight(Layer):
    def __init__(self, units, rate, activation=tf.nn.relu, **kwargs):
        super(DropWeight, self).__init__(**kwargs)
        self.units = units
        self.rate = rate
        self.activation = activation
        
    def build(self, input_shape):
        self.w = self.add_weight(shape=(input_shape[-1], self.units),
            initializer='random_normal', trainable=True)
        self.b = self.add_weight(shape=(self.units,),
            initializer='random_normal', trainable=True)
        
    def call(self, x, training=None):
        if training is None:
            new_w = self.w
        elif training is False:
            new_w = self.w
        else:
            new_w = tf.nn.dropout(self.w, self.rate)
        return(self.activation(tf.matmul(x, new_w) + self.b))

Let’s try out this layer:

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Flatten, Dense

net = Sequential([
    Input(shape=(28, 28, 1)),
    Flatten(),
    DropWeight(1024, rate=0.2),
    DropWeight(1024, rate=0.2),
    Dense(10, activation="softmax")
])

In [None]:
net.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
net.fit(train_imgs, train_lbls, batch_size=64, epochs=10)

# Pitfalls in Reinforcement Learning

## Numerical Instabilities

In Policy Gradient Methods we optimize
$\mathbb{E}_\pi[log \pi (a|s) R]$, which can lead to numerical
instabilities. We can avoid some of them by introducing a rectified log:

In [None]:
def rectified_log(x):
    return tf.math.log(tf.maximum(1e-6, x))

In Q-Learning we use the Q-values to define the policy by computing the
softmax as

$$p(a|s) = \frac{\exp(Q(s,a))}{\exp(\sum_{a'} Q(s,a')))}$$

which can lead to numerical problems if the Q values are large. We can
avoid this by subtracting the largest Q value before applying $\exp$:

$$p(a|s) = \frac{\exp(Q(s,a) - max(Q(s, \cdot))}{\exp(\sum_{a'} Q(s,a') - max(Q(s, \cdot)))}$$

## Convergence Issues

### Initiliazation

Try to initialize the weights of the last layer very close to zero,
resulting in a policy with high entropy, i.e. uniformly distributed,
which helps with exploration.

### Always Try Multiple Seeds

Initialization is very important and can doom your agent right from the
beginning. We can try to avoid this trying several random seeds, e.g. by
running multiple instances of the agent in parallel.

### MaxEnt Reinforcement Learning

We can enforce exploration by introducing a bonus for a high entropy
policy:

$$\pi^* = \max_{\pi} \mathbb{E}_{\pi}\left[\sum_{t=0}^T r(a_t, s_t) + \beta H(\pi)\right] = \mathbb{E}_{\pi}\left[\sum_{t=0}^T r(a_t, s_t) - \beta \log(\pi(a_t|s_t))\right]$$

which we can also formulate as penalizes the divergence from a (fixed)
policy $\pi_0$:

$$\pi^* = \max_{\pi} \mathbb{E}_{\pi}\left[\sum_{t=0}^T r(a_t, s_t) - \beta DKL(\pi||\pi_0)\right],$$

where $DKL$ is the Kullback-Leibler Divergence between the probability
distributions $p$ and $q$ defined as
$$DKL(p|q) = \sum_x p(x) \log \frac{p(x)}{q(x)}$$

### $\epsilon$ -greedy Exploration

Instead of picking an action according to the current policy we can
instead encourage exploration by sampling a random action with
$\epsilon$ probability:

In [None]:
import numpy as np
def epsilon_greedy(q_values, epsilon, num_actions, beta):
    u = np.random.uniform()
    p = np.ones(shape=num_actions) / num_actions
    if u <= epsilon:
        a = np.random.randint(low=0, high=num_actions)
    else:
        max_q = np.max(q_values)
        centered_q_values = q_values - max_q
        exp_q = np.exp(beta*centered_q_values) 
        p = exp_q / np.sum(exp_q)
        a = np.random.choice(a=num_actions, p=p)
    return a, p

q_values = np.array([1.0, 1.2, 1.5])
epsilon = 0.0

a, p = epsilon_greedy(q_values, epsilon, 3, 0.10)
print("Sampled Action:", a)
print("Probability Distribution:", p)

### Policy Diagnostic

-   Early drop in policy entropy usually means no learning
-   High Entropy means that the agent is focusing on a few/a single
    action
    -   No exploration
-   Compute $DKL\left[\pi_{old}(·|s)||\pi_{new}(·|s)\right]$
    -   KL spike means drastic loss of performance
    -   No learning progress might mean steps are too large

### Value Function Diagnostic

Several papers propose to use the “Huber Loss” instead of Mean Squared
Error, which is less sensitive to outliers:

$$L_\delta(y, f(x)) = \begin{cases}
 \frac{1}{2}(y - f(x))^2                   & \textrm{for } |y - f(x)| \le \delta, \\
 \delta\, |y - f(x)| - \frac{1}{2}\delta^2 & \textrm{otherwise.}
\end{cases}$$ This function is quadratic for small values of a, and
linear for large values, with equal values and slopes of the different
sections at the two points where $|a| = \delta$.

In Actor-Critic Models we fit a Value function to learn the cumulative
reward for a certain state $s$ denoted as $V(s)$. To see if the nework
is learning anything one should monitor the explained variance
$F(\theta)$ of your model $\theta$:

$$F(\theta) = 1 - \frac{\mathbb{Var}(y-f(x))}{\mathbb{Var}(y)},$$

where $y$ is the empirical return (i.e. the reward obtained) and $f(x)$
is the reward predicted by the Value function

### Sample Complexity

Usually Deep Reinforcement Learning just takes a lot of time. Model
Based RL methods can reduce sample complexity, but that usually means
samples drawn from the environment. Training time is often the same or
at least comparable to Model Free methods.

## Conclusions

-   Even with small implementation errors, networks often still train.
-   Run as many tests and diagnostics as possible.
-   Do not rely on a single metric.