In [2]:
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.cm
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data

  return f(*args, **kwds)


In [11]:
# Start a tensorflow session
session = tf.InteractiveSession()

# Set the random seed to enable reproducible code
np.random.seed(0)
mnist = input_data.read_data_sets('MNIST_data')

batch = mnist.test.next_batch(10000)
test_images = batch[0].reshape([-1, 28, 28, 1])
test_labels = batch[1]

def evaluate(result_tensor, data_placeholder):
    """Evaluate a reconstruction method.

    Parameters
    ----------
    result_tensor : `tf.Tensor`, shape (None,)
        The tensorflow tensor containing the result of the classification.
    data_placeholder : `tf.Tensor`, shape (None, 28, 28, 1) or (None, 784)
        The tensorflow tensor containing the input to the classification operator.

    Returns
    -------
    MSE : float
        Mean squared error of the reconstruction.
    """
    feed_images = np.reshape(test_images, [-1, *data_placeholder.shape[1:]])
    result = result_tensor.eval(
        feed_dict={data_placeholder: feed_images})

    return np.mean(result == test_labels)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


## Exercise 1: Residual networks

Residual networks were introduced in *Deep Residual Learning for Image Recognition*, He et. al. 2015 [arXiv](https://arxiv.org/abs/1512.03385). In residual networks instead of using the Multi-Layer Perceptron (MLP) structure

$$
x_{n+1} = \rho(W_nx_n + b_n)
$$

We use a residual structure

$$
x_{n+1} = x_n + W_n^{(2)}\rho(W_n^{(1)}x_n + b_n)
$$

In tensorflow, such a residual unit could be written as

```
tmp = tf.contrib.layers.fully_connected(x, n1)
x = x + tf.contrib.layers.fully_connected(tmp, n2, 
                                          activation_fn=None)
```

### Tasks

* Implement a residual version of the MLP. Does this allow trainig a deeper network?
* Implement a residual convolutional network
* Using a small `n1` is called a bottleneck. How does the choice of `n1` affect the performance of the network?

In [21]:
resnet = True
depth = 5
width = 128


with tf.name_scope('res_netlogistic_regression'):
    inp = tf.placeholder(shape=(None, 784), dtype=tf.float32, name="input")
    x = tf.contrib.layers.fully_connected(inp, num_outputs=width)  # the default activation function is ReLU
    for _ in range(depth - 1):
        if resnet:
            x = x + tf.contrib.layers.fully_connected(x, num_outputs=width)
        else:
            x = tf.contrib.layers.fully_connected(x, num_outputs=n2)
        
    logits = tf.contrib.layers.fully_connected(x, 
                                               num_outputs=10,
                                               activation_fn=None)
    pred = tf.argmax(logits, axis=1)
    
with tf.name_scope('optimizer'):
    labels = tf.placeholder(shape=(None,), dtype=tf.int32)

    one_hot_labels = tf.one_hot(labels, depth=10)
    
    loss = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels,
                                                   logits=logits)
    optimizer = tf.train.AdamOptimizer().minimize(loss)

# Initialize all TF variables
session.run(tf.global_variables_initializer())

for i in range(10000):
    inp_, labels_ = mnist.train.next_batch(128)
    session.run(optimizer, 
                feed_dict={labels:labels_, inp:inp_})

    if i % 1000 == 0:
        print("{:.1f}%, ".format(evaluate(tf.argmax(logits, axis=1), inp)*100), end="")

11.3%, 97.0%, 97.6%, 97.6%, 97.3%, 97.6%, 97.5%, 98.1%, 97.6%, 97.8%, 

In [26]:
resnet = True
depth = 2
width = 16


with tf.name_scope('resnet_cnn'):    
    inp = tf.placeholder(shape=(None, 784), dtype=tf.float32, name="input")
    images = tf.reshape(inp, [-1, 28, 28, 1])
    
    x = tf.contrib.layers.conv2d(images, num_outputs=width, kernel_size=3, stride=1)
    
    for _ in range(depth - 1):
        if resnet:
            x = x + tf.contrib.layers.conv2d(x, num_outputs=width, kernel_size=3, stride=1)
        else:
            x = tf.contrib.layers.conv2d(images, num_outputs=width, kernel_size=3, stride=2)
        
    x = tf.contrib.layers.flatten(x)    
    x = tf.contrib.layers.fully_connected(x, 128)        
    logits = tf.contrib.layers.fully_connected(x, 
                                               num_outputs=10,
                                               activation_fn=None)
    pred = tf.argmax(logits, axis=1)
    
with tf.name_scope('optimizer'):
    labels = tf.placeholder(shape=(None,), dtype=tf.int32)

    one_hot_labels = tf.one_hot(labels, depth=10)
    
    loss = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels,
                                                   logits=logits)
    optimizer = tf.train.AdamOptimizer().minimize(loss)

# Initialize all TF variables
session.run(tf.global_variables_initializer())

for i in range(10000):
    inp_, labels_ = mnist.train.next_batch(128)
    session.run(optimizer, 
                feed_dict={labels:labels_, inp:inp_})

    if i % 1000 == 0:
        print("{:.1f}%, ".format(evaluate(tf.argmax(logits, axis=1), inp)*100), end="")

37.5%, 98.1%, 98.6%, 98.9%, 98.5%, 98.7%, 98.8%, 98.8%, 98.7%, 98.9%, 

In [22]:
with tf.name_scope('try'):
    inp = tf.placeholder(shape=(None, 784), dtype=tf.float32, name="input")
    images = tf.reshape(inp, [-1, 28, 28, 1])
    
    x = tf.contrib.layers.conv2d(images, 
                                 num_outputs=width, # Number of "channels", e.g. duplicates of the image
                                 kernel_size=3,  # size of the convolution kernel
                                 stride=1)       # Use strides (jumps) to decrease the image size in each step


TensorShape([Dimension(None), Dimension(28), Dimension(28), Dimension(128)])