# Differentiable Spatial to Numerical Transform
An example of the usage of the DSNT layer, as taken from the paper "Numerical Coordinate Regression with Convolutional Neural Networks"

In [None]:
# Imports
import tensorflow as tf
import cv2
import numpy as np
import sonnet as snt

# Import for us of the transform layer and loss function
import dsnt

## Build some dummy data
Circles of random colour, size and position on a black background

In [None]:
img_size = 150
image_count = 200
train_percent = 0.75
train_image_count = int(train_percent * image_count)
test_image_count = image_count - train_image_count

images = []
targets = []
for _ in range(200):
    img = np.zeros((img_size, img_size, 3))
    row, col = np.random.randint(0, img_size), np.random.randint(0, img_size)
    radius = np.random.randint(8, 15)
    b, g, r = np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255)
    cv2.circle(img, (row, col), radius, (b, g, r), -1)
    images.append(img)
    norm_row = (row / img_size - 0.5) * 2
    norm_col = (col / img_size - 0.5) * 2
    targets.append([norm_row, norm_col])

images = np.array(images)
targets = np.array(targets)
train_images = images[:train_image_count]
test_images = images[train_image_count:]
train_targets = targets[:train_image_count]
test_targets = targets[train_image_count:]

print('''
{} images total
training: {}
testing : {}'''.format(image_count, train_image_count, test_image_count))

## A simple model
A handful of convolutional layers, each time downsampling by a factor of 2.
The network finishes with a kernel-size 1 convolution, producing a single channel heat-map.
I'm an advocate of [Deepmind's Sonnet](https://github.com/deepmind/sonnet), so the convolution operations are written using this. It's quite obvious what the equivalent Tensorflow operations would be.

In [None]:
def inference(inputs):
    inputs = snt.Conv2D(output_channels=166,
                        kernel_shape=3,
                        rate=1,
                        padding='SAME',
                        name='conv1')(inputs)
    inputs = tf.nn.relu(inputs)
    inputs = tf.nn.max_pool(inputs, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    
    inputs = snt.Conv2D(output_channels=32,
                        kernel_shape=3,
                        rate=2,
                        padding='SAME',
                        name='conv2')(inputs)
    inputs = tf.nn.relu(inputs)
    inputs = tf.nn.max_pool(inputs, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    
    inputs = snt.Conv2D(output_channels=64,
                        kernel_shape=3,
                        rate=4,
                        padding='SAME',
                        name='conv3')(inputs)
    inputs = tf.nn.relu(inputs)
    inputs = tf.nn.max_pool(inputs, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    
    inputs = snt.Conv2D(output_channels=128,
                        kernel_shape=3,
                        rate=8,
                        padding='SAME',
                        name='conv4')(inputs)
    inputs = tf.nn.relu(inputs)
    inputs = tf.nn.max_pool(inputs, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    
    inputs = snt.Conv2D(output_channels=256,
                        kernel_shape=3,
                        rate=16,
                        padding='SAME',
                        name='conv5')(inputs)
    inputs = tf.nn.relu(inputs)
    inputs = tf.nn.max_pool(inputs, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    
    inputs = snt.Conv2D(output_channels=256,
                        kernel_shape=3,
                        padding='SAME',
                        name='conv6')(inputs)
    inputs = tf.nn.relu(inputs)
    inputs = tf.nn.max_pool(inputs, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
    
    inputs = snt.Conv2D(output_channels=1,
                        kernel_shape=1,
                        padding='SAME',
                        name='conv7')(inputs)
    coords, norm_heatmap = dsnt.dsnt(inputs)
    return coords, norm_heatmap

# Differentiable Spatial to Numerical Transform
A Tensorflow implementation of the DSNT layer, as taken from the paper "Numerical Coordinate Regression with Convolutional Neural Networks".

### Provided Files:

- `dsnt.py` - The layer implementation and its supporting functions
- `DSNT_sample.ipynb` - A notebook demonstrating the usage of the DSNT layer.



### Example usage:
Begin by importing the module:
```
import dsnt
```

The layer can be inserted at the end of a stack of convolutional layers, where the final tensor shape is `[batch, height, width, 1]`.
The function's input tensor will be rectified, then passed through the transform. `dsnt.dsnt` returns the rectified input heatmaps and the produced coordinates tensor of shape `[batch, x, y]`:
```
norm_heatmaps, coords = dsnt.dsnt(my_tensor)
```
There are different rectification methods available, which can be provided as an additional argument, e.g: `dsnt.normalise_heatmap(my_tensor, 'relu')`


The loss function must be composed of two components. Mean-Squared-Error for the coordinate regression, and Jensen-Shannon Divergence for regularization.
```
# Coordinate regression loss
loss_1 = tf.losses.mean_squared_error(targets, predictions)
# Regularization loss - in this example the targets are in range [0, 1], 
# but need to be in range [-1, 1] for the regularization loss
loss_2, target_gauss = dsnt.js_reg_loss(heatmaps, (targets + 1) / 2)

loss = loss_1 + loss_2
```


## Training
A very simple training loop with no mini-batching.

In [None]:
tf.reset_default_graph()

input_x = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 3])
input_y = tf.placeholder(tf.float32, shape=[None, 2])

heatmaps, predictions = inference(input_x)
loss_1 = tf.losses.mean_squared_error(input_y, predictions)
# input_y is in the range [0, 1], but must be in range [-1, 1] for this loss
loss_2, target_gauss = dsnt.js_reg_loss(heatmaps, (input_y + 1) / 2)
loss = loss_1 + loss_2

optimizer = tf.train.AdamOptimizer(learning_rate=6e-5).minimize(loss)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(10):
        for i in range(train_image_count):
            curr_img = train_images[i]
            curr_target = train_targets[i]
            _, loss_val = sess.run(
                [optimizer, loss], 
                {
                    input_x: [curr_img],
                    input_y: [curr_target]
                }
            )

    def evaluate_accuracy(images, targets):
        '''
        Evaluate the accuracy% across the whole given batch of images, targets
        '''
        total_loss = 0
        image_count = images.shape[0]
        for i in range(image_count):
            curr_img = images[i]
            curr_target = targets[i]
            loss_val = sess.run(loss_1, {
                input_x: [curr_img],
                input_y: [curr_target]
            })
            total_loss += loss_val
        return 1 - total_loss / image_count
    
    print("Training accuracy: {:.3f}%".format(100 * evaluate_accuracy(train_images, train_targets)))
    print("Testing accuracy : {:.3f}%".format(100 * evaluate_accuracy(test_images, test_targets)))