# Novel Convolutional Attention Mechanism

In this notebook, we present a novel attention mechanism that is core in our current research. This mechanism allows for hierarchical attention to various regious of the data, and can learn invariance to feature position, scale, and pose.

In [1]:
import numpy as np
import tensorflow as tf

In [2]:
input_images = np.random.uniform(size=[32, 57, 57, 3])
image_labels = np.random.uniform(size=[32, 1000])
g = tf.Graph()
sess = tf.Session(graph=g)

In [3]:
with g.as_default():

    with tf.variable_scope("conv_one") as scope:

        conv_one_filters = tf.get_variable(
            "conv/filters", 
            shape=[3, 3, 3, 64], 
            dtype=tf.float32, 
            initializer=tf.contrib.layers.xavier_initializer())
        conv_one_biases = tf.get_variable(
            "conv/biases", 
            shape=[1, 1, 1, 64], 
            dtype=tf.float32, 
            initializer=tf.zeros_initializer())

        attend_one_filters_one = tf.get_variable(
            "attend/filters_one", 
            shape=[3, 3, 64, 16], 
            dtype=tf.float32, 
            initializer=tf.contrib.layers.xavier_initializer())
        attend_one_biases_one = tf.get_variable(
            "attend/biases_one", 
            shape=[1, 1, 1, 16], 
            dtype=tf.float32, 
            initializer=tf.zeros_initializer())

        attend_one_filters_two = tf.get_variable(
            "attend/filters_two", 
            shape=[3, 3, 64, 16], 
            dtype=tf.float32, 
            initializer=tf.contrib.layers.xavier_initializer())
        attend_one_biases_two = tf.get_variable(
            "attend/biases_two", 
            shape=[1, 1, 1, 16], 
            dtype=tf.float32, 
            initializer=tf.zeros_initializer())

    with tf.variable_scope("conv_two") as scope:

        conv_two_filters = tf.get_variable(
            "conv/filters", 
            shape=[3, 3, 3, 64], 
            dtype=tf.float32, 
            initializer=tf.contrib.layers.xavier_initializer())
        conv_two_biases = tf.get_variable(
            "conv/biases", 
            shape=[1, 1, 1, 64], 
            dtype=tf.float32, 
            initializer=tf.zeros_initializer())

        attend_two_filters_one = tf.get_variable(
            "attend/filters_one", 
            shape=[3, 3, 64, 1], 
            dtype=tf.float32, 
            initializer=tf.contrib.layers.xavier_initializer())
        attend_two_biases_one = tf.get_variable(
            "attend/biases_one", 
            shape=[1, 1, 1, 1], 
            dtype=tf.float32, 
            initializer=tf.zeros_initializer())

        attend_two_filters_two = tf.get_variable(
            "attend/filters_two", 
            shape=[3, 3, 64, 1], 
            dtype=tf.float32, 
            initializer=tf.contrib.layers.xavier_initializer())
        attend_two_biases_two = tf.get_variable(
            "attend/biases_two", 
            shape=[1, 1, 1, 1], 
            dtype=tf.float32, 
            initializer=tf.zeros_initializer())

    with tf.variable_scope("dense") as scope:

        dense_weights = tf.get_variable(
            "dense/weights", 
            shape=[64, 1000], 
            dtype=tf.float32, 
            initializer=tf.contrib.layers.xavier_initializer())
        dense_biases = tf.get_variable(
            "dense/biases", 
            shape=[1000], 
            dtype=tf.float32, 
            initializer=tf.zeros_initializer())

    image_feed = tf.placeholder(
        tf.float32, 
        name="image_feed", 
        shape=[None, 57, 57, 3])
    label_feed = tf.placeholder(
        tf.int32, 
        name="label_feed", 
        shape=[None, 1000])

In [4]:
with g.as_default():
    
    layer_one = tf.nn.relu(
        tf.nn.conv2d(
            image_feed,
            conv_one_filters,
            [1, 1, 1, 1],
            "SAME") + conv_one_biases)
    
    attend_one_one = tf.nn.softmax(
        tf.nn.conv2d(
            layer_one,
            attend_one_filters_one,
            [1, 1, 1, 1],
            "SAME") + attend_one_biases_one)
    
    attend_one_two = tf.nn.softmax(
        tf.nn.conv2d(
            layer_one,
            attend_one_filters_two,
            [1, 1, 1, 1],
            "SAME") + attend_one_biases_two)
    
    full_attended_one = tf.reduce_sum(
        tf.expand_dims(tf.expand_dims(layer_one, axis=3), axis=4) * 
        tf.expand_dims(tf.expand_dims(attend_one_one, axis=3), axis=5) *
        tf.expand_dims(tf.expand_dims(attend_one_two, axis=4), axis=5),
        axis=[1, 2])
    
    layer_two = tf.nn.relu(
        tf.nn.conv2d(
            image_feed,
            conv_two_filters,
            [1, 1, 1, 1],
            "SAME") + conv_two_biases)
    
    attend_two_one = tf.nn.softmax(
        tf.nn.conv2d(
            layer_two,
            attend_two_filters_one,
            [1, 1, 1, 1],
            "SAME") + attend_two_biases_one)
    
    attend_two_two = tf.nn.softmax(
        tf.nn.conv2d(
            layer_two,
            attend_two_filters_two,
            [1, 1, 1, 1],
            "SAME") + attend_two_biases_two)
    
    full_attended_two = tf.reduce_sum(
        tf.expand_dims(tf.expand_dims(layer_two, axis=3), axis=4) * 
        tf.expand_dims(tf.expand_dims(attend_two_one, axis=3), axis=5) *
        tf.expand_dims(tf.expand_dims(attend_two_two, axis=4), axis=5),
        axis=[1, 2])
    
    logits = tf.tensordot(
            tf.reduce_sum(full_attended_two, axis=[1, 2]),
            dense_weights,
            1) + dense_biases
    prediction = tf.nn.softmax(logits)
    loss = tf.reduce_sum(
        tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=logits, 
            labels=label_feed))
    gradient = tf.train.GradientDescentOptimizer(
        0.001).minimize(loss)
    init_op = tf.global_variables_initializer()
    
g.finalize()

In [8]:
sess.run(init_op)
for _i in range(20):
    p, l, _g = sess.run(
        [prediction, loss, gradient], 
        feed_dict={"image_feed:0": input_images, 
                   "label_feed:0": image_labels})
    print(
        "Prediction: %d" % np.argmax(p[0]), 
        "Actual: %d" % np.argmax(image_labels[0]),
        "Loss %.2f" % l)

Prediction: 434 Actual: 630 Loss 0.00
Prediction: 617 Actual: 630 Loss 0.00
Prediction: 136 Actual: 630 Loss 0.00
Prediction: 505 Actual: 630 Loss 0.00
Prediction: 730 Actual: 630 Loss 0.00
Prediction: 653 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
Prediction: 809 Actual: 630 Loss 0.00
