# Simple CNN
Goal is to see how far we can get with a ReLU-based CNN

In [1]:
import os
import tensorflow as tf
import cv2
import numpy as np
from enum import Enum
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
class CameraLabel(Enum):
    HTC_1_M7 = 0
    LG_Nexus_5x = 1
    Motorola_Droid_Maxx = 2
    Motorola_Nexus_6 = 3
    Motorola_X = 4
    Samsung_Galaxy_Note3 = 5
    Samsung_Galaxy_S4 = 6
    Sony_NEX_7 = 7
    iPhone_4s = 8
    iPhone_6 = 9

## Load Data

In [3]:
def load(path='data-224'):
    data = []
    labels = []
    # Read data from every type of generated data
    for data_path in os.listdir(path):
        full_data_path = os.path.join(path, data_path)
        # Read data from every camera type
        for label in CameraLabel:
            camera_path = os.path.join(full_data_path, label.name.replace('_', '-'))
            print("Loading images for ", camera_path)
            for filename in os.listdir(camera_path):
                data += [cv2.imread(os.path.join(camera_path, filename))]
                labels += [CameraLabel[label.name].value]
    return np.asarray(data), labels

In [4]:
images, labels = load()

Loading images for  data-224/train_resize_5/HTC-1-M7
Loading images for  data-224/train_resize_5/LG-Nexus-5x
Loading images for  data-224/train_resize_5/Motorola-Droid-Maxx
Loading images for  data-224/train_resize_5/Motorola-Nexus-6
Loading images for  data-224/train_resize_5/Motorola-X
Loading images for  data-224/train_resize_5/Samsung-Galaxy-Note3
Loading images for  data-224/train_resize_5/Samsung-Galaxy-S4
Loading images for  data-224/train_resize_5/Sony-NEX-7
Loading images for  data-224/train_resize_5/iPhone-4s
Loading images for  data-224/train_resize_5/iPhone-6
Loading images for  data-224/train_gamma_8/HTC-1-M7
Loading images for  data-224/train_gamma_8/LG-Nexus-5x
Loading images for  data-224/train_gamma_8/Motorola-Droid-Maxx
Loading images for  data-224/train_gamma_8/Motorola-Nexus-6
Loading images for  data-224/train_gamma_8/Motorola-X
Loading images for  data-224/train_gamma_8/Samsung-Galaxy-Note3
Loading images for  data-224/train_gamma_8/Samsung-Galaxy-S4
Loading image

In [5]:
""" Good ol' tandem shuffle. """
def shuffle(a, b, c=None):
    assert len(a) == len(b)
    state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(state)
    np.random.shuffle(b)
    if c is not None:
        assert len(c) == len(b)
        np.random.set_state(state)
        np.random.shuffle(c)

In [6]:
# Convert labels to one-hot (using tf for fun):
one_hot_graph = tf.Graph()
one_hot_session = tf.Session(graph=one_hot_graph)
with one_hot_graph.as_default():
    label_input = tf.placeholder(tf.int32)
    one_hot = tf.one_hot(label_input, 10)
label_one_hot = np.asarray(one_hot.eval(feed_dict={label_input: labels}, session=one_hot_session))

In [7]:
validation_ratio = int(len(images) * 0.8)
shuffle(images, label_one_hot)  # Shuffle before dividing into training/validation sets

train_data, train_labels = images[:validation_ratio], label_one_hot[:validation_ratio]
valid_data, valid_labels = images[validation_ratio:], label_one_hot[validation_ratio:]
training_samples = len(train_data)
print(train_data.shape, train_labels.shape)
print(valid_data.shape, valid_labels.shape)

(19800, 224, 224, 3) (19800, 10)
(4950, 224, 224, 3) (4950, 10)


# Network

In [8]:
####################################
# Some convenience functions for creating layers
def std(inputs):
    return 1 / math.sqrt(inputs)
def conv2d(x, w, strides=[1,1,1,1], padding='SAME'):
    return tf.nn.conv2d(x, w, strides=strides, padding=padding)

def max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME'):
    return tf.nn.max_pool(x, ksize=ksize, strides=strides, padding=padding)
####################################
# calculating validation accuracy 
def total_correct(correct_prediction, valid_data, batch_size):    
    correct_prediction.eval(feed_dict={X: batch[0], y_: batch[1], keep_prob: 1.0})

## Network Configuration

In [9]:
shape_in = train_data.shape[1:]
print(shape_in)
conv1_filter_shape = [5, 5, 3]
conv2_filter_shape = [5, 5]

# Training Config
batches = 1000
batch_size = 100

# Logging
show_train_accuracy = 25
show_test_accuracy = 50

# Network Config
conv1_features = 32
conv2_features = 32

(224, 224, 3)


## Simple Network
This is basically the same network I used when exploring expression recognition and speech commands. Training for a couple thousand batches will get it to around 98%/65% training/validation accuracy

In [11]:
# 1. Define the network
network_graph = tf.Graph();
network_session = tf.Session(graph=network_graph)
with network_graph.as_default():
    
    # Input images and labels
    X = tf.placeholder(tf.float32, [None, shape_in[0], shape_in[1], shape_in[2]])
    y_ = tf.placeholder(tf.float32, [None, 10])
    keep_prob = tf.placeholder(tf.float32)
    # Conv Layer 1
    w_conv1 = tf.Variable(tf.truncated_normal([conv1_filter_shape[0], conv1_filter_shape[1], conv1_filter_shape[2], conv1_features], stddev=std(shape_in[0] * shape_in[1] * shape_in[2])))
    b_conv1 = tf.Variable(tf.constant(0.1, shape=[conv1_features]))
    h_conv1 = tf.nn.relu(conv2d(tf.reshape(X, [-1, shape_in[0], shape_in[1], shape_in[2]]), w_conv1) + b_conv1)
    h_pool1 = max_pool(h_conv1)
    # Conv Layer 2
    w_conv2 = tf.Variable(tf.truncated_normal([conv2_filter_shape[0], conv2_filter_shape[1], conv1_features, conv2_features], stddev=std(conv2_filter_shape[0] * conv2_filter_shape[1] * conv2_features)))
    b_conv2 = tf.Variable(tf.constant(0.1, shape=[conv2_features]))
    h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)

    h_conv2_shape = h_conv2.get_shape().as_list()
    hc2shape_product = h_conv2_shape[1] * h_conv2_shape[2] *h_conv2_shape[3]
    h_conv2_flat = tf.reshape(h_conv2, [-1, hc2shape_product])
    print(h_conv2_shape)
    
    # FC Layer 1
    w_fc1 = tf.Variable(tf.truncated_normal([hc2shape_product, 128], stddev=std(hc2shape_product)))
    b_fc1 = tf.Variable(tf.constant(0.1, shape=[128]))
    h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, w_fc1) + b_fc1)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
    # FC Layer 2
    w_fc2 = tf.Variable(tf.truncated_normal([128, 128], stddev=std(128*128)))
    b_fc2 = tf.Variable(tf.constant(0.1, shape=[128]))
    h_fc2 = tf.nn.relu(tf.matmul(h_fc1_drop, w_fc2) + b_fc2)
    h_fc2_drop = tf.nn.dropout(h_fc2, keep_prob)
    # FC Layer 3 (Output Layer)
    w_fc3 = tf.Variable(tf.truncated_normal([128, 10], stddev=std(128*10)))
    b_fc3 = tf.Variable(tf.constant(0.1, shape=[10]))
    y_conv = tf.matmul(h_fc2_drop, w_fc3) + b_fc3

    softmax = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv)
    cross_entropy = tf.reduce_mean(softmax)
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    saver = tf.train.Saver()
# 2. Train the Network
with network_session as sess:
    if os.listdir('tf_checkpoints'):
        print('Checkpoint found. Loading...')
        saver.restore(sess, 'tf_checkpoints/model.ckpt')
    else:    
        sess.run(tf.global_variables_initializer())
    
    index = 0
    for i in range(batches):
        
        # Shuffle the data
        if (index + 2 * batch_size) > training_samples :  # Just ignore the partial batch, whateverrrr
            print('Shuffling training data')
            shuffle(train_data, train_labels)
            index = 0
        else:
            index += batch_size
        batch = (train_data[index:index+batch_size], train_labels[index:index+batch_size])
        train_step.run(feed_dict={X: batch[0], y_: batch[1], keep_prob: 0.5})
        # 2.A Test the Network
        if i % show_train_accuracy == 0:
            train_accuracy = accuracy.eval(feed_dict={
              X: batch[0], y_: batch[1], keep_prob: 1.0})
            print('step %d, training accuracy %g' % (i, train_accuracy))
        if i % show_test_accuracy == 0:
            print('validation accuracy %g' % accuracy.eval(feed_dict={
                  X: valid_data[:batch_size], y_: valid_labels[:batch_size], keep_prob: 1.0}))
            
    # Show Testing and Validation Accuracy at the end
    # 2.B Validate the Network
    print('validation accuracy %g' % accuracy.eval(feed_dict={
        X: valid_data[:batch_size], y_: valid_labels[:batch_size], keep_prob: 1.0}))
    
    validation_result = correct_prediction.eval(feed_dict={X: valid_data[:batch_size], y_: valid_labels[:batch_size], keep_prob: 1.0})
    validation_output = y_conv.eval(feed_dict={X: valid_data[:batch_size], y_: valid_labels[:batch_size], keep_prob: 1.0})
    print('Saving checkpoint')
    saver.save(sess, 'tf_checkpoints/model.ckpt')

[None, 112, 112, 32]
step 0, training accuracy 0.1
validation accuracy 0.07
step 25, training accuracy 0.12
step 50, training accuracy 0.06
validation accuracy 0.15
step 75, training accuracy 0.13
step 100, training accuracy 0.15
validation accuracy 0.16
step 125, training accuracy 0.11
step 150, training accuracy 0.12
validation accuracy 0.1
step 175, training accuracy 0.09
Shuffling training data
step 200, training accuracy 0.15
validation accuracy 0.15
step 225, training accuracy 0.14
step 250, training accuracy 0.12
validation accuracy 0.17
step 275, training accuracy 0.22
step 300, training accuracy 0.27
validation accuracy 0.26
step 325, training accuracy 0.13
step 350, training accuracy 0.2
validation accuracy 0.23
step 375, training accuracy 0.23
Shuffling training data
step 400, training accuracy 0.24
validation accuracy 0.29
step 425, training accuracy 0.31
step 450, training accuracy 0.36
validation accuracy 0.36
step 475, training accuracy 0.22
step 500, training accuracy 0

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


## Going Deeper
Next up, let's move on and make the network not only deeper, but residual as well.

Let's make some mods!

## Helper Function
First off, let's make a convenient way of generating multiple convolution layers

In [9]:
def conv_block(input_tensor, filter_shape, out_features):
    input_shape = input_tensor.get_shape().as_list()
    w_conv = tf.Variable(tf.truncated_normal([filter_shape[0], filter_shape[1], input_shape[-1], out_features], stddev=std(filter_shape[0] * filter_shape[1] * out_features)))
    b_conv = tf.Variable(tf.constant(0.1, shape=[out_features]))
    h_conv = tf.nn.relu(conv2d(input_tensor, w_conv) + b_conv)
    return h_conv


In [11]:
shape_in = train_data.shape[1:]
print(shape_in)
conv1_filter_shape = [7, 7]
conv2_filter_shape = [3, 3]
conv3_filter_shape = [3, 3]
conv4_filter_shape = [3, 3]
conv5_filter_shape = [3, 3]
# Training Config
batches = 100
batch_size = 50

# Logging
show_train_accuracy = 25
show_test_accuracy = 50

# Network Config
conv1_features = 64
conv2_features = 64
conv3_features = 128
conv4_features = 256
conv5_features = 512
conv2_layers = 6  # Conv1 is always 1 layer in our case
conv3_layers = 8
conv4_layers = 12
conv5_layers = 6

(224, 224, 3)


In [12]:
def calculate_accuracy(tensor, data, labels, batch_size=100):
    results = []
    total = 0.0
    for start_index in range(0, batch_size, batch_size): # len(data), batch_size):  # just doing samples of 100 because it takes forever
        end_index = min(start_index + 100, len(data))
        result = tensor.eval(feed_dict={
                  X: data[start_index:end_index], y_: labels[start_index:end_index], keep_prob: 1.0})
        # scale if smaller than batch size
        results += [result * (end_index - start_index) / batch_size]
    return np.average(results)

## Not Residual Yet
Here's a basic, deep network without the residual part of a ResNet. It...struggles to get anywhere.

In [14]:

# 1. Define the network
network_graph = tf.Graph();
network_session = tf.Session(graph=network_graph)
with network_graph.as_default():
    
    # Input images and labels
    X = tf.placeholder(tf.float32, [None, shape_in[0], shape_in[1], shape_in[2]])
    y_ = tf.placeholder(tf.float32, [None, 10])
    keep_prob = tf.placeholder(tf.float32)
    # Conv Layer 1
    h_conv1 = conv_block(tf.reshape(X, [-1, shape_in[0], shape_in[1], shape_in[2]]), conv1_filter_shape, conv1_features)
    h_pool1 = max_pool(h_conv1)
    # Conv Layer 2
    print('Pool Shape: ', h_pool1.get_shape().as_list())
    h_conv2 = h_pool1  # A little hacky, but necessary to start the next block
    for i in range(conv2_layers):        
        h_conv2 = conv_block(h_conv2, conv2_filter_shape, conv2_features)
    h_pool2 = max_pool(h_conv2)
    # Conv Layer 3
    h_conv3 = h_pool2  # Again, a little hacky
    for i in range(conv3_layers):
        h_conv3 = conv_block(h_conv3, conv3_filter_shape, conv3_features)
    h_pool3 = max_pool(h_conv3)
    # Conv Layer 4
    h_conv4 = h_pool3  # Again, a little hacky
    for i in range(conv4_layers):
        h_conv4 = conv_block(h_conv4, conv4_filter_shape, conv4_features)
    h_pool4 = max_pool(h_conv4)
    h_conv5 = h_pool4  # Again, a little hacky
    for i in range(conv5_layers):
        h_conv5 = conv_block(h_conv5, conv5_filter_shape, conv5_features)
    
    h_conv5_shape = h_conv5.get_shape().as_list()
    hc5shape_product = h_conv5_shape[1] * h_conv5_shape[2] * h_conv5_shape[3]
    h_conv5_flat = tf.reshape(h_conv5, [-1, hc5shape_product])
    print('Final pool shape: ', h_conv5_shape)
    
    # FC Layer 1
    #w_fc1 = tf.Variable(tf.truncated_normal([hp2shape_product, 1024], stddev=std(hp2shape_product)))
    w_fc1 = tf.Variable(tf.truncated_normal([hc5shape_product, 256], stddev=std(hc5shape_product)))
    b_fc1 = tf.Variable(tf.constant(0.1, shape=[256]))
    h_fc1 = tf.nn.relu(tf.matmul(h_conv5_flat, w_fc1) + b_fc1)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
    # FC Layer 2
    w_fc2 = tf.Variable(tf.truncated_normal([256, 256], stddev=std(256*256)))
    b_fc2 = tf.Variable(tf.constant(0.1, shape=[256]))
    h_fc2 = tf.nn.relu(tf.matmul(h_fc1_drop, w_fc2) + b_fc2)
    h_fc2_drop = tf.nn.dropout(h_fc2, keep_prob)
    # FC Layer 3 (Output Layer)
    w_fc3 = tf.Variable(tf.truncated_normal([256, 10], stddev=std(256*10)))
    b_fc3 = tf.Variable(tf.constant(0.1, shape=[10]))
    y_conv = tf.matmul(h_fc2_drop, w_fc3) + b_fc3

    softmax = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv)
    cross_entropy = tf.reduce_mean(softmax)
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    saver = tf.train.Saver()
# 2. Train the Network
with network_session as sess:
    if os.listdir('tf_checkpoints'):
        print('Checkpoint found. Loading...')
        saver.restore(sess, 'tf_checkpoints/model.ckpt')
    else:    
        sess.run(tf.global_variables_initializer())
    
    index = 0
    for i in range(batches):
        
        # Shuffle the data
        if (index + 2 * batch_size) > training_samples :  # Just ignore the partial batch, whateverrrr
            print('Shuffling training data')
            shuffle(train_data, train_labels)
            index = 0
        else:
            index += batch_size
        batch = (train_data[index:index+batch_size], train_labels[index:index+batch_size])
        
        train_step.run(feed_dict={X: batch[0], y_: batch[1], keep_prob: 0.5})
        # 2.A Test the Network
        if i % show_train_accuracy == 0:
            #train_accuracy = calculate_accuracy(accuracy, train_data, train_labels, batch_size)
            
            print('step %d, training accuracy %g' % (i, accuracy.eval(feed_dict={X: batch[0], y_: batch[1], keep_prob: 1.0})))
        if i % show_test_accuracy == 0:
            #print('validation accuracy %g' % calculate_accuracy(accuracy, valid_data, valid_labels, batch_size))
            print('validation accuracy %g' % accuracy.eval(feed_dict={X: valid_data[:batch_size], y_: valid_labels[:batch_size], keep_prob: 1.0}))
            
    # Show Testing and Validation Accuracy at the end
    # 2.B Validate the Network
    #print('validation accuracy %g' % calculate_accuracy(accuracy, valid_data, valid_labels, batch_size))
    accuracy.eval(feed_dict={X: valid_data[:batch_size], y_: valid_labels[:batch_size], keep_prob: 1.0})
    
    validation_result = correct_prediction.eval(feed_dict={X: valid_data[:batch_size], y_: valid_labels[:batch_size], keep_prob: 1.0})
    validation_output = y_conv.eval(feed_dict={X: valid_data[:batch_size], y_: valid_labels[:batch_size], keep_prob: 1.0})
    print('Saving checkpoint')
    saver.save(sess, 'tf_checkpoints/model.ckpt')

Pool Shape:  [None, 112, 112, 64]
Final pool shape:  [None, 14, 14, 512]
Checkpoint found. Loading...
INFO:tensorflow:Restoring parameters from tf_checkpoints/model.ckpt
step 0, training accuracy 0.06
validation accuracy 0.14
step 25, training accuracy 0.16
step 50, training accuracy 0.08
validation accuracy 0.12
step 75, training accuracy 0.14
Saving checkpoint


## Getting Residual
To build the ResNet graph, we need to divide it up into chunks (in a more appropriate way than above). First, let's break down each part of the ResNet model (to match up with Table 1 of the [original paper](https://arxiv.org/pdf/1512.03385.pdf):  
Input:  
* 224x224x3  

### First Conv Layer
This one is a bit of an exception compared to the conv layers following it.  
* Conv - 7x7, stride 2, 64 feature output  
* BatchNorm & Scale - Need to set weight decay for BatchNorm, then scale (normalize) back up to 255
* ReLU activation  
Pool:  
* Max Pool - Use a 3x3 kernel with a stride of 2  
Output:  
* 56x56x64 (note the stride=2 for the convolution)  

### Bottleneck Blocks  
Each block of the ResNet structure consists of a convolution, batch normalization, and scaling, which is repeated 3, 4, or 6 times. Additionally, the first part of each block includes a projection of the output from the previous layer (every other part of the block will simply take the output of the previous part of the block). So, each block will look like this:

#### Projection
This is the easy part. For the first activation of a block, we'll need to project the previous block (or pool) to have enough features. Simply perform a 1x1 convolution with 4 times the number of input features followed by the typical batch normalization and scaling.

#### Bottleneck Operation
Perform a set of 3 convolutions (each followed by batch norm and scaling and ReLU, except for the final one). The convolution sizes should be:  
* 1x1xinput_features  
* 3x3xinput_features  
* 1x1x4(input_features)  

Once the output of the final convolution (and BN & scaling) is calculated, perform an **elementwise addition** between this output and the one from the projection. Then, **run the result through the ReLU activation**.  

**For every bottleneck operation beyond the first** (that is, before we have another convolution with a stride of 2 and multiply the features by 4 again), perform another bottleneck operation and simply (elementwise-)add the output of the previous bottleneck operation to it before performing the activation.

In [None]:
####################################
# Some convenience functions for creating layers

def conv2d(x, w, stride=1, padding='SAME'):
    return tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=padding)

def max_pool(x, ksize=2, stride=2, padding='SAME'):
    return tf.nn.max_pool(x, ksize=[1, ksize, ksize, 1], strides=[1, stride, stride, 1], padding=padding)

# ResNet functions
def conv_batch_scale():
    conv2d
    tf.contrib.layers.batch_norm()
def bottleneck_block(tensor_in):
    
####################################