# MNIST CNN Multi GPU Example
Adapted from [TensorFlow: Deep MNIST for Experts](https://www.tensorflow.org/tutorials/mnist/pros/index.html).
Multi-GPU code is inspired by [TensorFlow: CNN - Training a Model using Multiple GPU Cards](https://github.com/tensorflow/tensorflow/blob/r0.9/tensorflow/models/image/cifar10/cifar10_multi_gpu_train.py)

In [None]:
import os
import sys

# add path to libraries for ipython
sys.path.append(os.path.expanduser("~/libs"))

import time
import numpy as npvariable_scope
import tensorflow as tf
import tensortools as tt

In [None]:
NUM_GPUS = 2
MEMORY_DEVICE = '/cpu:0'

MOVING_AVERAGE_DECAY = 0.9999

BATCH_SIZE = 128  # per GPU
MAX_STEPS = 2000
DROPOUT = 0.5
REG = 5e-4
LEARNING_RATE = 1e-4

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

In [None]:
g = tf.Graph()

In [None]:
def inference(x, keep_prob, scope):
    x_image = tf.reshape(x, [-1,28,28,1])

    # Conv1
    conv1 = tt.network.conv2d("Conv1", x_image,
                              64, 5, 5, 1, 1,
                              weight_init=0.01,
                              bias_init=0.1,
                              regularizer=tf.contrib.layers.l2_regularizer(REG),
                              activation=tf.nn.relu,
                              device=MEMORY_DEVICE)
    h_pool1 = tt.network.max_pool2d(conv1, 2, 2, 2, 2)

    # Conv2
    conv2 = tt.network.conv2d("Conv2", h_pool1,
                              128, 5, 5, 1, 1,
                              weight_init=0.01, 
                              bias_init=0.1,
                              regularizer=tf.contrib.layers.l2_regularizer(REG),
                              device=MEMORY_DEVICE)
    h_conv2 = tt.network.lrelu(conv2, 0.2)
    h_pool2 = tt.network.max_pool2d(h_conv2, 2, 2, 2, 2)

    # FC
    h_pool2_flat = tf.contrib.layers.flatten(h_pool2)
    h_fc1 = tt.network.fc("FC", h_pool2_flat, 1024,
                          weight_init=tf.contrib.layers.xavier_initializer(), 
                          bias_init=0.1,
                          regularizer=tf.contrib.layers.l2_regularizer(REG),
                          activation=tf.nn.relu,
                          device=MEMORY_DEVICE)

    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    # Output
    y_conv=tf.nn.softmax(tt.network.fc("Output", h_fc1_drop, 10,
                                       weight_init=tf.contrib.layers.xavier_initializer(),
                                       regularizer=tf.contrib.layers.l2_regularizer(REG),
                                       bias_init=0.1,
                                       device=MEMORY_DEVICE))
    return y_conv

In [None]:
def loss(y, y_, scope=None):
    with tf.name_scope("Loss"):
        cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
        
        # Remarks: Filter by scope might improve the the visualization in TensorBoard.
        #          But this might require to use manual matmul()-weight decay, 
        #          because the contrib-regularizers are only evaluated once.
        reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        
        total_loss = cross_entropy + tf.add_n(reg_losses)
        
    with tf.name_scope("Accuracy"):
        correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
    return total_loss, accuracy

In [None]:
def tower_loss(x, y_, keep_prob, scope):
    """Calculate the total loss on a single tower.
    Args:
        scope: unique prefix string identifying the tower, e.g. 'tower_0'
    Returns:
        Tensor of shape [] containing the total loss for a batch of data
    """
    # Build inference Graph.
    logits = inference(x, keep_prob, scope)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    total_loss, accuracy = loss(logits, y_, scope)

    # Compute the moving average of all individual losses and the total loss.
    #loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    #loss_averages_op = loss_averages.apply(losses + [total_loss])

    #with tf.control_dependencies([loss_averages_op]):
    #    total_loss = tf.identity(total_loss)
    return total_loss, accuracy

In [None]:
with g.as_default(): #, tf.device('/cpu:0'):
    x = tf.placeholder(tf.float32, [None, 784], "X")
    y_ = tf.placeholder(tf.float32, [None, 10], "Y_")
    keep_prob = tf.placeholder(tf.float32, name="KeepProb")
    
    # Create a variable to count the number of train() calls. This equals the
    # number of batches processed * FLAGS.num_gpus.
    global_step = tf.get_variable(
        'global_step', [],
        initializer=tf.constant_initializer(0), trainable=False)
    
    with tf.name_scope('Train'):   
        opt = tf.train.AdamOptimizer(LEARNING_RATE)
        # Calculate the gradients for each model tower.
        tower_grads = []
        for i in xrange(NUM_GPUS):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' % ('tower', i)) as scope:
                    # Calculate the loss for one tower of the CIFAR model. This function
                    # constructs the entire CIFAR model but shares the variables across
                    # all towers.
                    t_loss, accuracy = tower_loss(x[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :],
                                                  y_[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :], 
                                                  keep_prob, scope)

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Calculate the gradients for the batch of data on this tower.
                    grads = opt.compute_gradients(t_loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)
        
        # We must calculate the mean of each gradient.
        # This is also the synchronization point across all towers.
        grads = tt.training.average_gradients(tower_grads)
        
        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
        
        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)
    
    # start/init the session
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options,
                                                       log_device_placement=True))
    sess.run(tf.initialize_all_variables())
    
    start = time.time()
    for i in range(MAX_STEPS):
        # each gpu gets a fraction of the batch
        batch = mnist.train.next_batch(BATCH_SIZE * NUM_GPUS)
        if i%100 == 0:
            train_accuracy = accuracy.eval(feed_dict={
                                           x:batch[0],
                                           y_: batch[1],
                                           keep_prob: 1.0})
            print("step %d, training accuracy %g" % (i, train_accuracy))

        train_op.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: DROPOUT})
        
    duration = time.time() - start
    print("test accuracy %g" % accuracy.eval(feed_dict={
                                             x: mnist.test.images[:max(1000, BATCH_SIZE * NUM_GPUS)],
                                             y_: mnist.test.labels[:max(1000, BATCH_SIZE * NUM_GPUS)],
                                             keep_prob: 1.0}))
    print("Duration: {} sec total | {} sec/batch | {} sec/example ".format(duration,
                                                                           duration / MAX_STEPS,
                                                                           duration / (MAX_STEPS * BATCH_SIZE * NUM_GPUS)))

# Results

Tested on DeepThought's TitanX GPUs with 0% Utility and free (0 / 12206 MiB) memory

### CPU only

**@ BATCH_SIZE = 128**

#### Duration

- 712.075469971 sec total
- 0.356037734985 sec/batch
- 0.00278154480457 sec/example 


#### Memory

- gpu0: 130MiB
- gpu1: 108MiB

### 1 GPU (before using multi-GPU code):

**@ BATCH_SIZE = 128**

#### Duration

- 37.0640978813 sec total
- 0.0185320489407 sec/batch
- 0.000144781632349 sec/example 

#### Memory

- gpu0: 4285MiB
- gpu1: 108MiB

### 1 GPU (store all variables on CPU)

**@ BATCH_SIZE = 128**

#### Duration

- 65.0734059811 sec total
- 0.0325367029905 sec/batch
- 0.000254192992114 sec/example

#### Memory

- gpu0: 4285MiB
- gpu1: 108MiB

### 2 GPUs (Conv1@gpu:0, Conv2@gpu:1 + colocate_gradients_with_ops=True)

**@ BATCH_SIZE = 128**

#### Duration

- 40.3634259701 sec total
- 0.020181712985 sec/batch
- 0.000157669632696 sec/example 

#### Memory

- gpu0: 2236 MiB
- gpu1: 2795 MiB

### 2 GPUs (Conv1@gpu:0, Conv2@gpu:1 + colocate_gradients_with_ops=False)

**@ BATCH_SIZE = 128**

#### Duration

- 46.6568570137 sec total
- 0.0233284285069 sec/batch
- 0.00018225334771 sec/example 

#### Memory

- gpu0: 2236 MiB
- gpu1: 2795 MiB

### 2 GPUs (Model on GPUs, Gradient avg. on CPU)

**@ BATCH_SIZE = 128 (effectively: 256)**

#### Duration

- 86.920060873 sec total
- 0.0434600304365 sec/batch
- 0.000169765743893 sec/example 


- 75.5817620754 sec total
- 0.0377908810377 sec/batch
- 0.000147620629054 sec/example

#### Memory

- gpu0: 1214 MiB
- gpu1: 1212 MiB

**@ BATCH_SIZE = 256 (effectively: 512)**

#### Duration

- 109.699135065 sec total
- 0.0548495675325 sec/batch
- 0.000107128061587 sec/example 

#### Memory

- gpu0: 2238 MiB
- gpu1: 2236 MiB

**@ BATCH_SIZE = 512 (effectively: 1024)**

#### Duration

- 168.136204004 sec total
- 0.0840681020021 sec/batch
- 0.0000820977558615 sec/example

#### Memory

- gpu0: 4324 MiB
- gpu1: 4348 MiB

### 3 GPUs (Model on GPUs, Gradient avg. on CPU)

**@ BATCH_SIZE = 256 (effectively: 768)**

#### Duration

- 117.819902897 sec total
- 0.0589099514484 sec/batch
- 0.0000767056659485 sec/example

#### Memory

- gpu0: 2236 MiB
- gpu1: 2236 MiB
- gpu2: 2274 MiB

**@ BATCH_SIZE = 256 (effectively: 768)**
*without storing variables on CPU!*

#### Duration

- 91.0233440399 sec total
- 0.04551167202 sec/batch
- 0.0000592599896093 sec/example

#### Memory

- gpu0: 2238 MiB
- gpu1: 2236 MiB
- gpu2: 2236 MiB

**@ BATCH_SIZE = 768 (effectively: 2304)**

#### Duration

- 230.154171944 sec total
- 0.115077085972 sec/batch
- 0.0000499466518975e sec/example

#### Memory

- gpu0: 4348 MiB
- gpu1: 4348 MiB
- gpu2: 4352 MiB