
**Multi-GPU Training Example.**

Train a convolutional neural network on multiple GPU with TensorFlow.


In [1]:
from __future__ import print_function

import numpy as np
import tensorflow as tf
import time

#import mnist data
from tensorflow.examples.tutorials.mnist import input_data
mnist=input_data.read_data_sets("/tmp/data/", one_hot=True)


W0830 18:14:55.921128 140006326888320 deprecation.py:323] From <ipython-input-1-1f8f65d97aa7>:9: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
W0830 18:14:55.922621 140006326888320 deprecation.py:323] From /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Please write your own downloading logic.
W0830 18:14:55.923914 140006326888320 deprecation.py:323] From /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instruction

Extracting /tmp/data/train-images-idx3-ubyte.gz


W0830 18:14:56.223263 140006326888320 deprecation.py:323] From /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
W0830 18:14:56.227765 140006326888320 deprecation.py:323] From /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.one_hot on tensors.
W0830 18:14:56.285517 140006326888320 deprecation.py:323] From /usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:290: __init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a f

Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [0]:
#hyperparameters
#google colab has only GPU:0
num_gpus=1
num_steps=200
learning_rate=0.001
batch_size=1024
display_step=10

#network parameters
num_input=784
num_classes=10
dropout=0.75

In [0]:
# build a convolutional neural network
def conv_net(x, n_classes, dropout, reuse, is_training):
  #define a scope for reusing variables
  with tf.variable_scope('ConvNet', reuse=reuse):
    #mnist data input is a 1-d vector of 784 features 28x28 pixels
    #reshape to match picture format [Height, width, channel]
    #tensor input becomes 4-D: [Batch, height, widht, channel]
    x=tf.reshape(x, shape=[-1, 28,28,1])
    
    #convolution layer with 64 filters and a kernel size of 5
    x=tf.layers.conv2d(x,64,5,activation=tf.nn.relu)
    #maxpooling (down-sampling) with strides of 2 and kernel size of 2
    x=tf.layers.max_pooling2d(x,2,2)
    
    #convolution layer with 256 filters and a kernel size of 5
    x=tf.layers.conv2d(x,256, 5, activation=tf.nn.relu)
    #convolution layer with 512 filters and a kernel size of 5
    x=tf.layers.conv2d(x,512, 5, activation=tf.nn.relu)
    #maxpooling with strides of 2 and kernel size of 2
    x=tf.layers.max_pooling2d(x,2,2)
    
    #flatten the data to a 1-D vector for the fully connected layer
    x=tf.contrib.layers.flatten(x)
    
    #fully connected layer
    x=tf.layers.dense(x,2048)
    #apply dropout (if is_training is False, dropout is not applied)
    x=tf.layers.dropout(x,rate=dropout, training=is_training)
        
    #fully connected layer
    x=tf.layers.dense(x,1024)
    #apply dropout (if is_training is False, dropout is not applied)
    x=tf.layers.dropout(x,rate=dropout, training=is_training)
    
    #output layer, class prediction
    out=tf.layers.dense(x, n_classes)
    #because softmax_cross_entropy_with_logits already applies softmax, softmax is applied only to testing network
    out=tf.nn.softmax(out) if not is_training else out
    
  return out

In [0]:
#build function to average gradients
def average_gradients(tower_grads):
  average_grads=[]
  for grads_and_vars in zip(*tower_grads):
    #each grads_and_vars looks like following:
    # ((grad0_gpu0, var0_gpu0), ..... (grad0_gpuN, var0_gpuN))
    grads=[]
    for g,_ in grads_and_vars:
      #add 0 dimension to the gradients to represent the tower
      expanded_g = tf.expand_dims(g,0)
      
      #append on a 'tower' dimension which we will average over below
      grads.append(expanded_g)
      
    #average over 'tower' dimension
    grad=tf.concat(grads,0)
    grad=tf.reduce_mean(grad,0)
    
    #variables are redundant because they are shared across towers,
    # so we will just return first towers pointer to the variable
    v=grads_and_vars[0][1]
    grad_and_var=(grad,v)
    average_grads.append(grad_and_var)
    
  return average_grads

In [0]:
# by default all variables will be placed in gpu:0
# so we need custom device function, to assign all variables to cpu:0
#note if gpus are peered, gpu:0 can be faster option
PS_OPS=['Variable', 'VariableV2', 'AutoReloadVariable']

def assign_to_device(device, ps_device='/cpu:0'):
  def _assign(op):
    node_def=op if isinstance(op, tf.NodeDef) else op.node_def
    if node_def.op in PS_OPS:
      return "/"+ps_device
    
    else:
      return device
  return _assign


In [6]:
#place all ops in cpu by default
with tf.device('/cpu:0'):
  tower_grads=[]
  reuse_vars=False
  
  #graph input
  X=tf.placeholder(tf.float32, [None, num_input])
  Y=tf.placeholder(tf.float32, [None, num_classes])
  
  # loop over all GPUs and construct their own computation graph
  for i in range(num_gpus):
    with tf.device(assign_to_device('/gpu:{}'.format(i), ps_device='/cpu:0')):
      
      #split the data between gpus
      _x=X[i*batch_size:(i+1)*batch_size]
      _y=Y[i*batch_size:(i+1)*batch_size]
      
      #because dropout have different behaviour at prediciton and traiing time, we
      #need to create 2 distinct computation graphs that share the same weight
      
      #create graph for training
      logits_train=conv_net(_x, num_classes, dropout, reuse=reuse_vars,
                           is_training=True)
      
      #create another graph for testing that use the same wieghts
      logits_test=conv_net(_x,num_classes, dropout, reuse=True,
                          is_training=False)
      
      #define loss and optimizer (with train logits, for dropuot to take effect)
      loss_op=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits_train, labels=_y))
      optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate)
      grads=optimizer.compute_gradients(loss_op)
      
      
      #only first gpu compute accuracy
      if i==0:
        #evaluate model (with test logits for dropout to be disabled)
        correct_pred=tf.equal(tf.argmax(logits_test,1), tf.argmax(_y,1))
        accuracy=tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
      reuse_vars=True
      tower_grads.append(grads)
      
    tower_grads=average_gradients(tower_grads)
    train_op=optimizer.apply_gradients(tower_grads)
    
    #initiailing the variables
    init=tf.global_variables_initializer()
    
    
    #launch the graph
    with tf.Session() as sess:
      sess.run(init)
      
      step=1
      #keep training until reach max iterations
      for step in range(1, num_steps+1):
        #get a batch for each gpu
        batch_x, batch_y = mnist.train.next_batch(batch_size*num_gpus)
        
        #run optimization op (backprop)
        ts=time.time()
        sess.run(train_op, feed_dict={X:batch_x, Y:batch_y})
        te=time.time()-ts
        
        if step%display_step==0 or step==1:
          #calculate batch loss and accuracy
          loss, acc=sess.run([loss_op, accuracy], feed_dict={X:batch_x, 
                                                            Y:batch_y})
          
          print("Step",step,"Loss", loss, "Accuracy",acc, "Examples/sec", int(len(batch_x)/te))
        
        step +=1
        
      print("Optimization finished")
      
      #calculate accuracy for 1000 mnist test images
      print("Testing Accuracy", np.mean([sess.run(accuracy, feed_dict={X:mnist.test.images[i:i+batch_size],
                                                                      Y:mnist.test.labels[i:i+batch_size]})
                                        for i in range(0, len(mnist.test.images), batch_size)]))
    

W0830 18:14:56.686281 140006326888320 deprecation.py:323] From <ipython-input-3-f0997ef65467>:10: conv2d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
W0830 18:14:56.694341 140006326888320 deprecation.py:506] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling __init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0830 18:14:57.041497 140006326888320 deprecation.py:323] From <ipython-input-3-f0997ef65467>:12: max_pooling2d (from tensorflow.python.layers.pooling) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.MaxPooling2D instead.
W0830 18:14:57.207391 140006326888320 deprecation.py:323] From /usr/local/l

Step 1 Loss 2.294281 Accuracy 0.17773438 Examples/sec 197
Step 10 Loss 0.7026453 Accuracy 0.8046875 Examples/sec 3719
Step 20 Loss 0.30778912 Accuracy 0.92578125 Examples/sec 3719
Step 30 Loss 0.18712993 Accuracy 0.95703125 Examples/sec 3636
Step 40 Loss 0.070492655 Accuracy 0.9794922 Examples/sec 3689
Step 50 Loss 0.09600625 Accuracy 0.9794922 Examples/sec 3695
Step 60 Loss 0.031848885 Accuracy 0.9941406 Examples/sec 3690
Step 70 Loss 0.06033961 Accuracy 0.984375 Examples/sec 3675
Step 80 Loss 0.04421864 Accuracy 0.9873047 Examples/sec 3682
Step 90 Loss 0.04632093 Accuracy 0.9863281 Examples/sec 3520
Step 100 Loss 0.0378188 Accuracy 0.99121094 Examples/sec 3597
Step 110 Loss 0.037822086 Accuracy 0.9921875 Examples/sec 3687
Step 120 Loss 0.03808236 Accuracy 0.9941406 Examples/sec 3693
Step 130 Loss 0.020776927 Accuracy 0.9941406 Examples/sec 3575
Step 140 Loss 0.022629235 Accuracy 0.9951172 Examples/sec 3593
Step 150 Loss 0.026579147 Accuracy 0.99121094 Examples/sec 3643
Step 160 Loss 