# Setup:

Select GPUs before starting this ipython server. As an example, to use /gpu:4 and /gpu:6 on the server, use the following command:

*$ export CUDA_VISIBLE_DEVICES=4,6*

In case this variable is not set or mulitple devices are selected, TensorFlow will allocate memory on **all** devices, but will run only on /gpu:0.

In [1]:
# Force matplotlib to use inline rendering
%matplotlib inline

import os
import sys

# add path to libraries for ipython
sys.path.append(os.path.expanduser("~/libs"))

import time
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensortools as tt

from model.conv_deconv_model import ConvDeconvModel
from model.conv_lstm_model import ConvLSTMModel
from model.conv_lstmconv2d_model import ConvLSTMConv2DModel
from model.lstm_encoder_decoder import LSTMDecoderEncoderModel
from model.lstmconv2d_encoder_decoder import LSTMConv2DDecoderEncoderModel
from model.conv_lstmconv2d_encoder_decoder import ConvLSTMConv2DDecoderEncoderModel

In [4]:
# Global config parameters
INPUT_SEQ_LENGTH = 10
OUTPUT_SEQ_LENGTH = 10

FRAME_WIDTH = 64
FRAME_HEIGHT = 64
FRAME_CHANNELS = 1

MOVING_AVERAGE_DECAY = 0.9999
NUM_EPOCHS_PER_DECAY = 75.0 # used if FIXED_NUM_STEPS_PER_DECAY is None
LEARNING_RATE_DECAY_FACTOR = 0.5
INITIAL_LEARNING_RATE = 0.001

FIXED_NUM_STEPS_PER_DECAY = 15000

BATCH_SIZE = 16  # per GPU!

RESTORE_MODEL = None
TRAIN_DIR = 'train_moving_mnist'
MAX_STEPS = 50000

GPU_MEMORY_FRACTION = 1.0
GPU_ALLOW_GROWTH = True
NUM_GPUS = 2

# Input Data

In [5]:
dataset_train = tt.datasets.moving_mnist.MovingMNISTTrainDataset(BATCH_SIZE * NUM_GPUS,
                                                                INPUT_SEQ_LENGTH + OUTPUT_SEQ_LENGTH)
dataset_valid = tt.datasets.moving_mnist.MovingMNISTValidDataset(BATCH_SIZE * NUM_GPUS,
                                                                INPUT_SEQ_LENGTH + OUTPUT_SEQ_LENGTH)
# dataset_test = tt.datasets.moving_mnist.MovingMNISTTestDataset(BATCH_SIZE * NUM_GPUS,
#                                                                INPUT_SEQ_LENGTH + OUTPUT_SEQ_LENGTH)

File mnist.h5 has already been downloaded.
File mnist.h5 has already been downloaded.


In [6]:
# For manual verification of used parameters
decay_after_steps = NUM_EPOCHS_PER_DECAY * dataset_train.dataset_size // (BATCH_SIZE*NUM_GPUS)
if FIXED_NUM_STEPS_PER_DECAY is not None:
    decay_after_steps = FIXED_NUM_STEPS_PER_DECAY
print("Learning rate decay every {} steps".format(decay_after_steps))

if RESTORE_MODEL is not None:
    print("Restoring model file {}.".format(RESTORE_MODEL))

Learning rate decay every 15000 steps


# Training

In [7]:
def tower_loss(model):
    """Calculate the total loss on a single tower.
    Args:
        scope: unique prefix string identifying the tower, e.g. 'tower_0'
    Returns:
        Tensor of shape [] containing the total loss for a batch of data
    """
    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    total_loss = model.total_loss
    loss = model.loss

    # Compute the moving average of all individual losses and the total loss.
    # Generate moving averages of all losses and associated summaries
    loss_averages_op = tt.board.loss_summary([total_loss, loss] +
                                             tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES),
                                             decay=0.9)

    with tf.control_dependencies([loss_averages_op]):
        total_loss = tf.identity(total_loss)
    
    return total_loss

In [8]:
def train(inputs, targets, global_step):
    """Train sequence model.
    predictions_for_input:
        Either the same as predictions or None
    """
    # Variables that affect learning rate
    num_batches_per_epoch = dataset_train.dataset_size / (BATCH_SIZE * NUM_GPUS)
    decay_steps = num_batches_per_epoch * NUM_EPOCHS_PER_DECAY
    
    if FIXED_NUM_STEPS_PER_DECAY is not None:
        decay_steps = FIXED_NUM_STEPS_PER_DECAY

    # Decay the learning rate exponentially based on the number of steps
    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                    global_step,
                                    decay_steps,
                                    LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)
    tf.scalar_summary('learning_rate', lr)

    # Compute gradients
    opt = tf.train.AdamOptimizer(lr)
    # Calculate the gradients for each model tower.
    tower_grads = []
    tower_losses = []
    for i in xrange(NUM_GPUS):
        with tf.device('/gpu:%d' % i, ):
            with tf.name_scope('%s_%d' % ('tower', i)) as scope:
                this_inputs = inputs[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :, :, :, :]
                this_targets = targets[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :, :, :, :]
                
                # Build inference Graph.This function constructs 
                # the entire model but shares the variables across all towers.
                model = ConvLSTMConv2DDecoderEncoderModel(this_inputs, this_targets)
                
                # Calculate the loss for one tower of the model.
                this_loss = tower_loss(model)
                tower_losses.append(this_loss)

                # Reuse variables for the next tower.
                tf.get_variable_scope().reuse_variables()

                # Retain the summaries from the final tower.
                summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

                # Calculate the gradients for the batch of data on this tower.
                grads = opt.compute_gradients(this_loss)

                # Keep track of the gradients ackeep_probross all towers.
                tower_grads.append(grads)

    # We must calculate the mean of each gradient.
    # This is also the synchronization point across all towers.
    grads = tt.training.average_gradients(tower_grads)
    
    total_loss = tf.reduce_mean(tower_losses)
                      
    summaries.append(tf.scalar_summary('mean_total_loss', total_loss))
    
    # Add a summary to track the learning rate.
    summaries.append(tf.scalar_summary('learning_rate', lr))
    
    # Add histograms for gradients
    summaries.extend(tt.board.gradients_histogram_summary(grads))

    # Apply the gradients to adjust the shared variables.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
    
    summaries.extend(tt.board.variables_histogram_summary())

    # Track the moving averages of all trainable variables
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    # Group all updates to into a single train op.
    train_op = tf.group(apply_gradient_op, variables_averages_op, name="train_op")

    return train_op, total_loss, summaries

In [9]:
'''def validate(inputs, targets):
    """Train sequence model.
    predictions_for_input:
        Either the same as predictions or None
    """
    # Calculate the gradients for each model tower.
    tower_grads = []
    tower_losses = []
    for i in xrange(NUM_GPUS):
        with tf.device('/gpu:%d' % i, ):
            with tf.name_scope('%s_%d' % ('tower', i)) as scope:
                
                this_inputs = inputs[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :, :, :, :]
                this_targets = targets[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :, :, :, :]
                
                # Reuse variables for the next tower.
                tf.get_variable_scope().reuse_variables()
                
                # Build inference Graph.This function constructs 
                # the entire model but shares the variables across all towers.
                predictions = model.inference(this_inputs,
                                              None,
                                              FRAME_CHANNELS, 
                                              OUTPUT_SEQ_LENGTH, 
                                              LAMBDA)
                
                # Calculate the loss for one tower of the model.
                this_loss = tower_loss(predictions, this_targets)
                tower_losses.append(this_loss)
    
    total_loss = tf.reduce_mean(tower_losses)              
    return total_loss'''

'def validate(inputs, targets):\n    """Train sequence model.\n    predictions_for_input:\n        Either the same as predictions or None\n    """\n    # Calculate the gradients for each model tower.\n    tower_grads = []\n    tower_losses = []\n    for i in xrange(NUM_GPUS):\n        with tf.device(\'/gpu:%d\' % i, ):\n            with tf.name_scope(\'%s_%d\' % (\'tower\', i)) as scope:\n                \n                this_inputs = inputs[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :, :, :, :]\n                this_targets = targets[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :, :, :, :]\n                \n                # Reuse variables for the next tower.\n                tf.get_variable_scope().reuse_variables()\n                \n                # Build inference Graph.This function constructs \n                # the entire model but shares the variables across all towers.\n                predictions = model.inference(this_inputs,\n                                              None,\n                

# TensorFlow Session (Main)

In [None]:
with tf.Graph().as_default(), tf.device('/cpu:0'):
    global_step = tf.Variable(0, trainable=False)
    
    x = tf.placeholder(tf.float32, [None, INPUT_SEQ_LENGTH, FRAME_HEIGHT, FRAME_WIDTH, FRAME_CHANNELS], "X")
    y_ = tf.placeholder(tf.float32, [None, OUTPUT_SEQ_LENGTH, FRAME_HEIGHT, FRAME_WIDTH, FRAME_CHANNELS], "Y_")
    
    # train the model
    train_op, total_loss, summaries = train(x, y_, global_step)
    # total_valid_loss = validate(x, y_)

    # Create a saver and merge all summaries
    saver = tf.train.Saver(tf.all_variables())
    summary_op = tf.merge_summary(summaries)
    
    # Create a session for running operations in the Graph
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=GPU_MEMORY_FRACTION,
        allow_growth=GPU_ALLOW_GROWTH)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        
        # Initialize the variables (like the epoch counter)
        if RESTORE_MODEL is None:
            sess.run(tf.initialize_all_variables())
        else:
            saver.restore(sess, RESTORE_MODEL)
        
        # Visualize graph
        tt.visualization.show_graph(sess.graph_def)
        
        # Start input enqueue threads
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        
        summary_writer = tf.train.SummaryWriter(TRAIN_DIR, sess.graph)
        
        dataset_train.reset()

        try:
            step = 0
            while not coord.should_stop():
                step += 1
                
                if (step > MAX_STEPS):
                    break

                start_time = time.time()
                
                batch = dataset_train.get_batch()      
                batch_x = batch[:,0:INPUT_SEQ_LENGTH,:,:,:]
                batch_y = batch[:,INPUT_SEQ_LENGTH:INPUT_SEQ_LENGTH+OUTPUT_SEQ_LENGTH,:,:,:]

                _, loss_value = sess.run([train_op, total_loss],
                                         feed_dict={x: batch_x, y_: batch_y})
                duration = time.time() - start_time

                assert not np.isnan(loss_value), 'Model diverged with cost = NaN'

                if step % 10 == 0:
                    num_examples_per_step = BATCH_SIZE * NUM_GPUS
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                  'sec/batch)')
                    print (format_str % (datetime.now().time(), step, loss_value,
                                         examples_per_sec, sec_per_batch))

                if step % 100 == 0:
                    summary_str = sess.run(summary_op, feed_dict={x: batch_x, y_: batch_y})
                    summary_writer.add_summary(summary_str, step)
                    summary_writer.flush() 
                    
                # validation
                if step % 1000 == 0 or step == 100:
                    dataset_valid.reset()
                    num_batches = dataset_valid.dataset_size // dataset_valid.batch_size

                    valid_loss_sum = 0
                    print("Validation...")
                    for b in xrange(num_batches):
                        batch = dataset_valid.get_batch()      
                        batch_x = batch[:,0:INPUT_SEQ_LENGTH,:,:,:]
                        batch_y = batch[:,INPUT_SEQ_LENGTH:INPUT_SEQ_LENGTH+OUTPUT_SEQ_LENGTH,:,:,:]

                        valid_loss = sess.run(total_loss, feed_dict={x: batch_x, y_: batch_y})
                        print("{}/{} Valid Loss= {:.6f}".format(b, num_batches, valid_loss))
                        valid_loss_sum += valid_loss
            
                    avg_valid_loss = valid_loss_sum / num_batches
                    print("@{}: Minibatch Avg. Valid Loss= {:.6f}".format(step, avg_valid_loss))
                    valid_loss_summary = tf.scalar_summary('avg_valid_loss', avg_valid_loss)
                    summary_str = sess.run(valid_loss_summary)
                    summary_writer.add_summary(summary_str, step)
                    summary_writer.flush() 

                # Save the model checkpoint periodically.
                if step % 1000 == 0:
                    checkpoint_path = os.path.join(TRAIN_DIR, 'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)

        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
        finally:
            # When done, ask the threads to stop
            coord.request_stop()

        # Wait for threads to finish
        coord.join(threads)

22:15:36.491813: step 10, loss = 164.11 (49.5 examples/sec; 0.646 sec/batch)
22:15:43.012509: step 20, loss = 159.95 (50.3 examples/sec; 0.636 sec/batch)
22:15:49.447037: step 30, loss = 159.48 (45.9 examples/sec; 0.697 sec/batch)
22:15:55.887391: step 40, loss = 151.16 (47.2 examples/sec; 0.678 sec/batch)
22:16:02.314551: step 50, loss = 136.85 (52.5 examples/sec; 0.610 sec/batch)
22:16:08.779615: step 60, loss = 136.16 (46.4 examples/sec; 0.689 sec/batch)
22:16:15.488395: step 70, loss = 148.32 (47.7 examples/sec; 0.671 sec/batch)
22:16:22.160451: step 80, loss = 139.96 (45.6 examples/sec; 0.702 sec/batch)
22:16:28.947253: step 90, loss = 144.60 (47.4 examples/sec; 0.675 sec/batch)
22:16:35.729863: step 100, loss = 134.33 (47.2 examples/sec; 0.678 sec/batch)
Validation...
0/312 Valid Loss= 136.627808
1/312 Valid Loss= 141.820343
2/312 Valid Loss= 136.569611
3/312 Valid Loss= 134.664078
4/312 Valid Loss= 142.531448
5/312 Valid Loss= 133.176300
6/312 Valid Loss= 141.020874
7/312 Valid 