# Setup:

Select a GPU before starting this ipython server. As an example, to use /gpu:4 on the server, use the following command:

*$ export CUDA_VISIBLE_DEVICES=4*

In case this variable is not set or mulitple devices are selected, TensorFlow will allocate memory on **all** devices, but will run only on /gpu:0.

In [1]:
# Force matplotlib to use inline rendering
%matplotlib inline

import os
import sys

# add path to libraries for ipython
sys.path.append(os.path.expanduser("~/libs"))

import time
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensortools as tt

import model.conv_deconv_model as model
# import model.conv_lstm_model as model
# import model.conv_lstmconv2d_model as model

In [2]:
# Global config parameters
INPUT_SEQ_LENGTH = 5

FRAME_SCALE_FACTOR = 1.0
FRAME_WIDTH = int(320 * FRAME_SCALE_FACTOR)
FRAME_HEIGHT = int(240 * FRAME_SCALE_FACTOR)
FRAME_CHANNELS = 1

MOVING_AVERAGE_DECAY = 0.9999
NUM_EPOCHS_PER_DECAY = 75.0
LEARNING_RATE_DECAY_FACTOR = 0.2
INITIAL_LEARNING_RATE = 0.001

LAMBDA = 5e-4

BATCH_SIZE = 8

TRAIN_DIR = 'train'
MAX_STEPS = 30000

# Reminder: Uses the GPU that is selected by CUDA_VISIBLE_DEVICES!
GPU_MEMORY_FRACTION = 1.0

# Input Data

In [3]:
ucf11_train = tt.datasets.ucf11.UCF11TrainDataset(BATCH_SIZE, INPUT_SEQ_LENGTH, image_size=(240,320,1))

# For manual verification of used parameters
print("Frame size: {}x{}".format(FRAME_WIDTH, FRAME_HEIGHT))
print("Learning rate decay every {} steps".format(NUM_EPOCHS_PER_DECAY * ucf11_train.dataset_size // BATCH_SIZE))

File UCF11_updated_mpg.rar has already been downloaded.
File UCF11_updated_mpg.rar has already been extracted.
Found 1542 serialized frame sequences. Skipping serialization.
Frame size: 320x240
Learning rate decay every 14456.0 steps


# Training

In [4]:
def train(total_cost, cost, global_step):
    """Train sequence model.
    Create an optimizer and apply to all trainable variables. Add moving
    average for all trainable variables.
    Args:
        total_cost: Total loss from loss function including regularization terms.
        cost: Raw loss from loss function without regularization terms.
        global_step: Integer Variable counting the number of training steps
                     processed.
    Returns:
        train_op: op for training.
    """
    # Variables that affect learning rate
    num_batches_per_epoch = ucf11_train.dataset_size / BATCH_SIZE
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

    # Decay the learning rate exponentially based on the number of steps
    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                    global_step,
                                    decay_steps,
                                    LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)
    tf.scalar_summary('learning_rate', lr)

    # Generate moving averages of all losses and associated summaries
    cost_averages_op = tt.board.loss_summary([total_cost, cost] +
                                             tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

    # Compute gradients
    with tf.control_dependencies([cost_averages_op]):
        opt = tf.train.AdamOptimizer(lr)
        grads = opt.compute_gradients(total_cost)

    # Apply gradients
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    tt.board.variables_histogram_summary()

    # Add histograms for gradients
    tt.board.gradients_histogram_summary(grads)

    # Track the moving averages of all trainable variables
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
        train_op = tf.no_op(name='train')

    return train_op

# TensorFlow Session (Main)

In [5]:
with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)
    
    # get images batch from dataset
    seq_batch, prediction_batch = ucf11_train.get_batch()
    
    # build graph and compute predictions from the inference model
    model_output = model.inference(seq_batch, 
                                   FRAME_CHANNELS, 
                                   INPUT_SEQ_LENGTH, 
                                   LAMBDA)
    
    # calculate loss
    cost_with_reg, cost = model.loss(model_output, prediction_batch)
    
    # train the model
    train_op = train(cost_with_reg, cost, global_step)
    
    # Create a saver and merge all summaries
    saver = tf.train.Saver(tf.all_variables())
    summary_op = tf.merge_all_summaries()
    
    # Create the graph, etc.
    init_op = tf.initialize_all_variables()
    
    # Create a session for running operations in the Graph
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=GPU_MEMORY_FRACTION,
        allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        
        # Initialize the variables (like the epoch counter)
        sess.run(init_op)
        
        # Visualize graph
        tt.visualization.show_graph(sess.graph_def)
        
        # Start input enqueue threads
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        
        summary_writer = tf.train.SummaryWriter(TRAIN_DIR, sess.graph)

        try:
            step = 0
            while not coord.should_stop():
                step += 1
                
                if (step > MAX_STEPS):
                    break

                start_time = time.time()

                _, cost_with_reg_value, cost_value = sess.run([train_op, cost_with_reg, cost])
                duration = time.time() - start_time

                assert not np.isnan(cost_with_reg_value), 'Model diverged with cost = NaN'

                if step % 10 == 0:
                    num_examples_per_step = BATCH_SIZE
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = ('%s: step %d, loss = %.2f, lost w/o reg = %.2f (%.1f examples/sec; %.3f '
                                  'sec/batch)')
                    print (format_str % (datetime.now().time(), step, cost_with_reg_value, cost_value,
                                         examples_per_sec, sec_per_batch))

                if step % 100 == 0:
                    summary_str = sess.run(summary_op)
                    summary_writer.add_summary(summary_str, step)
                    summary_writer.flush() 

                # Save the model checkpoint periodically.
                if step % 1000 == 0 or (step + 1) == MAX_STEPS:
                    checkpoint_path = os.path.join(TRAIN_DIR, 'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)

        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
        finally:
            # When done, ask the threads to stop
            coord.request_stop()

        # Wait for threads to finish
        coord.join(threads)

        sequences, targets, predictions = sess.run([seq_batch, prediction_batch, model_output])

        # print predictions of a batch
        for idx in xrange(BATCH_SIZE):
            tt.utils.image.write(
                'out/{}-pred.png'.format(idx),
                predictions[idx] * 127.5 + 127.5)

        for idx in xrange(BATCH_SIZE):
            for s in xrange(INPUT_SEQ_LENGTH):
                frames = sequences[:, : , :,(s * FRAME_CHANNELS):((s + 1) * FRAME_CHANNELS)]
                tt.utils.image.write(
                    'out/{}-seq{}.png'.format(idx, s),
                    frames[idx] * 127.5 + 127.5)

        for idx in xrange(BATCH_SIZE):
            tt.utils.image.write(
                'out/{}-target.png'.format(idx),
                targets[idx] * 127.5 + 127.5)


A
1542
Filling queue with 231 examples...
B
(8, 240, 320, 5)
C
D
E
F
G
START SESS
VIS


QUEUE INIT
QUEUE STARTED
@step: 1
@step: 2
@step: 3
@step: 4
@step: 5
@step: 6
@step: 7
@step: 8
@step: 9
@step: 10
14:11:02.417502: step 10, loss = 481.91, lost w/o reg = 481.77 (56.3 examples/sec; 0.142 sec/batch)
@step: 11
@step: 12
@step: 13
@step: 14
@step: 15
@step: 16
@step: 17
@step: 18
@step: 19
@step: 20
14:11:04.875458: step 20, loss = 462.83, lost w/o reg = 462.69 (42.9 examples/sec; 0.186 sec/batch)
@step: 21
@step: 22
@step: 23
@step: 24
@step: 25
@step: 26
@step: 27
@step: 28
@step: 29
@step: 30
14:11:06.969463: step 30, loss = 330.51, lost w/o reg = 330.36 (29.5 examples/sec; 0.271 sec/batch)
@step: 31
@step: 32
@step: 33
@step: 34
@step: 35
@step: 36
@step: 37
@step: 38
@step: 39
@step: 40
14:11:09.155833: step 40, loss = 525.83, lost w/o reg = 525.68 (32.2 examples/sec; 0.248 sec/batch)
@step: 41
@step: 42
@step: 43
@step: 44
@step: 45
@step: 46
@step: 47
@step: 48
@step: 49
@step: 50
14:11:11.034197: step 50, loss = 337.13, lost w/o reg = 336.99 (27.6 examples/sec; 0

KeyboardInterrupt: 