# Deep Learning Pipeline for Meme captioning

In [1]:
%matplotlib inline
import numpy as np
import tensorflow as tf
import csv
import os, codecs
from utils.inception_v3 import InceptionV3

Using TensorFlow backend.


In [2]:
tf.__version__

'0.12.1'

## Keras Inception definition
See [`inception_v3.py`](utils/inception_v3.py) for more info on the specification.
The two last layers are ignored due to implementation issues

In [3]:
model, _ = InceptionV3(include_top=True, weights='imagenet')

model.layers.pop()
model.layers.pop()

#with tf.variable_scope("inception_v3") as scope:
 #   writer = tf.summary.FileWriter('./cnn', cnn_graph)
#writer.close()

<keras.layers.core.Flatten at 0x7eff59d6ed30>

In [4]:
import keras

A data sample will extracted from [`meme_characters/`](meme_characters/). This info must
be crawled!

The `stats` function will compute the statistics from the existing dataset in
[`meme_characters/`](meme_characters/).

In [6]:
from utils.meme_stats import stats, sizeof_fmt

global_dir = '10kmc/part-0-to-10000/'
# ncaptions, nmeme_characters, nwords, nchars, total_size = stats(global_dir, False)
# print('total number of captions', ncaptions)
# print('total number of meme characters', nmeme_characters)
# print('total number of words:', nwords)
# print('total number of characters:', nchars)
# print('total size:', sizeof_fmt(total_size))

Now we get a proportion of the data and we encode it using a CNN and sequence embeddings.

In [7]:
from utils.data_utils import get_data

images_captions, voca = get_data(global_dir, model, quantity=0.25)
print(np.shape(images_captions))

total_size: 358410
ub: 89602.5
1
2
vocabulary length: 3405
total sequences: 2587
total chars: 3405
Vectorization...
(2587, 2)


In [None]:
from keras.layers import LSTM, Input
from keras.models import Model

input = Input(batch_shape=(32, 10, 1))
lstm_layer = LSTM(10, stateful=True)(input)

model = Model(input, lstm_layer)
model.compile(optimizer="adam", loss="mse")

In [None]:
import keras.backend as K

hidden_states = K.variable(value=np.random.normal(size=(32, 10)))
cell_states = K.variable(value=np.random.normal(size=(32, 10)))

model.layers[1].states[0] = hidden_states
model.layers[1].states[1] = cell_states 

### Interactive session creation

In [None]:
tf.reset_default_graph()
sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

## TRÈS IMPORTANT !

Images and captions are stored in a tensor called __`images_captions`__, which is defined below.

In [None]:
def batch_with_dynamic_pad(images_and_captions,
                           batch_size,
                           queue_capacity,
                           add_summaries=True):
  """Batches input images and captions.
  This function splits the caption into an input sequence and a target sequence,
  where the target sequence is the input sequence right-shifted by 1. Input and
  target sequences are batched and padded up to the maximum length of sequences
  in the batch. A mask is created to distinguish real words from padding words.
  Example:
    Actual captions in the batch ('-' denotes padded character):
      [
        [ 1 2 5 4 5 ],
        [ 1 2 3 4 - ],
        [ 1 2 3 - - ],
      ]
    input_seqs:
      [
        [ 1 2 3 4 ],
        [ 1 2 3 - ],
        [ 1 2 - - ],
      ]
    target_seqs:
      [
        [ 2 3 4 5 ],
        [ 2 3 4 - ],
        [ 2 3 - - ],
      ]
    mask:
      [
        [ 1 1 1 1 ],
        [ 1 1 1 0 ],
        [ 1 1 0 0 ],
      ]
  Args:
    images_and_captions: A list of pairs [image, caption], where image is a
      Tensor of shape [height, width, channels] and caption is a 1-D Tensor of
      any length. Each pair will be processed and added to the queue in a
      separate thread.
    batch_size: Batch size.
    queue_capacity: Queue capacity.
    add_summaries: If true, add caption length summaries.
  Returns:
    images: A Tensor of shape [batch_size, height, 1].
    input_seqs: An int32 Tensor of shape [batch_size, padded_length].
    target_seqs: An int32 Tensor of shape [batch_size, padded_length].
    mask: An int32 0/1 Tensor of shape [batch_size, padded_length].
  """
  enqueue_list = []
  for image, caption in images_and_captions:
    caption_length = tf.shape(caption)[0]
    input_length = tf.expand_dims(tf.subtract(caption_length, 1), 0)

    input_seq = tf.slice(caption, [0], input_length)
    target_seq = tf.slice(caption, [1], input_length)
    indicator = tf.ones(input_length, dtype=tf.int32)
    img = image.flatten()
    # print(np.shape(img))
    enqueue_list.append([img, input_seq, target_seq, indicator])
    

  images, input_seqs, target_seqs, mask = tf.train.batch_join(
      enqueue_list,
      batch_size=batch_size,
      capacity=queue_capacity,
      dynamic_pad=True,
      name="batch_and_pad")
  print(tf.shape(images))

  if add_summaries:
    lengths = tf.add(tf.reduce_sum(mask, 1), 1)
    tf.summary.scalar("caption_length/batch_min", tf.reduce_min(lengths))
    tf.summary.scalar("caption_length/batch_max", tf.reduce_max(lengths))
    tf.summary.scalar("caption_length/batch_mean", tf.reduce_mean(lengths))
    
  # tf.reshape(images, (tf.shape(images)[0], tf.shape(images)[2]))  
  return images, input_seqs, target_seqs, mask

## (LSTM) Model Parameter Definitions

Variables are defined, and embeddings are built up.

In [None]:
embedding_size = 100

# To match the "Show and Tell" paper we initialize all variables with a
# random uniform initializer.
initializer_scale = 0.08
initializer = tf.random_uniform_initializer(
        minval=-initializer_scale,
        maxval=initializer_scale)
images, input_seqs, target_seqs, input_mask = (
          batch_with_dynamic_pad(images_captions[:200],
                                 batch_size=100,
                                 queue_capacity=200))
with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"):
    embedding_map = tf.get_variable(
        name="map",
        shape=[len(voca), embedding_size],
        initializer=initializer)
    seq_embeddings = tf.nn.embedding_lookup(embedding_map, input_seqs)

In [None]:
#images = tf.reshape(images, (2,))
print(tf.shape(images))

## Map inception output into embedding space.

In [None]:
with tf.variable_scope("image_embedding") as scope:
    image_embeddings = tf.contrib.layers.fully_connected(
        inputs=images,
        num_outputs=100,
        activation_fn=None,
        weights_initializer=initializer,
        biases_initializer=None,
        scope=scope)

A [`LSTMStateTuple`](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/LSTMStateTuple) will be needed to store all the (_batched_) embeddings obtained from the ConvNet, in order to initialize our RecurrentNet.

In [None]:
from tensorflow.contrib.rnn import LSTMStateTuple

## LSTM Specification

A (_training_) LSTM net is specified given a [`BasicLSTMCell`](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicLSTMCell), a `LSTMStateTuple`

In [None]:
lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=100, state_is_tuple=True)
state = tf.Variable(image_embeddings,
                    expected_shape=tf.shape(image_embeddings),
                    trainable=True)
initial_state = LSTMStateTuple(state, state)

# with tf.variable_scope("lstm", initializer=initializer) as lstm_scope:
# if mode == "inference":
#     # In inference mode, use concatenated states for convenient feeding and
#     # fetching.
#     tf.concat(axis=1, values=initial_state, name="initial_state")

#     # Placeholder for feeding a batch of concatenated states.<
#     state_feed = tf.placeholder(dtype=tf.float32,
#                                 shape=[None, sum(lstm_cell.state_size)],
#                                 name="state_feed")
#     c, h = tf.split(value=state_feed, num_or_size_splits=2, axis=1)
#     state_tuple = LSTMStateTuple(c, h)

#     # Run a single LSTM step.
#     lstm_outputs, state_tuple = lstm_cell(inputs=tf.squeeze(seq_embeddings, axis=[1]),
#                                           state=state_tuple)

#     # Concatentate the resulting state.
#     tf.concat(axis=1, values=state_tuple, name="state")
# else:
# Run the batch of sequence embeddings through the LSTM.
with tf.variable_scope("lstm", initializer=initializer) as lstm_scope:
    sequence_length = tf.reduce_sum(input_mask, 1)
    lstm_outputs, _ = tf.nn.dynamic_rnn(cell=lstm_cell,
                                        inputs=seq_embeddings,
                                        sequence_length=sequence_length,
                                        initial_state=initial_state,
                                        dtype=tf.float32)

In [None]:
# Stack batches vertically.
lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size])

with tf.variable_scope("logits") as logits_scope:
    logits = tf.contrib.layers.fully_connected(
        inputs=lstm_outputs,
        num_outputs=len(voca), # config.vocab_size,
        activation_fn=None,
        weights_initializer=initializer,
        scope=logits_scope)

# if mode == "inference":
#     tf.nn.softmax(logits, name="softmax")
# else:
targets = tf.reshape(target_seqs, [-1])
weights = tf.to_float(tf.reshape(input_mask, [-1]))

# Compute losses.
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets,
                                                        logits=logits)
batch_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)),
                    tf.reduce_sum(weights),
                    name="batch_loss")
tf.losses.add_loss(batch_loss)
total_loss = tf.losses.get_total_loss()

# Add summaries.
tf.summary.scalar("losses/batch_loss", batch_loss)
tf.summary.scalar("losses/total_loss", total_loss)
for var in tf.trainable_variables():
    tf.summary.histogram("parameters/" + var.op.name, var)
    
target_cross_entropy_losses = losses  # Used in evaluation.
target_cross_entropy_loss_weights = weights  # Used in evaluation.

In [None]:
# Sets up the function to restore inception variables from checkpoint.
# if mode != "inference":
# Restore inception variables only.
# inception_variables = tf.get_collection(
#     tf.GraphKeys.GLOBAL_VARIABLES, scope="InceptionV3")
# saver = tf.train.Saver(inception_variables)

# def restore_fn(sess):
#     tf.logging.info("Restoring Inception variables from checkpoint file %s",
#                     inception_checkpoint_file)
#     saver.restore(sess, inception_checkpoint_file)

# init_fn = restore_fn
# else:
#     init_fn = None

In [None]:
# Sets up the global step Tensor.
global_step = tf.Variable(
 initial_value=0,
    name="global_step",
    trainable=False,
    collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

global_step = global_step

## Training

In [None]:
# Create training directory.
train_dir = 'train/'
if not tf.gfile.IsDirectory(train_dir):
    tf.logging.info("Creating training directory: %s", train_dir)
    tf.gfile.MakeDirs(train_dir)

In [None]:
# Some important variables

# Whether to train inception submodel variables.
train_inception = False

# Learning rate for the initial phase of training.
initial_learning_rate = 2.0
learning_rate_decay_factor = 0.5
num_epochs_per_decay = 8.0

# Number of examples per epoch of training data.
num_examples_per_epoch = 586363

# Optimizer for training the model.
optimizer = "SGD"

# Learning rate when fine tuning the Inception v3 parameters.
train_inception_learning_rate = 0.0005

# If not None, clip gradients to this value.
clip_gradients = 5.0

# How many model checkpoints to keep.
max_checkpoints_to_keep = 5

# Batch size.
batch_size = 32

In [None]:
# Set up the learning rate.
learning_rate_decay_fn = None
if train_inception:
    learning_rate = tf.constant(train_inception_learning_rate)
else:
    learning_rate = tf.constant(initial_learning_rate)
    if learning_rate_decay_factor > 0:
        num_batches_per_epoch = (num_examples_per_epoch /
                                 batch_size)
        decay_steps = int(num_batches_per_epoch *
                          num_epochs_per_decay)

    def _learning_rate_decay_fn(learning_rate, global_step):
        return tf.train.exponential_decay(
            learning_rate,
            global_step,
            decay_steps=decay_steps,
            decay_rate=learning_rate_decay_factor,
            staircase=True)

    learning_rate_decay_fn = _learning_rate_decay_fn

In [None]:
# Set up the training ops.
train_op = tf.contrib.layers.optimize_loss(
    loss=total_loss,
    global_step=global_step,
    learning_rate=learning_rate,
    optimizer=optimizer,
    clip_gradients=clip_gradients,
    learning_rate_decay_fn=learning_rate_decay_fn)

In [None]:
# Set up the Saver for saving and restoring model checkpoints.
saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)

In [None]:
# Run training.
log_every_n_steps = 1
number_of_steps = 1000000
tf.contrib.slim.learning.train(
    train_op,
    train_dir,
    log_every_n_steps=log_every_n_steps,
    graph=sess.graph,
    global_step=global_step,
    number_of_steps=number_of_steps,
    saver=saver)
sess.run(tf.global_variables_initializer())

In [None]:
# close session
sess.close()