In [1]:
from keras import Model
from keras.applications import VGG16
from keras.callbacks import ModelCheckpoint, Callback, ReduceLROnPlateau
from keras.layers import Input, Dense, LSTM, Embedding, GRU, Flatten, Dropout, BatchNormalization, RepeatVector, concatenate
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.sequence import pad_sequences
from keras.backend.tensorflow_backend import set_session
from nltk.translate.bleu_score import corpus_bleu
import matplotlib.pyplot as plt
import os
from PIL import Image

import coco_parse
import flickr8k_parse
import numpy as np
import tensorflow as tf
import text_processing
import time
import utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Decoder

### Captions encoding

Before building decoder, it is necessary to encode captions into one-hot vectors which further would be used in embedding layer.

### COCO dataset

In [2]:
# captions_path = 'D:/coco/annotations/'
# images_path = 'D:/coco/images/'

# # parse JSON file with captions to get paths to images with captions
# val_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
#                                                                      train=False)
# val_filenames_with_all_captions = coco_parse.get_image_with_all_captions(val_filenames_with_captions)

# train_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
#                                                                      train=True)
# train_filenames_with_all_captions = coco_parse.get_image_with_all_captions(train_filenames_with_captions)

# ### Extract captions
# train_captions = coco_parse.make_list_of_captions(train_filenames_with_all_captions)
# val_captions = coco_parse.make_list_of_captions(val_filenames_with_all_captions)

### Flickr8k dataset

In [3]:
images_path = 'D:/Flickr8k/images/'
annotations_path = 'D:/Flickr8k/annotations/'
captions_file = 'D:/Flickr8k/annotations/Flickr8k.token.txt'
train_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.trainImages.txt'
dev_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.devImages.txt'
test_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.testImages.txt'

filenames_with_all_captions = flickr8k_parse.generate_filenames_with_all_captions(captions_file, images_path)

train_filenames_with_all_captions = flickr8k_parse.generate_set(train_txt_path, filenames_with_all_captions, images_path)
val_filenames_with_all_captions = flickr8k_parse.generate_set(dev_txt_path, filenames_with_all_captions, images_path)
test_filenames_with_all_captions = flickr8k_parse.generate_set(test_txt_path, filenames_with_all_captions, images_path)

train_captions = flickr8k_parse.make_list_of_captions(train_filenames_with_all_captions)
val_captions = flickr8k_parse.make_list_of_captions(val_filenames_with_all_captions)


In [7]:
### Preprocess captions
text_processing.preprocess_captions(val_captions)
text_processing.preprocess_captions(train_captions)

In [10]:
### Add markers of captions' starts and ends
text_processing.add_start_and_end_to_captions(train_captions)
text_processing.add_start_and_end_to_captions(val_captions)

In [13]:
### Create vocabulary from the training captions
train_vocab = text_processing.Vocabulary()
for caption_list in train_captions:
    for caption in caption_list:
        tmp_caption_list = caption.split()
        for word in tmp_caption_list:
            train_vocab.add_word(word)

In [14]:
train_vocab.save_vocabulary()

In [15]:
### Create transformed captions list - substitute words by their IDs from vocabulary
def tokenise_captions(set_captions, vocabulary):
    captions_tokens = [] 
    for captions in set_captions:
        tmp_captions_for_img = []
        for caption in captions:
            caption_words = caption.split()
            tmp = []
            for word in caption_words:
                if word in vocabulary.word_to_id:
                    tmp.append(vocabulary.get_id_by_word(word))
                else:
                    tmp.append(0)
            tmp_captions_for_img.append(tmp)
        captions_tokens.append(tmp_captions_for_img)
    return captions_tokens

train_captions_tokens = tokenise_captions(train_captions, train_vocab)
val_captions_tokens = tokenise_captions(val_captions, train_vocab)

In [16]:
train_captions_tokens[0]

[[1, 2, 3, 4, 5, 6, 7, 2, 8, 4, 9, 10, 11, 12],
 [1, 3, 4, 13, 14, 4, 15, 11, 12],
 [1, 16, 17, 18, 19, 20, 21, 10, 22, 23, 12],
 [1, 16, 17, 24, 25, 9, 10, 11, 12],
 [1, 16, 17, 6, 15, 2, 26, 27, 28, 29, 30, 12]]

In [17]:
val_captions_tokens[0]

[[1,
  10,
  50,
  1325,
  622,
  94,
  72,
  2,
  678,
  5,
  824,
  2294,
  67,
  10,
  23,
  42,
  265,
  50,
  12],
 [1, 16, 185, 24, 72, 2, 678, 9, 2, 3708, 12],
 [1, 16, 111, 24, 72, 2, 296, 678, 12],
 [1, 16, 280, 246, 9, 77, 1150, 40, 72, 2, 678, 12],
 [1, 16, 129, 246, 72, 2, 678, 852, 21, 2, 689, 12]]

In [18]:
train_captions[0]

['<sos> a black dog is running after a white dog in the snow <eos>',
 '<sos> black dog chasing brown dog through snow <eos>',
 '<sos> two dogs chase each other across the snowy ground <eos>',
 '<sos> two dogs play together in the snow <eos>',
 '<sos> two dogs running through a low lying body of water <eos>']

### Batch generation

In [19]:
def batch_one_hot_encode(batch, number_of_words):
    """ 
    Applies one-hot encoding to the input batch
    
    """
    batch_size = batch.shape[0]
    sentence_size = batch.shape[1]
    
    one_hot_batch = np.zeros((batch_size, sentence_size, number_of_words))
    
    for i in range(batch_size):
        for j in range(sentence_size):
            one_hot_batch[i, j, batch[i, j]] = 1
    return one_hot_batch

In [20]:
def generate_batch(transfer_values, captions_tokens, number_of_words, gru=True, max_length_lstm=40, batch_size=32):
    """
    Generate a batch of input-output data pairs:
        input_data = {
            transfer_values,
            input_tokens
        }
        
        output_data = {
            output_tokens
        }
        
     Parameters:
        -----------
        transfer_values: np.array
            Encoded images features
            
        captions: list
            list with all the captions
        
        
        batch_size: int
            The number of examples in a batch
        -----------
    """
    while True:
        number_of_images = len(captions_tokens)
        
        indices = np.random.randint(0, len(transfer_values), size=batch_size)

        captions_batch = []
        ### Randomly select one caption for each example index
        for ind in indices:
            num_captions = len(captions_tokens[ind])
            selected_caption = captions_tokens[ind][np.random.randint(0, num_captions - 1)]
            captions_batch.append(selected_caption)

        if not gru:
            captions_batch_padded = pad_sequences(captions_batch, 
                                              maxlen=max_length_lstm + 1, 
                                              padding='post', 
                                              value=0)
        else:
            ### Find the largest caption length and pad the remaining to be the same size
            max_caption_size = max([len(cap) for cap in captions_batch])
            captions_batch_padded = pad_sequences(captions_batch, 
                                              maxlen=max_caption_size, 
                                              padding='post', 
                                              value=0)
        ### Input tokens are the initial ones starting from index 1
        ### Output tokens are the initial ones shifted to the right
        input_tokens = captions_batch_padded[:, :-1]
        output_tokens = captions_batch_padded[:, 1:]

        output_tokens = batch_one_hot_encode(output_tokens, number_of_words)

        input_transfer_values = transfer_values[indices]

        input_data = {
            'encoder_input': input_transfer_values,
            'decoder_input': input_tokens
        }

        output_data = {
            'decoder_output': output_tokens
        }

        yield (input_data, output_data)

In [21]:
transfer_values = np.load('./cnn_features/vgg16_flickr8k_train.npy')
val_transfer_values = np.load('./cnn_features/vgg16_flickr8k_val.npy')

### Decoder NN

### GRU

In [38]:
batch_size = 64
steps_per_epoch = int(len(train_captions) / batch_size)
initial_state_size = 512
embedding_out_size = 512
number_of_layers = 3
batch_norm = False
dropout = False
gru = True
max_len = 40
path_checkpoint = utils.generate_weights_path(gru, 'flickr8k', number_of_layers, batch_size, batch_norm, dropout)
model_path = utils.generate_model_path(gru, number_of_layers, batch_norm, dropout)

In [39]:
path_checkpoint

'./weights/VGG_True_flickr8k_3l_64b.hdf5'

In [40]:
model_path

'./models/VGG16_GRU_3l.json'

In [91]:
### Encoder input part
encoder_input = Input(shape=(4096,), name='encoder_input')
encoder_reduction = Dense(initial_state_size, activation='relu', name='encoder_reduction')
if batch_norm:
    bn1 = BatchNormalization()
### For LSTM
if not gru:
    repeat = RepeatVector(max_len)
### Decoder input and embedding
if gru:
    decoder_input = Input(shape=(None,), name='decoder_input')
else:
    decoder_input = Input(shape=(40,), name='decoder_input')
embedding = Embedding(input_dim=train_vocab.number_of_words, output_dim=embedding_out_size, name='embedding')
if dropout:
    drop1 = Dropout(0.5)
### GRU1
if gru:
    gru1 = GRU(initial_state_size, name='GRU1', return_sequences=True)
else:
    lstm1 = LSTM(initial_state_size, name='LSTM1', return_sequences=True)
if batch_norm:
    bn2 = BatchNormalization()
### GRU2    
if number_of_layers >= 2:
    if gru:
        gru2 = GRU(initial_state_size, name='GRU2', return_sequences=True)
    else:
        lstm2 = LSTM(initial_state_size, name='LSTM2', return_sequences=True)
    if batch_norm:
        bn3 = BatchNormalization()
### GRU3        
if number_of_layers == 3:
    if gru:
        gru3 = GRU(initial_state_size, name='GRU3', return_sequences=True)
    else:
        lstm3 = LSTM(initial_state_size, name='LSTM3', return_sequences=True)
    if batch_norm:
        bn4 = BatchNormalization()

decoder_dense = Dense(train_vocab.number_of_words, activation='softmax', name='decoder_output')

def connect_transfer_values_gru(transfer_values):
    
    initial_state = encoder_reduction(transfer_values)
    if batch_norm:
        initial_state = bn1(initial_state)

    X = decoder_input
    X = embedding(X)
    if dropout:
        X = drop1(X)
    
    X = gru1(X, initial_state=initial_state)
    if batch_norm:
        X = bn2(X)
    if number_of_layers >= 2:
        X = gru2(X, initial_state=initial_state)
        if batch_norm:
            X = bn3(X)
    if number_of_layers == 3:
        X = gru3(X, initial_state=initial_state)
        if batch_norm:
            X = bn4(X)

    decoder_output = decoder_dense(X)
    
    return decoder_output

def connect_transfer_values_lstm(transfer_values):
    initial_state = encoder_reduction(transfer_values)
    if batch_norm:
        initial_state = bn1(initial_state)
    initial_state = repeat(initial_state)
    
    X = decoder_input
    X = embedding(X)
    if dropout:
        X = drop1(X)
        
    X = concatenate([initial_state, X])
    
    X = lstm1(X)
    if batch_norm:
        X = bn2(X)
    if number_of_layers >= 2:
        X = lstm2(X)
        if batch_norm:
            X = bn3(X)
    if number_of_layers == 3:
        X = lstm3(X)
        if batch_norm:
            X = bn4(X)
    
    decoder_output = decoder_dense(X)
    
    return decoder_output

In [92]:
optimizer = RMSprop(lr=1e-3)

In [93]:
if gru:
    decoder_output = connect_transfer_values_gru(transfer_values=encoder_input)
else:
    decoder_output = connect_transfer_values_lstm(transfer_values=encoder_input)

In [94]:
decoder_model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])

In [95]:
decoder_model.compile(optimizer=optimizer,
                      loss='categorical_crossentropy')

In [96]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_input (InputLayer)      (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 512)    3774976     decoder_input[0][0]              
__________________________________________________________________________________________________
encoder_reduction (Dense)       (None, 512)          2097664     encoder_input[0][0]              
__________________________________________________________________________________________________
GRU1 (GRU)

In [97]:
model_json = decoder_model.to_json()
try:
    os.mkdir('./models')
except:
    print('The folder already exists')
with open(model_path, "w") as json_file:
    json_file.write(model_json)

  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


The folder already exists


In [49]:
generator = generate_batch(transfer_values, train_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=batch_size, gru=False)
val_generator = generate_batch(val_transfer_values, val_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=batch_size, gru=False)

In [50]:
" ".join([x for x in [train_vocab.get_word_by_id(word) for x in next(generator)[0]['decoder_input'] for word in x if word != 0]])

'<sos> a young boy and girl hug and smile <eos> <sos> a person is surfing a big wave <eos> <sos> a biker flies through the air near sand dunes <eos> <sos> a man catches a huge wave on his surfboard <eos> <sos> several people are looking out from a building that is under construction <eos> <sos> a woman wearing a blue shirt and high heels stands on the sidewalk next to a man <eos> <sos> a woman smokes a cigarette and looks at a magazine in the middle of a snowy road <eos> <sos> a group of people wearing costumes jump as they walk down the street <eos> <sos> children smiling with facepaintings <eos> <sos> two boys one naked one in a skirt run on the sand <eos> <sos> a child in green winter clothes is holding his or her hand up while two other children look at him or her <eos> <sos> the man in the striped shirt and the man in the hat are posing for the camera <eos> <sos> a man sits watching a waterfall <eos> <sos> a boy wears plastic toy teeth and green plastic toy glasses which are much 

### Checkpoints

During the training process, it is a good idea to save the weights periodically.

In [51]:
try:
    os.mkdir('./decoders/')
except:
    print('The folder already exists')

checkpoints = ModelCheckpoint(path_checkpoint, verbose=1, save_weights_only=True, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=2, verbose=1, min_lr=0.00001)

The folder already exists


In [52]:

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [53]:
# try:
#     decoder_model.load_weights(path_checkpoint)
# except:
#     print("Error while loading weights")

In [54]:
start = time.time()
history = decoder_model.fit_generator(generator=generator,
                            steps_per_epoch=steps_per_epoch,
                            epochs=20,
                            callbacks=[checkpoints, reduce_lr],
                            validation_data=val_generator,
                            validation_steps=5)
time_train = time.time() - start

RuntimeError: You must compile your model before using it.

In [35]:
print("Time for training: {} seconds".format(time_train))

Time for training: 2987.9345643520355 seconds


In [36]:
his

NameError: name 'his' is not defined