In [1]:
from keras import Model
from keras.applications import VGG16
from keras.callbacks import ModelCheckpoint, Callback, ReduceLROnPlateau
from keras.layers import Input, Dense, LSTM, Embedding, GRU, Flatten, Dropout, BatchNormalization, RepeatVector, concatenate
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.sequence import pad_sequences
from keras.backend.tensorflow_backend import set_session
from nltk.translate.bleu_score import corpus_bleu
import matplotlib.pyplot as plt
from PIL import Image

import coco_parse
import flickr8k_parse
import numpy as np
import tensorflow as tf
import text_processing
import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Decoder

### Captions encoding

Before building decoder, it is necessary to encode captions into one-hot vectors which further would be used in embedding layer.

### COCO dataset

In [2]:
# captions_path = 'D:/coco/annotations/'
# images_path = 'D:/coco/images/'

# # parse JSON file with captions to get paths to images with captions
# val_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
#                                                                      train=False)
# val_filenames_with_all_captions = coco_parse.get_image_with_all_captions(val_filenames_with_captions)

# train_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
#                                                                      train=True)
# train_filenames_with_all_captions = coco_parse.get_image_with_all_captions(train_filenames_with_captions)

# ### Extract captions
# train_captions = coco_parse.make_list_of_captions(train_filenames_with_all_captions)
# val_captions = coco_parse.make_list_of_captions(val_filenames_with_all_captions)

### Flickr8k dataset

In [3]:
images_path = 'D:/Flickr8k/images/'
annotations_path = 'D:/Flickr8k/annotations/'
captions_file = 'D:/Flickr8k/annotations/Flickr8k.token.txt'
train_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.trainImages.txt'
dev_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.devImages.txt'
test_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.testImages.txt'

filenames_with_all_captions = flickr8k_parse.generate_filenames_with_all_captions(captions_file, images_path)

train_filenames_with_all_captions = flickr8k_parse.generate_set(train_txt_path, filenames_with_all_captions, images_path)
val_filenames_with_all_captions = flickr8k_parse.generate_set(dev_txt_path, filenames_with_all_captions, images_path)
test_filenames_with_all_captions = flickr8k_parse.generate_set(test_txt_path, filenames_with_all_captions, images_path)

train_captions = flickr8k_parse.make_list_of_captions(train_filenames_with_all_captions)
val_captions = flickr8k_parse.make_list_of_captions(val_filenames_with_all_captions)


In [4]:
train_captions[0]

['A black dog is running after a white dog in the snow .',
 'Black dog chasing brown dog through snow',
 'Two dogs chase each other across the snowy ground .',
 'Two dogs play together in the snow .',
 'Two dogs running through a low lying body of water .']

In [5]:
len(train_captions)

6000

In [6]:
val_captions[0]

['the boy laying face down on a skateboard is being pushed along the ground by another boy .',
 'Two girls play on a skateboard in a courtyard .',
 'Two people play on a long skateboard .',
 'Two small children in red shirts playing on a skateboard .',
 'two young children on a skateboard going across a sidewalk']

In [7]:
### Preprocess captions
text_processing.preprocess_captions(val_captions)
text_processing.preprocess_captions(train_captions)

In [8]:
val_captions[0]

['the boy laying face down on a skateboard is being pushed along the ground by another boy ',
 'two girls play on a skateboard in a courtyard ',
 'two people play on a long skateboard ',
 'two small children in red shirts playing on a skateboard ',
 'two young children on a skateboard going across a sidewalk']

In [9]:
train_captions[0]

['a black dog is running after a white dog in the snow ',
 'black dog chasing brown dog through snow',
 'two dogs chase each other across the snowy ground ',
 'two dogs play together in the snow ',
 'two dogs running through a low lying body of water ']

In [10]:
### Add markers of captions' starts and ends
text_processing.add_start_and_end_to_captions(train_captions)
text_processing.add_start_and_end_to_captions(val_captions)

In [11]:
train_captions[0]

['<sos> a black dog is running after a white dog in the snow <eos>',
 '<sos> black dog chasing brown dog through snow <eos>',
 '<sos> two dogs chase each other across the snowy ground <eos>',
 '<sos> two dogs play together in the snow <eos>',
 '<sos> two dogs running through a low lying body of water <eos>']

In [12]:
val_captions[0]

['<sos> the boy laying face down on a skateboard is being pushed along the ground by another boy <eos>',
 '<sos> two girls play on a skateboard in a courtyard <eos>',
 '<sos> two people play on a long skateboard <eos>',
 '<sos> two small children in red shirts playing on a skateboard <eos>',
 '<sos> two young children on a skateboard going across a sidewalk <eos>']

In [13]:
### Create vocabulary from the training captions
train_vocab = text_processing.Vocabulary()
for caption_list in train_captions:
    for caption in caption_list:
        tmp_caption_list = caption.split()
        for word in tmp_caption_list:
            train_vocab.add_word(word)

In [14]:
train_vocab.save_vocabulary()

In [15]:
### Create transformed captions list - substitute words by their IDs from vocabulary
def tokenise_captions(set_captions, vocabulary):
    captions_tokens = [] 
    for captions in set_captions:
        tmp_captions_for_img = []
        for caption in captions:
            caption_words = caption.split()
            tmp = []
            for word in caption_words:
                if word in vocabulary.word_to_id:
                    tmp.append(vocabulary.get_id_by_word(word))
                else:
                    tmp.append(0)
            tmp_captions_for_img.append(tmp)
        captions_tokens.append(tmp_captions_for_img)
    return captions_tokens

train_captions_tokens = tokenise_captions(train_captions, train_vocab)
val_captions_tokens = tokenise_captions(val_captions, train_vocab)

In [16]:
train_captions_tokens[0]

[[1, 2, 3, 4, 5, 6, 7, 2, 8, 4, 9, 10, 11, 12],
 [1, 3, 4, 13, 14, 4, 15, 11, 12],
 [1, 16, 17, 18, 19, 20, 21, 10, 22, 23, 12],
 [1, 16, 17, 24, 25, 9, 10, 11, 12],
 [1, 16, 17, 6, 15, 2, 26, 27, 28, 29, 30, 12]]

In [17]:
val_captions_tokens[0]

[[1,
  10,
  50,
  1325,
  622,
  94,
  72,
  2,
  678,
  5,
  824,
  2294,
  67,
  10,
  23,
  42,
  265,
  50,
  12],
 [1, 16, 185, 24, 72, 2, 678, 9, 2, 3708, 12],
 [1, 16, 111, 24, 72, 2, 296, 678, 12],
 [1, 16, 280, 246, 9, 77, 1150, 40, 72, 2, 678, 12],
 [1, 16, 129, 246, 72, 2, 678, 852, 21, 2, 689, 12]]

In [18]:
train_captions[0]

['<sos> a black dog is running after a white dog in the snow <eos>',
 '<sos> black dog chasing brown dog through snow <eos>',
 '<sos> two dogs chase each other across the snowy ground <eos>',
 '<sos> two dogs play together in the snow <eos>',
 '<sos> two dogs running through a low lying body of water <eos>']

### Batch generation

In [19]:
def batch_one_hot_encode(batch, number_of_words):
    """ 
    Applies one-hot encoding to the input batch
    
    """
    batch_size = batch.shape[0]
    sentence_size = batch.shape[1]
    
    one_hot_batch = np.zeros((batch_size, sentence_size, number_of_words))
    
    for i in range(batch_size):
        for j in range(sentence_size):
            one_hot_batch[i, j, batch[i, j]] = 1
    return one_hot_batch

In [20]:
def generate_batch(transfer_values, captions_tokens, number_of_words, gru=True, max_length_lstm=40, batch_size=32):
    """
    Generate a batch of input-output data pairs:
        input_data = {
            transfer_values,
            input_tokens
        }
        
        output_data = {
            output_tokens
        }
        
     Parameters:
        -----------
        transfer_values: np.array
            Encoded images features
            
        captions: list
            list with all the captions
        
        
        batch_size: int
            The number of examples in a batch
        -----------
    """
    while True:
        number_of_images = len(captions_tokens)
        
        indices = np.random.randint(0, len(transfer_values), size=batch_size)

        captions_batch = []
        ### Randomly select one caption for each example index
        for ind in indices:
            num_captions = len(captions_tokens[ind])
            selected_caption = captions_tokens[ind][np.random.randint(0, num_captions - 1)]
            captions_batch.append(selected_caption)

        if not gru:
            captions_batch_padded = pad_sequences(captions_batch, 
                                              maxlen=max_length_lstm + 1, 
                                              padding='post', 
                                              value=0)
        else:
            ### Find the largest caption length and pad the remaining to be the same size
            max_caption_size = max([len(cap) for cap in captions_batch])
            captions_batch_padded = pad_sequences(captions_batch, 
                                              maxlen=max_caption_size, 
                                              padding='post', 
                                              value=0)
        ### Input tokens are the initial ones starting from index 1
        ### Output tokens are the initial ones shifted to the right
        input_tokens = captions_batch_padded[:, :-1]
        output_tokens = captions_batch_padded[:, 1:]

        output_tokens = batch_one_hot_encode(output_tokens, number_of_words)

        input_transfer_values = transfer_values[indices]

        input_data = {
            'encoder_input': input_transfer_values,
            'decoder_input': input_tokens
        }

        output_data = {
            'decoder_output': output_tokens
        }

        yield (input_data, output_data)

In [21]:
transfer_values = np.load('./cnn_features/vgg16_flickr8k_train.npy')
val_transfer_values = np.load('./cnn_features/vgg16_flickr8k_val.npy')

### Decoder NN

### GRU

In [22]:
batch_size = 64
steps_per_epoch = int(len(train_captions) / batch_size)
initial_state_size = 512
embedding_out_size = 512
number_of_gru = 2
batch_norm = False
dropout = False
gru = True
max_len = 40
path_checkpoint = './decoders/VGG16_GRU_flickr8k_2l_64b.hdf5'

In [23]:
### Encoder input part
encoder_input = Input(shape=(4096,), name='encoder_input')
encoder_reduction = Dense(initial_state_size, activation='relu', name='encoder_reduction')
if batch_norm:
    bn1 = BatchNormalization()
### For LSTM
if not gru:
    repeat = RepeatVector(max_len)
### Decoder input and embedding
if gru:
    decoder_input = Input(shape=(None,), name='decoder_input')
else:
    decoder_input = Input(shape=(40,), name='decoder_input')
embedding = Embedding(input_dim=train_vocab.number_of_words, output_dim=embedding_out_size, name='embedding')
if dropout:
    drop1 = Dropout(0.5)
### GRU1
if gru:
    gru1 = GRU(initial_state_size, name='GRU1', return_sequences=True)
else:
    lstm1 = LSTM(initial_state_size, name='LSTM1', return_sequences=True)
if batch_norm:
    bn2 = BatchNormalization()
### GRU2    
if number_of_gru >= 2:
    if gru:
        gru2 = GRU(initial_state_size, name='GRU2', return_sequences=True)
    else:
        lstm2 = LSTM(initial_state_size, name='LSTM2', return_sequences=True)
    if batch_norm:
        bn3 = BatchNormalization()
### GRU3        
if number_of_gru == 3:
    if gru:
        gru3 = GRU(initial_state_size, name='GRU3', return_sequences=True)
    else:
        lstm3 = LSTM(initial_state_size, name='LSTM3', return_sequences=True)
    if batch_norm:
        bn4 = BatchNormalization()

decoder_dense = Dense(train_vocab.number_of_words, activation='softmax', name='decoder_output')

def connect_transfer_values_gru(transfer_values):
    
    initial_state = encoder_reduction(transfer_values)
    if batch_norm:
        initial_state = bn1(initial_state)

    X = decoder_input
    X = embedding(X)
    if dropout:
        X = drop1(X)
    
    X = gru1(X, initial_state=initial_state)
    if batch_norm:
        X = bn2(X)
    if number_of_gru >= 2:
        X = gru2(X, initial_state=initial_state)
        if batch_norm:
            X = bn3(X)
    if number_of_gru == 3:
        X = gru3(X, initial_state=initial_state)
        if batch_norm:
            X = bn4(X)

    decoder_output = decoder_dense(X)
    
    return decoder_output

def connect_transfer_values_lstm(transfer_values):
    initial_state = encoder_reduction(transfer_values)
    if batch_norm:
        initial_state = bn1(initial_state)
    initial_state = repeat(initial_state)
    
    X = decoder_input
    X = embedding(X)
    if dropout:
        X = drop1(X)
        
    X = concatenate([initial_state, X])
    
    X = lstm1(X)
    if batch_norm:
        X = bn2(X)
    if number_of_gru >= 2:
        X = lstm2(X)
        if batch_norm:
            X = bn3(X)
    if number_of_gru == 3:
        X = lstm3(X)
        if batch_norm:
            X = bn4(X)
    
    decoder_output = decoder_dense(X)
    
    return decoder_output

In [24]:
optimizer = RMSprop(lr=1e-3)

Instructions for updating:
Colocations handled automatically by placer.


In [25]:
if gru:
    decoder_output = connect_transfer_values_gru(transfer_values=encoder_input)
else:
    decoder_output = connect_transfer_values_lstm(transfer_values=encoder_input)

In [26]:
decoder_model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])

In [27]:
decoder_model.compile(optimizer=optimizer,
                      loss='categorical_crossentropy')

In [28]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_input (InputLayer)      (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 512)    3774976     decoder_input[0][0]              
__________________________________________________________________________________________________
encoder_reduction (Dense)       (None, 512)          2097664     encoder_input[0][0]              
__________________________________________________________________________________________________
GRU1 (GRU)

In [29]:
generator = generate_batch(transfer_values, train_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=batch_size, gru=False)
val_generator = generate_batch(val_transfer_values, val_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=batch_size, gru=False)

In [30]:
" ".join([x for x in [train_vocab.get_word_by_id(word) for x in next(generator)[0]['decoder_input'] for word in x if word != 0]])

'<sos> the two boys are wearing ice skates and hockey equipment <eos> <sos> a man looks at the city <eos> <sos> people camp with the mountains in the background <eos> <sos> basketball players try to block a ball from going into the goal <eos> <sos> a young girl in a blue and white cheerleading costume holds her right arm up while her other hand is on her hip <eos> <sos> a young boy makes a splash in the water near the rocks <eos> <sos> a little girl in a pink and white flowered dress and blue sweater swinging <eos> <sos> a lone swinger on a swing ride at the fair <eos> <sos> a boy holds a green apple in his mouth <eos> <sos> a boy jumps <eos> <sos> a wrestler is ready to jump on another wrestler outside the ring <eos> <sos> a baby hangs off an adult s back while laughing <eos> <sos> a woman wearing a headscarf is near many tulips <eos> <sos> a female with glasses a brown shirt and a backpack <eos> <sos> two brown dogs with blue collars are running in the grass <eos> <sos> four small do

### Checkpoints

During the training process, it is a good idea to save the weights periodically.

In [31]:
try:
    os.mkdir('./decoders/')
except:
    print('The folder already exists')

checkpoints = ModelCheckpoint(path_checkpoint, verbose=1, save_weights_only=True, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=2, verbose=1, min_lr=0.00001)

The folder already exists


In [32]:

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [33]:
# try:
#     decoder_model.load_weights(path_checkpoint)
# except:
#     print("Error while loading weights")

In [34]:
start = time.time()
history = decoder_model.fit_generator(generator=generator,
                            steps_per_epoch=steps_per_epoch,
                            epochs=20,
                            callbacks=[checkpoints, reduce_lr],
                            validation_data=val_generator,
                            validation_steps=5)
time_train = time.time() - start

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.44233, saving model to ./decoders/VGG16_GRU_flickr8k_2l_64b.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 1.44233 to 1.20751, saving model to ./decoders/VGG16_GRU_flickr8k_2l_64b.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 1.20751 to 1.07715, saving model to ./decoders/VGG16_GRU_flickr8k_2l_64b.hdf5
Epoch 4/20



Epoch 00004: val_loss did not improve from 1.07715
Epoch 5/20

Epoch 00005: val_loss improved from 1.07715 to 1.04040, saving model to ./decoders/VGG16_GRU_flickr8k_2l_64b.hdf5
Epoch 6/20

Epoch 00006: val_loss did not improve from 1.04040
Epoch 7/20



Epoch 00007: val_loss did not improve from 1.04040

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 8/20

Epoch 00008: val_loss did not improve from 1.04040
Epoch 9/20

Epoch 00009: val_loss improved from 1.04040 to 1.03916, saving model to ./decoders/VGG16_GRU_flickr8k_2l_64b.hdf5
Epoch 10/20



Epoch 00010: val_loss did not improve from 1.03916
Epoch 11/20

Epoch 00011: val_loss did not improve from 1.03916

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 12/20

Epoch 00012: val_loss did not improve from 1.03916
Epoch 13/20



Epoch 00013: val_loss improved from 1.03916 to 1.02189, saving model to ./decoders/VGG16_GRU_flickr8k_2l_64b.hdf5
Epoch 14/20

Epoch 00014: val_loss improved from 1.02189 to 1.00262, saving model to ./decoders/VGG16_GRU_flickr8k_2l_64b.hdf5
Epoch 15/20

Epoch 00015: val_loss did not improve from 1.00262
Epoch 16/20



Epoch 00016: val_loss did not improve from 1.00262

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 17/20

Epoch 00017: val_loss improved from 1.00262 to 0.99978, saving model to ./decoders/VGG16_GRU_flickr8k_2l_64b.hdf5
Epoch 18/20

Epoch 00018: val_loss did not improve from 0.99978
Epoch 19/20



Epoch 00019: val_loss did not improve from 0.99978

Epoch 00019: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 20/20

Epoch 00020: val_loss did not improve from 0.99978


In [35]:
print("Time for training: {} seconds".format(time_train))

Time for training: 2514.081176996231 seconds


In [36]:
his

NameError: name 'his' is not defined