In [14]:
from keras import Model
from keras.applications import VGG16
from keras.callbacks import ModelCheckpoint, Callback
from keras.layers import Input, Dense, LSTM, Embedding, GRU, Flatten
from keras.optimizers import RMSprop
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from PIL import Image

import coco_parse
import numpy as np
import tensorflow as tf
import text_processing

# Decoder

### Captions encoding

Before building decoder, it is necessary to encode captions into one-hot vectors which further would be used in embedding layer.

In [15]:
captions_path = 'D:/coco/annotations/'
images_path = 'D:/coco/images/'

# parse JSON file with captions to get paths to images with captions
val_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
                                                                     train=False)
val_filenames_with_all_captions = coco_parse.get_image_with_all_captions(val_filenames_with_captions)

train_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
                                                                     train=True)
train_filenames_with_all_captions = coco_parse.get_image_with_all_captions(train_filenames_with_captions)

### Extract captions
train_captions = coco_parse.make_list_of_captions(train_filenames_with_all_captions)
val_captions = coco_parse.make_list_of_captions(val_filenames_with_all_captions)

In [16]:
train_captions[0]

['A bicycle replica with a clock as the front wheel.',
 'The bike has a clock as a tire.',
 'A black metal bicycle with a clock inside the front wheel.',
 'A bicycle figurine in which the front wheel is replaced with a clock\n',
 'A clock with the appearance of the wheel of a bicycle ']

In [17]:
val_captions[0]

['A black Honda motorcycle parked in front of a garage.',
 'A Honda motorcycle parked in a grass driveway',
 'A black Honda motorcycle with a dark burgundy seat.',
 'Ma motorcycle parked on the gravel in front of a garage',
 'A motorcycle with its brake extended standing outside']

In [18]:
### Preprocess captions
text_processing.preprocess_captions(val_captions)
text_processing.preprocess_captions(train_captions)

In [19]:
val_captions[0]

['A black Honda motorcycle parked in front of a garage ',
 'A Honda motorcycle parked in a grass driveway',
 'A black Honda motorcycle with a dark burgundy seat ',
 'Ma motorcycle parked on the gravel in front of a garage',
 'A motorcycle with its brake extended standing outside']

In [20]:
train_captions[0]

['A bicycle replica with a clock as the front wheel ',
 'The bike has a clock as a tire ',
 'A black metal bicycle with a clock inside the front wheel ',
 'A bicycle figurine in which the front wheel is replaced with a clock ',
 'A clock with the appearance of the wheel of a bicycle ']

In [21]:
### Add markers of captions' starts and ends
text_processing.add_start_and_end_to_captions(train_captions)
text_processing.add_start_and_end_to_captions(val_captions)

In [22]:
train_captions[0]

['<sos> a bicycle replica with a clock as the front wheel <eos>',
 '<sos> the bike has a clock as a tire <eos>',
 '<sos> a black metal bicycle with a clock inside the front wheel <eos>',
 '<sos> a bicycle figurine in which the front wheel is replaced with a clock <eos>',
 '<sos> a clock with the appearance of the wheel of a bicycle <eos>']

In [23]:
val_captions[0]

['<sos> a black honda motorcycle parked in front of a garage <eos>',
 '<sos> a honda motorcycle parked in a grass driveway <eos>',
 '<sos> a black honda motorcycle with a dark burgundy seat <eos>',
 '<sos> ma motorcycle parked on the gravel in front of a garage <eos>',
 '<sos> a motorcycle with its brake extended standing outside <eos>']

In [24]:
### Create vocabulary from the training captions
train_vocab = text_processing.Vocabulary()
for caption_list in train_captions:
    for caption in caption_list:
        tmp_caption_list = caption.split()
        for word in tmp_caption_list:
            train_vocab.add_word(word)

In [25]:
train_vocab.save_vocabulary()

In [26]:
### Create transformed captions list - substitute words by their IDs from vocabulary
train_captions_tokens = [] 
for captions in train_captions:
    tmp_captions_for_img = []
    for caption in captions:
        caption_words = caption.split()
        tmp = []
        for word in caption_words:
            tmp.append(train_vocab.get_id_by_word(word))
        tmp_captions_for_img.append(tmp)
    train_captions_tokens.append(tmp_captions_for_img)

In [27]:
train_captions_tokens[0]

[[1, 2, 3, 4, 5, 2, 6, 7, 8, 9, 10, 11],
 [1, 8, 12, 13, 2, 6, 7, 2, 14, 11],
 [1, 2, 15, 16, 3, 5, 2, 6, 17, 8, 9, 10, 11],
 [1, 2, 3, 18, 19, 20, 8, 9, 10, 21, 22, 5, 2, 6, 11],
 [1, 2, 6, 5, 8, 23, 24, 8, 10, 24, 2, 3, 11]]

In [28]:
train_captions[0]

['<sos> a bicycle replica with a clock as the front wheel <eos>',
 '<sos> the bike has a clock as a tire <eos>',
 '<sos> a black metal bicycle with a clock inside the front wheel <eos>',
 '<sos> a bicycle figurine in which the front wheel is replaced with a clock <eos>',
 '<sos> a clock with the appearance of the wheel of a bicycle <eos>']

### Batch generation

In [29]:
def batch_one_hot_encode(batch, number_of_words):
    """ 
    Applies one-hot encoding to the input batch
    
    """
    batch_size = batch.shape[0]
    sentence_size = batch.shape[1]
    
    one_hot_batch = np.zeros((batch_size, sentence_size, number_of_words))
    
    for i in range(batch_size):
        for j in range(sentence_size):
            one_hot_batch[i, j, batch[i, j]] = 1
    return one_hot_batch

In [63]:
def generate_batch(transfer_values, captions_tokens, number_of_words, batch_size=32):
    """
    Generate a batch of input-output data pairs:
        input_data = {
            transfer_values,
            input_tokens
        }
        
        output_data = {
            output_tokens
        }
        
     Parameters:
        -----------
        transfer_values: np.array
            Encoded images features
            
        captions: list
            list with all the captions
        
        
        batch_size: int
            The number of examples in a batch
        -----------
    """
    i = 0
    while True:
        number_of_images = len(captions_tokens)
        
        indices = range(i * batch_size, (i + 1) * batch_size)
        i += 1

        captions_batch = []
        ### Randomly select one caption for each example index
        for ind in indices:
            num_captions = len(captions_tokens[ind])
            selected_caption = captions_tokens[ind][np.random.randint(0, num_captions)]
            captions_batch.append(selected_caption)



        ### Find the largest caption length and pad the remaining to be the same size
        max_caption_size = max([len(cap) for cap in captions_batch])

        captions_batch_padded = pad_sequences(captions_batch, 
                                              maxlen=max_caption_size, 
                                              padding='post', 
                                              value=0)

        ### Input tokens are the initial ones starting from index 1
        ### Output tokens are the initial ones shifted to the right
        input_tokens = captions_batch_padded[:, :-1]
        output_tokens = captions_batch_padded[:, 1:]

        output_tokens = batch_one_hot_encode(output_tokens, number_of_words)

        input_transfer_values = transfer_values[indices]

        input_data = {
            'encoder_input': input_transfer_values,
            'decoder_input': input_tokens
        }

        output_data = {
            'decoder_output': output_tokens
        }

        yield (input_data, output_data)

In [64]:
generator = generate_batch(transfer_values, train_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=16)

In [65]:
" ".join([x for x in [train_vocab.get_word_by_id(word) for x in next(generator)[0]['decoder_input'] for word in x if word != 0]])

'<sos> a bicycle figurine in which the front wheel is replaced with a clock <eos> <sos> a blue boat themed bathroom with a life preserver on the wall <eos> <sos> city street with parked cars and a bench <eos> <sos> there is a gol plane taking off in a partly cloudy sky <eos> <sos> a toilet in a circular matte glass stall <eos> <sos> a picture of a modern looking kitchen area <eos> <sos> a messy bathroom countertop perched atop black cabinetry <eos> <sos> an open box contains an unknown purple object <eos> <sos> an old green car parked on the side of the street <eos> <sos> this is a kitchen with dishes and a silver sink <eos> <sos> a group of motorcycle riders driving past buildings <eos> <sos> a cat drinking water from a toilet in a bathroom <eos> <sos> a man in a wheelchair and another sitting on a bench that is overlooking the water <sos> a man sits next to his horse in an alley in costume <eos> <sos> a series of motorbikes parked in a row on a street <eos> <sos> a small kitten is si

### Decoder NN

In [66]:
initial_state_size = 512
embedding_out_size = 512

encoder_input = Input(shape=(4096,), name='encoder_input')
encoder_reduction = Dense(initial_state_size, activation='tanh', name='encoder_reduction')

decoder_input = Input(shape=(None,), name='decoder_input')
embedding = Embedding(input_dim=train_vocab.number_of_words, output_dim=embedding_out_size, name='embedding')

gru1 = GRU(initial_state_size, name='GRU1', return_sequences=True)
gru2 = GRU(initial_state_size, name='GRU2', return_sequences=True)
gru3 = GRU(initial_state_size, name='GRU3', return_sequences=True)

flatten = Flatten()

decoder_dense = Dense(train_vocab.number_of_words, activation='softmax', name='decoder_output')

In [67]:
def connect_transfer_values(transfer_values):
    
    initial_state = encoder_reduction(transfer_values)

    X = decoder_input
    
    X = embedding(X)
    
    X = gru1(X, initial_state=initial_state)
    X = gru2(X, initial_state=initial_state)
    X = gru3(X, initial_state=initial_state)

    decoder_output = decoder_dense(X)
    
    return decoder_output

In [68]:
transfer_values = np.load('./cnn_features/vgg16_train.npy')

In [69]:
optimizer = RMSprop(lr=1e-3)

Instructions for updating:
Colocations handled automatically by placer.


In [70]:
decoder_output = connect_transfer_values(transfer_values=encoder_input)

In [71]:
decoder_model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_output])

In [72]:
decoder_model.compile(optimizer=optimizer,
                      loss='categorical_crossentropy')

In [73]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_input (InputLayer)      (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 512)    13748224    decoder_input[0][0]              
__________________________________________________________________________________________________
encoder_reduction (Dense)       (None, 512)          2097664     encoder_input[0][0]              
__________________________________________________________________________________________________
GRU1 (GRU)

In [76]:
generator = generate_batch(transfer_values, train_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=16)

In [77]:
# " ".join([x for x in [train_vocab.get_word_by_id(word) for x in next(generator)[0]['decoder_input'] for word in x if word != 0]])

### Checkpoints

During the training process, it is a good idea to save the weights periodically.

In [78]:
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
    
    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

class TimeHistory(Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, epoch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, epoch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

In [79]:
try:
    os.mkdir('./decoders/')
except:
    print('The folder already exists')
path_checkpoint = './decoders/VGG16_GRU.hdf5'
checkpoints = ModelCheckpoint(path_checkpoint, verbose=1, save_weights_only=True)
loss_history = LossHistory()

The folder already exists


In [80]:
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [31]:
try:
    decoder_model.load_weights(path_checkpoint)
except:
    print("Error while loading weights")

In [81]:
history = decoder_model.fit_generator(generator=generator,
                            steps_per_epoch=256,
                            epochs=5,
                            callbacks=[checkpoints, loss_history])

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/5

Epoch 00001: saving model to ./decoders/VGG16_GRU.hdf5
Epoch 2/5

Epoch 00002: saving model to ./decoders/VGG16_GRU.hdf5
Epoch 3/5

Epoch 00003: saving model to ./decoders/VGG16_GRU.hdf5
Epoch 4/5

Epoch 00004: saving model to ./decoders/VGG16_GRU.hdf5
Epoch 5/5

Epoch 00005: saving model to ./decoders/VGG16_GRU.hdf5
