In [1]:
from configs.default import _C as config
from configs.default import update_config

from datasets import flickr8k_parse

from keras import backend as K
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import RMSprop
from keras.backend.tensorflow_backend import set_session
from models import batch_generator, decoder

import json
import numpy as np
import os
import pandas as pd
import path_generation
import tensorflow as tf
import text_processing
import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 12006001025085623202, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 1462032793
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 6801201856544217791
 physical_device_desc: "device: 0, name: GeForce 840M, pci bus id: 0000:07:00.0, compute capability: 5.0"]

In [3]:
config_file = "./configs/attn.yaml"
update_config(config, config_file)

# Decoder

### Captions encoding

Before building decoder, it is necessary to encode captions into one-hot vectors which further would be used in embedding layer.

### COCO dataset

In [4]:
if config.DATASET == 'Coco':
    if config.ATTENTION:
        features_file_train = "vgg16_coco_train_attn.npy"
        features_file_val = "vgg16_coco_val_attn.npy"
    else:
        features_file_train = "vgg16_coco_train.npy"
        features_file_val = "vgg16_coco_val.npy"
    
    
    val_filenames_with_captions = coco_parse.get_image_filename_with_caption(config.PATH.ANNOTATIONS_PATH, 
                                                                             config.PATH.IMG_PATH, 
                                                                             train=False)

    val_filenames_with_all_captions = coco_parse.get_image_with_all_captions(val_filenames_with_captions)

    train_filenames_with_captions = coco_parse.get_image_filename_with_caption(config.PATH.ANNOTATIONS_PATH, 
                                                                               config.PATH.IMG_PATH,
                                                                               train=True)
    train_filenames_with_all_captions = coco_parse.get_image_with_all_captions(train_filenames_with_captions)

    ### Extract captions
    train_captions = coco_parse.make_list_of_captions(train_filenames_with_all_captions)
    val_captions = coco_parse.make_list_of_captions(val_filenames_with_all_captions)

### Flickr8k dataset

In [5]:
if config.DATASET == 'Flickr8k':
    if config.ATTENTION:
        features_file_train = "vgg16_flickr8k_train_attn.npy"
        features_file_val = "vgg16_flickr8k_val_attn.npy"
    else:
        features_file_train = "vgg16_flickr8k_train.npy"
        features_file_val = "vgg16_flickr8k_val.npy"

    captions_file = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr8k.token.txt")
    train_txt_path = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr_8k.trainImages.txt")
    dev_txt_path = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr_8k.devImages.txt")
    test_txt_path = os.path.join(config.PATH.ANNOTATIONS_PATH, "Flickr_8k.testImages.txt")    
        
    filenames_with_all_captions = flickr8k_parse.generate_filenames_with_all_captions(captions_file, 
                                                                                      config.PATH.IMG_PATH)
    train_filenames_with_all_captions = flickr8k_parse.generate_set(train_txt_path, 
                                                                    filenames_with_all_captions,
                                                                    config.PATH.IMG_PATH)
    val_filenames_with_all_captions = flickr8k_parse.generate_set(dev_txt_path, 
                                                                  filenames_with_all_captions, 
                                                                  config.PATH.IMG_PATH)
    test_filenames_with_all_captions = flickr8k_parse.generate_set(test_txt_path, 
                                                                   filenames_with_all_captions, 
                                                                   config.PATH.IMG_PATH)

    train_captions = flickr8k_parse.make_list_of_captions(train_filenames_with_all_captions)
    val_captions = flickr8k_parse.make_list_of_captions(val_filenames_with_all_captions)

In [6]:
### Preprocess captions
text_processing.preprocess_captions(val_captions)
text_processing.preprocess_captions(train_captions)

In [7]:
### Add markers of captions' starts and ends
text_processing.add_start_and_end_to_captions(train_captions)
text_processing.add_start_and_end_to_captions(val_captions)

In [8]:
### Create vocabulary from the training captions
train_vocab = text_processing.Vocabulary()
for caption_list in train_captions:
    for caption in caption_list:
        tmp_caption_list = caption.split()
        for word in tmp_caption_list:
            train_vocab.add_word(word)

In [9]:
if not os.path.exists(config.PATH.VOCABULARY_PATH):
    os.mkdir(config.PATH.VOCABULARY_PATH)
train_vocab.save_vocabulary(config.VOCABULARY.WORD_TO_ID, config.VOCABULARY.ID_TO_WORD, config.VOCABULARY.COUNT)

In [10]:
train_captions_tokens = text_processing.tokenise_captions(train_captions, train_vocab)
val_captions_tokens = text_processing.tokenise_captions(val_captions, train_vocab)

In [11]:
train_captions_tokens[0]

[[1, 2, 3, 4, 5, 6, 7, 2, 8, 4, 9, 10, 11, 12],
 [1, 3, 4, 13, 14, 4, 15, 11, 12],
 [1, 16, 17, 18, 19, 20, 21, 10, 22, 23, 12],
 [1, 16, 17, 24, 25, 9, 10, 11, 12],
 [1, 16, 17, 6, 15, 2, 26, 27, 28, 29, 30, 12]]

In [12]:
train_captions[0]

['<sos> a black dog is running after a white dog in the snow <eos>',
 '<sos> black dog chasing brown dog through snow <eos>',
 '<sos> two dogs chase each other across the snowy ground <eos>',
 '<sos> two dogs play together in the snow <eos>',
 '<sos> two dogs running through a low lying body of water <eos>']

### Decoder NN

### GRU

In [13]:
path_gen = path_generation.PathGenerator(config.DECODER.GRU, 
                                         config.DATASET, 
                                         config.DECODER.NUM_RNN_LAYERS, 
                                         config.DECODER.BATCH_SIZE, 
                                         config.DECODER.BATCH_NORM, 
                                         config.DECODER.DROPOUT, 
                                         config.ATTENTION, 
                                         config.DECODER.ATTN_TYPE)

path_checkpoint = path_gen.get_weights_path()
model_path = path_gen.get_model_path()
callbacks_path = path_gen.get_callbacks_path()

In [14]:
features_file_train_path = os.path.join(config.PATH.FEATURES_PATH, features_file_train)
features_file_val_path = os.path.join(config.PATH.FEATURES_PATH, features_file_val)

transfer_values = np.load(features_file_train_path)
val_transfer_values = np.load(features_file_val_path)

In [15]:
if config.ATTENTION:
    print(transfer_values.shape)
    transfer_values = transfer_values.reshape(len(train_filenames_with_all_captions), transfer_values.shape[1] ** 2, -1)
    val_transfer_values = val_transfer_values.reshape(len(val_filenames_with_all_captions), val_transfer_values.shape[1] ** 2, -1)
    print(transfer_values.shape)

(6000, 14, 14, 512)
(6000, 196, 512)


In [16]:
decoder_model = decoder.Decoder(config.DECODER.INITIAL_STATE_SIZE,
                                config.DECODER.EMBEDDING_OUT_SIZE,
                                config.DECODER.NUM_RNN_LAYERS,
                                config.DECODER.GRU,
                                config.DECODER.BATCH_NORM,
                                config.DECODER.DROPOUT,
                                config.ATTENTION,
                                config.DECODER.ATTN_TYPE,
                                transfer_values,
                                train_vocab)
decoder_model = decoder_model.build_model()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [17]:
if config.DECODER.GRU:
    generator = batch_generator.generate_batch(transfer_values, 
                                               train_captions_tokens, 
                                               number_of_words=train_vocab.number_of_words, 
                                               batch_size=config.DECODER.BATCH_SIZE)
    val_generator = batch_generator.generate_batch(val_transfer_values, 
                                                   val_captions_tokens, 
                                                   number_of_words=train_vocab.number_of_words, 
                                                   batch_size=config.DECODER.BATCH_SIZE)
else:
    generator = batch_generator.generate_batch(transfer_values, 
                                               train_captions_tokens, 
                                               number_of_words=train_vocab.number_of_words, 
                                               batch_size=config.DECODER.BATCH_SIZE, 
                                               gru=config.DECODER.GRU)
    
    val_generator = batch_generator.generate_batch(val_transfer_values, 
                                                   val_captions_tokens, 
                                                   number_of_words=train_vocab.number_of_words, 
                                                   batch_size=config.DECODER.BATCH_SIZE, 
                                                   gru=config.DECODER.GRU)              

In [18]:
if config.DECODER.OPTIMIZER:
    optimizer = RMSprop(lr=config.DECODER.LR, decay=config.DECODER.DECAY)

In [19]:
decoder_model.compile(optimizer=optimizer,
                      loss=config.DECODER.LOSS)

In [20]:
model_json = decoder_model.to_json()
try:
    os.mkdir(config.PATH.MODELS_ARCHITECTURE_PATH)
except:
    print('The folder already exists')
with open(model_path, "w") as json_file:
    json.dump(json.loads(model_json), json_file, indent=4)

  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


The folder already exists


### Checkpoints

During the training process, it is a good idea to save the weights periodically.

In [21]:
try:
    os.mkdir(configs.WEIGHTS_PATH)
except:
    print('The folder already exists')

checkpoints = ModelCheckpoint(path_checkpoint, 
                              verbose=config.DECODER.VERBOSE, 
                              save_weights_only=True, 
                              save_best_only=config.DECODER.SAVE_BEST)

reduce_lr = ReduceLROnPlateau(monitor=config.DECODER.MONITOR, 
                              factor=config.DECODER.FACTOR,
                              patience=config.DECODER.PATIENCE, 
                              verbose=config.DECODER.VERBOSE, 
                              min_lr=config.DECODER.MIN_LR)

The folder already exists


In [22]:
tf_configuration = tf.ConfigProto()
tf_configuration.gpu_options.allow_growth = True
set_session(tf.Session(config=tf_configuration))
start = time.time()
callbacks = decoder_model.fit_generator(generator=generator,
                                        steps_per_epoch=int(len(train_filenames_with_all_captions) / config.DECODER.BATCH_SIZE),
                                        epochs=config.DECODER.EPOCHS,
                                        callbacks=[checkpoints, reduce_lr],
                                        validation_data=val_generator,
                                        validation_steps=config.DECODER.VAL_STEPS)
time_train = time.time() - start

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/20

Epoch 00001: val_loss improved from inf to 8.16355, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 2/20



Epoch 00002: val_loss improved from 8.16355 to 5.97949, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 3/20



Epoch 00003: val_loss improved from 5.97949 to 1.52982, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 4/20



Epoch 00004: val_loss did not improve from 1.52982
Epoch 5/20



Epoch 00005: val_loss improved from 1.52982 to 1.50758, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 6/20



Epoch 00006: val_loss improved from 1.50758 to 1.38787, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 7/20



Epoch 00007: val_loss did not improve from 1.38787
Epoch 8/20



Epoch 00008: val_loss did not improve from 1.38787

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 9/20



Epoch 00009: val_loss did not improve from 1.38787
Epoch 10/20



Epoch 00010: val_loss improved from 1.38787 to 1.36652, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 11/20



Epoch 00011: val_loss did not improve from 1.36652
Epoch 12/20



Epoch 00012: val_loss improved from 1.36652 to 1.36232, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 13/20



Epoch 00013: val_loss did not improve from 1.36232
Epoch 14/20



Epoch 00014: val_loss did not improve from 1.36232

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 15/20



Epoch 00015: val_loss improved from 1.36232 to 1.30204, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 16/20



Epoch 00016: val_loss did not improve from 1.30204
Epoch 17/20



Epoch 00017: val_loss did not improve from 1.30204

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 18/20



Epoch 00018: val_loss did not improve from 1.30204
Epoch 19/20



Epoch 00019: val_loss improved from 1.30204 to 1.27371, saving model to ./model_files/weights/VGG16_LSTM_Flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 20/20



Epoch 00020: val_loss did not improve from 1.27371


In [23]:
print("Time for training: {} seconds".format(time_train))

Time for training: 6747.911556720734 seconds


In [24]:
if not os.path.exists(config.PATH.CALLBACKS_PATH):
    os.mkdir(config.PATH.CALLBACKS_PATH)   
callback_df = pd.DataFrame(callbacks.history)
callback_df.to_csv(callbacks_path, index=None)