In [1]:
from datasets import flickr8k_parse
from keras import backend as K
from keras import Model
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Input, Dense, LSTM, add, Embedding, GRU, Dropout, Multiply, Dot, Lambda, BatchNormalization, \
    RepeatVector, concatenate
from keras.optimizers import RMSprop
from keras.backend.tensorflow_backend import set_session
from models import batch_generator, decoder

import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import path_generation
import tensorflow as tf
import text_processing
import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 2498995129509002572, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 1462032793
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 10982203293942572571
 physical_device_desc: "device: 0, name: GeForce 840M, pci bus id: 0000:07:00.0, compute capability: 5.0"]

# Decoder

### Captions encoding

Before building decoder, it is necessary to encode captions into one-hot vectors which further would be used in embedding layer.

### COCO dataset

In [3]:
# captions_path = 'D:/coco/annotations/'
# images_path = 'D:/coco/images/'

# # parse JSON file with captions to get paths to images with captions
# val_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
#                                                                      train=False)
# val_filenames_with_all_captions = coco_parse.get_image_with_all_captions(val_filenames_with_captions)

# train_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
#                                                                      train=True)
# train_filenames_with_all_captions = coco_parse.get_image_with_all_captions(train_filenames_with_captions)

# ### Extract captions
# train_captions = coco_parse.make_list_of_captions(train_filenames_with_all_captions)
# val_captions = coco_parse.make_list_of_captions(val_filenames_with_all_captions)

### Flickr8k dataset

In [4]:
images_path = 'D:/Flickr8k/images/'
annotations_path = 'D:/Flickr8k/annotations/'
captions_file = 'D:/Flickr8k/annotations/Flickr8k.token.txt'
train_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.trainImages.txt'
dev_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.devImages.txt'
test_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.testImages.txt'

filenames_with_all_captions = flickr8k_parse.generate_filenames_with_all_captions(captions_file, images_path)

train_filenames_with_all_captions = flickr8k_parse.generate_set(train_txt_path, filenames_with_all_captions, images_path)
val_filenames_with_all_captions = flickr8k_parse.generate_set(dev_txt_path, filenames_with_all_captions, images_path)
test_filenames_with_all_captions = flickr8k_parse.generate_set(test_txt_path, filenames_with_all_captions, images_path)

train_captions = flickr8k_parse.make_list_of_captions(train_filenames_with_all_captions)
val_captions = flickr8k_parse.make_list_of_captions(val_filenames_with_all_captions)

In [5]:
### Preprocess captions
text_processing.preprocess_captions(val_captions)
text_processing.preprocess_captions(train_captions)

In [6]:
### Add markers of captions' starts and ends
text_processing.add_start_and_end_to_captions(train_captions)
text_processing.add_start_and_end_to_captions(val_captions)

In [7]:
### Create vocabulary from the training captions
train_vocab = text_processing.Vocabulary()
for caption_list in train_captions:
    for caption in caption_list:
        tmp_caption_list = caption.split()
        for word in tmp_caption_list:
            train_vocab.add_word(word)

In [8]:
if not os.path.exists('./vocabulary'):
    os.mkdir('./vocabulary')
train_vocab.save_vocabulary('word_to_id.pickle', 'id_to_word.pickle', 'word_counter.pickle')

In [9]:
train_captions_tokens = text_processing.tokenise_captions(train_captions, train_vocab)
val_captions_tokens = text_processing.tokenise_captions(val_captions, train_vocab)

In [10]:
train_captions_tokens[0]

[[1, 2, 3, 4, 5, 6, 7, 2, 8, 4, 9, 10, 11, 12],
 [1, 3, 4, 13, 14, 4, 15, 11, 12],
 [1, 16, 17, 18, 19, 20, 21, 10, 22, 23, 12],
 [1, 16, 17, 24, 25, 9, 10, 11, 12],
 [1, 16, 17, 6, 15, 2, 26, 27, 28, 29, 30, 12]]

In [11]:
train_captions[0]

['<sos> a black dog is running after a white dog in the snow <eos>',
 '<sos> black dog chasing brown dog through snow <eos>',
 '<sos> two dogs chase each other across the snowy ground <eos>',
 '<sos> two dogs play together in the snow <eos>',
 '<sos> two dogs running through a low lying body of water <eos>']

### Decoder NN

### GRU

In [12]:
dataset = 'flickr8k'
batch_size = 32
epochs = 30
steps_per_epoch = int(len(train_captions) / batch_size)
initial_state_size = 512
embedding_out_size = 512
number_of_layers = 2
batch_norm = True
dropout = True
gru = False
attn = True
attn_type = 'bahdanau'
max_len = 30
path_gen = path_generation.PathGenerator(gru, dataset, number_of_layers, batch_size, batch_norm, dropout, attn, attn_type)
path_checkpoint = path_gen.get_weights_path()
model_path = path_gen.get_model_path()
callbacks_path = path_gen.get_callbacks_path()

In [13]:
print(callbacks_path)

./model_files/callbacks/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.csv


In [14]:
if attn:
    transfer_values = np.load('./cnn_features/vgg16_flickr8k_train_attn.npy')
    val_transfer_values = np.load('./cnn_features/vgg16_flickr8k_val_attn.npy')
else:
    transfer_values = np.load('./cnn_features/vgg16_flickr8k_train.npy')
    val_transfer_values = np.load('./cnn_features/vgg16_flickr8k_val.npy')

In [15]:
if attn:
    print(transfer_values.shape)
    transfer_values = transfer_values.reshape(6000, -1, 512)
    val_transfer_values = val_transfer_values.reshape(1000, -1, 512)
    print(transfer_values.shape)

(6000, 14, 14, 512)
(6000, 196, 512)


In [16]:
decoder_model = decoder.Decoder(initial_state_size,
                               embedding_out_size,
                               number_of_layers,
                               gru,
                               batch_norm,
                               dropout,
                               attn,
                               attn_type,
                               transfer_values,
                               train_vocab)
decoder_model = decoder_model.build_model()

Initial features shape (?, 196, 512)
Instructions for updating:
Colocations handled automatically by placer.
word-embedding (?, 30, 512)
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Initial states
s initial (?, 512)
c initial (?, 512)
------------------------
LSTM iteration 0
------------------------
Attention
img features (?, 196, 512)
prev state (?, 512)
a_dense (?, 196, 512)
s_dense (?, 1, 512)
summary (?, 196, 512)
first_dense (?, 196, 512)
weights (?, 196, 1)
context (?, 1, 512)
------------------------
context (?, 1, 512)
current word vector (?, 1, 512)
lstm input: context-word concat (?, 1, 1024)
hidden state (?, 512)
------------------------
LSTM iteration 1
------------------------
Attention
img features (?, 196, 512)
prev state (?, 512)
a_dense (?, 196, 512)
s_dense (?, 1, 512)
summary (?, 196, 512)
first_dense (?, 196, 512)
weights (?, 196, 1)
context (?, 1, 512)
------------------------
context (?, 1, 512

In [17]:
if gru:
    generator = batch_generator.generate_batch(transfer_values, train_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=batch_size)
    val_generator = batch_generator.generate_batch(val_transfer_values, val_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=batch_size)
else:
    generator = batch_generator.generate_batch(transfer_values, train_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=batch_size, gru=False)
    val_generator = batch_generator.generate_batch(val_transfer_values, val_captions_tokens, number_of_words=train_vocab.number_of_words, batch_size=batch_size, gru=False)              

In [18]:
optimizer = RMSprop(lr=1e-3, decay=1e-8)

In [19]:
decoder_model.compile(optimizer=optimizer,
                      loss='categorical_crossentropy')

In [20]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 196, 512)     0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 512)          0           encoder_input[0][0]              
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 512)          262656      lambda_1[0][0]                   
__________________________________________________________________________________________________
batch_normalization_5 (BatchNor (None, 512)          2048        dense_5[0][0]                    
__________________________________________________________________________________________________
lambda_3 (

__________________________________________________________________________________________________
batch_normalization_6 (BatchNor (None, 512)          2048        dense_6[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 512), (None, 3147776     concatenate_1[0][0]              
                                                                 batch_normalization_5[0][0]      
                                                                 batch_normalization_6[0][0]      
                                                                 concatenate_2[0][0]              
                                                                 lstm_1[0][0]                     
                                                                 lstm_1[0][2]                     
                                                                 concatenate_3[0][0]              
          

                                                                 encoder_input[0][0]              
__________________________________________________________________________________________________
multiply_5 (Multiply)           (None, 1, 512)       0           dot_5[0][0]                      
                                                                 dense_3[4][0]                    
__________________________________________________________________________________________________
lambda_17 (Lambda)              (None, 1, 512)       0           dropout_1[0][0]                  
__________________________________________________________________________________________________
concatenate_5 (Concatenate)     (None, 1, 1024)      0           multiply_5[0][0]                 
                                                                 lambda_17[0][0]                  
__________________________________________________________________________________________________
lambda_18 

__________________________________________________________________________________________________
lambda_41 (Lambda)              (None, 1, 512)       0           dropout_1[0][0]                  
__________________________________________________________________________________________________
concatenate_13 (Concatenate)    (None, 1, 1024)      0           multiply_13[0][0]                
                                                                 lambda_41[0][0]                  
__________________________________________________________________________________________________
lambda_42 (Lambda)              (None, 1, 512)       0           lstm_1[12][0]                    
__________________________________________________________________________________________________
add_14 (Add)                    (None, 196, 512)     0           dense_2[13][0]                   
                                                                 dense_1[13][0]                   
__________

                                                                 lambda_65[0][0]                  
__________________________________________________________________________________________________
lambda_66 (Lambda)              (None, 1, 512)       0           lstm_1[20][0]                    
__________________________________________________________________________________________________
add_22 (Add)                    (None, 196, 512)     0           dense_2[21][0]                   
                                                                 dense_1[21][0]                   
__________________________________________________________________________________________________
lambda_67 (Lambda)              (None, 196, 512)     0           add_22[0][0]                     
__________________________________________________________________________________________________
weights_21 (Lambda)             (None, 196, 1)       0           dense_4[21][0]                   
__________

add_30 (Add)                    (None, 196, 512)     0           dense_2[29][0]                   
                                                                 dense_1[29][0]                   
__________________________________________________________________________________________________
lambda_91 (Lambda)              (None, 196, 512)     0           add_30[0][0]                     
__________________________________________________________________________________________________
weights_29 (Lambda)             (None, 196, 1)       0           dense_4[29][0]                   
__________________________________________________________________________________________________
dot_30 (Dot)                    (None, 1, 512)       0           weights_29[0][0]                 
                                                                 encoder_input[0][0]              
__________________________________________________________________________________________________
multiply_3

In [21]:
model_json = decoder_model.to_json()
try:
    os.mkdir('./models')
except:
    print('The folder already exists')
with open(model_path, "w") as json_file:
    json.dump(json.loads(model_json), json_file, indent=4)

  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '
  '. They will not be included '


The folder already exists


### Checkpoints

During the training process, it is a good idea to save the weights periodically.

In [22]:
try:
    os.mkdir('./weights/')
except:
    print('The folder already exists')

checkpoints = ModelCheckpoint(path_checkpoint, verbose=1, save_weights_only=True, save_best_only=False)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=2, verbose=1, min_lr=0.0000001)

The folder already exists


In [23]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))
start = time.time()
callbacks = decoder_model.fit_generator(generator=generator,
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            callbacks=[checkpoints, reduce_lr],
                            validation_data=val_generator,
                            validation_steps=5)
time_train = time.time() - start

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/30

Epoch 00001: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 2/30

Epoch 00002: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 3/30



Epoch 00003: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 4/30

Epoch 00004: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 5/30



Epoch 00005: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 6/30

Epoch 00006: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 7/30



Epoch 00007: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 8/30

Epoch 00008: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 9/30



Epoch 00009: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 10/30

Epoch 00010: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 11/30



Epoch 00011: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 12/30

Epoch 00012: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 13/30



Epoch 00013: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 14/30

Epoch 00014: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 15/30



Epoch 00015: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00015: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 16/30

Epoch 00016: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 17/30



Epoch 00017: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00017: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 18/30

Epoch 00018: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 19/30



Epoch 00019: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00019: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 20/30

Epoch 00020: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 21/30



Epoch 00021: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00021: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.
Epoch 22/30

Epoch 00022: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 23/30



Epoch 00023: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00023: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.
Epoch 24/30

Epoch 00024: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 25/30



Epoch 00025: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00025: ReduceLROnPlateau reducing learning rate to 1.9531250927684596e-06.
Epoch 26/30

Epoch 00026: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 27/30



Epoch 00027: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00027: ReduceLROnPlateau reducing learning rate to 9.765625463842298e-07.
Epoch 28/30

Epoch 00028: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5
Epoch 29/30



Epoch 00029: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5

Epoch 00029: ReduceLROnPlateau reducing learning rate to 4.882812731921149e-07.
Epoch 30/30

Epoch 00030: saving model to ./model_files/weights/VGG16_LSTM_flickr8k_2l_32b_bn_dr_attn_bahdanau.hdf5


In [24]:
print("Time for training: {} seconds".format(time_train))

Time for training: 9415.719113588333 seconds


In [25]:
if not os.path.exists('./callbacks'):
    os.mkdir('./callbacks')   
columns = callbacks.history.keys()

In [26]:
callback_df = pd.DataFrame(callbacks.history)
callback_df.to_csv(callbacks_path, index=None)