In [1]:
from datasets import coco_parse
from datasets import flickr8k_parse
from keras import Model 
from keras.applications import VGG16
from tensorflow.python.client import device_lib

import numpy as np
import os
import time
import utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Image encoding

Currently, an image encoder is built using VGG16 architecture pre-trained on imagenet database.

The features were obtained from the "fc2" layer - last fully-connected layer before the predictions layer.

Generated features in numpy arrays for both training and validation datasets were saved to .npy files.

### COCO dataset

In [2]:
# captions_path = 'D:/coco/annotations/'
# images_path = 'D:/coco/images/'

# # parse JSON file with captions to get paths to images with captions

# val_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
#                                                                      train=False)

# val_filenames_with_all_captions = coco_parse.get_image_with_all_captions(val_filenames_with_captions)

# train_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
#                                                                      train=True)
# train_filenames_with_all_captions = coco_parse.get_image_with_all_captions(train_filenames_with_captions)

### Flickr8k dataset

In [3]:
images_path = 'D:/Flickr8k/images/'
annotations_path = 'D:/Flickr8k/annotations/'
captions_file = 'D:/Flickr8k/annotations/Flickr8k.token.txt'
train_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.trainImages.txt'
dev_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.devImages.txt'
test_txt_path = 'D:/Flickr8k/annotations/Flickr_8k.testImages.txt'

filenames_with_all_captions = flickr8k_parse.generate_filenames_with_all_captions(captions_file, images_path)

train_filenames_with_all_captions = flickr8k_parse.generate_set(train_txt_path, filenames_with_all_captions, images_path)
val_filenames_with_all_captions = flickr8k_parse.generate_set(dev_txt_path, filenames_with_all_captions, images_path)
test_filenames_with_all_captions = flickr8k_parse.generate_set(test_txt_path, filenames_with_all_captions, images_path)

In [4]:
print('Number of images in validation dataset: {}'.format(len(val_filenames_with_all_captions)))
print('Number of images in training dataset: {}'.format(len(train_filenames_with_all_captions)))

Number of images in validation dataset: 1000
Number of images in training dataset: 6000


In [5]:
attn = True

In [6]:
### Create instance of the VGG16 model pretrained on imagenet
VGG_model = VGG16(include_top=True, weights='imagenet')
### alter initial VGG model by eliminating of the last layer which produces preditions. 
### The output of the second fully-connected layer will be used as the last layer of the encoder part
if attn:
    transfer_layer = VGG_model.get_layer('block5_conv3')
else:
    transfer_layer = VGG_model.get_layer('fc2')
VGG_transfer_model = Model(inputs=VGG_model.input, outputs=transfer_layer.output)
### get the size of the images used for VGG16 to resize COCO images
input_layer = VGG_model.get_layer('input_1')
VGG_image_size = input_layer.input_shape[1:3]

Instructions for updating:
Colocations handled automatically by placer.


In [7]:
VGG_transfer_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [8]:
def use_pretrained_model_for_images(filenames_with_all_captions, transfer_model, img_size ,batch_size=64):
    """
    Uses the pretrained model without prediction layer to encode the images into the set of the features.
    
    Parameters:
    -----------
    filenames with all captions : list
        List of dictionaries containing images with the corresponding captions
    
    transfer_model: keras.Model
        Model which is used to process images
        
    img_size: tuple
        Size of images required by the model
        
    batch_size: int
        Size of the batch for CNN
    -----------
    """
    
    ### get the number of images in the dataset
    num_images = len(filenames_with_all_captions)
    ### calculate the number of iterations 
    iter_num = int(num_images / batch_size)
    ### variable to print the progress each 5% of the dataset 
    five_perc = int(iter_num * 0.05)
    iter_count = 0
    cur_progress = 0
    
    ### get the paths to all images without captions
    image_paths = list(filenames_with_all_captions.keys())
    ### list for the final result
    transfer_values = []
    
    ### start and end index for each batch
    first_i = 0
    last_i = batch_size
    
    ### loop through the images
    while first_i < num_images:
        iter_count += 1
        
        ### progress print
        if iter_count == five_perc:
            cur_progress += 5
            iter_count = 0
            print(str(cur_progress) + "% of images processed")
        
        ### to make sure that last batch is not beyond the number of the images
        if last_i > num_images:
            last_i = num_images
        
        ### initialize the list for the batch
        image_batch = []
        
        ### loop to form batches
        for image in image_paths[first_i:last_i]:
            ### preprocess image
            image = utils.image_preprocessing(image, img_size)
            ### append image to batch list
            image_batch.append(image)
        
        ### run the model to encode the features
        preds = transfer_model.predict(np.array(image_batch))
        
        ### append predictions from the batch to the final list
        for pred in preds:
            transfer_values.append(pred)
        
        ### update first and last indices in the batch
        first_i += batch_size
        last_i += batch_size
        
    return np.array(transfer_values)

In [9]:
def save_features(np_arr, folder, filename):
    """
    Saves encoded features into the .npy file.
    
    Parameters:
    -----------
    np_arr : numpy.ndarray
        The array which should be saved
        
    folder: str
        Path to the destination folder 
    
    model: str
        Name of the used CNN model
    
    train: boolean
        Flag for training dataset 
        set True if training and False if validaton
    -----------
    """
    
    ### form the full path for the file
    full_path = os.path.join(folder, filename)
    
    ### create the folder if it does not exist
    try: 
        os.mkdir(folder)
    except:
        pass
    ### save file 
    np.save(full_path, np_arr)
    
    print("Array was saved to {}".format(full_path))

In [10]:
### encode features for validation images
start = time.time()
val_transfer_values = use_pretrained_model_for_images(val_filenames_with_all_captions, VGG_transfer_model, VGG_image_size, batch_size=16)
time_val = time.time() - start

5% of images processed
10% of images processed
15% of images processed
20% of images processed
25% of images processed
30% of images processed
35% of images processed
40% of images processed
45% of images processed
50% of images processed
55% of images processed
60% of images processed
65% of images processed
70% of images processed
75% of images processed
80% of images processed
85% of images processed
90% of images processed
95% of images processed
100% of images processed
105% of images processed


In [11]:
print('Validation dataset encoding took {:.1f} minutes'.format(time_val / 60))

Validation dataset encoding took 1.9 minutes


In [12]:
### save features for validation images
save_features(val_transfer_values, './cnn_features/', 'vgg16_flickr8k_val_attn.npy')

Array was saved to ./cnn_features/vgg16_flickr8k_val_attn.npy


In [13]:
a = np.load('./cnn_features/vgg16_flickr8k_val_attn.npy')
(a == val_transfer_values).all()

True

In [14]:
### encode features for training images
start = time.time()
train_transfer_values = use_pretrained_model_for_images(train_filenames_with_all_captions, VGG_transfer_model, VGG_image_size, batch_size=16)
time_train = time.time() - start

5% of images processed
10% of images processed
15% of images processed
20% of images processed
25% of images processed
30% of images processed
35% of images processed
40% of images processed
45% of images processed
50% of images processed
55% of images processed
60% of images processed
65% of images processed
70% of images processed
75% of images processed
80% of images processed
85% of images processed
90% of images processed
95% of images processed
100% of images processed


In [15]:
print('Training dataset encoding took {:.1f} minutes'.format(time_train / 60))

Training dataset encoding took 9.2 minutes


In [16]:
save_features(train_transfer_values, './cnn_features/', 'vgg16_flickr8k_train_attn.npy')

Array was saved to ./cnn_features/vgg16_flickr8k_train_attn.npy


In [17]:
a = np.load('./cnn_features/vgg16_flickr8k_train_attn.npy')
(a == train_transfer_values).all()

True

In [18]:
### encode features for training images
start = time.time()
test_transfer_values = use_pretrained_model_for_images(test_filenames_with_all_captions, VGG_transfer_model, VGG_image_size, batch_size=16)
time_train = time.time() - start

5% of images processed
10% of images processed
15% of images processed
20% of images processed
25% of images processed
30% of images processed
35% of images processed
40% of images processed
45% of images processed
50% of images processed
55% of images processed
60% of images processed
65% of images processed
70% of images processed
75% of images processed
80% of images processed
85% of images processed
90% of images processed
95% of images processed
100% of images processed
105% of images processed


In [19]:
save_features(test_transfer_values, './cnn_features/', 'vgg16_flickr8k_test_attn.npy')

Array was saved to ./cnn_features/vgg16_flickr8k_test_attn.npy
