In [1]:
from PIL import Image
import numpy as np
import coco_parse
from matplotlib.pyplot import imshow
import os
import time
import re
import pickle
%matplotlib inline

In [2]:
from keras import Model
from keras.layers import Input, Dense, LSTM, Embedding
from keras.applications import VGG16
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.image import grayscale_to_rgb

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Image encoding

Currently, an image encoder is built using VGG16 architecture pre-trained on imagenet database.

The features were obtained from the "fc2" layer - last fully-connected layer before the predictions layer.

Generated features in numpy arrays for both training and validation datasets were saved to .npy files.

In [3]:
def image_preprocessing(image_path, new_size):
    """
    Reads the image and applies preprocessing including:
    - resizing to the new_size
    - rescaling pixel values at [0, 1]
    - transforming grayscale images to RGB format
    
    Parameters:
    -----------
    image_path : str
        full path to the image
    new_size: tuple
        size of the output image
    -----------
    """
    image = Image.open(image_path)
    image = np.array(image.resize(new_size, Image.LANCZOS))
    image = np.divide(image, 255)
    if len(image.shape) != 3:
        image = np.repeat(image[:, :, np.newaxis], 3, axis=2)
    return image

In [4]:
captions_path = 'D:/coco/annotations/'
images_path = 'D:/coco/images/'

# parse JSON file with captions to get paths to images woth captions
val_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
                                                                     train=False)
val_filenames_with_all_captions = coco_parse.get_image_with_all_captions(val_filenames_with_captions)

train_filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
                                                                     train=True)
train_filenames_with_all_captions = coco_parse.get_image_with_all_captions(train_filenames_with_captions)

In [5]:
print('Number of images in validation dataset: {}'.format(len(val_filenames_with_all_captions)))
print('Number of images in training dataset: {}'.format(len(train_filenames_with_all_captions)))

Number of images in validation dataset: 5000
Number of images in training dataset: 118285


In [9]:
### Create instance of the VGG16 model pretrained on imagenet
VGG_model = VGG16(include_top=True, weights='imagenet')

In [10]:
VGG_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [11]:
### alter initial VGG model by eliminating of the last layer which produces preditions. 
### The output of the second fully-connected layer will be used as the last layer of the encoder part
transfer_layer = VGG_model.get_layer('fc2')
VGG_transfer_model = Model(inputs=VGG_model.input, outputs=transfer_layer.output)

In [12]:
### get the size of the images used for VGG16 to resize COCO images
input_layer = VGG_model.get_layer('input_1')
VGG_image_size = input_layer.input_shape[1:3]

In [13]:
def use_pretrained_model_for_images(filenames_with_all_captions, transfer_model, img_size ,batch_size=64):
    """
    Uses the pretrained model without prediction layer to encode the images into the set of the features.
    
    Parameters:
    -----------
    filenames with all captions : list
        List of dictionaries containing images with the corresponding captions
    
    transfer_model: keras.Model
        Model which is used to process images
        
    img_size: tuple
        Size of images required by the model
        
    batch_size: int
        Size of the batch for CNN
    -----------
    """
    
    ### get the number of images in the dataset
    num_images = len(filenames_with_all_captions)
    ### calculate the number of iterations 
    iter_num = int(num_images / batch_size)
    ### variable to print the progress each 5% of the dataset 
    five_perc = int(iter_num * 0.05)
    iter_count = 0
    cur_progress = 0
    
    ### get the paths to all images without captions
    image_paths = list(filenames_with_all_captions.keys())
    ### list for the final result
    transfer_values = []
    
    ### start and end index for each batch
    first_i = 0
    last_i = batch_size
    
    ### loop through the images
    while first_i < num_images:
        iter_count += 1
        
        ### progress print
        if iter_count == five_perc:
            cur_progress += 5
            iter_count = 0
            print(str(cur_progress) + "% of images processed")
        
        ### to make sure that last batch is not beyond the number of the images
        if last_i > num_images:
            last_i = num_images
        
        ### initialize the list for the batch
        image_batch = []
        
        ### loop to form batches
        for image in image_paths[first_i:last_i]:
            ### preprocess image
            image = image_preprocessing(image, img_size)
            ### append image to batch list
            image_batch.append(image)
        
        ### run the model to encode the features
        preds = transfer_model.predict(np.array(image_batch))
        
        ### append predictions from the batch to the final list
        for pred in preds:
            transfer_values.append(pred)
        
        ### update first and last indices in the batch
        first_i += batch_size
        last_i += batch_size
        
    return np.array(transfer_values)

In [14]:
def save_features(np_arr, folder, model, train=True):
    """
    Saves encoded features into the .npy file.
    
    Parameters:
    -----------
    np_arr : numpy.ndarray
        The array which should be saved
        
    folder: str
        Path to the destination folder 
    
    model: str
        Name of the used CNN model
    
    train: boolean
        Flag for training dataset 
        set True if training and False if validaton
    -----------
    """
    
    ### form the final filename
    if train:
        filename = model + '_train.npy'
    else:
        filename = model + '_val.npy'
    
    ### form the full path for the file
    full_path = os.path.join(folder, filename)
    
    ### create the folder if it does not exist
    try: 
        os.mkdir(folder)
    except:
        pass
    ### save file 
    np.save(full_path, np_arr)
    
    print("Array was saved to {}".format(full_path))

In [41]:
### encode features for validation images
start = time.time()
val_transfer_values = use_pretrained_model_for_images(val_filenames_with_all_captions, VGG_transfer_model, VGG_image_size, batch_size=16)
time_val = time.time() - start

5% of images processed
10% of images processed
15% of images processed
20% of images processed
25% of images processed
30% of images processed
35% of images processed
40% of images processed
45% of images processed
50% of images processed
55% of images processed
60% of images processed
65% of images processed
70% of images processed
75% of images processed
80% of images processed
85% of images processed
90% of images processed
95% of images processed
100% of images processed


In [42]:
print('Validation dataset encoding took {:.1f} minutes'.format(time_val / 60))

Validation dataset encoding took 7.9 minutes


In [43]:
### save features for validation images
save_features(val_transfer_values, './cnn_features/', 'vgg16', train=False)

Array was saved to ./cnn_features/vgg16_val.npy


In [44]:
a = np.load('./cnn_features/vgg16_val.npy')
(a == val_transfer_values).all()

True

In [31]:
### encode features for training images
start = time.time()
train_transfer_values = use_pretrained_model_for_images(train_filenames_with_all_captions, VGG_transfer_model, VGG_image_size, batch_size=16)
time_train = time.time() - start

5% of images processed
10% of images processed
15% of images processed
20% of images processed
25% of images processed
30% of images processed
35% of images processed
40% of images processed
45% of images processed
50% of images processed
55% of images processed
60% of images processed
65% of images processed
70% of images processed
75% of images processed
80% of images processed
85% of images processed
90% of images processed
95% of images processed
100% of images processed


In [35]:
print('Training dataset encoding took {:.1f} minutes'.format(time_train / 60))

Training dataset encoding took 189.6 minutes


In [39]:
save_features(train_transfer_values, './cnn_features/', 'vgg16', train=True)

Array was saved to ./cnn_features/vgg16_train.npy


In [40]:
a = np.load('./cnn_features/vgg16_train.npy')
(a == train_transfer_values).all()

True

# Decoder

### Captions encoding

Before building decoder, it is necessary to encode captions into one-hot vectors which further would be used in embedding layer.

In [6]:
def preprocess_captions(all_captions):
    """
    Replaces all the signs by whitespaces
    
    Parameters:
    -----------
    all_captions: list
        List of lists with all the captions
    -----------
    """
    for captions_list in all_captions:
        for i, caption in enumerate(captions_list):
            captions_list[i] = re.sub('\W+', ' ', caption)

In [7]:
def add_start_and_end_to_captions(all_captions, start_str = '<SOS>', end_str = '<EOS>'):
    """
    Adds start and end of caption markers
    
    Parameters:
    -----------
    all_captions: list
        List of lists with all the captions
        
    start_str: str
        Start of caption marker
        
    end_str: str
        End of caption marker
    -----------
    """
    for captions in all_captions:
        for i in range(len(captions)):
            captions[i] =  '{} {} {}'.format(start_str, captions[i], end_str)
            captions[i] = captions[i].replace('  ', ' ').lower()

In [8]:
### Extract captions
train_captions = coco_parse.make_list_of_captions(train_filenames_with_all_captions)
val_captions = coco_parse.make_list_of_captions(val_filenames_with_all_captions)

In [9]:
train_captions[0]

['A bicycle replica with a clock as the front wheel.',
 'The bike has a clock as a tire.',
 'A black metal bicycle with a clock inside the front wheel.',
 'A bicycle figurine in which the front wheel is replaced with a clock\n',
 'A clock with the appearance of the wheel of a bicycle ']

In [10]:
val_captions[0]

['A black Honda motorcycle parked in front of a garage.',
 'A Honda motorcycle parked in a grass driveway',
 'A black Honda motorcycle with a dark burgundy seat.',
 'Ma motorcycle parked on the gravel in front of a garage',
 'A motorcycle with its brake extended standing outside']

In [11]:
### Preprocess captions
preprocess_captions(val_captions)
preprocess_captions(train_captions)

In [12]:
val_captions[0]

['A black Honda motorcycle parked in front of a garage ',
 'A Honda motorcycle parked in a grass driveway',
 'A black Honda motorcycle with a dark burgundy seat ',
 'Ma motorcycle parked on the gravel in front of a garage',
 'A motorcycle with its brake extended standing outside']

In [13]:
train_captions[0]

['A bicycle replica with a clock as the front wheel ',
 'The bike has a clock as a tire ',
 'A black metal bicycle with a clock inside the front wheel ',
 'A bicycle figurine in which the front wheel is replaced with a clock ',
 'A clock with the appearance of the wheel of a bicycle ']

In [14]:
### Add markers of captions' starts and ends
add_start_and_end_to_captions(train_captions)
add_start_and_end_to_captions(val_captions)

In [15]:
train_captions[0]

['<sos> a bicycle replica with a clock as the front wheel <eos>',
 '<sos> the bike has a clock as a tire <eos>',
 '<sos> a black metal bicycle with a clock inside the front wheel <eos>',
 '<sos> a bicycle figurine in which the front wheel is replaced with a clock <eos>',
 '<sos> a clock with the appearance of the wheel of a bicycle <eos>']

In [16]:
val_captions[0]

['<sos> a black honda motorcycle parked in front of a garage <eos>',
 '<sos> a honda motorcycle parked in a grass driveway <eos>',
 '<sos> a black honda motorcycle with a dark burgundy seat <eos>',
 '<sos> ma motorcycle parked on the gravel in front of a garage <eos>',
 '<sos> a motorcycle with its brake extended standing outside <eos>']

In [17]:
class Vocabulary(object):
    """
    The class is used to form a vocabulary (bag-of-words)
    
    Attributes:
    -----------
    number_of_words
        current number of words in the class instance
        
    word_to_id
        dictionary mapping words (tokens) to their ids
        
    id_to_word
        dictionary mapping ids to the corresponding words
    -----------
    """
    
    def __init__(self):
        self.number_of_words = 0
        self.word_to_id = dict()
        self.id_to_word = dict()
    
    def add_word(self, word):
        """
        Adds a word in the vocabulary
        
        Parameters:
        -----------
        word : str
            The word to add
        -----------
        """
        if word not in self.word_to_id:
            self.word_to_id.update({word: self.number_of_words})
            self.id_to_word.update({self.number_of_words: word})
            self.number_of_words += 1
    
    def get_id_by_word(self, word):
        """
        Returns id for an input word
        
        Parameters:
        -----------
        word : str
            The word for which id is needed
        -----------
        """
        return self.word_to_id[word]
    
    def get_word_by_id(self, idx):
        """
        Returns a word for an input id
        
        Parameters:
        -----------
        id : int
            The id for which word is needed
        -----------
        """
        return self.id_to_word[idx]
    
    def save_vocabulary(self, filename_word_to_id='word_to_id.pickle', filename_id_to_word='id_to_word.pickle'):
        """
        Saves vocabulary dictionaries to pickle files
        
        Parameters:
        -----------
        filename_word_to_id : str
            The filename for word_to_id dictionary
        
        filename_id_to_word : str
            The filename for id_to_word dictionary
        -----------
        """
        try: 
            os.mkdir('./vocabulary')
        except:
            pass
        path_word_to_id = os.path.join('./vocabulary/', filename_word_to_id)
        path_id_to_word = os.path.join('./vocabulary/', filename_id_to_word)
        
        with open(path_word_to_id, 'wb') as writer:
            pickle.dump(self.word_to_id, writer)
            
        with open(path_id_to_word, 'wb') as writer:
            pickle.dump(self.id_to_word, writer)
            
    def load_vocabulary(self, path_word_to_id='./vocabulary/word_to_id.pickle', path_id_to_word='./vocabulary/id_to_word.pickle'):
        """
        Loads vocabulary dictionaries from pickle files
        
        Parameters:
        -----------
        path_word_to_id : str
            The path to file with word_to_id dictionary
        
        filename_id_to_word : str
            The path to file with id_to_word dictionary
        -----------
        """
        with open(path_word_to_id, 'rb') as reader:
            self.word_to_id = pickle.load(reader)
            
        with open(path_id_to_word, 'rb') as reader:
            self.id_to_word = pickle.load(reader)

In [18]:
### Create vocabulary from the training captions
train_vocab = Vocabulary()
for i, caption_list in enumerate(train_captions):
    for caption in caption_list:
        tmp_caption_list = caption.split()
        for word in tmp_caption_list:
            train_vocab.add_word(word)

In [19]:
train_vocab.save_vocabulary()

In [20]:
### Create transformed captions list - substitute words by their IDs from vocabulary
train_captions_tokens = [] 
for captions in train_captions:
    tmp_captions_for_img = []
    for caption in captions:
        caption_words = caption.split()
        tmp = []
        for word in caption_words:
            tmp.append(train_vocab.get_id_by_word(word))
        tmp_captions_for_img.append(tmp)
    train_captions_tokens.append(tmp_captions_for_img)

In [21]:
train_captions_tokens[0]

[[0, 1, 2, 3, 4, 1, 5, 6, 7, 8, 9, 10],
 [0, 7, 11, 12, 1, 5, 6, 1, 13, 10],
 [0, 1, 14, 15, 2, 4, 1, 5, 16, 7, 8, 9, 10],
 [0, 1, 2, 17, 18, 19, 7, 8, 9, 20, 21, 4, 1, 5, 10],
 [0, 1, 5, 4, 7, 22, 23, 7, 9, 23, 1, 2, 10]]

In [22]:
train_captions[0]

['<sos> a bicycle replica with a clock as the front wheel <eos>',
 '<sos> the bike has a clock as a tire <eos>',
 '<sos> a black metal bicycle with a clock inside the front wheel <eos>',
 '<sos> a bicycle figurine in which the front wheel is replaced with a clock <eos>',
 '<sos> a clock with the appearance of the wheel of a bicycle <eos>']

### Batch generation

In [30]:
def generate_batch(transfer_values, captions_tokens, eos_token = 10, batch_size=32):
    """
    Generate a batch of input-output data pairs:
        input_data = {
            transfer_values,
            input_tokens
        }
        
        output_data = {
            output_tokens
        }
        
     Parameters:
        -----------
        transfer_values: np.array
            Encoded images features
            
        captions: list
            list with all the captions
        
        
        batch_size: int
            The number of examples in a batch
        -----------
    """
    
    number_of_images = len(captions)
    
    indices = np.random.randint(0, number_of_images, size=batch_size)
    
    captions_batch = []
    ### Randomly select one caption for each example index
    for ind in indices:
        num_captions = len(captions_tokens[ind])
        selected_caption = captions_tokens[ind][np.random.randint(0, num_captions)]
        captions_batch.append(selected_caption)
    
    ### Find the largest caption length and pad the remaining to be the same size
    max_caption_size = max([len(cap) for cap in captions_batch])
    
    captions_batch_padded = pad_sequences(captions_batch, 
                                          maxlen=max_caption_size, 
                                          padding='post', 
                                          value=eos_token)
    
    ### Input tokens are the initial ones starting from index 1
    ### Output tokens are the initial ones shifted to the right
    input_tokens = captions_batch_padded[:, :-1]
    output_tokens = captions_batch_padded[:, 1:]
    input_transfer_values = transfer_values[indices]
    
    input_data = {
        'transfer_values': input_transfer_values,
        'input_tokens': input_tokens
    }
    
    output_data = {
        'output_tokens': output_tokens
    }
    
    return input_data, output_data

In [28]:
transfer_values = np.load('./cnn_features/vgg16_train.npy')

In [32]:
inp, out = generate_batch(transfer_values, train_captions_tokens)

In [34]:
inp['transfer_values'].shape

(32, 4096)

In [35]:
inp['input_tokens'].shape

(32, 18)

In [37]:
out['output_tokens'].shape

(32, 18)

In [38]:
print(inp['input_tokens'][0])
print(out['output_tokens'][0])

[ 0  1 69 23 56 51 18  1 70 58 57 10 10 10 10 10 10 10]
[ 1 69 23 56 51 18  1 70 58 57 10 10 10 10 10 10 10 10]


In [42]:
print(" ".join([train_vocab.get_word_by_id(x) for x in inp['input_tokens'][0]]))
print(" ".join([train_vocab.get_word_by_id(x) for x in out['output_tokens'][0]]))

<sos> a couple of cars parked in a busy street sidewalk <eos> <eos> <eos> <eos> <eos> <eos> <eos>
a couple of cars parked in a busy street sidewalk <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>
