In [1]:
from PIL import Image
import numpy as np
import coco_parse
from matplotlib.pyplot import imshow
import os
%matplotlib inline

In [2]:
from keras import Model
from keras.layers import Input, Dense, LSTM, Embedding
from keras.applications import VGG16
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.image import grayscale_to_rgb

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
captions_path = 'D:/coco/annotations/'
images_path = 'D:/coco/images/'

# parse JSON file with captions to get paths to images woth captions
filenames_with_captions = coco_parse.get_image_filename_with_caption(captions_path, images_path, 
                                                                     train=False)
filenames_with_all_captions = coco_parse.get_image_with_all_captions(filenames_with_captions)

In [4]:
len(filenames_with_all_captions)

5000

In [5]:
def image_preprocessing(image_path, new_size):
    """
    Function reads the image and applies preprocessing including:
    - resizing to the new_size
    - rescaling pixel values at [0, 1]
    - transforming grayscale images to RGB format
    """
    image = Image.open(image_path)
    image = np.array(image.resize(new_size, Image.LANCZOS))
    image = np.divide(image, 255)
    if len(image.shape) != 3:
        image = np.repeat(image[:, :, np.newaxis], 3, axis=2)
    return image

In [6]:
### Create instance of the VGG16 model pretrained on imagenet
VGG_model = VGG16(include_top=True, weights='imagenet')

In [7]:
VGG_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [8]:
### alter initial VGG model by eliminating of the last layer which produces preditions. 
### The output of the second fully-connected layer will be used as the last layer of the encoder part
transfer_layer = VGG_model.get_layer('fc2')
VGG_transfer_model = Model(inputs=VGG_model.input, outputs=transfer_layer.output)

In [9]:
### get the size of the images used for VGG16 to resize COCO images
input_layer = VGG_model.get_layer('input_1')
VGG_image_size = input_layer.input_shape[1:3]

In [10]:
def use_pretrained_model_for_images(filenames_with_all_captions, transfer_model, img_size ,batch_size=64):
    """
    Function uses the pretrained model without prediction layer to encode the images into the set
    of the features.
    """
    
    ### get the number of images in the dataset
    num_images = len(filenames_with_all_captions)
    ### calculate the number of iterations 
    iter_num = int(num_images / batch_size)
    ### variable to print the progress each 5% of the dataset 
    five_perc = int(iter_num * 0.05)
    iter_count = 0
    cur_progress = 0
    
    ### get the paths to all images without captions
    image_paths = list(filenames_with_all_captions.keys())
    ### list for the final result
    transfer_values = []
    
    ### start and end index for each batch
    first_i = 0
    last_i = batch_size
    
    ### loop through the images
    while first_i < num_images:
        iter_count += 1
        
        ### progress print
        if iter_count == five_perc:
            cur_progress += 5
            iter_count = 0
            print(str(cur_progress) + "% of images processed")
        
        ### to make sure that last batch is not beyond the number of the images
        if last_i > num_images:
            last_i = num_images
        
        ### initialize the list for the batch
        image_batch = []
        
        ### loop to form batches
        for image in image_paths[first_i:last_i]:
            ### preprocess image
            image = image_preprocessing(image, img_size)
            ### append image to batch list
            image_batch.append(image)
        
        ### run the model to encode the features
        preds = transfer_model.predict(np.array(image_batch))
        
        ### append predictions from the batch to the final list
        for pred in preds:
            transfer_values.append(pred)
        
        ### update first and last indices in the batch
        first_i += batch_size
        last_i += batch_size
        
    return np.array(transfer_values)

In [14]:
def save_features(np_arr, folder, model, train=True):
    """
    Function for saving encoded features into the .npy file.
    """
    
    ### form the final filename
    if train:
        filename = model + '_train.npy'
    else:
        filename = model + '_val.npy'
    
    ### form the full path for the file
    full_path = os.path.join(folder, filename)
    
    ### create the folder if it does not exist
    try: 
        os.mkdir(folder)
    except:
        pass
    ### save file 
    np.save(full_path, np_arr)
    
    print("Array was saved to {}".format(full_path))

In [12]:
### encode features for validation images
val_transfer_values = use_pretrained_model_for_images(filenames_with_all_captions, VGG_transfer_model, VGG_image_size, batch_size=16)

5% of images processed
10% of images processed
15% of images processed
20% of images processed
25% of images processed
30% of images processed
35% of images processed
40% of images processed
45% of images processed
50% of images processed
55% of images processed
60% of images processed
65% of images processed
70% of images processed
75% of images processed
80% of images processed
85% of images processed
90% of images processed
95% of images processed
100% of images processed


In [15]:
### save features for validation images
save_features(val_transfer_values, './cnn_features/', 'vgg16', train=False)

Array was saved to ./cnn_features/vgg16_val.npy
