In [1]:
import numpy as np 
import pandas as pd 
import os 
import pickle
import matplotlib.pyplot as plt
from PIL import Image
import skimage
from keras.preprocessing import image
from keras.applications import VGG16
from keras.models import Model
from keras import backend
import json
from pycocotools.coco import COCO
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (8.0, 10.0)

Using TensorFlow backend.


In [2]:
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.applications import VGG16
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [5]:
def load_data(filename):
    data = pd.read_json('./Dataset/'+filename+'.json')
    image_id_list = data.columns
    caption_list = []
    
    for i in image_id_list:
        caption_list.append(data[i])
    
    print("Data Size - ", len(image_id_list), "\nCaptions Per Image - ", len(caption_list[0]))
    
    return image_id_list, caption_list

In [6]:
image_list_train, caption_list_train = load_data('train_data')

Data Size -  118287 
Captions Per Image -  5


In [39]:
image_list_val, caption_list_val = load_data('val_data')

Data Size -  5000 
Captions Per Image -  5


In [40]:
vgg16_model = VGG16(include_top = True, weights = 'imagenet')
vgg16_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [41]:
dense_output = vgg16_model.get_layer('fc2')
dense_output

<keras.layers.core.Dense at 0x7f2d5e2b5978>

In [42]:
custom_vgg16 = Model(inputs = vgg16_model.input, outputs = dense_output.output)

In [43]:
def load_image(path, size = None):
    img = Image.open(path)
    
    if not size is None:
        img = image.img_to_array(img)
        img = skimage.transform.resize(img, size)
    img = np.array(img)
    img = img / 255.0
    
    if (len(img.shape) == 2):
        img = np.repeat(img[:, :, np.newaxis], 3, axis=2)
    return img

In [44]:
def show_image(idx, train, image_id_list, caption_list):
    cwd = os.getcwd()
    
    if train: 
        dir_p = cwd+'/Dataset/val_2017/'
        filename = image_id_list[idx]
        captions = caption_list[idx]
    else:
        dir_p = cwd+'/Dataset/Test_Image/'
        filename = image_id[idx]
        captions = caption_list[idx]
    path = os.path.join(dir_p,+str(filename)+'.jpg')
    
    for caption in captions:
        print(caption)
    
    img = load_image(path)
    plt.imshow(img)
    plt.show()

In [73]:
def process_images(data_dir, image_id_list, batch_size):
    print(data_dir)
    
    num_of_images = len(image_id_list)
    image_size = backend.int_shape(vgg16_model.input)[1:3]
    dense_output_size = backend.int_shape(dense_output.output)[1]
    
    image_shape = (batch_size,) + image_size + (3,)
    print(image_shape)
    image_batch = np.zeros(shape = image_shape, dtype = np.float16)
    
    dense_val_shape = (num_of_images, dense_output_size)
    dense_values = np.zeros(shape = dense_val_shape, dtype = np.float16)
    
    start_index = 0
    
    while start_index < num_of_images:
        print(start_index, end = "\r")
        end_index = start_index + batch_size
        if end_index > num_of_images:
            end_index = num_of_images
        
        current_batch_size = end_index - start_index
        i = 0
        for image_id in image_id_list[start_index:end_index]:
            
            id_len = len(str(image_id))
            file_name = ['0'] * 13
            file_name[12-id_len:] = str(image_id)
            file_name = ''.join(file_name)
            file_name += '.jpg'
            
            path = os.path.join(data_dir, file_name)
            img = load_image(path, size = image_size)
            image_batch[i] = img
            i += 1
        dense_values_batch = custom_vgg16.predict(image_batch[0:current_batch_size])
        
        dense_values[start_index:end_index] = dense_values_batch[0:current_batch_size]
            
        start_index = end_index
    
    return dense_values

In [8]:
def process_train():
    pwd = os.getcwd()
    train_path = pwd + '/Dataset/train_2017/'
    cache_path = pwd + '/Cache/dense_values_train.pkl'
    
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as file:
            obj = pickle.load(file)
    else:
        obj = process_images(train_path, image_list_train, 32)
        with open(cache_path, 'wb') as file:
            pickle.dump(obj, file)
    
    return obj

In [9]:
def process_val():
    pwd = os.getcwd()
    val_path = pwd + '/Dataset/val_2017/'
    cache_path = pwd + '/Cache/dense_values_val.pkl'
    
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as file:
            obj = pickle.load(file)
    else:
        obj = process_images(val_path, image_list_val, 32)
        with open(cache_path, 'wb') as file:
            pickle.dump(obj, file)
    
    return obj

In [77]:
dense_values_val = process_val()
print("Shape - ", dense_values_val.shape)

/home/kshitij/Desktop/Sem2/SMAI/Project/captioning-images/Dataset/val_2017/
(32, 224, 224, 3)
Shape -  (5000, 4096)


In [3]:
pwd = os.getcwd()
cache_path = pwd + '/Cache/dense_values_val.pkl'
transfer_values_train = None
with open(cache_path, 'rb') as file:
    transfer_values_train = pickle.load(file)

In [4]:
def load(train=True):

    if train:
        filename = 'Dataset/captions_train2017.json'
    else:
        filename = 'Dataset/captions_val2017.json'

    path = filename
    
    with open(path, "r", encoding="utf-8") as file:
        data_raw = json.load(file)

    images = data_raw['images']
    annotations = data_raw['annotations']
    
    records = dict()

    for image in images:
        image_id = image['id']
        filename = image['file_name']
        
        record = dict()
        
        record['filename'] = filename
        
        record['captions'] = list()
        
        records[image_id] = record
        
    for ann in annotations:
        
        image_id = ann['image_id']
        caption = ann['caption']
        
        record = records[image_id]
        
        record['captions'].append(caption)
        
    records_list = [(key, record['filename'], record['captions'])
                    for key, record in sorted(records.items())]
    
    ids, filenames, captions = zip(*records_list)

    return ids, filenames, captions

In [5]:
_, filenames_train, captions_train = load(train=False)

In [6]:
mark_start = 'ssss '
mark_end = ' eeee'

In [7]:
def mark_captions(captions_listlist):
    captions_marked = [[mark_start + caption + mark_end
                        for caption in captions_list]
                        for captions_list in captions_listlist]
    
    return captions_marked

In [8]:
captions_train_marked = mark_captions(captions_train)
captions_train_marked[0]

['ssss A woman stands in the dining area at the table. eeee',
 'ssss A room with chairs, a table, and a woman in it. eeee',
 'ssss A woman standing in a kitchen by a window eeee',
 'ssss A person standing at a table in a room. eeee',
 'ssss A living area with a television and a table eeee']

In [9]:
captions_train[0]

['A woman stands in the dining area at the table.',
 'A room with chairs, a table, and a woman in it.',
 'A woman standing in a kitchen by a window',
 'A person standing at a table in a room.',
 'A living area with a television and a table']

In [10]:
def flatten(captions_listlist):
    captions_list = [caption
                     for captions_list in captions_listlist
                     for caption in captions_list]
    
    return captions_list

In [11]:
captions_train_flat = flatten(captions_train_marked)

In [12]:
num_words = 10000

In [13]:
class TokenizerWrap(Tokenizer):
    
    def __init__(self, texts, num_words=None):

        Tokenizer.__init__(self, num_words=num_words)
        
        self.fit_on_texts(texts)
        
        self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))

    def token_to_word(self, token):

        word = " " if token == 0 else self.index_to_word[token]
        return word 

    def tokens_to_string(self, tokens):
        
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        text = " ".join(words)

        return text
    
    def captions_to_tokens(self, captions_listlist):
        
        tokens = [self.texts_to_sequences(captions_list)
                  for captions_list in captions_listlist]
        
        return tokens

In [14]:
tokenizer = TokenizerWrap(texts=captions_train_flat, num_words=num_words)

In [15]:
token_start = tokenizer.word_index[mark_start.strip()]
token_start

2

In [16]:
token_end = tokenizer.word_index[mark_end.strip()]
token_end

3

In [17]:
tokens_train = tokenizer.captions_to_tokens(captions_train_marked)

In [18]:
tokens_train[0]

[[2, 1, 22, 171, 7, 6, 444, 106, 18, 6, 24, 3],
 [2, 1, 45, 8, 295, 1, 24, 9, 1, 22, 7, 27, 3],
 [2, 1, 22, 16, 7, 1, 68, 50, 1, 132, 3],
 [2, 1, 30, 16, 18, 1, 24, 7, 1, 45, 3],
 [2, 1, 119, 106, 8, 1, 288, 9, 1, 24, 3]]

In [19]:
captions_train_marked[0]

['ssss A woman stands in the dining area at the table. eeee',
 'ssss A room with chairs, a table, and a woman in it. eeee',
 'ssss A woman standing in a kitchen by a window eeee',
 'ssss A person standing at a table in a room. eeee',
 'ssss A living area with a television and a table eeee']

In [20]:
def get_random_caption_tokens(idx):
    
    result = []
    
    for i in idx:
        
        j = np.random.choice(len(tokens_train[i]))
        
        tokens = tokens_train[i][j]
        
        result.append(tokens)

    return result

In [21]:
num_images_train = 5000

def batch_generator(batch_size):
    
    while True:
        idx = np.random.randint(num_images_train, size=batch_size)
        
        transfer_values = transfer_values_train[idx]
        
        tokens = get_random_caption_tokens(idx)
        
        num_tokens = [len(t) for t in tokens]
        
        max_tokens = np.max(num_tokens)
        
        tokens_padded = pad_sequences(tokens, maxlen=max_tokens, padding='post', truncating='post')
        
        decoder_input_data = tokens_padded[:, 0:-1]
        decoder_output_data = tokens_padded[:, 1:]
        
        x_data =  {
            'decoder_input': decoder_input_data,
            'transfer_values_input': transfer_values
        }
        
        y_data =  {
            'decoder_output': decoder_output_data
        }
        
        yield (x_data, y_data)

In [22]:
batch_size = 512

In [23]:
generator = batch_generator(batch_size=batch_size)

In [24]:
batch = next(generator)
batch_x = batch[0]
batch_y = batch[1]

In [25]:
batch_x['transfer_values_input'][0]

array([0.    , 0.    , 1.356 , ..., 0.    , 0.    , 0.7905], dtype=float16)

In [26]:
batch_x['decoder_input'][0]

array([   2,   73,   83,   19,  342,    7,   56,  814,    9,   19, 1004,
         72,    1, 1027,    3,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [27]:
batch_y['decoder_output'][0]

array([  73,   83,   19,  342,    7,   56,  814,    9,   19, 1004,   72,
          1, 1027,    3,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [28]:
num_captions_train = [len(captions) for captions in captions_train]

In [29]:
total_num_captions_train = np.sum(num_captions_train)

In [30]:
steps_per_epoch = int(total_num_captions_train / batch_size)
steps_per_epoch

48

In [31]:
state_size = 512

embedding_size = 128

In [32]:
transfer_values_size = 4096

transfer_values_input = Input(shape=(transfer_values_size,), name='transfer_values_input')

decoder_transfer_map = Dense(state_size, activation='tanh', name='decoder_transfer_map')

decoder_input = Input(shape=(None, ), name='decoder_input')

decoder_embedding = Embedding(input_dim=num_words, output_dim=embedding_size, name='decoder_embedding')

decoder_gru1 = GRU(state_size, name='decoder_gru1', return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2', return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3', return_sequences=True)

decoder_dense = Dense(num_words, activation='linear', name='decoder_output')

In [33]:
def connect_decoder(transfer_values):
    initial_state = decoder_transfer_map(transfer_values)
    
    net = decoder_input
    
    net = decoder_embedding(net)
    
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)
    
    decoder_output = decoder_dense(net)
    
    return decoder_output

In [34]:
decoder_output = connect_decoder(transfer_values=transfer_values_input)

Instructions for updating:
Colocations handled automatically by placer.


In [35]:
decoder_model = Model(inputs=[transfer_values_input, decoder_input], outputs=[decoder_output])

In [36]:
def sparse_cross_entropy(y_true, y_pred):
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

In [38]:
import tensorflow as tf

optimizer = RMSprop(lr=1e-3)

decoder_target = tf.placeholder(dtype='int32', shape=(None, None))

decoder_model.compile(optimizer=optimizer, loss=sparse_cross_entropy, target_tensors=[decoder_target])

In [39]:
decoder_model.fit_generator(generator=generator, steps_per_epoch=steps_per_epoch, epochs=20)

Epoch 1/20

KeyboardInterrupt: 