## Assignment 3.2. Image Caption Generation

## Task 2.1: Encoder Decoder Model

Build an image caption generator model, as described in Vinyals, Oriol, et al. "Show and tell: A neural image caption generator." Proceedings of the IEEE conference on computer vision and pattern recognition. 2015. The model shall consist of:

- Image encoder (image feature extractor)
- Caption generator (RNN-based)



In [0]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook

import string
import os
from PIL import Image
import glob
from pickle import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, GRU, Embedding, TimeDistributed, Dense, RepeatVector, Add, Lambda,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add, dot
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import keras.backend as K

Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
abs_path = '/content/drive/My Drive/TUE -EIT (me)/Recommender Systems/Assignment 3 - Sequential networks/'
files_path = abs_path + 'Assignment_3_Notebooks_submission/pickel_files/'

### Upload Data


In [0]:
import _pickle as cPickle

def read_pickle(data_path, file_name):

    f = open(os.path.join(data_path, file_name), 'rb')
    read_file = cPickle.load(f)
    f.close()

    return read_file
  

def read_npy(data_path, file_name):

    f = open(os.path.join(data_path, file_name), 'rb')
    read_file = np.load(f)
    f.close()

    return read_file

# Import Images Pickel Files
train_features = read_pickle(files_path, 'encoded_train_images.pkl')
dev_features = read_pickle(files_path, 'encoded_dev_images.pkl')
test_features = read_pickle(files_path, 'encoded_test_images.pkl')
 
# Import Caption Files (descriptions)
train_descriptions = read_pickle(files_path, 'train_descriptions.pkl')
dev_descriptions = read_pickle(files_path, 'dev_descriptions.pkl')
test_descriptions = read_pickle(files_path, 'test_descriptions.pkl')

# Import Wordtoix dictionary
wordtoix = read_pickle(files_path, 'wordtoix.pkl')

# Import Embedded_matrix
embedding_matrix = read_npy(files_path, 'embedding_matrix.npy')

## Model

In [0]:
K.clear_session()

In [0]:
# model variables
rnn_dim = 300
vocab_size = 1652
max_length = 34

### Image Encoder

In [0]:
# image input
image_in = Input(shape=(2048,), name='image_inputs')

In [0]:
# image encoder
fe1 = Dropout(0.5, name='dropout_img_feats')(image_in)
image_dense = Dense(rnn_dim, activation='relu', name = 'dense_img_feats')
fe2 = image_dense(fe1) # reduce the dimension with FC projection

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


### Caption Generator

In [0]:
# caption input
cap_in = Input(shape=(None,),name='caption_inputs') 

# caption embedding representation (word-based embedding)
embed_cap = Embedding(vocab_size, rnn_dim, weights = [embedding_matrix], trainable = False)
embed_cap_out = embed_cap(cap_in)
drop_cap = Dropout(0.5)
se2 = drop_cap(embed_cap_out)


## encoder
lstm_encoder = LSTM(rnn_dim,  name='lstm_encoder')
# lstm_encoder = GRU(rnn_dim, return_state=True, name='lstm_encoder')
se3 = lstm_encoder(se2)


# state input for each decoder time step
s0 = Input(shape=(rnn_dim,), name='s0') # with a dimension of (, rnn_dim)
s = [s0]


# layers initialization
# LSTM/GRU decoder as caption generator
decoder = GRU(rnn_dim, return_state=True)
decoder_dense = Dense(rnn_dim, activation='relu')


# Prediction layer with softmax activation
pred_layer = Dense(vocab_size, activation='softmax')

In [0]:
probs = []

# process the training per time step (following the max length of captions)
for t in range(max_length):
    
    ### YOUR CODE HERE
    caption_dec = Lambda(lambda x: x[:,t,:], name='dec_embedding-%s'%t)(se2)
    caption_dec = Reshape((1, rnn_dim))(caption_dec)
    image_dec = Reshape((1, rnn_dim))(fe2)
    
    ### ... HOW DO YOU REPRESENT JOINT-REPRESENTATION OF IMAGE-CAPTION AS DECODER INPUT? 
    context_concat = dot([caption_dec, image_dec], axes=1, normalize=False)
    #context_concat = concatenate([x_dec, enc_out_reshape],axis=-1)

    
    ### ... HOW DO YOU INITIALIZE THE RNN-BASED DECODER STATE IN TIME STEP=0? 
    
    if t == 0:
      s = se3
        
    ### ... WHAT IS THE INPUT OF THE DECODER? 
    
    
    s, _ = decoder(context_concat, initial_state = s)
    
    # softmax probability output
    prob = pred_layer(s)
    
    probs.append(prob)
    s = [s]

### The model shall be constructed based on the following inputs

In [0]:
# Construct the model
#model = Model(inputs=[image_in, cap_in], outputs=probs)
model = Model(inputs=[image_in, cap_in, s0], outputs=probs)
# Compile & run training
adam = optimizers.Adam(lr=0.001)
model.compile(optimizer=adam, loss='categorical_crossentropy')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
caption_inputs (InputLayer)     (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    495600      caption_inputs[0][0]             
__________________________________________________________________________________________________
image_inputs (InputLayer)       (None, 2048)         0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 300)    0           embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_im

### Training

a) Following MT2 example


In [0]:
import tensorflow as tf
from keras.utils import to_categorical

class Dataiterator():

    def __init__(self, feature_in, caption_in, vocab_size=vocab_size, seq_length=max_length, decoder_dim=rnn_dim, batch_size=32):
        
        self.feature_in = feature_in
        self.caption_in = caption_in
#         self.y_out = y_out
        self.states = np.zeros((len(feature_in), decoder_dim))
        self.y_out = np.zeros((len(feature_in), vocab_size))
        self.num_data = len(feature_in) 
        self.vocab_size = vocab_size
        self.batch_size = batch_size 
        self.seq_length = seq_length
        self.reset() # initial: shuffling examples and set index to 0
        
    
    def onehotencoding(self, data):
      
            
        return to_categorical(data, num_classes=self.vocab_size, dtype='int32')
    
    def __iter__(self): # iterates data
        
        return self


    def reset(self): # initials
        
        self.idx = 0
        self.order = np.random.permutation(self.num_data) # shuffling examples by providing randomized ids 
        
    def __next__(self): # return model inputs - outputs per batch
        
        X_ids = [] # hold ids per batch 

        while len(X_ids) < self.batch_size:

            X_id = self.order[self.idx] # copy random id from initial shuffling
            X_ids.append(X_id)
            #print(X_ids)

            self.idx += 1 # 
            if self.idx >= self.num_data: # exception if all examples of data have been seen (iterated)
                self.reset()
                raise StopIteration()
    
        batch_feature_in = self.feature_in[np.array(X_ids)] # X values (encoder input) per batch FEATURES
        batch_caption_in = self.caption_in[np.array(X_ids)] # y_in values (decoder input) per batch
        batch_y_out = self.y_out[np.array(X_ids)]
        batch_states = self.states[np.array(X_ids)] # state values (decoder state input) per batch
        batch_y = self.onehotencoding(batch_y_out)
        
      
        
        return batch_feature_in, batch_caption_in, batch_states, list(batch_y.swapaxes(0,1))

    # return all data examples 
    def all(self):
      
        y = self.onehotencoding(self.y_out)
        
        return self.feature_in, self.caption_in, self.states, list(y.swapaxes(0,1))


In [0]:


batch_size = 32

train_steps_epoch = len(train_features)/batch_size
print("train_steps_epoch : %s" %train_steps_epoch)
batch_train_iter = Dataiterator(train_features, train_descriptions)

dev_steps_epoch = len(dev_features)/batch_size
print("dev_steps_epoch : %s" %dev_steps_epoch)
batch_dev_iter = Dataiterator(dev_features, dev_descriptions)

In [0]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

def train_generator(model, batch_train_iter, batch_val_iter):
    
    earlystop_callbacks = [EarlyStopping(monitor='val_loss', patience=10),
                     ModelCheckpoint(filepath=os.path.join('./','{epoch:02d}-{loss:.2f}.check'), \
                                     monitor='val_loss', save_best_only=False, \
                                     save_weights_only=True)
                     ]
    
    def train_gen():
        while True:
            train_batches = [[[X, y_in, state], y_out] for X, y_in, \
                             state, y_out in batch_train_iter]
            for train_batch in train_batches:
                yield train_batch
                
    def val_gen():
        while True:
            val_batches = [[[X, y_in, state], y_out] for X, y_in, \
                           state, y_out in batch_val_iter]
            for val_batch in val_batches:
                yield val_batch
                
    history = model.fit_generator(train_gen(), validation_data=val_gen(), \
                                  validation_steps=dev_steps_epoch, steps_per_epoch=train_steps_epoch, \
                                  epochs = 20, callbacks = earlystop_callbacks)
      

In [0]:
# train_generator(model, batch_train_iter, batch_dev_iter)

Instructions for updating:
Use tf.cast instead.
Epoch 1/20


Trying this approachwe ran our of memory and the kernel crashes.

b)  Following the approach of Image-Caption generation example

In [0]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch, rnn_dim):
  
    X1, X2, y, hidden = list(), list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # hidden state S
                    hidden_state = np.zeros(rnn_dim)
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
                    hidden.append(hidden_state)
            #print('Shape X1 {}, Shape X2 {}, Shape Hidden {}, Shape Y {}'.format(array(X1).shape, array(X2).shape, array(hidden).shape, array(y).shape))
            # yield the batch data
            if n==num_photos_per_batch:
                
                yield [[array(X1), array(X2), array(hidden)], array(y)]
               
                X1, X2, y, hidden = list(), list(), list(), list()
                n=0
                
    return self.X1, self.X2, self.y, self.hidden

In [0]:
epochs = 10
number_pics_per_batch = 3
train_steps = len(train_descriptions)//number_pics_per_batch
dev_steps = len(dev_descriptions)//number_pics_per_batch

In [0]:
for i in range(epochs):
    train_generator = data_generator(train_descriptions, train_features, wordtoix, max_length, number_pics_per_batch, rnn_dim)
    dev_generator = data_generator(dev_descriptions, dev_features, wordtoix, max_length, number_pics_per_batch, rnn_dim)
    model.fit_generator(train_generator, epochs=1, steps_per_epoch=train_steps, verbose=1, callbacks=None, validation_data=dev_generator, validation_steps=dev_steps)

Epoch 1/1


ValueError: ignored

With this approach we dont manage'to have the correct inputs to the model. 

Since we couldn't train the model, we couldn't continue with the rest of this question.

## Task 2.2: Decoder Model

Based on the completed encoder-decoder, build a decoder model for generating captions using two approaches:
- Greedy search
- Beam search

In [0]:
### YOUR CODE HERE

### Greedy search

In [0]:
### YOUR CODE HERE

### Beam search

In [0]:
### YOUR CODE HERE