In [1]:
import json
import numpy as np
import scipy.io as sio
import argparse

import lessdummy1 as utilities
import cocoIDToFeatures as cocoImageUtils

tfile = '../features/coco_vgg_IDMap.txt'

args = {}
args['answer_vector_file']='answer_feature_list.json'
args['glove_file']='../glove/glove.6B.300d.txt'

In [2]:
print "Reading GloVE and VGG raw files"

glove_word_vec_file = args['glove_file']
word_vec_dict = utilities.readGloveData(glove_word_vec_file)

imageDict = cocoImageUtils.generateDictionary(tfile)
feats = sio.loadmat('./../features/coco/vgg_feats.mat')['feats']

print "Reading the data and creating features"

answer_vector_file = open(args['answer_vector_file'], 'r')
answerFeatureVector = json.loads(answer_vector_file.read())

answer_vector_file.close()

Reading GloVE and VGG raw files
Reading the data and creating features


In [3]:
import sys
sys.path.insert(0, './../VQA/PythonHelperTools')
from vqaTools.vqa import VQA

dataDir = './../VQA'
taskType = 'MultipleChoice'
dataType = 'mscoco' # 'mscoco' for real and 'abstract_v002' for abstract
dataSubType = 'train2014'
annFile = '%s/Annotations/%s_%s_annotations.json' % (dataDir, dataType, dataSubType)
quesFile = '%s/Questions/%s_%s_%s_questions.json' % (dataDir, taskType, dataType, dataSubType)
imgDir = '%s/Images/%s/%s/' % (dataDir, dataType, dataSubType)
vqaTrain = VQA(annFile, quesFile)
dummyano = vqaTrain.dataset['annotations']
answerFeatures = utilities.createAnswerFeatures(dummyano)

vqaVal = VQA(annFile, quesFile)

loading VQA annotations and questions into memory...
0:00:18.065623
creating index...
index created!
loading VQA annotations and questions into memory...
0:00:23.249883
creating index...
index created!


In [49]:
dataset = []

numQuestions = 1000
q = 0
for quesID, annotation in vqaVal.qa.iteritems():
    q += 1
    if q == numQuestions:
        break
        
    question = vqaVal.qqa[quesID]
    question_text = question['question'].strip().replace('?', ' ?').split()
    imgID = annotation['image_id']
    ansString = annotation['multiple_choice_answer']
    
    dataset.append({'question': question_text, 'answer': ansString, 'image': imgID})

In [50]:
from collections import Counter

c = Counter([len(x['question']) for x in dataset])
maxlen = max(c.keys())
print c
print "Max Question Length = ", maxlen

Counter({6: 237, 7: 206, 8: 177, 5: 133, 9: 98, 10: 54, 11: 30, 4: 27, 12: 17, 13: 9, 15: 4, 14: 3, 16: 2, 17: 2})
Max Question Length =  17


In [51]:
maxlen = 23
nb_train = len(dataset)
nb_timestep = maxlen + 1 # For Image Vector
word_vec_dim = len(word_vec_dict['hi'])
image_dim = 4096

### Building the LSTM Model###

**Create the LSTM model**

In [66]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Reshape, Merge, RepeatVector
from keras.layers.recurrent import LSTM, GRU

inner_layer_size = 512
output_size = 1000
input_size = word_vec_dim

model = Sequential()

imageModel = Sequential()
imageModel.add(Dense(input_size, input_shape=(4096,)))
imageModel.add(Dropout(0.2))
imageModel.add(RepeatVector(maxlen))

questionModel = Sequential()
questionModel.add(Reshape(input_shape=(maxlen, input_size,), dims=(maxlen, input_size,)))

model.add(Merge([imageModel, questionModel], mode='concat', concat_axis=2))

model.add(GRU(inner_layer_size, return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(output_size, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
                    
model.add(Dense(output_size, init='uniform', activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [57]:
# from IPython.display import SVG
# from keras.utils.visualize_util import to_graph

# SVG(to_graph(model).create(prog='dot', format='svg'))

**Generating X_train and Y_train**

In [62]:
X_train = np.zeros(shape=(nb_train, maxlen, input_size))
Image_train = np.zeros(shape=(nb_train, 4096))
Y_train = np.zeros(shape=(nb_train, len(answerFeatureVector)) , dtype='bool')

idx = 0
for item in dataset:
    q = item['question']
    padding = maxlen - len(q)
    for i in xrange(padding):
        X_train[idx, i, :] = np.zeros(input_size)
        
    for word in q:
        X_train[idx, padding, :] = utilities.getWordVector(word, word_vec_dict)
    Y_train[idx, :] = utilities.getAnswerVector(item['answer'], answerFeatureVector)
    
    Image_train[idx, :] = np.asarray(feats[:, imageDict[item['image']]])
    
    idx += 1

In [67]:
model.fit([Image_train, X_train], Y_train, nb_epoch=5, validation_split=0.1, show_accuracy=True, verbose=1)

Train on 899 samples, validate on 100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2e1627810>

In [64]:
print model.to_json()

{"layers": [{"layers": [{"layers": [{"b_constraint": null, "name": "Dense", "activity_regularizer": null, "W_constraint": null, "input_shape": [4096], "init": "glorot_uniform", "activation": "linear", "input_dim": null, "b_regularizer": null, "W_regularizer": null, "output_dim": 300}, {"p": 0.2, "name": "Dropout"}, {"name": "RepeatVector", "n": 23}], "name": "Sequential"}, {"layers": [{"dims": [23, 300], "name": "Reshape", "input_shape": [23, 300]}], "name": "Sequential"}], "mode": "concat", "dot_axes": -1, "name": "Merge", "concat_axis": 2}, {"name": "LSTM", "inner_activation": "hard_sigmoid", "go_backwards": false, "output_dim": 512, "stateful": false, "init": "glorot_uniform", "inner_init": "orthogonal", "input_dim": 600, "return_sequences": false, "activation": "tanh", "forget_bias_init": "one", "input_length": null}, {"p": 0.2, "name": "Dropout"}, {"b_constraint": null, "name": "Dense", "activity_regularizer": null, "W_constraint": null, "init": "uniform", "activation": "tanh", "i