###### Importing needed libraries from keras

In [1]:
import sys
!{sys.executable} -m pip install pydot 



In [2]:
# Multilayer Perceptron
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import concatenate
from keras.layers.recurrent import LSTM
from keras.layers import Reshape
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import TimeDistributed
from keras.layers import Activation
import tensorflow as tf
import pydot

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Constants

In [3]:
Na = 205
Nq = 80
question_lstm_hidden_size = 3

# Inputs
### We need to get the following embeddings as input to the model - 
(1) Image Embedding 

(2) Answer Embedding

(3) Question Embedding

### (1) Image Embedding

###### Get the image embedding by using VGG16 Pre-Trained Model

In [4]:
import getEm
all_img_emb = getEm.get_train_img_embedding()

(14, 14, 512)


### (2) Answer Embedding

###### Imports for Getting Answers

In [5]:
import sys
!{sys.executable} -m pip install gensim
from vqaTools.vqa import VQA
import random
import skimage.io as io
import matplotlib.pyplot as plt
import os
import shutil
import math
import sys
import json
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import preprocess as pre
import json



###### Initializing and loading annotations

In [6]:
dataDir = '../../'
versionType = 'v2_'  # this should be '' when using VQA v2.0 dataset
taskType = 'OpenEnded'  # 'OpenEnded' only for v2.0. 'OpenEnded' or 'MultipleChoice' for v1.0
dataType = 'mscoco'  # 'mscoco' only for v1.0. 'mscoco' for real and 'abstract_v002' for abstract for v1.0.
dataSubType = 'train2014'	
annFile = '%s/%s%s_%s_annotations.json' % (dataDir, versionType, dataType, dataSubType)
quesFile = '%s/%s%s_%s_%s_questions.json' % (dataDir, versionType, taskType, dataType, dataSubType)
imgDir = '%s/Images/%s/%s/' % (dataDir, dataType, dataSubType)

# initialize VQA api for QA annotations
vqa = VQA(annFile, quesFile)

loading VQA annotations and questions into memory...
0:01:54.027877
creating index...
index created!


###### Get Reduced Annotation list

In [7]:
# Get all the annotations Ids
annIds = vqa.getImgIds()

# Select only first 1000 annotations
annotations = annIds[:1001]

# Get the list of all unique Image Ids from these 1000 annotations
imageIds = []
for ann in annotations:
    imageIds.append(ann['image_id'])
imageIds = set(imageIds)

# Select only 100 images from this unique list
imageDataSet = list(imageIds)[:101]

# Get all the questions related to these 100 images
questions = vqa.getQuesIds(imageDataSet)

# Making a map of the {'imageID':([List of questions], annotation)}
qs = []
for ann in annotations:
    if ann['question_id'] in questions:
        qs.append({'image_id': ann['image_id'], 'question': vqa.getQuestions(ann), 'answer': ann['multiple_choice_answer']})


###### Get Answer Embedding from reduced annotations

In [8]:
# Making a list which will contain all the answers with confidence of yes/maybe
answer_set = []
for data_entry in qs:
    answer_set.append(data_entry['answer'])

answer_set = set(answer_set)

ans_embedding = []
for ans in answer_set:
    token = ans.split(" ")
    ans_embedding.append(token)

print(len(ans_embedding))

embedding = Word2Vec(sentences = ans_embedding, size=512, window=1, min_count = 1, workers=4, sg=0)
words = list(embedding.wv.vocab)
final_ans_embedding=[]
for ans in answer_set:
    token = ans.split(" ")
    if len(token) > 1:
        index = math.ceil(len(token)/2)
        ans = token[index]
    final_ans_embedding.append({ans:embedding[ans]})
print("Final Emb:", len(final_ans_embedding))

205
Final Emb: 205




### (3) Question Embedding 

###### Get Question List from the Reduced Annotations

In [9]:
import vqaLib as lib
import nltk
# nltk.download('punkt')


question_set = []
for data_entry in qs:
    question_set.append(data_entry['question'])
# print(question_set)

dataset_train = lib.prepro_question(question_set)
#print(dataset_train)
dataset_train, vocab = lib.build_vocab_question(dataset_train)
itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table
wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table
ques_train, ques_length_train, question_id_train = lib.encode_question(dataset_train, wtoi)

question_list = []

for entry in dataset_train:
    question_list.append(entry['final_question'])

number of words in vocab would be 693


###### Making the Question Embedding Map

In [10]:
embed = Word2Vec(sentences = question_list, size=512, window=3, min_count = 1, workers=4, sg=0)
words = list(embedding.wv.vocab)

question_embedding_map = {}
question_embedding_list = []
#print(len(question_embedding_list[0]))
for entry in vocab:
    if entry!='?' and entry!='UNK':
        question_embedding_map[entry] = embed[entry]

print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(len(question_embedding_map))

question_embedding_keys = list(question_embedding_map.keys())
# print(question_embedding_keys)
# print(question_embedding_keys[0])
# print(question_embedding_map[question_embedding_keys[0]])

question_embedding_list = question_embedding_map.values()
question_embedding_list = (list(question_embedding_list))
#print(len(question_embedding_list[0]))

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
692


  if __name__ == '__main__':


###### Getting question embeddings for each word in the question and storing it separately in list: [ [que_word1] [que_word2] ... [que_wordn], [ ][ ],  ]. Also image-id corresponding to each question is stored

In [11]:
list_ques_embedding_wordWise = []
list_imageId_mapped_to_ques = []
list_ans_mapped_to_ques = []
list_questions = []

for data_entry in qs:
    question = data_entry['question'][0]
    image_id = data_entry['image_id']
    
    list_questions.append(question)
    list_ans_mapped_to_ques.append(data_entry['answer'])
    list_imageId_mapped_to_ques.append(image_id)
    
    question_embedding_wordWise = []
    for word in question.split(' '):
        
        word = lib.tokenize(word)
        
        if(len(word) > 1 and len(word[-1]) <= 1):
            word = word[0]
           
        word = ''.join(word)
        
        word = word.lower()
        if(word[-1] in ('?', ',')):
            continue
           
        question_embedding_wordWise.append(question_embedding_map[word]) #Here
    list_ques_embedding_wordWise.append(question_embedding_wordWise)

print(len(list_questions))
print(len(list_ques_embedding_wordWise))
print(len(list_imageId_mapped_to_ques))
print(len(list_ans_mapped_to_ques))

548
548
548
548


In [12]:
import numpy as np
from keras.layers.merge import concatenate
from keras.layers import Merge

# create the model
embedding_vector_length = len(question_embedding_list)
question_tensor = tf.convert_to_tensor(np.array(question_embedding_list), dtype=tf.float32)
subLstmModel1 = Sequential()
subLstmModel1.add(LSTM(256, input_length=Nq, input_dim=len(question_embedding_list[0]), 
                      return_sequences=True))
subLstmModel1.summary()

subLstmModel2 = Sequential()
subLstmModel2.add(Conv1D(256, 1, activation='relu', input_shape=(Nq, 256)))
subLstmModel2.add(LSTM(256, input_length=Nq, input_dim=len(question_embedding_list[0]), 
                      return_sequences=True))                 
subLstmModel2.summary()

questionModel = Sequential()
questionModel.add(Merge([subLstmModel1, subLstmModel2], mode='concat'))
questionModel.summary()

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 80, 256)           787456    
Total params: 787,456
Trainable params: 787,456
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


  app.launch_new_instance()
  app.launch_new_instance()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 80, 256)           65792     
_________________________________________________________________
lstm_2 (LSTM)                (None, 80, 256)           525312    
Total params: 591,104
Trainable params: 591,104
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 80, 512)           0         
Total params: 1,378,560
Trainable params: 1,378,560
Non-trainable params: 0
_________________________________________________________________




# Potentials

### We need to calculate  - 
(1) Unary Potentials

(2) Pairwise Potentials

### (1) Unary Potentials

###### Question Unary Potential

In [13]:
ques = Input(shape=(Nq,512))
conv1 = Conv1D(512, kernel_size=1, activation='tanh')(ques)
Qoutput = Conv1D(1, kernel_size=1)(conv1)
thetaQ = Model(inputs=ques, outputs=Qoutput)
# summarize layers
print(thetaQ.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 80, 512)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 80, 512)           262656    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 80, 1)             513       
Total params: 263,169
Trainable params: 263,169
Non-trainable params: 0
_________________________________________________________________
None


###### Answer Unary Potential

In [14]:
ans = Input(shape=(Na,512))
conv2 = Conv1D(512, kernel_size=1, activation='tanh')(ans)
Aoutput = Conv1D(1, kernel_size=1)(conv2)
thetaA = Model(inputs=ans, outputs=Aoutput)
# summarize layers
print(thetaA.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 205, 512)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 205, 512)          262656    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 205, 1)            513       
Total params: 263,169
Trainable params: 263,169
Non-trainable params: 0
_________________________________________________________________
None


###### Image Unary Potential

In [15]:
img = Input(shape=(196,512))
conv3 = Conv1D(512, kernel_size=1, activation='tanh')(img)
Voutput = Conv1D(1, kernel_size=1)(conv3)
thetaV = Model(inputs=img, outputs=Voutput)
# summarize layers
print(thetaV.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 196, 512)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 196, 512)          262656    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 196, 1)            513       
Total params: 263,169
Trainable params: 263,169
Non-trainable params: 0
_________________________________________________________________
None


### (2) Pairwise Potentials

In [16]:
import keras.backend as K
from keras.layers import Lambda
from keras.models import Model

###### Image - Question Potential

In [17]:
img = Input(shape=(196,512))
convImg = Conv1D(512, kernel_size=1, activation=None)(img)

ques = Input(shape=(15,512))
convQues = Conv1D(512, kernel_size=1, activation=None)(ques)
batch_size = 1

thetaVQ = K.dot(convImg[batch_size,:,:], K.transpose(convQues[batch_size,:,:]))
thetaVQ = K.reshape(thetaVQ, (batch_size, 196, 15, 1))

convQuesImage = Conv2D(1, kernel_size=(196, 1), activation='tanh')(thetaVQ)
print(convQuesImage.get_shape())

convImageQues = Conv2D(1, kernel_size=(1, 15), activation='tanh')(thetaVQ)
print(convImageQues.get_shape())

(1, 1, 15, 1)
(1, 196, 1, 1)


###### Question - Answer Potential

In [18]:
ans = Input(shape=(Na,512))
convAns = Conv1D(512, kernel_size=1, activation=None)(ans)

ques = Input(shape=(15,512))
convQues = Conv1D(512, kernel_size=1, activation=None)(ques)
batch_size = 1

thetaAQ = K.dot(convAns[batch_size,:,:], K.transpose(convQues[batch_size,:,:]))
thetaAQ = K.reshape(thetaAQ, (batch_size, Na, 15, 1))

convAnsQues = Conv2D(1, kernel_size=(Na, 1), activation='tanh')(thetaAQ)
print(convAnsQues.get_shape())

convQuesAns = Conv2D(1, kernel_size=(1, 15), activation='tanh')(thetaAQ)
print(convQuesAns.get_shape())

(1, 1, 15, 1)
(1, 205, 1, 1)


###### Answer - Image Potential

In [19]:
ans = Input(shape=(Na,512))
convAns = Conv1D(512, kernel_size=1, activation=None)(ans)

img = Input(shape=(196,512))
convImg = Conv1D(512, kernel_size=1, activation=None)(img)
batch_size = 1

thetaAV = K.dot(convAns[batch_size,:,:], K.transpose(convImg[batch_size,:,:]))
thetaAV = K.reshape(thetaAV, (batch_size, Na, 196, 1))

convAnsImg = Conv2D(1, kernel_size=(Na, 1), activation='tanh')(thetaAV)
print(convAnsImg.get_shape())

convImgAns = Conv2D(1, kernel_size=(1, 196), activation='tanh')(thetaAV)
print(convImgAns.get_shape())

(1, 1, 196, 1)
(1, 205, 1, 1)


In [20]:
convQuesImage = K.reshape(convQuesImage, (15,))
print(convQuesImage.shape)

convImageQues = K.reshape(convImageQues, (196,))
print(convImageQues.shape)

convAnsQues = K.reshape(convAnsQues, (15,))
print(convAnsQues.shape)

convQuesAns = K.reshape(convQuesAns, (Na,))
print(convQuesAns.shape)

convAnsImg = K.reshape(convAnsImg, (196,))
print(convAnsImg.shape)

convImgAns = K.reshape(convImgAns, (Na,))
print(convImgAns.shape)

########### Reshaping Unary Potentials #######################
Aoutput = K.reshape(Aoutput, (Na,))
print(Aoutput.shape)

Voutput = K.reshape(Voutput, (196,))
print(Voutput.shape)

Qoutput = K.reshape(Qoutput, (15,))
print(Qoutput.shape)

(15,)
(196,)
(15,)
(205,)
(196,)
(205,)
(205,)
(196,)
(15,)


In [21]:
from keras.layers.merge import Concatenate
from keras.layers import merge
from keras.backend import stack

conc_A = stack([convQuesAns, convImgAns, Aoutput], axis=0)
conc_Q = stack([convAnsQues, convQuesImage, Qoutput], axis=0)
conc_V = stack([convAnsImg, convImageQues, Voutput], axis=0)


# conc_A = tf.concat(0,[convQuesAns, convImgAns, Aoutput])

# conc_A = merge([convQuesAns, convImgAns, Aoutput], mode='concat', concat_axis=1)
print(conc_A.shape)
print(conc_Q.shape)
print(conc_V.shape)

(3, 205)
(3, 15)
(3, 196)


In [22]:
from keras.layers import Dense

conc_A = K.transpose(conc_A)
conc_V = K.transpose(conc_V)
conc_Q = K.transpose(conc_Q)

print(conc_A.shape)
print(conc_Q.shape)
print(conc_V.shape)

(205, 3)
(15, 3)
(196, 3)


##### Dense layers

In [23]:
# ANSWER 
print(conc_A.shape)
test = Input(shape=(Na,3))
denseA = Dense(Na, activation='relu')(test)
outputA = Dense(1, activation='softmax')(denseA)
dense_ans = Model(inputs=test, outputs=outputA)
print(dense_ans.summary())

(205, 3)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 205, 3)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 205, 205)          820       
_________________________________________________________________
dense_2 (Dense)              (None, 205, 1)            206       
Total params: 1,026
Trainable params: 1,026
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
# QUESTION
print(conc_Q.shape)
test = Input(shape=(15,3))
denseA = Dense(Nq, activation='relu')(test)
outputQ = Dense(1, activation='softmax')(denseA)
dense_que = Model(inputs=test, outputs=outputQ)
print(dense_que.summary())

(15, 3)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 15, 3)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 15, 80)            320       
_________________________________________________________________
dense_4 (Dense)              (None, 15, 1)             81        
Total params: 401
Trainable params: 401
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
# Image
print(conc_V.shape)
test = Input(shape=(196,3))
denseA = Dense(Na, activation='relu')(test)   # Img_count = hardcoded here
outputV = Dense(1, activation='softmax')(denseA)
dense_vis = Model(inputs=test, outputs=outputV)
print(dense_vis.summary())

(196, 3)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 196, 3)            0         
_________________________________________________________________
dense_5 (Dense)              (None, 196, 205)          820       
_________________________________________________________________
dense_6 (Dense)              (None, 196, 1)            206       
Total params: 1,026
Trainable params: 1,026
Non-trainable params: 0
_________________________________________________________________
None


### CALCULATING ATTENTION

In [26]:
# WHAT TO DO : multiply (op_dense) * (embedding)

# FIRST STEP : GET EMBEDDING                                                        STATUS : 
#1 answe embedding => each of 205 is actually have d dimensions-- DICTIONARY!    == BUG! -> DONE!
#2 quest embedding => using same PLACEHOLDER that was used earlier               == DONE
#3 Image embedding => search for reshaped from getEm()                           == DONE


#===  ANSWER EMBEDDING MATRIX ====
ans_emb_mat = []
for d in final_ans_embedding:
    for k in d:
        npArr = d[k]
        ans_emb_mat.append (npArr)
        
ans_emb_mat = np.array(ans_emb_mat)
print ("ans emb mat shape : ", str (ans_emb_mat.shape))


#===  QUESTION EMBEDDING MATRIX ====
que_emb_mat = []
for word in question.split(' '):
    word = lib.tokenize(word)
    if(len(word) > 1 and len(word[-1]) <= 1):
        word = word[0]
    word = ''.join(word)
    word = word.lower()
    if(word[-1] in ('?', ',')):
        continue
    que_emb_mat.append(question_embedding_map[word]) #Here
    
for i in range(15 - len(que_emb_mat)):
    que_emb_mat.append(np.zeros(512))

que_emb_mat = np.array(que_emb_mat)
print ("que emb mat shape : ", str (que_emb_mat.shape))
    

#===  IMAGE EMBEDDING MATRIX ====
emb_name = 'COCO_train2014_000000000009.txt'
threeD = all_img_emb[emb_name]
img_emb_mat = []
for i in range (14):
    for j in range (14):
        img_emb_mat.append (threeD[i][j])
    
img_emb_mat = np.array(img_emb_mat)

print ("img emb mat shape : ", str (img_emb_mat.shape))


ans emb mat shape :  (205, 512)
que emb mat shape :  (15, 512)
img emb mat shape :  (196, 512)


In [27]:
# converting np_to_tensor
tf_VembMat = tf.convert_to_tensor(img_emb_mat, np.float32)
tf_QembMat = tf.convert_to_tensor(que_emb_mat, np.float32)
tf_AembMat = tf.convert_to_tensor(ans_emb_mat, np.float32)

tf_outputV = tf.convert_to_tensor(outputV, np.float32)
tf_outputQ = tf.convert_to_tensor(outputQ, np.float32)
tf_outputA = tf.convert_to_tensor(outputA, np.float32)

# FINDING ATTENTION : MAT_MUL (Pv * V), (Pa * A), (Pq * Q)
attentionV = K.dot( K.transpose(tf_outputV[batch_size, :,:]) , tf_VembMat[:,:])
attentionQ = K.dot( K.transpose(tf_outputQ[batch_size, :,:]) , tf_QembMat[:,:])
attentionA = K.dot( K.transpose(tf_outputA[batch_size, :,:]) , tf_AembMat[:,:])

print (type(attentionA))
print (attentionA[0].shape)

<class 'tensorflow.python.framework.ops.Tensor'>
(512,)


### Decision Making

In [28]:
# using HADAMARD on all three.. element wise multiplication to get single [1xD]
attQA = tf.multiply (attentionA, attentionQ)
attQV = tf.multiply (attentionV, attentionQ)
predictedTensor = tf.multiply (attQV, attQA)
print (predictedTensor)

Tensor("Mul_2:0", shape=(1, 512), dtype=float32)


#### cosine similarity of output of MCB 

In [29]:
predictedArray = np.random.uniform(-1, 1, size=512)
from sklearn.metrics.pairwise import cosine_similarity

cos_simi = cosine_similarity(predictedArray.reshape(1, -1), ans_emb_mat )
ind = np.unravel_index(cos_simi.argmax(), cos_simi.shape)
#print ("max prob : "+ str(cos_simi[ind]))
print ("best matched ans # : " + str(ind[1]) )
#print (ans_emb_mat[ind[1]])

best matched ans # : 103


In [34]:
print("img id    : " + str (list_imageId_mapped_to_ques[ind[1]]))
print ("Question  : " + str (list_questions[ind[1]]))
print("Answer    : " + str (list_ans_mapped_to_ques[ind[1]]))


img id    : 262172
Question  : Where are the two blue coolers?
Answer    : left
