In [1]:
import numpy as np
import pandas as pd
import math
import json
import re
from keras.models import load_model
from keras.layers import Dense, Activation, Dropout, UpSampling2D, RepeatVector,TimeDistributed
from keras.layers import Conv2D, MaxPooling2D, Flatten,BatchNormalization,concatenate,LSTM
from keras.layers import Input, Lambda,Reshape,Multiply, add, Add
from keras.models import Model
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
from keras.models import model_from_json
from gensim.models import KeyedVectors
import keras.backend as K
from numpy import prod
from nltk.corpus import wordnet as wn
from keras import regularizers

Using TensorFlow backend.


In [2]:
def load_imageData():
    with open('./data/img_features.json', 'r') as f:
        feat = json.load(f) 
    return feat
# assumption: imageid at end of question. 
# later from start in enco-decoder remove all tags, 'the', ',' (in answers), keep _ words 
# correct imageids not starting with i
def load_QA(filename):
    data = open(filename)
    questList,ansList,imageIds = [],[],[]
    for line in data.readlines():
        line = line.replace("?\n","?") 
        line = line.replace("."," . ")   
        line = line.replace(", ",",")
        line = line.replace("_",",")
        #line = line.replace("the","")
        if not (line.endswith("?")):
            line = line.replace("\n","")
            line = "<start>"+","+line
            ansList.append(line.split(","))
        else:
            imgId = line.split(" ")[-2]
            if imgId.startswith("i"):
                imageIds.append(imgId)
            else:
                print("incorrect image ID" + line)
                i = re.search("\d", imgId)
                imgId = "image"+imgId[i.start():]
                imageIds.append(imgId)
            line = re.sub(r"image\d+", "", line)
            line = line.replace("  "," ")
            temp = line.split(" ")[0:-3]
            temp.append("?")
            #questList.append(line.split(" ")[0:-3])
            questList.append(temp)
    return (questList,ansList,imageIds)

def create_vocab_df(vocab):
    vocab_df = pd.DataFrame(list(vocab))
    vocab_df.columns = ["vocab"]
    vocab_df["index"]=vocab_df.index
    wordVecList = []
    for i,word in enumerate(vocab_df["vocab"]):
        wordVecList.append(wv[word])
    vocab_df["wordVec"] = wordVecList
    return vocab_df

def max_len(data):
    maxLen = 0
    for i,u in enumerate(data):
        if maxLen < np.asarray(u).shape[0]:
            maxLen = np.asarray(u).shape[0]  
    return maxLen

def save_model(model,model_name):
    model_json = model.to_json()
    with open(model_name+".json", "w") as json_file:
        json_file.write(model_json)
    model.save_weights(model_name+".h5")
    
def load_model(savedModel_filename):
    filename = savedModel_filename+".json"
    json_file = open(filename, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    filename = savedModel_filename+".h5"
    loaded_model.load_weights(filename)
    return loaded_model

def load_wordEmbeddings(wv_filename):
    wv = KeyedVectors.load(wv_filename,mmap='r')
    wv.init_sims(replace=True)
    return wv

def get_vocab(dataList):
    vocab = set()
    for aList in dataList:
        for a in aList:
            vocab.add(a)
    vocab.add("<Pad>")
    return vocab

In [98]:
# loading encoder-decoder model 
loaded_model = load_model('./saved_models/modelTFiDFWeights-2')
# load questions, answers and imageids
questData, ansData, imageIds = load_QA("./data/Text-Q&A-train.txt")
testValQuestData, testValAnsData, valImageIds = load_QA("./data/Text-Q&A-test.txt")
#create a dataframe to store vocabulary, it's ref index and word embeddings
wv = load_wordEmbeddings("./saved_models/word2vec-train1.kv")
vocab_df = create_vocab_df(wv.vocab.keys())
# get max question and answer length for padding
maxQuestLen, maxAnsLen = max_len(questData), max_len(ansData)
# initialize variables
batchSize = 50
epoch = 50
vecSize = wv.vector_size
vocabSize = vocab_df.shape[0]
#import image feat
feat = load_imageData()
#load model =>use old wv=>start tag is <start>, vocab has 1002 words 

In [7]:
ansVocab = get_vocab(ansData)
ansVocab = sorted(ansVocab)
ansVocabdf = create_vocab_df(ansVocab)

In [99]:
# Visual QA model architecture
imgInput = Input(shape=(np.array(feat['image1']).shape), dtype='float32', name='images')
input_quest = loaded_model.get_input_at(0)
#img = Conv2D(512,(2,2),padding = 'valid',activation = 'relu')(imgInput)
img = MaxPooling2D(pool_size=(2,2),padding='same')(imgInput)
img = BatchNormalization()(img)
img = Conv2D(256,(1,1),padding = 'same',activation = 'relu')(img)
img = Conv2D(256,(2,2),padding = 'valid',activation = 'relu')(img)
img = Conv2D(256,(2,2),padding ='same',activation = 'relu')(img)
img = Conv2D(128,(1,1),padding ='same',activation = 'relu')(img)
#img = Conv2D(256,(2,2),activation = 'relu',padding ='valid')(img)
#img = Conv2D(128,(1,1),activation = 'relu',padding ='valid')(img) # batchsize,6,6,128
#img = Conv2D(64,(1,1),activation = 'relu',padding ='same')(img)
#img = Conv2D(1,(1,1),padding ='same')(img)
summaryQuest = loaded_model.get_layer("bidirectional_1").output #batchSize,128
group0_a = Lambda(lambda x: x[:,0:3,0:3,:], output_shape=(3,3,128))(img) # batchSize,3,3,128
group1_a = Lambda(lambda x: x[:,0:3,3:6,:], output_shape=(3,3,128))(img)
group2_a = Lambda(lambda x: x[:,3:6,0:3,:], output_shape=(3,3,128))(img)
group3_a = Lambda(lambda x: x[:,3:6,3:6,:], output_shape=(3,3,128))(img)

group0_a = Flatten()(group0_a) # batchSize,1152
concat0 = concatenate([group0_a,summaryQuest], axis = 1) #batchSize,(1152+128)
concat01 = Reshape(target_shape=(1,1280))(concat0)  #batchSize,1,(1152+128)
group1_a = Flatten()(group1_a)
concat1 = concatenate([group1_a,summaryQuest], axis = 1)
concat11 = Reshape(target_shape=(1,1280))(concat1)
group2_a = Flatten()(group2_a)
concat2 = concatenate([group2_a,summaryQuest], axis = 1)
concat21 = Reshape(target_shape=(1,1280))(concat2)
group3_a = Flatten()(group3_a)
concat3 = concatenate([group3_a,summaryQuest], axis = 1)
concat31 = Reshape(target_shape=(1,1280))(concat3)

#answers = Input(shape=(maxAnsLen,vecSize), dtype='float32', name='answers') #batchsize,maxAnsLen,100

inputConcat = concatenate([concat01,concat11,concat21,concat31], axis = 1) # batchsize,2,2380
#inputConcat = concatenate([inputConcat,concat21], axis = 1) # batchsize,3,2380
#inputConcat = concatenate([inputConcat,concat31], axis = 1) # batchsize,4,2380
inputConcat = BatchNormalization()(inputConcat)
lstm_out, hidden_state, cell_state = LSTM(512,return_sequences = True,return_state=True)(inputConcat)  #batchSize,
 
inputFinal = BatchNormalization()(hidden_state)
inputFinal = RepeatVector(maxAnsLen)(inputFinal)                 #output_shape: (batchSize,maxAnsLen,512)
#inputFinal = concatenate([inputFinal,answers], axis = 2)  #input length 128+100, shape = batchSize,16,228

lstm_out2, hidden_state2, cell_state2 = LSTM(128,return_sequences = True,return_state=True)(inputFinal)
# kernel_regularizer=regularizers.l2(0.001)
output3 = (TimeDistributed(Dense(len(ansVocab), activation="softmax")))(lstm_out2)

In [100]:
modelQA = Model(input=[imgInput,input_quest], outputs=output3)
modelQA.compile(loss="categorical_crossentropy", optimizer='adam',metrics=['categorical_accuracy'])

  """Entry point for launching an IPython kernel.


In [None]:
modelQA.summary()

In [11]:
def get_encInp(batchData,maxLen):
    encInp = np.ones((len(batchData),maxLen,vecSize))*(np.asarray(vocab_df["wordVec"].loc[vocab_df["vocab"]=="<Pad>"])[0])
    for i,wordList in enumerate(batchData):
        for w,word in enumerate(wordList):
            if word in wv.vocab:
                encInp[i][w] = np.asarray(vocab_df["wordVec"].loc[vocab_df['vocab'] ==word])[0]
            else:
                encInp[i][w] = np.asarray(vocab_df["wordVec"].loc[vocab_df['vocab'] =="<Unk>"])[0]
    return encInp
            
def get_imgInp(batchData,batchImageIds):
    imgInp = np.zeros((len(batchData),np.array(feat['image1']).shape[0],np.array(feat['image1']).shape[1],np.array(feat['image1']).shape[2]))
    for i,imgId in enumerate(batchImageIds):
        imgInp[i] = feat[imgId] 
    maxV = np.max(imgInp)
    imgInp= imgInp.astype("float32")/(float)(maxV)
    return imgInp 

def get_oneHotTarget(batchData,vocabSize,vocab_df): 
    target = np.zeros((len(batchData),maxAnsLen,vocabSize))
    target[:,:,vocab_df["index"].loc[vocab_df["vocab"]== "<Pad>"]] = 1
    for i,questList in enumerate(batchData):
        for w,quest in enumerate(questList[1:]):
            if quest in list(vocab_df["vocab"]):
                target[i,w,vocab_df["index"].loc[vocab_df["vocab"]== "<Pad>"]] = 0
                target[i,w,vocab_df["index"].loc[vocab_df["vocab"]== quest]] = 1
    return target

def calculate_class_weights(trainTarget):
    from sklearn.utils.class_weight import compute_class_weight
    number_dim = np.shape(trainTarget)[2]
    weights = np.empty([number_dim, 2])
    for nd in range(number_dim):
        y_true = [0]*len(trainTarget)
        for i in list(np.where(trainTarget[:,:,nd] ==1)[0]):
            y_true[i] =1
        weights[nd] = compute_class_weight('balanced', np.unique(y_true), y_true)
    return weights 

In [None]:
#training
batchSize = 100
valBatchData = testValQuestData[0:50]  # 50 samples for validation 
valBatchDataOp = testValAnsData[0:50]
valBatchImgId = valImageIds[0:50]
valEncInp = get_encInp(valBatchData,maxQuestLen)
valTarget = get_oneHotTarget(valBatchDataOp,len(ansVocab),ansVocabdf)
valImg = get_imgInp(valBatchData,valBatchImgId)
for e in range(100):
    print("training epoch "+str(e)) 
    for itr in range(int(math.ceil(np.asarray(questData).shape[0]/batchSize))):
        if (np.asarray(questData).shape[0] - (itr*batchSize) >= batchSize):
            startIx = (itr*batchSize)%(np.asarray(questData).shape[0])
            #startIx = 0
        else:
            break
        # prepare batch data
        batchData = questData[startIx:startIx+batchSize] 
        batchDataOp = ansData[startIx:startIx+batchSize]
        batchImgId = imageIds[startIx:startIx+batchSize]
        #encoder Inputs
        trainEncInp = get_encInp(batchData,maxQuestLen)
        #image input
        trainImg = get_imgInp(batchData,batchImgId)
        #answer targets
        trainTarget = get_oneHotTarget(batchDataOp,len(ansVocab),ansVocabdf)
        #class weights
        class_weights = calculate_class_weights(trainTarget)
        # training the model.             
        modelQA.fit([trainImg,trainEncInp], 
                    trainTarget,epochs=1,shuffle=True,batch_size=batchSize,class_weight=class_weights)
#validation_data=([valImg,valEncInp],valTarget)

In [None]:
# results on test data
batchData = testValQuestData[0:100]
batchDataOp = testValAnsData[0:100]
batchImgId = valImageIds[0:100]
EncInp = get_encInp(batchData,maxQuestLen)
trainImg = get_imgInp(batchData,batchImgId)
modelPred = modelQA.predict([trainImg,EncInp])
target = multiClass_target(ansVocab,ansVocabdf,batchDataOp)
PredTarget = multiClass_pred(ansVocab,ansVocabdf,modelPred)
precision,recall = metric_check(PredTarget,target)
actualAnsList,PredAnsList = get_ActandPred_list(modelPred,ansVocabdf,batchDataOp)
calculate_WUP(actualAnsList,PredAnsList)
print("Precision:"+str(precision))
print("Recall"+str(recall))
print_predictions(modelPred,batchData,batchDataOp)

In [103]:
# results on train data
batchData = questData[30:50]
batchDataOp = ansData[30:50]
batchImgId = imageIds[30:50]
EncInp = get_encInp(batchData,maxQuestLen)
trainImg = get_imgInp(batchData,batchImgId)
modelPred = modelQA.predict([trainImg,EncInp])
target = multiClass_target(ansVocab,ansVocabdf,batchDataOp)
PredTarget = multiClass_pred(ansVocab,ansVocabdf,modelPred)
precision,recall = metric_check(PredTarget,target)
actualAnsList,PredAnsList = get_ActandPred_list(modelPred,ansVocabdf,batchDataOp)
calculate_WUP(actualAnsList,PredAnsList)
print("Precision:"+str(precision))
print("Recall"+str(recall))
print_predictions(modelPred,batchData,batchDataOp)

KeyboardInterrupt: 

In [None]:
# train to overfit
batchData = questData[0:100] 
batchDataOp = ansData[0:100]
batchImgId = imageIds[0:100]
trainEncInp = get_encInp(batchData,maxQuestLen)
#answers inputs
#trainAnswerIp= get_encInp(batchDataOp,maxAnsLen)
#image input
trainImg = get_imgInp(batchData,batchImgId)
#answer targets
trainTarget = get_oneHotTarget(batchDataOp,len(ansVocab),ansVocabdf)
class_weights = calculate_class_weights(trainTarget)
# training the model. 
for e in range(80):
    print(str(e))
    modelQA.fit([trainImg,trainEncInp],trainTarget,epochs=1,
                shuffle=True,batch_size=50,class_weight=class_weights)

In [None]:
# # print predictions
# batchData = questData[0:10]
# batchDataOp = ansData[0:10]
# batchImgId = imageIds[0:10]
# EncInp = get_encInp(batchData,maxQuestLen)
# trainImg = get_imgInp(batchData,batchImgId)
# #testAns = np.ones((np.asarray(batchDataOp).shape[0],maxAnsLen,vecSize))*wv["<Pad>"]
# #testAns[:,0,:] = wv["<start>"]
# modelPred = modelQA.predict([trainImg,EncInp])
# for i in range(len(batchData)):
#     prediction = []
#     print("Original Question:"+str(batchData[i]))
#     print("Original Answer:"+str(batchDataOp[i]))  
#     for p in np.argmax(modelPred[i],1):
#         prediction.append(np.asarray(ansVocabdf["vocab"].loc[ansVocabdf["index"]==p])[0])
#     print("Decoded Answer:"+str(prediction)) 

In [82]:
# testing model with actual result analysis 
#testing on train 
    
def get_ActandPred_list(modelPred,vocab_df,batchDataOp):
    PredAnsList = list(list())
    actualAnsList= list(list())
    for y in batchDataOp:
        actualAnsList.append(y[1:])
    for i in range(len(batchDataOp)):
        prediction = [] 
        for p in np.argmax(modelPred[i],1):
            if p != np.asarray(ansVocabdf["index"].loc[ansVocabdf["vocab"]=="<Pad>"]): # <Pad> index in ansVocabdf
                prediction.append(np.asarray(ansVocabdf["vocab"].loc[ansVocabdf["index"]==p])[0])
        PredAnsList.append(prediction)
    return actualAnsList,PredAnsList
        
    
def print_predictions(modelPred,batchData,batchDataOp):
    for i in range(len(batchData)):
        prediction = []
        print("Original Question:"+str(batchData[i]))
        print("Original Answer:"+str(batchDataOp[i]))  
        for p in np.argmax(modelPred[i],1):
            prediction.append(np.asarray(ansVocabdf["vocab"].loc[ansVocabdf["index"]==p])[0])
        print("Decoded Answer:"+str(prediction)) 
        
def metric_check(prediction_target,actual):
    recall = np.sum((actual + prediction_target)==2)/np.sum(actual==1)
    precision = np.sum((actual + prediction_target)==2)/np.sum(prediction_target==1)
#     print("recall"+str(np.sum((actual + prediction_target)==2)/np.sum(actual==1))) #recall
#     print("precision"+str(np.sum((actual + prediction_target)==2)/np.sum(prediction_target==1)))#precision
    return precision, recall 

In [66]:
def multiClass_target(ansVocab,vocab_df,batchDataOp):
    answerTarget = np.zeros((len(batchDataOp),len(ansVocab)))
    for i,a in enumerate(batchDataOp):
        for w in a:
            answerTarget[i][(vocab_df["index"].loc[vocab_df['vocab'] == w])] = 1
    answerTarget[:,vocab_df["index"].loc[vocab_df["vocab"]=="<start>"]] = 0
    return answerTarget

def multiClass_pred(ansVocab,vocab_df,modelPred):
    predTarget = np.zeros((len(modelPred),len(ansVocab)))
    for i in range(len(modelPred)):  
        for p in np.argmax(modelPred[i],1):
            predTarget[i][p] = 1 
    predTarget[:,vocab_df["index"].loc[vocab_df["vocab"]=="<Pad>"]] = 0
    return predTarget

In [67]:
#WUPS measure, requires input as list of actual and predicted 
def wup_measure(a,b,similarity_threshold=0.9):
    """
    Returns Wu-Palmer similarity score.
    More specifically, it computes:
        max_{x \in interp(a)} max_{y \in interp(b)} wup(x,y)
        where interp is a 'interpretation field'
    """
    def get_semantic_field(a):
        weight = 1.0
        semantic_field = wn.synsets(a,pos=wn.NOUN)
        return (semantic_field,weight)


    def get_stem_word(a):
        """
        Sometimes answer has form word\d+:wordid.
        If so we return word and downweight
        """
        weight = 1.0
        return (a,weight)


    global_weight=1.0

    (a,global_weight_a)=get_stem_word(a)
    (b,global_weight_b)=get_stem_word(b)
    global_weight = min(global_weight_a,global_weight_b)

    if a==b:
        # they are the same
        return 1.0*global_weight

    if a==[] or b==[]:
        return 0


    interp_a,weight_a = get_semantic_field(a) 
    interp_b,weight_b = get_semantic_field(b)

    if interp_a == [] or interp_b == []:
        return 0

    # we take the most optimistic interpretation
    global_max=0.0
    for x in interp_a:
        for y in interp_b:
            local_score=x.wup_similarity(y)
            if local_score > global_max:
                global_max=local_score

    # we need to use the semantic fields and therefore we downweight
    # unless the score is high which indicates both are synonyms
    if global_max < similarity_threshold:
        interp_weight = 0.1
    else:
        interp_weight = 1.0

    final_score=global_max*weight_a*weight_b*interp_weight*global_weight
    return final_score 

def fuzzy_set_membership_measure(x,A,m):
    return 0 if A==[] else max(map(lambda a: m(x,a), A))

def score_it(A,T,m):
    if A==[] and T==[]:
        return 1
    score_left = 1 
    score_right = 1 
    #for a,p in zip(actual_list,pred_list):
    for a1 in A:
        score_left= score_left*m(a1,T)
    for t1 in T:
        score_right= score_right*m(t1,A)
    return min(score_left,score_right) 

our_element_membership=lambda x,y: wup_measure(x,y,0.9)
our_set_membership=\
            lambda x,A: fuzzy_set_membership_measure(x,A,our_element_membership)


def calculate_WUP(actual_list,pred_list):
    score_list = []
    if len(actual_list) > 1:
        for a,p in zip(actual_list,pred_list):
            score_list.append(score_it(a,p,our_set_membership))
    final_score=float(sum(score_list))/float(len(score_list))
    print('final score is %2.2f%%' % (final_score * 100.0))

In [None]:
# # results on whole dataset
# PredAnsList=[]
# actualAnsList =[]
# totalPrecision =0.0
# totalRecall = 0.0
# for itr in range(int(math.ceil(np.asarray(questData).shape[0]/batchSize))):
#     if (np.asarray(questData).shape[0] - (itr*batchSize) >= batchSize):
#         startIx = (itr*batchSize)%(np.asarray(questData).shape[0])
#         #startIx = 0
#     else:
#         break
#     # prepare batch data
#     batchData = questData[startIx:startIx+batchSize] 
#     batchDataOp = ansData[startIx:startIx+batchSize]
#     batchImgId = imageIds[startIx:startIx+batchSize]
#     #encoder Inputs
#     trainEncInp = get_encInp(batchData,maxQuestLen)
#     #image input
#     trainImg = get_imgInp(batchData,batchImgId)
#     #answer targets
#     #trainTarget = get_oneHotTarget(batchDataOp,len(ansVocab),ansVocabdf)
#     modelPred = modelQA.predict([trainImg,trainEncInp])
#     target = multiClass_target(ansVocab,ansVocabdf,batchDataOp)
#     PredTarget = multiClass_pred(ansVocab,ansVocabdf,modelPred)
#     precision,recall = metric_check(PredTarget,target)
#     totalPrecision += precision
#     totalRecall +=recall
#     actualAnsList1,PredAnsList1 = get_ActandPred_list(modelPred,ansVocabdf,batchDataOp)
#     actualAnsList.append(actualAnsList1)
#     PredAnsList.append(PredAnsList)
# calculate_WUP(actualAnsList,PredAnsList)
# print("Total Precision:"+ str(np.asarray(totalPrecision).astype("float32")/(itr+1) ))
# print("Total Recall:"+ str(np.asarray(totalRecall).astype("float32")/(itr+1) ))

In [84]:
save_model(modelQA,"imgTextSimple1")

In [105]:
loaded_model_vaq = load_model("imgTextSimple1")

In [106]:
modelPred = loaded_model_vaq.predict([trainImg,EncInp])
target = multiClass_target(ansVocab,ansVocabdf,batchDataOp)
PredTarget = multiClass_pred(ansVocab,ansVocabdf,modelPred)
precision,recall = metric_check(PredTarget,target)
actualAnsList,PredAnsList = get_ActandPred_list(modelPred,ansVocabdf,batchDataOp)
print("Precision:"+str(precision))
print("Recall"+str(recall))
calculate_WUP(actualAnsList,PredAnsList)
print_predictions(modelPred,batchData,batchDataOp)

Precision:0.1
Recall0.08
final score is 10.25%
Original Question:['what', 'is', 'the', 'largest', 'white', 'object', 'on', 'the', 'left', 'side', 'of', 'the', 'picture', '?']
Original Answer:['<start>', 'printer']
Decoded Answer:['shelves', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>']
Original Question:['what', 'is', 'between', 'the', 'paper', 'holder', 'and', 'tape', 'dispenser', 'below', 'the', 'white', 'paper', 'rack', '?']
Original Answer:['<start>', 'hole', 'puncher']
Decoded Answer:['plant', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>']
Original Question:['what', 'is', 'the', 'blue', 'object', 'in', 'the', 'black', 'pen', 'stand', '?']
Original Answer:['<start>', 'scissor']
Decoded Answer:['door', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>', '<Pad>']
Original Question:['what', 'color', 'is', 'the', 'fax', 'machine', 'below', 'the'

In [None]:
#add test on complete data
#save model and results 
#experiment with regularization 