## Evaluate results 

### Load the test data

In [2]:
import pickle
from collections import deque
import numpy as np

file_name='./data/processed_data/questions-visual_features-test.txt'

with open(file_name, 'rb') as pickleFile:
  test_deque=pickle.load(pickleFile)

test_questions, test_images, test_answers = zip(*test_deque)

### CONVERT FROM TUPLE TO ARRAY
test_questions=np.array(test_questions)
test_visual_features=np.array(test_images)
test_answers=np.array(test_answers)

### Load the model

In [4]:
import numpy as np

embedding_matrix=np.load('./data/embedding/glove_300d_embedding.npy')

In [7]:
from keras import optimizers
from keras.models import load_model
from numpy import array
from keras.models import Sequential
from keras.models import Model
from keras.layers import LSTM, Bidirectional
from keras.layers import Input
from keras.layers import Embedding
from keras.initializers import Constant
from keras.layers import Flatten
from keras.layers import Dense, Conv2D, MaxPooling2D
from keras.layers import Multiply, Dropout

MAX_WORDS=3000
MAX_LEN=25
EMBED_DIM=300


# encoder input model
encoder_inputs = Input(shape=(MAX_LEN,))
encoder1 = Embedding(MAX_WORDS,
                     EMBED_DIM,
                     embeddings_initializer=Constant(embedding_matrix),
                     trainable=False)(encoder_inputs)
encoder2  = Bidirectional(LSTM(512,activation='relu', trainable=False))(encoder1)
#ENCODER MODEL
#make sure to create an model inside your model. Because the encoder will be 
#saved as a model itself.
encoder_model = Model(inputs=encoder_inputs,outputs=encoder2, name='Encoder')
encoder_model.load_weights('./data/models/bidirectionnal_lstm_encoder2.h5')

## IMAGES
cnn_input=Input(shape=(14,14,512), name='CNN-Input')

x = Conv2D(256, (3, 3),
                  activation='relu',
                  padding='same')(cnn_input)

x = Conv2D(128, (3, 3),
                  activation='relu',
                  padding='valid')(x)

x = MaxPooling2D((2, 2), strides=(2, 2))(x)
flatten=Flatten()(x)

cnn_output=Dense(1024, input_dim=4096, activation='tanh')(flatten)
cnn=Model(inputs=cnn_input,outputs=cnn_output)

multiplied = Multiply()([cnn.output, encoder_model.output])
dropout_1 = Dropout(0.5)(multiplied)
fully_connected=Dense(1000, activation='tanh')(dropout_1)
dropout_2 = Dropout(0.5)(fully_connected)
fully_connected=Dense(MAX_WORDS, activation='softmax')(dropout_2)

question_answering=Model(inputs=[cnn.input,encoder_model.input], outputs=fully_connected)

question_answering.compile(optimizer='adam', loss='categorical_crossentropy',
    metrics=['accuracy'])




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



In [9]:
question_answering.load_weights('./data/models/question_answering_model2.h5')

In [10]:
#make predicitons on the test set
predictions=question_answering.predict([test_visual_features,test_questions])
#use argmax to get the index with the highest probability
predictions_argmax=np.argmax(predictions, axis=1)

In [12]:
import pickle

# loading
with open('./data/tokenizer/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [13]:
import numpy as np

reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
reverse_word_map[0]='not in vocab'

predicted_answers=[]
real_answers=[]

for word in predictions_argmax:
  predicted_answers.append(reverse_word_map[word])
for word in np.squeeze(test_answers):
  real_answers.append(reverse_word_map[word])

#### Accuracy

In [14]:
### accuracy on test set
correct=0

for i in range(len(predicted_answers)):
  if predicted_answers[i]==real_answers[i]:
    correct+=1

accuracy=correct/len(predicted_answers)
print('accuracy :' + str(accuracy))

accuracy :0.21871138570167697


#### WUPS

In [15]:
import WUPS
from WUPS import wup_measure

In [54]:
predicted_answers=np.array(list(map(lambda x: x.replace("'",''), predicted_answers)))
real_answers=np.array(list(map(lambda x: x.replace("'",''), real_answers)))

#### wup score @0.9

In [60]:
total_wup=0

for i in range(len(predicted_answers)):
    single_wup=wup_measure(predicted_answers[i],real_answers[i],similarity_threshold=0)
    total_wup+=single_wup

wup_score=total_wup/len(predicted_answers)

In [61]:
wup_score

0.7625313402551096

#### wup score @0.0

In [62]:
total_wup=0

for i in range(len(predicted_answers)):
    single_wup=wup_measure(predicted_answers[i],real_answers[i],similarity_threshold=0.9)
    total_wup+=single_wup

wup_score=total_wup/len(predicted_answers)

In [63]:
wup_score

0.3031790144457981