# Introductive code

In [None]:
# Explicitly print the variables: useful for debugging

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Import most relevant libraries

import os
import tensorflow as tf
import numpy as np

# Import library for handling json files

import json

# Import library for handling dataframes

import pandas as pd

# Import library for handling images

from PIL import Image

# Set the seed for random operations, in order to let all the experiments be reproducible

SEED = 1234
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [None]:
# Add Colab (with Drive)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Run this to unzip the folder for the current session

!unzip '/content/drive/My Drive/ANNDL_Homeworks3/Experiment_10/anndl-2020-vqa.zip'

In [None]:
# Inspect the dataset

!ls '/content'
!ls '/content/VQA_Dataset'

drive  sample_data  VQA_Dataset
Images	test_questions.json  train_questions_annotations.json


In [None]:
# We get the current directory cwd ('/content') and the directory env, which contains the zip file and the notebook

cwd = os.getcwd()
env = '/content/drive/My Drive/ANNDL_Homeworks3/Experiment_10'
dataset_dir = os.path.join('/content', 'VQA_Dataset')

# Tokenize the words

In [None]:
# From the file .json, extract a dictionary with:

# Key: id of the question
# Value: infos about the question

json_train_q_a_dir = os.path.join(dataset_dir, 'train_questions_annotations.json')
with open(json_train_q_a_dir) as json_file:
    labels = json.load(json_file)

# Manipulate the data with a dataframe

df = pd.DataFrame.from_dict(labels, orient='index')
df.columns = ['question', 'image_id', 'answer']

# Replace the answer "monkey bars" with "monkeybars"

idx = df.index[df['answer'] == 'monkey bars']
df.loc[idx,'answer'] = 'monkeybars'

In [None]:
# Parameters used in our attempts

num_data = 28000
MAX_NUM_WORDS = 100000

img_h = 64
img_w = 64

EMBEDDING_SIZE = 64

# Import functions we use to convert words to integers

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df = df.head(num_data)

# Create tokenizers

answer_tokenizer = Tokenizer(num_words= MAX_NUM_WORDS)
question_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token = '<UNK>')

# Get the list of questions and answers

question_list = df['question'].tolist()
answer_list = df['answer'].tolist()

# Tokenize questions and answers

answer_tokenizer.fit_on_texts(answer_list)
question_tokenizer.fit_on_texts(question_list)

answer_tokenized = answer_tokenizer.texts_to_sequences(answer_list)
question_tokenized = question_tokenizer.texts_to_sequences(question_list)

# Get the dictionaries of words in questions and answers

answer_wtoi = answer_tokenizer.word_index
question_wtoi = question_tokenizer.word_index

vocabulary_answer_size = len(answer_wtoi) + 1
vocabulary_question_size = len(question_wtoi) + 2

# Get the maximum length of questions and answers

max_answer_length = max(len(sentence) for sentence in answer_tokenized)
max_question_length = max(len(sentence) for sentence in question_tokenized)

# Pad tokenized questions and answers

question_encoder_inputs = pad_sequences(question_tokenized, maxlen=max_question_length, padding = 'pre')
answer_encoder_inputs = pad_sequences(answer_tokenized, maxlen=max_answer_length, padding = 'post')

# Model definition

In [None]:
### RNN for language input

encoder_input = tf.keras.Input(shape=[max_question_length])

# Create the embedding representation of input vector of the sentences, passing from {0,1}^N to (0,1)^m smaller representation

encoder_embedding_layer = tf.keras.layers.Embedding(input_dim=vocabulary_question_size,       # input dim, is the num of words in the dictionary + 2 (it is not MAX_NUM_WORD because in the dataset there could be less)
                                                    output_dim=EMBEDDING_SIZE,                # output dim, is the m, dim of the vector (0,1)^m represetation of the word, it is the m
                                                    input_length=max_question_length,         # dimension of a input sequence
                                                    mask_zero=True)                           # ignores padding's zeros
encoder_embedding_out = encoder_embedding_layer(encoder_input)
encoder = tf.keras.layers.LSTM(units=128, return_state=True)
#return_sequences: Boolean. Whether to return the last output. in the output sequence, or the full sequence. Default: False.
#return_state: Boolean. Whether to return the last state in addition to the output. Default: False.

encoded_question, _, _ = encoder(encoder_embedding_out)


### CNN for image input

# Load VGG16 Model and set training options

vgg = tf.keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=(img_h, img_w, 3))
vgg.trainable = False

# Build the CNN

vision_model = tf.keras.Sequential()
vision_model.add(vgg)
vision_model.add(tf.keras.layers.Flatten())
vision_model.add(tf.keras.layers.Dense(128, activation='relu'))

image_input = tf.keras.Input(shape=(img_h, img_w, 3))
encoded_image = vision_model(image_input)


### Combine the 2 models

from tensorflow.keras.models import Model

merged = tf.keras.layers.concatenate([encoded_question, encoded_image])
output = tf.keras.layers.Dense(vocabulary_answer_size, activation='softmax')(merged)
vqa_model = Model(inputs=[image_input, encoder_input], outputs=output)

### Model summary

vqa_model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 21)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 21, 64)       222464      input_1[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 64, 64, 3)]  0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 128), (None, 9

# Image preprocessing

In [None]:
# Function that loads the image and processes it with VGG16 preprocessing function

from tensorflow.keras.applications.vgg16 import preprocess_input

def load_and_process_image(image_path):
  im = Image.open(image_path)
  im = im.resize((img_h, img_w), resample=Image.ANTIALIAS)
  im = np.array(im)
  im = im[:,:,0:3]
  return preprocess_input(im)

# Function that creates the proper numpy array for storing the images

def read_images(paths):
  ims = np.zeros((num_data, img_h, img_w, 3))
  i = 0
  for image_path in paths:
    ims[i,:,:,:] = load_and_process_image(image_path)
    i += 1
  return ims

# Use the above functions to create the numpy array for the training

image_list = df['image_id'].tolist()
images_dir = os.path.join(dataset_dir, 'Images')
image_list = [os.path.join(images_dir, im + '.png') for im in image_list]

train_X_images = read_images(image_list)

# Model training

In [None]:
# Set optimization params

# Loss
ls = tf.keras.losses.CategoricalCrossentropy()

# Learning rate
lr = 1e-3
optim = tf.keras.optimizers.Adam(learning_rate=lr)

# Validation metrics
val_metric = ['accuracy']

# Compile model
vqa_model.compile(optimizer=optim, loss=ls, metrics=val_metric)

In [None]:
# We set the proper callbacks for training

# Create the folder for the experiments

from datetime import datetime

cwd = os.getcwd()

exps_dir = os.path.join(env, 'vqa_experiments')
if not os.path.exists(exps_dir):
    os.makedirs(exps_dir)

now = datetime.now().strftime('%b%d_%H-%M-%S')

exp_name = 'vqa_exp'

exp_dir = os.path.join(exps_dir, exp_name + '_' + str(now))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
    
callbacks = []

# Model checkpoints

ckpt_dir = os.path.join(exp_dir, 'ckpts')
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'),
                                                   save_best_only=False, 
                                                   save_weights_only=True)  # False to save the model directly
callbacks.append(ckpt_callback)

# Visualize Learning on Tensorboard

tb_dir = os.path.join(exp_dir, 'tb_logs')
if not os.path.exists(tb_dir):
    os.makedirs(tb_dir)
    
# By default shows losses and metrics for both training and validation

tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_dir,
                                             profile_batch=0,
                                             histogram_freq=1)  # if 1 shows weights histograms
callbacks.append(tb_callback)

# Early Stopping

early_stop = True
if early_stop:
    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6)
    callbacks.append(es_callback)

In [None]:
# Convert the answers into categorical

from tensorflow.keras.utils import to_categorical
y_train = to_categorical(answer_encoder_inputs)

# Training parameters

num_epochs = 20
bs = 1
val_split = 0.1

# Fit the model

vqa_model.fit(x=[train_X_images, question_encoder_inputs],
              y=y_train,
              epochs=num_epochs,
              batch_size = bs,
              validation_split=val_split,
              shuffle=True,
              callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


<tensorflow.python.keras.callbacks.History at 0x7f3342019fd0>

In [None]:
%load_ext tensorboard
%tensorboard --logdir '/content/drive/My Drive/ANNDL_Homeworks3/Experiment_10/vqa_experiments'

# Predictions creation

In [None]:
# Load the weights of the chosen model

vqa_model.load_weights(os.path.join(env,'vqa_experiments', 'vqa_exp_Jan01_20-00-36', 'ckpts', 'cp_04.ckpt'))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f333a2f78d0>

In [None]:
## Create the proper dictionaries

# Inverse dictionary associated with our dictionary answer_wtoi

answer_itow = {v:k for k, v in answer_wtoi.items()}

# Given dictionary to be used for the submission (with 'monkey bars' modified in 'monkeybars)

labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkeybars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}

In [None]:
# Load the .json test dictionary

json_test_q_dir = os.path.join(dataset_dir, 'test_questions.json')
with open(json_test_q_dir) as json_file:
    test_questions_dict = json.load(json_file)

results = {}

for key in test_questions_dict.keys():

  # Process the question

  question = test_questions_dict[key]['question']
  question_t = question_tokenizer.texts_to_sequences([question])
  question_p = pad_sequences(question_t, maxlen=max_question_length, padding = 'pre', truncating = 'pre')

  # Process the image

  image_name = test_questions_dict[key]['image_id']
  image_path = os.path.join(dataset_dir,'Images',image_name + '.png')

  test_image = np.zeros((1,img_h,img_w,3))
  test_image[0,:,:,:] = load_and_process_image(image_path)

  # Generate the prediction

  pred = vqa_model.predict(x=[test_image, question_p])
  pred_id = np.array(tf.argmax(pred,-1))
  pred_word = answer_itow[pred_id[0]]
  pred_label = labels_dict[pred_word]

  # Save the prediction

  results[key] = pred_label

In [None]:
# Required function for csv creation

import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

# Use the function to create the csv

create_csv(results, results_dir = env)