In [None]:
# Upload Input Data
# -----------------
# Parameters:

# Upload Input Data (True, False)
UPLOAD_DATA = False

# -----------------

if UPLOAD_DATA:
    from google.colab import files
    files.upload()

# -----------------

In [None]:
# Modules Import
# --------------

import os
import json
import time
import re
import shutil
import csv
import math
import glob
import datetime
import random

import pathlib
from tqdm import tqdm

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

import operator
from operator import itemgetter
from collections import defaultdict

from contextlib import contextmanager
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn import preprocessing
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, add, Conv2D, Reshape
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D, multiply
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import VGG16, VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.utils import Sequence
from tensorflow.keras import utils
from tensorflow.keras.preprocessing import image, text, sequence
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow_addons.metrics import F1Score

import warnings
warnings.filterwarnings('ignore')

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')


matplotlib.use("Agg")
%matplotlib inline

# --------------

In [None]:
# Feature Extraction Available Models
# -----------------------------------

class MODEL(enum.Enum):
    VGG19 = "VGG19"
    AUTOENCODER = "Autoencoder"

# -----------------------------------

In [None]:
# Module Parameters
# -----------------
# Parameters:

# Image Height
IMG_H = 400

# Image Width
IMG_W = 700

# Text Data Augmentation (True, False)
TEXT_AUGMENTATION = False

# Feature Extraction Model
FEATURE_EXTRACTION = MODEL.AUTOENCODER.value

# Train Autoencoder Model (True, False)
TRAIN_AUTOENCODER = False

# Train Attention Model (True, False)
TRAIN_ATTENTION = True

# Test Prediction (True, False)
TEST_PREDICTION = False

# -----------------

labels_dict = {
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5': 5,
    'apple': 6,
    'baseball': 7,
    'bench': 8,
    'bike': 9,
    'bird': 10,
    'black': 11,
    'blanket': 12,
    'blue': 13,
    'bone': 14,
    'book': 15,
    'boy': 16,
    'brown': 17,
    'cat': 18,
    'chair': 19,
    'couch': 20,
    'dog': 21,
    'floor': 22,
    'food': 23,
    'football': 24,
    'girl': 25,
    'grass': 26,
    'gray': 27,
    'green': 28,
    'left': 29,
    'log': 30,
    'man': 31,
    'monkey bars': 32,
    'no': 33,
    'nothing': 34,
    'orange': 35,
    'pie': 36,
    'plant': 37,
    'playing': 38,
    'red': 39,
    'right': 40,
    'rug': 41,
    'sandbox': 42,
    'sitting': 43,
    'sleeping': 44,
    'soccer': 45,
    'squirrel': 46,
    'standing': 47,
    'stool': 48,
    'sunny': 49,
    'table': 50,
    'tree': 51,
    'watermelon': 52,
    'white': 53,
    'wine': 54,
    'woman': 55,
    'yellow': 56,
    'yes': 57
}

# -----------------

In [None]:
# GPU Configuration
# -----------------

physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
print(physical_devices)

# -----------------

In [None]:
# Define Directories
# ------------------
# Parameters:

# Root Directory
ROOT_DIR = pathlib.Path(".")

# ------------------

annotations_file = ROOT_DIR / "train_questions_annotations.json"
augmented_file = ROOT_DIR / "train_augmented_questions_annotations.json"

if TEXT_AUGMENTATION:
    processed_file = ROOT_DIR / "processed_train_augmented_questions_annotations.json"
else:
    processed_file = = ROOT_DIR / "processed_train_questions_annotations.json"

temp_directory = ROOT_DIR / "temps"

images_directory = ROOT_DIR / "images"
features_directory = ROOT_DIR / 'features'

autoencoder_checkpoint_directory = temp_directory
autoencoder_checkpoint = autoencoder_checkpoint_directory / "autoencoder_checkpoint"

# ------------------

In [None]:
# Random Seed
# -----------
# Parameters:

# Random Seed
SEED = 1000

# -----------

tf.random.set_seed(SEED) 

# -----------

In [None]:
# Text Augmentation
# -----------------
# Parameters:

# Force Reload Text Augmentation (True, False)
FORCE_RELOAD = False

# Augmentation Frequency
FREQUENCY = 5000

# -----------------

if TEXT_AUGMENTATION and (FORCE_RELOAD or not os.path.exists(augmented_file)):
    !pip install textattack

    from textattack.augmentation import EmbeddingAugmenter
    aug = EmbeddingAugmenter()

    augmented_data = []
    class_frequency = {}

    for i in range(58):
        class_frequency[i] = 0
        
    for i in tqdm(annotations_file):
        value = anno[i]
        answer = value['answer']
        class_frequency[labels_dict[answer]] += 1
        
    for i in tqdm(annotations_file):
        value = annotations_file[i]
        image_path = value['image_id']
        question = value['question']
        answer = value['answer']
        
        weight = int(FREQUENCY / class_frequency[labels_dict[answer]])
        
        if weight <= 1:
            augmented_data.append({
                'image_id': str(image_path), 
                'question': question, 
                'answer': answer})
        else:
            for j in range(weight):
                augmented_question = aug.augment(question)
                augmented_data.append({
                    'image_id': str(image_path), 
                    'question': augmented_question[0], 
                    'answer': answer})

    json.dump(augmented_data, open(augmented_file, 'w'))

# -----------------

In [None]:
# Annotations Processing
# ----------------------

def process_question_annotation(annotations):
    anno = json.load(open(annotations, 'r'))

    data = []
    for value in tqdm(anno):
        image_path = images_directory / (value['image_id']+'.png') 
        question = value['question'][0]
        answer =  value['answer']
        data.append({
            'img_path': str(image_path), 
            'question': question, 
            'answer': answer})
    
    json.dump(data, open(processed_file, 'w'))

# ----------------------

if TEXT_AUGMENTATION:
    process_question_annotation(augmented_file)
else:
    process_question_annotation(annotations_file)

train_data = processed_file

# ----------------------

In [None]:
# Answers Statistics
# ------------------

anno = json.load(open(processed_file, 'r'))

new_data = []
class_frequency = {}

for i in range(58):
    class_frequency[i] = 0
    
for value in tqdm(anno):
    answer =  value['answer']
    class_frequency[labels_dict[answer]] += 1
    
print(class_frequency)

answer_freq= defaultdict(int)

for answer in list(map(itemgetter('answer'), train_data)):
    answer_freq[answer] += 1

max_answers = len(answer_freq)

# ------------------

In [None]:
# Answer Data Reduction
# ---------------------

def select_top_answers_data(questions_list, answer_list, images_list, k):
	answer_freq= defaultdict(int)

	for answer in answer_list:
		answer_freq[answer] += 1

	sorted_freq = sorted(answer_freq.items(), key=operator.itemgetter(1), reverse=True)[0: k]
	top_answers, top_freq = zip(*sorted_freq)
 
	new_questions_list=[]
	new_answer_list=[]
	new_images_list=[]

	for question, answer, image in zip(questions_list, answer_list, images_list):
		if answer in top_answers:
			new_questions_list.append(question)
			new_answer_list.append(answer)
			new_images_list.append(image)

	return (new_questions_list, new_answer_list, new_images_list, top_answers)

# ---------------------

questions_train, answer_train, images_train, top_answers = select_top_answers_data(
    list(map(itemgetter('question'), train_data)), 
    list(map(itemgetter('answer'), train_data)), 
    list(map(itemgetter('img_path'), train_data)), 
    max_answers)

# ---------------------

In [None]:
# Feature Extraction Dimensions
# -----------------------------

if FEATURE_EXTRACTION == MODEL.VGG.value:
    IMG_W /= 2 
    IMG_H /= 2
else:
    IMG_W = 648
    IMG_H = 324

# -----------------------------

In [None]:
# Dataframe Splitting
# -------------------
# Parameters:

# Dataframe Split
DF_SPLIT = 0.8

# -------------------

train_dir = images_directory

image_filenames = next(os.walk(train_dir))[2]

data = []
for row_index, image_name in tqdm(enumerate(image_filenames)):
    data.append(image_name)

DATAFRAME = pd.DataFrame(data, columns=['filename'])

probs = np.random.rand(len(DATAFRAME))
training_mask = probs < DF_SPLIT
validation_mask = (probs >= DF_SPLIT)

DF_TRAIN = DATAFRAME[training_mask]
DF_VAL = DATAFRAME[validation_mask]

# -------------------

In [None]:
# Autoencoder Data Generator
# --------------------------

if FEATURE_EXTRACTION == MODEL.AUTOENCODER.value:
    train_datagen = ImageDataGenerator(rescale=1./255)
    valid_datagen = ImageDataGenerator(rescale=1./255)

    CLASS_MODE = "input"
    CLASSES = None

    # Training Data Generator
    train_gen = train_datagen.flow_from_dataframe(
        DF_TRAIN,
        train_dir,
        batch_size=AUTOENCODER_BATCH_SIZE,
        target_size=(IMG_H, IMG_W),
        class_mode=CLASS_MODE,
        classes=CLASSES,
        shuffle=True,
        seed=SEED,
    )

    # Validation Data Generator
    val_gen = valid_datagen.flow_from_dataframe(
        DF_VAL,
        train_dir,
        batch_size=AUTOENCODER_BATCH_SIZE,
        target_size=(IMG_H, IMG_W),
        class_mode=CLASS_MODE,
        classes=CLASSES,
        shuffle=True,
        seed=SEED,
    )

# --------------------------

In [None]:
# Autoencoder Model Architecture
# ------------------------------

if FEATURE_EXTRACTION == MODEL.AUTOENCODER.value:
    input_shape = [IMG_H, IMG_W, 3]

    # Encoder
    encoder = tf.keras.Sequential()
    encoder.add(tf.keras.layers.Input(input_shape))

    encoder.add(tf.keras.layers.Conv2D(filters=8, kernel_size=(3, 3), strides=(1, 1), padding="same"))
    encoder.add(tf.keras.layers.ReLU())
    encoder.add(tf.keras.layers.MaxPool2D(pool_size=(3, 3)))

    encoder.add(tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), strides=(1, 1), padding="same"))
    encoder.add(tf.keras.layers.ReLU())
    encoder.add(tf.keras.layers.MaxPool2D(pool_size=(3, 3)))

    encoder.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), strides=(1, 1), padding="same"))
    encoder.add(tf.keras.layers.ReLU())
    encoder.add(tf.keras.layers.MaxPool2D(pool_size=(3, 3)))

    encoder.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), strides=(1, 1), padding="same"))
    encoder.add(tf.keras.layers.ReLU())
    encoder.add(tf.keras.layers.MaxPool2D(pool_size=(3, 3)))

    encoder.add(tf.keras.layers.Flatten())
    encoder.add(Dense(1024,activation='relu'))

    
    # Decoder
    decoder = tf.keras.Sequential()
    decoder.add(tf.keras.layers.Input([1024]))
    decoder.add(Dense(2048,activation='relu'))
    decoder.add(tf.keras.layers.Reshape((4, 8, 64)))

    decoder.add(tf.keras.layers.UpSampling2D(size=(3, 3), interpolation="bilinear"))
    decoder.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), strides=(1, 1), padding="same"))
    decoder.add(tf.keras.layers.ReLU())

    decoder.add(tf.keras.layers.UpSampling2D(size=(3, 3), interpolation="bilinear"))
    decoder.add(tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), strides=(1, 1), padding="same"))
    decoder.add(tf.keras.layers.ReLU())

    decoder.add(tf.keras.layers.UpSampling2D(size=(3, 3), interpolation="bilinear"))
    decoder.add(tf.keras.layers.Conv2D(filters=8, kernel_size=(3, 3), strides=(1, 1), padding="same"))
    decoder.add(tf.keras.layers.ReLU())

    decoder.add(tf.keras.layers.UpSampling2D(size=(3, 3), interpolation="bilinear"))
    decoder.add(tf.keras.layers.Conv2D(filters=3, kernel_size=(3, 3), strides=(1, 1), padding="same", activation=tf.keras.activations.sigmoid))

    # Autoencoder
    autoencoder = tf.keras.Sequential()
    autoencoder.add(tf.keras.layers.Input(input_shape))
    autoencoder.add(encoder)
    autoencoder.add(decoder)


    autoencoder.summary()

# ------------------------------

In [None]:
# Autoencoder Model Optimization
# ------------------------------
# Parameters:

# Early Stopping (True, False)
EARLY_STOP = True

# Learning Rate 
LR = 1e-4

# Validation Metrics
METRICS = ['accuracy']

# ------------------------------

optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

if FEATURE_EXTRACTION == MODEL.AUTOENCODER.value:
    autoencoder.compile(optimizer=optimizer, loss="mse", metrics=METRICS)

    callbacks = []

    es_callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
    callbacks.append(es_callback)

    cp_callback = tf.keras.callbacks.ModelCheckpoint(autoencoder_checkpoint_directory, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto',save_freq='epoch',period=3)
    callbacks.append(cp_callback)

# ------------------------------

In [None]:
# Autoencoder Fitting
# -------------------
# Parameters:

# Number of Epochs
AE_EPOCHS = 200

# -------------------

if FEATURE_EXTRACTION == MODEL.AUTOENCODER.value:
    if TRAIN_AUTOENCODER:
        autoencoder.fit_generator(train_gen,
            epochs=AUTOENCODER_EPOCHS,
            validation_data=val_gen,
            callbacks=callbacks)
    else:
        autoencoder = tf.keras.models.load_model(autoencoder_checkpoint)
        decoder = autoencoder._layers.pop()
        encoder = autoencoder._layers.pop()

# -------------------

In [None]:
# Encoder Summary
# ---------------

encoder.summary()

# ---------------

In [None]:
# Feature Extraction Function
# ---------------------------

def image_feature_extractor(target_path, image_list, batch_size, model, model_type):
	progbar = utils.Progbar(int(np.ceil(len(image_list) / float(batch_size))))
    
	for (b, i) in enumerate(range(0, len(image_list), batch_size)):
		progbar.update(b + 1)
		
		batch_range = range(i, min(i + BATCH_SIZE, len(image_list)))
		batchPaths = image_list[batch_range[0]: batch_range[-1]+1]

		batchImages = []
		batchIds = []

		for imagePath in batchPaths:
			img = image.load_img(str(ROOT_DIR / 'images' / imagePath), target_size=(IMG_H, IMG_W))
			img = image.img_to_array(img)
    
			img = np.expand_dims(img, axis=0)
			img = preprocess_input(img)
    
			batchImages.append(img)
			batchIds.append(imagePath.split('.')[0][-6:])
        
		batchImages = np.vstack(batchImages)

		features = model.predict(batchImages)

        if model_type == MODEL.AUTOENCODER.value:
		    features = np.expand_dims(features, axis=1)
        else:
            features = tf.reshape(features, (features.shape[0],-1, features.shape[3]))

		for id, feat in zip(batchIds, features):
			np.save(os.path.join(target_path, id), feat)

# ---------------------------

In [None]:
# Feature Extraction
# -----------------
# Parameters:

# Force Reload Feature Extraction (True, False)
FORCE_RELOAD = False

# Batch Size
FE_BATCH = 100

# -----------------

if FEATURE_EXTRACTION == MODEL.AUTOENCODER.value:
    FE_MODEL = encoder
else:
    FE_MODEL = VGG19(weights="imagenet", include_top=False,  input_tensor=Input(shape=(IMG_H, IMG_W, 3)))

# -----------------

if FORCE_RELOAD or not os.path.exists(features_directory)):

    image_list = os.listdir(images_directory)

    if not os.path.exists(features_directory):
        os.mkdir(features_directory)
    
    image_feature_extractor(features_directory, image_list, FE_BATCH, FE_MODEL)

# -----------------

In [None]:
# Text Processing Function
# ------------------------

def process_sentence(sentence):
    periodStrip  = re.compile("(?!<=\d)(\.)(?!\d)")
    commaStrip   = re.compile("(\d)(\,)(\d)")
    punct        = [';', r"/", '[', ']', '"', '{', '}',
                    '(', ')', '=', '+', '\\', '_', '-',
                    '>', '<', '@', '`', ',', '?', '!']
    contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't", \
                    "couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't", \
                    "hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've", \
                    "he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", \
                    "Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's", \
                    "maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've", \
                    "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't", \
                    "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've", \
                    "she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've", \
                    "somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll", \
                    "somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've", \
                    "someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've", \
                    "something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've", \
                    "there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've", \
                    "they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't", \
                    "wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're", \
                    "whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "wheres": "where's", "whereve": "where've", \
                    "whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll", \
                    "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've", \
                    "wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've", \
                    "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've", \
                    "youll": "you'll", "youre": "you're", "youve": "you've"}

    inText = sentence.replace('\n', ' ')
    inText = inText.replace('\t', ' ')
    inText = inText.strip()
    outText = inText
    for p in punct:
        if (p + ' ' in inText or ' ' + p in inText) or \
           (re.search(commaStrip, inText) != None):
            outText = outText.replace(p, '')
        else:
            outText = outText.replace(p, ' ')
    outText = periodStrip.sub("", outText, re.UNICODE)
    outText = outText.lower().split()
    for wordId, word in enumerate(outText):
        if word in contractions:
            outText[wordId] = contractions[word]
    outText = ' '.join(outText)
    return outText

# ------------------------

In [None]:
# Text Processing
# ---------------

questions_train_processed = pd.Series(questions_train).apply(process_sentence)

tok = text.Tokenizer(filters='')
tok.fit_on_texts(questions_train_processed)

question_data_train = tok.texts_to_sequences(questions_train_processed)

question_len = [len(text) for text in question_data_train]
plt.figure(figsize=(7,5))
sns.distplot(question_len, color='red')
plt.title('Distribution of Question length')
plt.xlabel('Length of Question')
plt.ylabel('Question count')
plt.xlim(0, 30)
plt.show()

for i in range(0, 11):
    print(10 * i, 'percentile value is', np.percentile(question_len, 10*i))

for i in range(0, 11):
    print(90 + i, 'percentile value is',np.percentile(question_len, 90+i))

MAX_LEN = 21

question_data_train=sequence.pad_sequences(question_data_train, maxlen=MAX_LEN, padding='post')

# ---------------

In [None]:
# Answers Processing
# ------------------

def get_answers_matrix(answers, encoder):
	y = encoder.transform(answers)
	nb_classes = encoder.classes_.shape[0]
	Y = utils.to_categorical(y, nb_classes)
	return Y

# ------------------

labelencoder = preprocessing.LabelEncoder()
labelencoder.fit(answer_train)

# ------------------

In [None]:
# Annotations Splitting
# ---------------------

sss = StratifiedShuffleSplit(n_splits=1, test_size= 0.25,random_state=42)

for train_index, val_index in sss.split(images_train, answer_train):
    TRAIN_INDEX = train_index
    VAL_INDEX = val_index

image_list_tr, image_list_vl = np.array(images_train)[TRAIN_INDEX.astype(int)], np.array(images_train)[VAL_INDEX.astype(int)]

question_tr, question_vl = question_data_train[TRAIN_INDEX], question_data_train[VAL_INDEX]

answer_matrix = get_answers_matrix(answer_train, labelencoder)
answer_tr, answer_vl = answer_matrix[TRAIN_INDEX], answer_matrix[VAL_INDEX]

# ---------------------

In [None]:
# Attention Maps
# --------------

class AttentionMaps(tf.keras.layers.Layer):
  def __init__(self, dim_k, reg_value, **kwargs):
    super(AttentionMaps, self).__init__(**kwargs)

    self.dim_k = dim_k
    self.reg_value = reg_value

    self.Wv = Dense(self.dim_k, activation=None,\
                        kernel_regularizer=tf.keras.regularizers.l2(self.reg_value),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=2))
    self.Wq = Dense(self.dim_k, activation=None,\
                        kernel_regularizer=tf.keras.regularizers.l2(self.reg_value),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=3))

  def call(self, image_feat, ques_feat):
    # Affinity Matrix C
    # (QT)(Wb)V 
    C = tf.matmul(ques_feat, tf.transpose(image_feat, perm=[0,2,1])) # [b, 23, 49]
    # tanh((QT)(Wb)V)
    C = tf.keras.activations.tanh(C) 

    # (Wv)V
    WvV = self.Wv(image_feat)                             # [b, 49, dim_k]
    # (Wq)Q
    WqQ = self.Wq(ques_feat)                              # [b, 23, dim_k]

    # ((Wq)Q)C
    WqQ_C = tf.matmul(tf.transpose(WqQ, perm=[0,2,1]), C) # [b, k, 49]
    WqQ_C = tf.transpose(WqQ_C, perm =[0,2,1])            # [b, 49, k]

    # ((Wv)V)CT                                           # [b, k, 23]
    WvV_C = tf.matmul(tf.transpose(WvV, perm=[0,2,1]), tf.transpose(C, perm=[0,2,1]))  
                        
    WvV_C = tf.transpose(WvV_C, perm =[0,2,1])            # [b, 23, k]

    #---------------image attention map------------------
    # We find "Hv = tanh((Wv)V + ((Wq)Q)C)" ; H_v shape [49, k]

    H_v = WvV + WqQ_C                                     # (Wv)V + ((Wq)Q)C
    H_v = tf.keras.activations.tanh(H_v)                  # tanh((Wv)V + ((Wq)Q)C) 

    #---------------question attention map---------------
    # We find "Hq = tanh((Wq)Q + ((Wv)V)CT)" ; H_q shape [23, k]

    H_q = WqQ + WvV_C                                     # (Wq)Q + ((Wv)V)CT
    H_q = tf.keras.activations.tanh(H_q)                  # tanh((Wq)Q + ((Wv)V)CT) 
        
    return [H_v, H_q]                                     # [b, 49, k], [b, 23, k]
  
  def get_config(self):
    config = {
        'dim_k': self.dim_k,
        'reg_value': self.reg_value
    }
    base_config = super(AttentionMaps, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

# --------------

layer = AttentionMaps(64, 0.001)
config = layer.get_config()
print(config)
new_layer = AttentionMaps.from_config(config)

# --------------

In [None]:
# Context Vector
# --------------

class ContextVector(tf.keras.layers.Layer):
  def __init__(self, reg_value, **kwargs):
    super(ContextVector, self).__init__(**kwargs)

    self.reg_value = reg_value

    self.w_hv = Dense(1, activation='softmax',\
                        kernel_regularizer=tf.keras.regularizers.l2(self.reg_value),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=4))
    self.w_hq = Dense(1, activation='softmax',\
                        kernel_regularizer=tf.keras.regularizers.l2(self.reg_value),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=5)) 
    

  def call(self, image_feat, ques_feat, H_v, H_q):
    # attention probabilities of each image region vn; a_v = softmax(wT_hv * H_v)
    a_v = self.w_hv(H_v)                               # [b, 49, 1]

    # attention probabilities of each word qt ;        a_q = softmax(wT_hq * H_q)
    a_q = self.w_hq(H_q)                               # [b, 23, 1]

    # context vector for image
    v = a_v * image_feat                               # [b, 49, dim_d]
    v = tf.reduce_sum(v, 1)                            # [b, dim_d]

    # context vector for question
    q = a_q * ques_feat                                # [b, 23, dim_d]
    q = tf.reduce_sum(q, 1)                            # [b, dim_d]


    return [v, q]

  def get_config(self):
    config = {
        'reg_value': self.reg_value
    }
    base_config = super(ContextVector, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

# --------------

layer = ContextVector(0.001)
config = layer.get_config()
print(config)
new_layer = ContextVector.from_config(config)

# --------------

In [None]:
# Phrase Level Features
# ---------------------

class PhraseLevelFeatures(tf.keras.layers.Layer):
  def __init__(self, dim_d, **kwargs):
    super(PhraseLevelFeatures, self).__init__(**kwargs)
    
    self.dim_d = dim_d
    
    self.conv_unigram = Conv1D(self.dim_d, kernel_size=1, strides=1,\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=6)) 
    self.conv_bigram =  Conv1D(self.dim_d, kernel_size=2, strides=1, padding='same',\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=7)) 
    self.conv_trigram = Conv1D(self.dim_d, kernel_size=3, strides=1, padding='same',\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=8)) 


  def call(self, word_feat):
    # phrase level unigram features
    x_uni = self.conv_unigram(word_feat)                    # [b, 23, dim_d]

    # phrase level bigram features
    x_bi  = self.conv_bigram(word_feat)                     # [b, 23, dim_d]

    # phrase level trigram features
    x_tri = self.conv_trigram(word_feat)                    # [b, 23, dim_d]

    # Concat
    x = tf.concat([tf.expand_dims(x_uni, -1),\
                    tf.expand_dims(x_bi, -1),\
                    tf.expand_dims(x_tri, -1)], -1)         # [b, 23, dim_d, 3]

    # https://stackoverflow.com/a/36853403
    # Max-pool across n-gram features; over-all phrase level feature
    x = tf.reduce_max(x, -1)                                # [b, 23, dim_d]

    return x

  def get_config(self):
    config = {
        'dim_d': self.dim_d
    }
    base_config = super(PhraseLevelFeatures, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

# ---------------------

layer = PhraseLevelFeatures(32)
config = layer.get_config()
print(config)
new_layer = PhraseLevelFeatures.from_config(config)

# ---------------------

In [None]:
# Model Architecture Function
# ---------------------------

def build_model(max_answers, max_seq_len, vocab_size, dim_d, dim_k, l_rate, d_rate, reg_value):
    # inputs 
    image_input = Input(shape=(1, 1024, ), name='Image_Input')
    ques_input = Input(shape=(MAX_LEN, ), name='Question_Input')

    # image feature; (Wb)V                                          # [b, 49, dim_d]
    image_feat = Dense(dim_d, activation=None, name='Image_Feat_Dense',\
                            kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                                kernel_initializer=tf.keras.initializers.glorot_uniform(seed=1))(image_input)
    image_feat = Dropout(d_rate, seed=1)(image_feat)

    # word level
    ques_feat_w = Embedding(input_dim=vocab_size, output_dim=dim_d, input_length=max_seq_len,\
                            mask_zero=True)(ques_input)
    
    Hv_w, Hq_w = AttentionMaps(dim_k, reg_value, name='AttentionMaps_Word')(image_feat, ques_feat_w)
    v_w, q_w = ContextVector(reg_value, name='ContextVector_Word')(image_feat, ques_feat_w, Hv_w, Hq_w)
    feat_w = tf.add(v_w,q_w)
    h_w = Dense(dim_d, activation='tanh', name='h_w_Dense',\
                    kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=13))(feat_w)

    # phrase level
    ques_feat_p = PhraseLevelFeatures(dim_d, name='PhraseLevelFeatures')(ques_feat_w)

    Hv_p, Hq_p = AttentionMaps(dim_k, reg_value, name='AttentionMaps_Phrase')(image_feat, ques_feat_p)
    v_p, q_p = ContextVector(reg_value, name='ContextVector_Phrase')(image_feat, ques_feat_p, Hv_p, Hq_p)
    feat_p = concatenate([tf.add(v_p,q_p), h_w], -1) 
    h_p = Dense(dim_d, activation='tanh', name='h_p_Dense',\
                    kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=14))(feat_p)

    # sentence level
    ques_feat_s = LSTM(dim_d, return_sequences=True, input_shape=(None, max_seq_len, dim_d),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=16))(ques_feat_p)

    Hv_s, Hq_s = AttentionMaps(dim_k, reg_value, name='AttentionMaps_Sent')(image_feat, ques_feat_s)
    v_s, q_s = ContextVector(reg_value, name='ContextVector_Sent')(image_feat, ques_feat_p, Hv_s, Hq_s)
    feat_s = concatenate([tf.add(v_s,q_s), h_p], -1) 
    h_s = Dense(2*dim_d, activation='tanh', name='h_s_Dense',\
                    kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=15))(feat_s)

    z   = Dense(2*dim_d, activation='tanh', name='z_Dense',\
                    kernel_regularizer=tf.keras.regularizers.l2(reg_value),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=16))(h_s)
    z   = Dropout(d_rate, seed=16)(z)

    # result
    result = Dense(max_answers, activation='softmax')(z)

    model = Model(inputs=[image_input, ques_input], outputs=result)

    return model

# ---------------------------

In [None]:
# Dataset Creation
# ----------------
# Parameters:

# Batch Size
BATCH_SIZE = 300

# Buffer Size
BUFFER_SIZE = 5000

# ----------------

def map_func(img_name, ques, ans):
    img_path = img_name.decode("utf-8")
    img_path = img_path.replace('images', 'features')
    img_path = img_path.replace('png', 'npy')
    img_tensor = np.load(img_path)
    return img_tensor, ques, ans

# ----------------

dataset_tr = tf.data.Dataset.from_tensor_slices((image_list_tr, question_tr, answer_tr))

dataset_tr = dataset_tr.map(lambda item1, item2, item3: tf.numpy_function(
    map_func, [item1, item2, item3], [tf.float32, tf.int32, tf.float32]),
    num_parallel_calls=tf.data.experimental.AUTOTUNE)

dataset_tr = dataset_tr.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset_tr = dataset_tr.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

dataset_vl = tf.data.Dataset.from_tensor_slices((image_list_vl, question_vl, answer_vl))

dataset_vl = dataset_vl.map(lambda item1, item2, item3: tf.numpy_function(
    map_func, [item1, item2, item3], [tf.float32, tf.int32, tf.float32]),
    num_parallel_calls=tf.data.experimental.AUTOTUNE)

dataset_vl = dataset_vl.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset_vl = dataset_vl.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# ----------------

In [None]:
# Model Architecture
# ------------------
# Parameters:

EPOCHS      = 50

max_answers = max_answers
max_seq_len = MAX_LEN
vocab_size  = len(tok.word_index) + 1

dim_d       = 512
dim_k       = 256
l_rate      = 1e-4
d_rate      = 0.5
reg_value   = 0.01

# ------------------

base_path = temp_directory

model = build_model(max_answers, max_seq_len, vocab_size, dim_d, dim_k, l_rate, d_rate, reg_value)

model.summary()

# ------------------

In [None]:
# Model Parameters
# ----------------

SAVE_CKPT_FREQ = 5
steps_per_epoch = int(np.ceil(len(image_list_tr) / BATCH_SIZE))
boundaries = [50 * steps_per_epoch]
values = [l_rate, l_rate / 10]

learning_rate_fn = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries, values)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_fn)

loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction='auto')

checkpoint_directory = base_path / ('checkpoint_'+str(l_rate)+"_"+str(dim_k))

if not os.path.exists(checkpoint_directory):
    os.mkdir(checkpoint_directory)

ckpt = tf.train.Checkpoint(step=tf.Variable(0), optimizer=optimizer, model=model)
manager = tf.train.CheckpointManager(ckpt, checkpoint_directory, max_to_keep=3)

train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
val_loss = tf.keras.metrics.Mean('val_loss', dtype=tf.float32)

train_score = F1Score(num_classes=max_answers, average='micro', name='train_score')
val_score = F1Score(num_classes=max_answers, average='micro', name='val_score')

train_score = F1Score(num_classes=max_answers, average='micro', name='train_score')
val_score = F1Score(num_classes=max_answers, average='micro', name='val_score')

train_summary_writer = tf.summary.create_file_writer(train_log_dir)
val_summary_writer = tf.summary.create_file_writer(val_log_dir)

# ----------------

In [None]:
# Train and Test Functions
# ------------------------

# @tf.function
def train_step(model, img, ques, ans, optimizer):
  with tf.GradientTape() as tape:
    predictions = model([img, ques], training=True)
    loss = loss_object(ans, predictions)

  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  train_loss(loss)
  train_score(ans, predictions)
    
  grads_ = list(zip(grads, model.trainable_variables))
  return grads_

def test_step(model, img, ques, ans):
  predictions = model([img, ques])
  loss = loss_object(ans, predictions)

  val_loss(loss)
  val_score(ans, predictions)

# ------------------------

if manager.latest_checkpoint:
    ckpt.restore(manager.latest_checkpoint)
    print("Restored from {}".format(manager.latest_checkpoint))
    START_EPOCH = int(manager.latest_checkpoint.split('-')[-1]) * SAVE_CKPT_FREQ
    print("Resume training from epoch: {}".format(START_EPOCH))
else:
    print("Initializing from scratch")
    START_EPOCH = 0

# ------------------------

In [None]:
# Model Fitting
# -------------

if TRAIN_ATTENTION:
    for epoch in range(START_EPOCH, EPOCHS):

      start = time.time()

      for img, ques, ans in tqdm(dataset_tr):
        grads = train_step(model, img, ques, ans, optimizer)

      with train_summary_writer.as_default():
        tf.summary.scalar('loss', train_loss.result(), step=epoch)
        tf.summary.scalar('f1_score', train_score.result(), step=epoch)
        
        for var in model.trainable_variables:
            tf.summary.histogram(var.name, var, step=epoch)
        for grad, var in grads:
            tf.summary.histogram(var.name + '/gradient', grad, step=epoch)

      for img, ques, ans in tqdm(dataset_vl):
        test_step(model, img, ques, ans)

      with val_summary_writer.as_default():
        tf.summary.scalar('loss', val_loss.result(), step=epoch)
        tf.summary.scalar('f1_score', val_score.result(), step=epoch)
        
      template = 'Epoch {}, loss: {:.4f}, f1_score: {:.4f}, val loss: {:.4f}, val f1_score: {:.4f}, time: {:.0f} sec'
      print (template.format(epoch + 1,
                             train_loss.result(), 
                             train_score.result(),
                             val_loss.result(), 
                             val_score.result(),
                             (time.time() - start)))

      train_loss.reset_states()
      train_score.reset_states()
      val_loss.reset_states()
      val_score.reset_states()
      ckpt.step.assign_add(1)
      if int(ckpt.step) % SAVE_CKPT_FREQ == 0:
          manager.save()
          print('Saved checkpoint.')

# -------------

In [None]:
# Prediction Auxiliary Functions
# ------------------------------

def predict(image_feature,processed_question):
    answer = model([image_feature,processed_question])
    answer = tf.argmax(answer, axis=1, output_type=tf.int32)
    answer = (labelencoder.inverse_transform(answer))
    return answer

def preprocess_data(image_id,question):
    features = np.load(str(ROOT_DIR / 'features' / (image_id+'.npy')))
    features = np.expand_dims(features, axis=0)

    processed_question = process_sentence(question)
    tok.fit_on_texts(processed_question)
    processed_question = [processed_question]
    processed_question = tok.texts_to_sequences(processed_question)
    processed_question = sequence.pad_sequences(processed_question, maxlen=MAX_LEN, padding='post')

    return features, processed_question

def create_csv(results, results_dir='./'):
    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

# ------------------------------

In [None]:
# Model Evaluation
# ----------------

test_annotations = json.load(open(ROOT_DIR / 'test_questions.json', 'r'))
tf.get_logger().setLevel('ERROR')

results = {}

if TEST_PREDICTION:
    for i in tqdm(test_annotations):
        value = test_annotations[i]
        image_features, processed_question = preprocess_data(value['image_id'],value['question'])
        answer = predict(image_features,processed_question)
        results[i] = labels_dict.get(answer[0])
    
    create_csv(results, str(ROOT_DIR))

# ----------------