In [36]:
import json
import tqdm
import time
from operator import itemgetter
import random
import matplotlib.pyplot as plt
import os
import cv2
import tensorflow as tf
import seaborn as sns
import numpy as np
import re
import joblib
import pandas as pd
from collections import defaultdict
from prettytable import PrettyTable
from wordcloud import WordCloud, STOPWORDS

In [37]:
from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit

In [38]:
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import Input, Dense, Conv1D, Dropout, Embedding, concatenate, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras import utils
from tensorflow.keras.preprocessing import text, image, sequence
from tensorflow.keras.applications.vgg19 import preprocess_input

from tensorflow_addons.metrics import F1Score

In [43]:
print('Using tf version: {}'.format(tf.__version__))
print('Using numpy version: {}'.format(np.__version__))

Using tf version: 2.5.0
Using numpy version: 1.19.2


In [13]:
# folder of data
DATA_FOLDER = '../content'
# folder of processed data
PROCESSED_DATA_FOLDER = '../content/processed_data'
# folder of dataset evaluation
EVALUATION_DATA_FOLDER = '../content/evaluation'
# path image format
IMAGE_FORMAT = '../content/%s/COCO_%s_%012d.jpg'

# Preparing the data matrices

In [14]:
# loading from disk
with open(f'{PROCESSED_DATA_FOLDER}/vqa_raw_trainVal2014_top1000.json', 'rb') as f:
    clQuestionTrainVal, clAnswerTrainVal, clAnswersTrainVal, clPathImageTrainVal = joblib.load(f)

In [15]:
# load from disk : question tokenization class
with open(f'{DATA_FOLDER}/question_tokenizer.pkl', 'rb') as f:
    oQuestionTok = joblib.load(f)

In [16]:
# load from disk : Tokenized questions
with open(f'{DATA_FOLDER}/tokenised_question_paddingPost_train_val.pkl', mode='rb') as f:
    caQuestionTrainValTokenized = joblib.load(f)

In [17]:
# load from disk: Encode target answers with value between 0 and n_classes-1
with open(f'{DATA_FOLDER}/AnswerEncoderTrainVal.pkl', 'rb') as f:
    oAnswerEncoder = joblib.load(f)

In [18]:
"""
    Function name: GetAnswersMatrix
    
    Objective: One-hot-encode the answers
    
    Summary algorithmic description: Transform answers to normalized encoding.
                                     Convert the normalized encoding to one-hot-coding
    
    Input parameters: clAnswer : list of answers
                      cEncoder : LabelEncoder class
    
    Return : binary class matrix
    
    Date : 09/11/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
def GetAnswersMatrix(clAnswer, cEncoder):
    # Transform labels to normalized encoding.
    y = cEncoder.transform(clAnswer) 
    # Number of classes 
    iNbClasses = cEncoder.classes_.shape[0]
    # Convert a class vector (integers) to binary class matrix.
    Y = utils.to_categorical(y, iNbClasses)
    # Return the binary class matrix.
    return Y

In [19]:
# One-hot-encode the train answers 
caAnswerMatrix = GetAnswersMatrix(clAnswerTrainVal, oAnswerEncoder)

In [20]:
# Shape of answer matrix
caAnswerMatrix.shape

(574913, 1000)

# Model

In [21]:
# Resets all state generated by Keras
tf.keras.backend.clear_session()

In [22]:
"""
    Class name: AttentionMaps 
    
    Objective: Compute the image (or question) attention
    
    Summary algorithmic description: Calculate the affinity matrix
                                     Predict image and question attention maps
    
    Input parameters: iDim_k : hidden attention dimention
                      fRegValue : Regularization value
    
    Date : 14/11/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
class AttentionMaps(tf.keras.layers.Layer):
    """
        Function name: __init__

        Objective: Define custom layer attributes

        Summary algorithmic description: Define two attributes hidden attention dimention and regularization value
                                         Define the weight parameters by layer Dense

        Input parameters: iDim_k : hidden attention dimention
                          fRegValue : Regularization value

        Return : None

        Date : 14/11/2021

        Coding: INSA CVL - Van Tuan BUI  
    """
    def __init__(self, iDim_k, fRegValue, **kwargs):
        # Default layer attributes
        super(AttentionMaps, self).__init__(**kwargs)

        # Get the value of hidden attention dimention
        self.iDim_k = iDim_k
        # Get the regularization value 
        self.fRegValue = fRegValue

        # Dense layer with Xavier uniform initializer for image weight parameters 
        self.Wv = Dense(self.iDim_k, activation=None,\
                        kernel_regularizer=tf.keras.regularizers.l2(self.fRegValue),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=2))
        # Dense layer with Xavier uniform initializer for question weight parameters
        self.Wq = Dense(self.iDim_k, activation=None,\
                        kernel_regularizer=tf.keras.regularizers.l2(self.fRegValue),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=3))
        
        
    """
        Function name: call

        Objective: Perform the logic of applying the layer to the input tensors

        Summary algorithmic description: Calculate the affinity matrix
                                         Predict image and question attention maps

        Input parameters: caImageFeat : image feature (b, N,  d) 
                          caQuestionFeat : question feature (b, T,  d)

        Return : image attention map and question attention map

        Date : 14/11/2021

        Coding: INSA CVL - Van Tuan BUI  
    """
    def call(self, caImageFeat, caQuestionFeat):
        # Affinity Matrix C = tanh((QT)(Wb)V) (b, T, N)
        C = tf.matmul(caQuestionFeat, tf.transpose(caImageFeat, perm=[0,2,1])) 
        C = tf.keras.activations.tanh(C) 

        # (Wv)V (b, N, k)
        Wv_V = self.Wv(caImageFeat)                             
        # (Wq)Q (b, T, k)
        Wq_Q = self.Wq(caQuestionFeat)                              

        # ((Wq)Q)C (b, k, N)
        Wq_Q_C = tf.matmul(tf.transpose(Wq_Q, perm=[0,2,1]), C) 
        # ((Wq)Q)C (b, N, k)
        Wq_Q_C = tf.transpose(Wq_Q_C, perm =[0,2,1])       

        # ((Wv)V)CT (b, k, T)                                         
        Wv_V_CT = tf.matmul(tf.transpose(Wv_V, perm=[0,2,1]), tf.transpose(C, perm=[0,2,1])) 
        # ((Wv)V)CT (b, T, k) 
        Wv_V_CT = tf.transpose(Wv_V_CT, perm =[0,2,1])         

        # image attention map Hv = tanh((Wv)V + ((Wq)Q)C) (b, N, k)
        Hv = Wv_V + Wq_Q_C                                     
        Hv = tf.keras.activations.tanh(Hv)              

        # question attention map Hq = tanh((Wq)Q + ((Wv)V)CT) (b, T, k)
        Hq = Wq_Q + Wv_V_CT                                     
        Hq = tf.keras.activations.tanh(Hq)                   

        # Return image attention map and question attention map
        return [Hv, Hq]                                     
  

    """
        Function name: get_config

        Objective: Collect the input shape and other information about the layer.

        Summary algorithmic description: Return a dictionary containing the configuration used to initialize this layer

        Input parameters: None

        Return : a dictionary containing informations about the layer.

        Date : 14/11/2021

        Coding: INSA CVL - Van Tuan BUI  
    """
    def get_config(self):
        # Dictionnary of two attributes of this layer
        cdConfig = {
            'iDim_k': self.iDim_k,
            'fRegValue': self.fRegValue
        }
        # Dictionnary of base configurations
        cdBasaConfig = super(AttentionMaps, self).get_config()
        # Return a dictionary containing informations about the layer.
        return dict(list(cdBasaConfig.items()) + list(cdConfig.items()))

In [23]:
cLayerAttentionMaps = AttentionMaps(64, 0.001)
cdConfig = cLayerAttentionMaps.get_config()
print(cdConfig)
cNewLayerAttentionMaps = AttentionMaps.from_config(cdConfig)

{'name': 'attention_maps', 'trainable': True, 'dtype': 'float32', 'iDim_k': 64, 'fRegValue': 0.001}


In [24]:
"""
    Class name: ContextVector 
    
    Objective: Find context vector of the image and text features
    
    Summary algorithmic description: 
    
    Input parameters: iDim_k : hidden attention dimention
                      fRegValue : Regularization value
    
    Date : 14/11/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
class ContextVector(tf.keras.layers.Layer):
    """
        Function name: __init__

        Objective: Define custom layer attributes

        Summary algorithmic description: Define attribute regularization value
                                         Define the weight parameters by layer Dense

        Input parameters: fRegValue : Regularization value

        Return : None

        Date : 14/11/2021

        Coding: INSA CVL - Van Tuan BUI  
    """
    def __init__(self, fRegValue, **kwargs):
        # Default layer attributes
        super(ContextVector, self).__init__(**kwargs)

        # Get the regularization value 
        self.fRegValue = fRegValue
        # Image weight parameters (Dense layer with Xavier uniform initializer)
        self.whv = Dense(1, activation='softmax',\
                            kernel_regularizer=tf.keras.regularizers.l2(self.fRegValue),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=4))
        # Image weight parameters (Dense layer with Xavier uniform initializer)
        self.whq = Dense(1, activation='softmax',\
                            kernel_regularizer=tf.keras.regularizers.l2(self.fRegValue),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=5)) 
    

    """
        Function name: call

        Objective: Perform the logic of applying the layer to the input tensors

        Summary algorithmic description: Find context vector of the image and text features

        Input parameters: caImageFeat : image feature (b, N,  d) 
                          caQuestionFeat : question feature (b, T,  d)
                          Hv : image attention map (b, N, k)
                          Hq : question attention map (b, T, k)

        Return : context vector of the image and text features

        Date : 14/11/2021

        Coding: INSA CVL - Van Tuan BUI  
    """
    def call(self, caImageFeat, caQuestionFeat, Hv, Hq):
        # attention probabilities of each image region vn
        # av = softmax(wT_hv * H_v) (b, N, 1)
        av = self.whv(Hv)                               

        # attention probabilities of each word qt      
        # aq = softmax(wT_hq * H_q) (b, T, 1)
        aq = self.whq(Hq)                              

        # context vector for image (b, N, d)
        v = av * caImageFeat 
        # context vector for image (b, d)
        v = tf.reduce_sum(v, 1)                            

        # context vector for question (b, T, d)
        q = aq * caQuestionFeat                               
        # context vector for question (b, d)
        q = tf.reduce_sum(q, 1)            
        
        # Return context vector of the image and text features
        return [v, q]

    
    """
        Function name: get_config

        Objective: Collect the input shape and other information about the layer.

        Summary algorithmic description: Return a dictionary containing the configuration used to initialize this layer

        Input parameters: None

        Return : a dictionary containing informations about the layer.

        Date : 14/11/2021

        Coding: INSA CVL - Van Tuan BUI  
    """
    def get_config(self):
        # Dictionnary of attributes of this layer
        cdConfig = {
            'fRegValue': self.fRegValue
        }
        # Dictionnary of base configurations
        cdBaseConfig = super(ContextVector, self).get_config()
        # Return a dictionary containing informations about the layer
        return dict(list(cdBaseConfig.items()) + list(cdConfig.items()))

In [25]:
cLayerContextVector = ContextVector(0.001)
cdConfig = cLayerContextVector.get_config()
print(cdConfig)
cNewLayerContextVector = ContextVector.from_config(cdConfig)

{'name': 'context_vector', 'trainable': True, 'dtype': 'float32', 'fRegValue': 0.001}


In [26]:
"""
    Class name: PhraseLevelFeatures 
    
    Objective: Compute the phrase features by applying 1-D convolution on the word embedding vectors
    
    Summary algorithmic description: Applying 1-D convolution with filters of three window sizes: unigram, bigram and trigram
                                     Max-pooling across different n-grams at each word location
    
    Input parameters: iDim_d : hidden dimension
    
    Date : 14/11/2021
    
    Coding: INSA CVL - Van Tuan BUI  
"""
class PhraseLevelFeatures(tf.keras.layers.Layer):
    """
        Function name: __init__

        Objective: Define custom layer attributes

        Summary algorithmic description: Define attribute hidden dimension
                                         Define the weight parameters by 1D convolution layer

        Input parameters: iDim_d : hidden dimension

        Return : None

        Date : 14/11/2021

        Coding: INSA CVL - Van Tuan BUI  
    """
    def __init__(self, iDim_d, **kwargs):
        # Default layer attributes
        super(PhraseLevelFeatures, self).__init__(**kwargs)

        # Get the hidden dimension
        self.iDim_d = iDim_d
        # Convolution with filter unigram
        self.conv_unigram = Conv1D(self.iDim_d, kernel_size=1, strides=1,\
                                    kernel_initializer=tf.keras.initializers.glorot_uniform(seed=6)) 
        # Convolution with filter bigram
        self.conv_bigram =  Conv1D(self.iDim_d, kernel_size=2, strides=1, padding='same',\
                                    kernel_initializer=tf.keras.initializers.glorot_uniform(seed=7)) 
        # Convolution with filter trigram
        self.conv_trigram = Conv1D(self.iDim_d, kernel_size=3, strides=1, padding='same',\
                                kernel_initializer=tf.keras.initializers.glorot_uniform(seed=8)) 

        
    """
        Function name: call

        Objective: Perform the logic of applying the layer to the input tensors

        Summary algorithmic description: Compute the n-gram phrase embeddings (n = 1,2,3)

        Input parameters: caWordFeat : word feature (b, T, d) 

        Return : Phrase level features of the question (b, T, d)

        Date : 14/11/2021

        Coding: INSA CVL - Van Tuan BUI  
    """
    def call(self, caWordFeat):
        # phrase level unigram features (b, T, d)
        qpUnigram = self.conv_unigram(caWordFeat)                    

        # phrase level bigram features (b, T, d)
        qpBigram  = self.conv_bigram(caWordFeat)                     

        # phrase level trigram features (b, T, d)
        qpTrigram = self.conv_trigram(caWordFeat)             

        # phrase level features (b, T, d, 3)
        qp = tf.concat([tf.expand_dims(qpUnigram, -1),\
                        tf.expand_dims(qpBigram, -1),\
                        tf.expand_dims(qpTrigram, -1)], -1)         

        # Max-pool across n-gram features, over-all phrase level feature (b, T, d)
        qp = tf.reduce_max(qp, -1)                                

        # Return phrase level features
        return qp

    """
        Function name: get_config

        Objective: Collect the input shape and other information about the layer.

        Summary algorithmic description: Return a dictionary containing the configuration used to initialize this layer

        Input parameters: None

        Return : a dictionary containing informations about the layer.

        Date : 14/11/2021

        Coding: INSA CVL - Van Tuan BUI  
    """
    def get_config(self):
        # Dictionnary of attributes of this layer
        cdConfig = {
            'iDim_d': self.iDim_d
        }
        # Dictionnary of base configurations
        cdBaseConfig = super(PhraseLevelFeatures, self).get_config()
        # Return a dictionary containing informations about the layer
        return dict(list(cdBaseConfig.items()) + list(cdConfig.items()))

In [27]:
cLayerPhraseLevelFeatures = PhraseLevelFeatures(32)
cdConfig = cLayerPhraseLevelFeatures.get_config()
print(cdConfig)
cNewLayerPhraseLevelFeatures = PhraseLevelFeatures.from_config(cdConfig)

{'name': 'phrase_level_features', 'trainable': True, 'dtype': 'float32', 'iDim_d': 32}


# Architecture

In [28]:
"""
    Function name: BuildModel

    Objective: build the training model

    Summary algorithmic description: Calculate co-attended image and question features from all three levels
                                     Use a multi-layer perceptron (MLP) to recursively encode the attention features
    
    Input parameters: iMaxAnswers : Number of output targets of the model
                      iMaxSequenceLength : Maximum length of input sequences
                      iVocabSize : Size of the vocabulary
                      iDim_d : Hidden dimension
                      iDim_k : Hidden attention dimension
                      fLearningRate : Learning rate for the model
                      fDropoutRate : Dropout rate
                      fRegValue : Regularization value

    Return : The training model

    Date : 14/11/2021

    Coding: INSA CVL - Van Tuan BUI  
"""
def BuildModel(iMaxAnswers, iMaxSequenceLength, iVocabSize, iDim_d, iDim_k, fLearningRate, fDropoutRate, fRegValue):
    # inputs 
    caImageInput = Input(shape=(49, 2048, ), name='Image_Input')
    caQuestionInput = Input(shape=(22, ), name='Question_Input')

    # image feature (Wb)V (b, N, d)
    caImageFeature = Dense(iDim_d, activation=None, name='Image_Feat_Dense',\
                        kernel_regularizer=tf.keras.regularizers.l2(fRegValue),\
                        kernel_initializer=tf.keras.initializers.glorot_uniform(seed=1))(caImageInput)
    # image feature (b, N, d)
    caImageFeature = Dropout(fDropoutRate, seed=1)(caImageFeature)

    # word embedding feature
    caWordFeature = Embedding(input_dim=iVocabSize, output_dim=iDim_d, input_length=iMaxSequenceLength,\
                            mask_zero=True)(caQuestionInput)
    # image and question attention maps at word level
    Hvw, Hqw = AttentionMaps(iDim_k, fRegValue, name='AttentionMaps_Word')(caImageFeature, caWordFeature)
    # context vector of the image and text features at word level
    vw, qw = ContextVector(fRegValue, name='ContextVector_Word')(caImageFeature, caWordFeature, Hvw, Hqw)
    # attention features at word level
    caWordAttentionFeature = tf.add(vw, qw)
    # co-attended image and question features from word level
    hw = Dense(iDim_d, activation='tanh', name='hw_Dense',\
                kernel_regularizer=tf.keras.regularizers.l2(fRegValue),\
                kernel_initializer=tf.keras.initializers.glorot_uniform(seed=13))(caWordAttentionFeature)

    # phrase level features
    caPhraseFeature = PhraseLevelFeatures(iDim_d, name='PhraseLevelFeatures')(caWordFeature)
    # image and question attention maps at phrase level
    Hvp, Hqp = AttentionMaps(iDim_k, fRegValue, name='AttentionMaps_Phrase')(caImageFeature, caPhraseFeature)
    # context vector of the image and text features at phrase level
    vp, qp = ContextVector(fRegValue, name='ContextVector_Phrase')(caImageFeature, caPhraseFeature, Hvp, Hqp)
    # attention features at phrase level
    caPhraseAttentionFeature = concatenate([tf.add(vp, qp), hw], -1) 
    # co-attended image and question features from phrase level
    hp = Dense(iDim_d, activation='tanh', name='hp_Dense',\
                kernel_regularizer=tf.keras.regularizers.l2(fRegValue),\
                kernel_initializer=tf.keras.initializers.glorot_uniform(seed=14))(caPhraseAttentionFeature)

    # sentence level
    caSentenceFeature = LSTM(iDim_d, return_sequences=True, input_shape=(None, iMaxSequenceLength, iDim_d),\
                            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=16))(caPhraseFeature)
    # image and question attention maps at sentence level
    Hvs, Hqs = AttentionMaps(iDim_k, fRegValue, name='AttentionMaps_Sent')(caImageFeature, caSentenceFeature)
    # context vector of the image and text features at sentence level
    vs, qs = ContextVector(fRegValue, name='ContextVector_Sent')(caImageFeature, caSentenceFeature, Hvs, Hqs)
    # attention features at sentence level
    caSentenceAttentionFeature = concatenate([tf.add(vs, qs), hp], -1) 
    # co-attended image and question features from sentence level
    hs = Dense(2 * iDim_d, activation='tanh', name='hs_Dense',\
                kernel_regularizer=tf.keras.regularizers.l2(fRegValue),\
                kernel_initializer=tf.keras.initializers.glorot_uniform(seed=15))(caSentenceAttentionFeature)

    # encode the attention features
    z = Dense(2 * iDim_d, activation='tanh', name='z_Dense',\
                kernel_regularizer=tf.keras.regularizers.l2(fRegValue),\
                kernel_initializer=tf.keras.initializers.glorot_uniform(seed=16))(hs)
    # encode the attention features
    z = Dropout(fDropoutRate, seed=16)(z)

    # probability of the final answer
    result = Dense(iMaxAnswers, activation='softmax')(z)
    
    # group layers
    model = Model(inputs=[caImageInput, caQuestionInput], outputs=result)

    # Return the model
    return model

# Create tf.Dataset

In [29]:
# batch size
BATCH_SIZE = 300
# buffer siez
BUFFER_SIZE = 5000

In [30]:
"""
    Function name: GetImageFeatures

    Objective: Load the image features from numpy files

    Summary algorithmic description: Load the image features from numpy files

    Input parameters: sPathImage : path image
                      caQuestion : Tokenization of question
                      caAnswer : answer one-hot-enconding 

    Return : caImageTensor : image features
             caQuestion : Tokenization of question
             caAnswer : answer one-hot-enconding 

    Date : 15/11/2021

    Coding: INSA CVL - Van Tuan BUI  
"""
def GetImageFeatures(sPathImage, caQuestion, caAnswer):
    # Load the image features from numpy files
    caImageTensor = np.load(f'{DATA_FOLDER}/featuresResNet50/' + sPathImage.decode('utf-8').split('.')[2][-6:] + '.npy')
    # Return image features, tokenization of question and answer one-hot-enconding 
    return caImageTensor, caQuestion, caAnswer

In [31]:
# Create a Dataset whose elements are slices along their first dimension
oTrainDataset = tf.data.Dataset.from_tensor_slices((clPathImageTrainVal, caQuestionTrainValTokenized, caAnswerMatrix))

# Use map to load the numpy files in parallel
oTrainDataset = oTrainDataset.map(lambda sPathImage, caQuestion, caAnswer: tf.numpy_function(
    GetImageFeatures, [sPathImage, caQuestion, caAnswer], [tf.float32, tf.int32, tf.float32]),
    num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and batch
oTrainDataset = oTrainDataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
# allow later elements to be prepared while the current element is being processed.
oTrainDataset = oTrainDataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Training

In [32]:
# params 1
MAX_ANSWERS = 1000
MAX_SEQUENCE_LENGTH = 22
VOCAB_SIZE = len(oQuestionTok.word_index) + 1
EPOCHS      = 200
DIM_D = 512
DIM_K = 256
LEARNING_RATE = 1e-4
DROPOUT_RATE = 0.5
REG_VALUE = 0.01

BASE_PATH = f'{DATA_FOLDER}/temps'

In [33]:
VOCAB_SIZE

17984

In [60]:
# create model
oModel = BuildModel(MAX_ANSWERS, MAX_SEQUENCE_LENGTH, VOCAB_SIZE, DIM_D, DIM_K, LEARNING_RATE, DROPOUT_RATE, REG_VALUE)

In [61]:
oModel.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Image_Input (InputLayer)        [(None, 49, 2048)]   0                                            
__________________________________________________________________________________________________
Image_Feat_Dense (Dense)        (None, 49, 512)      1049088     Image_Input[0][0]                
__________________________________________________________________________________________________
Question_Input (InputLayer)     [(None, 22)]         0                                            
__________________________________________________________________________________________________
dropout (Dropout)               (None, 49, 512)      0           Image_Feat_Dense[0][0]           
______________________________________________________________________________________________

In [62]:
# number of steps per epoch
iStepsPerEpoch = int(np.ceil(len(clPathImageTrainVal)/BATCH_SIZE))
# interval boundaries for changed learning rates.
clBoundary = [50 * iStepsPerEpoch]
# the learning rate values for the intervals defined by boundaries.
clLearningRateValue = [LEARNING_RATE, LEARNING_RATE / 10]

In [63]:
# we reduce the learning rate after 50th epoch (from 1e-4 to 1e-5)
learningRateSchedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(clBoundary, clLearningRateValue)
# Optimizer that implements the Adam algorithm
oOptimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
# the crossentropy loss between the labels and predictions
oLoss = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction='auto')

In [64]:
# checkpoint directory
sCheckpointDirectory = BASE_PATH + "/training_checkpoints/" + str(LEARNING_RATE) + "_" + str(DIM_K)
# save checkpoint every SAVE_CKPT_FREQ step
SAVE_CKPT_FREQ = 5

In [65]:
# Create a Checkpoint that will manage three objects with trackable state
oCheckpoint = tf.train.Checkpoint(step=tf.Variable(0), optimizer=oOptimizer, model=oModel)
# keep only 3 newest checkpoints 
oCheckPointManager = tf.train.CheckpointManager(oCheckpoint, sCheckpointDirectory, max_to_keep=3)



In [66]:
# Compute the (weighted) mean of the given values
oTrainLoss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)

In [67]:
# Compute F-1 Score
oTrainScore = F1Score(num_classes=MAX_ANSWERS, average='micro', name='train_score')

In [68]:
# training log directory
sTrainLogDir = BASE_PATH + '/logs/' + str(LEARNING_RATE) + "_" + str(DIM_K) + '/train'

# Create a summary file writer for the training log directory
oTrainSummaryWriter = tf.summary.create_file_writer(sTrainLogDir)

In [69]:
"""
    Function name: TrainStep

    Objective: Operate a traning step

    Summary algorithmic description: Make a forward pass
                                     Make a backward pass
                                     Record results : loss mean, F1 score

    Input parameters: oModel : training model
                      tsImageFeature : image feature tensor
                      tsQuestionFeature : question feature tensor
                      tsAnswerLabel : answer label tensor
                      oOptimizer : Optimizer object

    Return : clGradient_ : all gradients against trainable variables

    Date : 15/11/2021

    Coding: INSA CVL - Van Tuan BUI  
"""
def TrainStep(oModel, tsImageFeature, tsQuestionFeature, tsAnswerLabel, oOptimizer):
    # Record operations for automatic differentiation
    with tf.GradientTape() as oGradientTape:
        # forward pass
        # résultat de la prédiction
        tsPrediction = oModel([tsImageFeature, tsQuestionFeature], training=True)
        # the crossentropy loss
        fLoss = oLoss(tsAnswerLabel, tsPrediction)

    # backward pass
    # Compute the gradient of the loss
    clGradient = oGradientTape.gradient(fLoss, oModel.trainable_variables)
    # Apply gradients to variables
    oOptimizer.apply_gradients(zip(clGradient, oModel.trainable_variables))

    # record results
    # Loss Mean
    oTrainLoss(fLoss)
    # F-1 Score
    oTrainScore(tsAnswerLabel, tsPrediction)

    # all gradients
    clGradient_ = list(zip(clGradient, oModel.trainable_variables))
    # Return all gradients
    return clGradient_

In [70]:
# If the prefix of the most recent checkpoint exist
if oCheckPointManager.latest_checkpoint:
    # Restore the latest checkpoint
    oCheckpoint.restore(oCheckPointManager.latest_checkpoint)
    # Display the prefix of the most recent checkpoint
    print("Restored from {}".format(oCheckPointManager.latest_checkpoint))
    # latest training epoch
    START_EPOCH = int(oCheckPointManager.latest_checkpoint.split('-')[-1]) * SAVE_CKPT_FREQ
    print("Resume training from epoch: {}".format(START_EPOCH))
# If the prefix of the most recent checkpoint doesn't exist
else:
    print("Initializing from scratch")
    # Train from scratch
    START_EPOCH = 0

Restored from ../content/temps/training_checkpoints/0.0001_256\ckpt-9
Resume training from epoch: 45


In [71]:
# Loop over all epochs
for iEpoch in range(START_EPOCH, EPOCHS):
    # time in seconds since the epoch as a floating point number
    fStartTime = time.time()

    # Loop over all elements in training dataset
    for tsImageFeature, tsQuestionFeature, tsAnswerLabel in (oTrainDataset):
        # Make a training step
        clGradient = TrainStep(oModel, tsImageFeature, tsQuestionFeature, tsAnswerLabel, oOptimizer)

    # tensorboard 
    # set default writer
    with oTrainSummaryWriter.as_default():
        # Write the crossentropy loss for later analysis in TensorBoard
        tf.summary.scalar('loss', oTrainLoss.result(), step=iEpoch)
        # Write the F1 score for later analysis in TensorBoard
        tf.summary.scalar('f1_score', oTrainScore.result(), step=iEpoch)
        # Create summaries to visualize weights
        # Loop over all trainable variables
        for tsVariable in oModel.trainable_variables:
            # Writes a weights histogram for later analysis in TensorBoard
            tf.summary.histogram(tsVariable.name, tsVariable, step=iEpoch)
        # Summarize all gradients
        # Loop over all gradients
        for fGradient, tsVariable in clGradient:
            # Writes a gradient histogram for later analysis in TensorBoard
            tf.summary.histogram(tsVariable.name + '/gradient', fGradient, step=iEpoch)

    sTemplateEpoch = 'Epoch {}, loss: {:.4f}, f1_score: {:.4f}, time: {:.0f} sec'
    print (sTemplateEpoch.format(iEpoch + 1,
                         oTrainLoss.result(), 
                         oTrainScore.result(),
                         (time.time() - fStartTime)))

    # Reset metric state variables every epoch
    oTrainLoss.reset_states()
    oTrainScore.reset_states()

    # save checkpoint every SAVE_CKPT_FREQ step
    # Add 1 to step
    oCheckpoint.step.assign_add(1)
    # After SAVE_CKPT_FREQ step
    if int(oCheckpoint.step) % SAVE_CKPT_FREQ == 0:
        # Save a checkpoint
        oCheckPointManager.save()
        print('Saved checkpoint.')

Epoch 46, loss: 1.4928, f1_score: 0.4776, time: 6570 sec
Epoch 47, loss: 1.4874, f1_score: 0.4788, time: 6801 sec
Epoch 48, loss: 1.4843, f1_score: 0.4801, time: 6977 sec
Epoch 49, loss: 1.4779, f1_score: 0.4815, time: 7089 sec
Epoch 50, loss: 1.4768, f1_score: 0.4819, time: 7178 sec
Saved checkpoint.
Epoch 51, loss: 1.4718, f1_score: 0.4837, time: 7616 sec
Epoch 52, loss: 1.4704, f1_score: 0.4843, time: 7684 sec
Epoch 53, loss: 1.4686, f1_score: 0.4841, time: 7830 sec
Epoch 54, loss: 1.4607, f1_score: 0.4872, time: 8081 sec
Epoch 55, loss: 1.4601, f1_score: 0.4883, time: 8134 sec
Saved checkpoint.
Epoch 56, loss: 1.4508, f1_score: 0.4890, time: 8352 sec
Epoch 57, loss: 1.4534, f1_score: 0.4893, time: 8613 sec
Epoch 58, loss: 1.4463, f1_score: 0.4905, time: 8884 sec
Epoch 59, loss: 1.4422, f1_score: 0.4913, time: 8915 sec
Epoch 60, loss: 1.4394, f1_score: 0.4926, time: 9281 sec
Saved checkpoint.
Epoch 61, loss: 1.4370, f1_score: 0.4943, time: 9248 sec
Epoch 62, loss: 1.4354, f1_score: 

KeyboardInterrupt: 

In [72]:
# Save the trained model
oModel.save(f'{DATA_FOLDER}/TrainVal_CoAttention_Resnet50_model.h5')



In [6]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [7]:
%tensorboard --logdir '../content/temps/logs/0.0001_256'