<h1 style="padding-top: 25px;padding-bottom: 25px;text-align: left; padding-left: 10px; background-color: #DDDDDD; 
    color: black;"> <img style="float: left; padding-right: 10px; width: 45px" src="https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/iacs.png"> AC295: Advanced Practical Data Science </h1>

## Practicum 2: Visual Question Answering

**Harvard University, Fall 2020**  
**Instructors**: Pavlos Protopapas  

### **Team: $\alpha\beta normal$ $Distri\beta ution$**
#### **Roht Beri, Eduardo Peynetti, Jessica Wijaya, Stuart Neilson**

## Basic Model for Training

### Install Packages

In [1]:
!pip3 install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/34/fb092588df61bf33f113ade030d1cbe74fb73a0353648f8dd938a223dce7/transformers-3.5.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 14.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 52.2MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 65.4MB/s 
[?25hCollecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

### Imports

In [6]:
import os
import requests
import tempfile
import zipfile
import shutil
import json
import time
import sys
import cv2
import numpy as np
import pandas as pd
from collections import Counter
from glob import glob
from google.colab import drive
from tqdm.notebook import trange, tqdm
import subprocess
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.python.keras import backend as Kb
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import layers
from tensorflow.keras import activations
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import initializers
from tensorflow.keras import regularizers
from tensorflow.keras.utils import to_categorical
from keras.utils.layer_utils import count_params
from tensorflow_addons.metrics import F1Score

from transformers import BertTokenizer, TFBertModel

### Download Data

In [3]:
"""
!mkdir 'data'
#shutil.rmtree('/content/data/train2014_tf')
#shutil.rmtree('/content/data/val2014_tf')
!cp -r '/content/drive/My Drive/Practicum2Data/big/val2014_tf' /content/data
!cp -r '/content/drive/My Drive/Practicum2Data/big/train2014_tf' /content/data
!cp -r "/content/drive/My Drive/Practicum2Data/big/answers.csv" /content/data
"""
if not os.path.exists('/content/data'):
    os.mkdir('/content/data')

!gsutil cp -r gs://practicum2-abnormal-distribution /content/data

# https://storage.googleapis.com/practicum2-abnormal-distribution/train2014_tf
# https://storage.googleapis.com/practicum2-abnormal-distribution/val2014_tf

In [4]:
#drive.mount('/content/drive', force_remount=False)
if not os.path.exists('/content/data'):
    os.mkdir('/content/data')

#!cp -r '/content/drive/My Drive/Practicum2Data/big' /content/data

Mounted at /content/drive


### Utils

In [7]:
# we use the following to save the models
class JsonEncoder(json.JSONEncoder):
  def default(self, obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, decimal.Decimal):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return super(JsonEncoder, self).default(obj)

# save_model saves everything. weights, statuses and results. 
def save_model(model,training_results,execution_time, learning_rate, epochs, optimizer, evaluation_results,path="models"):
  model_name=model.name
  # Get the model train history
  model_train_history = training_results.history

  # Ensure path exists
  if not os.path.exists(path):
      os.mkdir(path)

  # Save the enitire model (structure + weights)
  model.save(os.path.join(path,model_name+".hdf5"))

  # Save only the weights
  model.save_weights(os.path.join(path,model_name+".h5"))

  # Save the structure only
  model_json = model.to_json()
  with open(os.path.join(path,model_name+".json"), "w") as json_file:
      json_file.write(model_json)
    
  model_size = get_model_size(model_name=model_name)

  # Save model history
  with open(os.path.join("models",model_name+"_train_history.json"), "w") as json_file:
      json_file.write(json.dumps(model_train_history,cls=JsonEncoder))

  trainable_parameters = count_params(model.trainable_weights)
  non_trainable_parameters = count_params(model.non_trainable_weights)
  total_params = trainable_parameters + non_trainable_parameters

  # Save model metrics
  metrics ={
      "total_params":total_params,
      "execution_time":execution_time,
      "loss":evaluation_results[0],
      "accuracy":evaluation_results[1],
      "model_size":model_size,
      "learning_rate":learning_rate,
      "epochs":epochs,
      "optimizer":type(optimizer).__name__,
      "name": model_name,
      "id": int(time.time())
  }

  with open(os.path.join("models",model.name+"_metrics.json"), "w") as json_file:
      json_file.write(json.dumps(metrics,cls=JsonEncoder))

def get_model_size(path="models",model_name="model01"):
  model_size = os.stat(os.path.join(path,model_name+".hdf5")).st_size
  return model_size

def evaluate_model(model,test_data, training_results,execution_time, learning_rate, epochs, 
                   optimizer,save=True, 
                   loss_metrics=["loss","val_loss"],
                   acc_metrics=["accuracy","val_accuracy"]):
    
  # Get the model train history
  model_train_history = training_results.history
  # Get the number of epochs the training was run for
  num_epochs = len(model_train_history[loss_metrics[0]])

  # Plot training results
  fig = plt.figure(figsize=(15,5))
  axs = fig.add_subplot(1,2,1)
  axs.set_title('Loss')
  # Plot all metrics
  for metric in loss_metrics:
      axs.plot(np.arange(0, num_epochs), model_train_history[metric], label=metric)
  axs.legend()
  
  axs = fig.add_subplot(1,2,2)
  axs.set_title('Accuracy')
  # Plot all metrics
  for metric in acc_metrics:
      axs.plot(np.arange(0, num_epochs), model_train_history[metric], label=metric)
  axs.legend()

  plt.show()
  
  # Evaluate on test data
  evaluation_results = model.evaluate(test_data, return_dict=True)
  print(evaluation_results)

  evaluation_results = [evaluation_results[loss_metrics[0]], evaluation_results[acc_metrics[0]]]
  
  if save:
      # Save model
      save_model(model,training_results,execution_time, learning_rate, epochs, optimizer, evaluation_results)
  
  return evaluation_results

In [8]:
# Constants
IMG_WIDTH = 224
IMG_HEIGHT = 224
IMG_CHANNELS = 3

K = 10

AUTOTUNE = tf.data.experimental.AUTOTUNE

# Pipeline variables
batch_size = 256
train_buffer_size = 100
val_buffer_size = 20
prefetch = AUTOTUNE

In [9]:
# Get Top K answers
def get_top_K_answers(k):
    answers = pd.read_csv("/content/data/big/answers.csv", index_col=0)
    answers = answers.iloc[:k]
    return answers

In [10]:
# Function to parse data features
def _parse_features_function(example):
    # Parse the input tf.train.Example proto using the dictionary above.
    tf_records_features = {
        'image_raw': tf.io.FixedLenFeature([], tf.string), 
        'question' : tf.io.FixedLenFeature([], tf.string),
        'input_ids': tf.io.FixedLenFeature([], tf.string),
        'token_type_ids': tf.io.FixedLenFeature([], tf.string),
        'attention_mask': tf.io.FixedLenFeature([], tf.string), 
        'answer': tf.io.FixedLenFeature([], tf.string)
    }
    return tf.io.parse_single_example(example, tf_records_features)


# Filter if answer is no
def filter_fn(x):
    #use broadcasting for element-wise tensor operation
    broadcast_equal = tf.equal(TOP_ANSWERS, x['answer'])
    broadcast_equal_int = tf.cast(broadcast_equal, tf.int8)
    broadcast_sum = tf.reduce_sum(broadcast_equal_int)
    return broadcast_sum > 0


# Read image and resize it
def read_and_decode(img):
    img = tf.image.decode_jpeg(img, channels=IMG_CHANNELS)
    img = tf.cast(img, tf.float32)/255.0
    return img


# Structure the data for training
def structure_data(data):
    image = data['image_raw']
    image = read_and_decode(image)
    
    input_ids = tf.io.decode_raw(data['input_ids'], tf.int32)
    attention_mask = tf.io.decode_raw(data['attention_mask'], tf.int32)
    token_type_ids = tf.io.decode_raw(data['token_type_ids'], tf.int32)
    
    answer = tf.io.decode_raw(data['answer'], tf.int32)

    return ((image, (input_ids, token_type_ids, attention_mask)), answer)

### Build Pipeline

In [11]:
# ############## #
# # Train data # #
# ############## #
tfrecords_pattern_path = "/content/data/big/train2014_tf/vaq_raw_train2014_*-of-*.records"
train_files = tf.io.matching_files(tfrecords_pattern_path)
train_files = tf.random.shuffle(train_files)
train_shards = tf.data.Dataset.from_tensor_slices(train_files)

train = train_shards.interleave(tf.data.TFRecordDataset)
train = train.map(_parse_features_function, num_parallel_calls=AUTOTUNE)
#train = train.filter(filter_fn)
train = train.map(structure_data, num_parallel_calls=AUTOTUNE)
train = train.shuffle(buffer_size=train_buffer_size).batch(batch_size)
train = train.cache().prefetch(prefetch)

# ################### #
# # Validation data # #
# ################### #
tfrecords_pattern_path = "/content/data/big/val2014_tf/vaq_raw_val2014_*-of-*.records"
val_files = tf.io.matching_files(tfrecords_pattern_path)
val_files = tf.random.shuffle(val_files)
val_shards = tf.data.Dataset.from_tensor_slices(val_files)

valid = val_shards.interleave(tf.data.TFRecordDataset)

valid = valid.map(_parse_features_function, num_parallel_calls=AUTOTUNE)
#valid = valid.filter(filter_fn)
valid = valid.map(structure_data, num_parallel_calls=AUTOTUNE)
valid = valid.shuffle(buffer_size=val_buffer_size).batch(batch_size)
valid = valid.cache().prefetch(prefetch)

### Get Top Answers

In [12]:
top_answers = get_top_K_answers(K)
#top_answers = tf.constant(top_answers)

### Build Model

In [13]:
def build_vqa_model(image_height, image_width, num_channels, num_classes):
    # Handle to pretrained model (Use a different model here)
    input_shape=[image_height, image_width, num_channels]
    resnet = keras.applications.Xception(
        include_top=False, 
        weights='imagenet', 
        input_shape=input_shape
    )
    resnet.trainable = False
    image_hidden_states = layers.Flatten()(resnet.output)

    input_ids = layers.Input(shape=(24,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(24,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(24,), dtype=tf.int32)
    bert = TFBertModel.from_pretrained('bert-base-uncased')
    bert.trainable = False
    question = bert(
        input_ids, 
        token_type_ids=token_type_ids, 
        attention_mask=attention_mask
    )
    last_hidden_states = layers.Flatten()(question[0])

    x = layers.concatenate([image_hidden_states, last_hidden_states])

    output = layers.Dense(units=num_classes)(x)

    model = Model(inputs=[resnet.input, (input_ids, token_type_ids, attention_mask)], outputs=output)

    return model

In [14]:
Kb.clear_session()

model = build_vqa_model(224, 224, 3, K)

# Optimizer
learning_rate = 0.001
optimizer = optimizers.Adam(lr=learning_rate)

# Loss
loss = losses.CategoricalCrossentropy(from_logits=True)

# Compile
model.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=['accuracy']
)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


### Train Model

In [None]:
# Train model
drive.mount('/content/drive', force_remount=False)

start_time = time.time()

epochs = 10
iteration = int(top_answers.sum().values)//(batch_size * 100) + 1

for epoch in trange(epochs):
    for i in trange(iteration):
        training_results = model.fit(
            train.take(100),
            validation_data=valid.take(10),
            batch_size=batch_size,
            callbacks = keras.callbacks.ModelCheckpoint( 
                filepath='/content/drive/My Drive/Practicum2Data/vqa_model.h5', 
                monitor='val_accuracy', 
                save_best_only=True, 
                save_weights_only=True
            ),
            epochs=1, 
            verbose=0
        )
    model.evaluate(valid.take(100))

execution_time = (time.time() - start_time)/60.0
print("Training execution time (mins)",execution_time)

evaluate_save_model(
    model, 
    valid, 
    training_results, 
    execution_time, 
    learning_rate, 
    batch_size, 
    epochs, 
    optimizers,
    save=True
)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

In [None]:
int(top_answers.sum().values)