**Installing particular dependencies**

In [None]:
#!pip install yacs cython matplotlib
!pip install --upgrade matplotlib
!pip install sentencepiece
!pip install torch pytorch-lightning

**Installing MMF**

In [42]:
%cd /content/
%rm -rf mmf
!git clone https://github.com/albertkjoller/explainableVQA.git explainableVQA
%cd /content/explainableVQA/mmf
# Don't modify torch version
!sed -i '/torch/d' requirements.txt
!pip install -e .

import sys
sys.path.append("/content/explainableVQA/mmf/mmf")

**Downloading dataset** for visualization (not working)

In [None]:
# Importing
# registry is need to register the dataset or our new model so as to be MMF discoverable
from mmf.common.registry import registry

from mmf.models.mmbt import MMBT
from mmf.utils.build import build_dataset
from mmf.utils.env import setup_imports

import matplotlib.pyplot as plt


In [None]:
# downloading

setup_imports()
dataset = build_dataset("okvqa")

# visualizing
plt.rcParams["figure.figsize"] = (20, 20)
dataset.visualize(num_samples=8, size=(512, 512), nrow=4)

**Building the model**

In [15]:
# importing
import torch

# All model using MMF need to inherit BaseModel
from mmf.models.base_model import BaseModel

# Builder methods for image encoder and classifier
from mmf.utils.build import (
    build_classifier_layer,
    build_image_encoder,
    build_text_encoder,
)


In [29]:

# Register the model for MMF, "concat_bert_tutorial" key would be used to find the model
# concat_bert_tutorial
# ConcatBERTTutorial
@registry.register_model("first_model")
class First_Model(BaseModel):
    # All models in MMF get first argument as config which contains all
    # of the information you stored in this model's config (hyperparameters)
    def __init__(self, config):
      # This is not needed in most cases as it just calling parent's init
      # with same parameters. But to explain how config is initialized we
      # have kept this
      super().__init__(config)
      self.build()

    # This classmethod tells MMF where to look for default config of this model
    @classmethod
    def config_path(cls):
      # Relative to user dir root
      return "/mmf/configs/models/first_model/defaults.yaml"

    # Each method need to define a build method where the model's modules
    # are actually build and assigned to the model
    def build(self):
      """
      Config's image_encoder attribute will be used to build an MMF image
      encoder. This config in yaml will look like:

      # "type" parameter specifies the type of encoder we are using here.
      # In this particular case, we are using resnet152
      type: resnet152
      # Parameters are passed to underlying encoder class by
      # build_image_encoder
      params:
          # Specifies whether to use a pretrained version
          pretrained: true
          # Pooling type, use max to use AdaptiveMaxPool2D
          pool_type: avg
          # Number of output features from the encoder, -1 for original
          # otherwise, supports between 1 to 9
          num_output_features: 1
      """
      self.vision_module = build_image_encoder(self.config.image_encoder)

      """
      For text encoder, configuration would look like:
      # Specifies the type of the langauge encoder, in this case mlp
      type: transformer
      # Parameter to the encoder are passed through build_text_encoder
      params:
          # BERT model type
          bert_model_name: bert-base-uncased
          hidden_size: 768
          # Number of BERT layers
          num_hidden_layers: 12
          # Number of attention heads in the BERT layers
          num_attention_heads: 12
      """
      self.language_module = build_text_encoder(self.config.text_encoder)

      """
      For classifer, configuration would look like:
      # Specifies the type of the classifier, in this case mlp
      type: mlp
      # Parameter to the classifier passed through build_classifier_layer
      params:
          # Dimension of the tensor coming into the classifier
          # Visual feature dim + Language feature dim : 2048 + 768
          in_dim: 2816
          # Dimension of the tensor going out of the classifier
          out_dim: 2
          # Number of MLP layers in the classifier
          num_layers: 2
      """
      self.classifier = build_classifier_layer(self.config.classifier)

    # Each model in MMF gets a dict called sample_list which contains
    # all of the necessary information returned from the image
    def forward(self, sample_list):
      # Text input features will be in "input_ids" key
      text = sample_list["input_ids"]
      # Similarly, image input will be in "image" key
      image = sample_list["image"]

      # Get the text and image features from the encoders
      text_features = self.language_module(text)[1]
      image_features = self.vision_module(image)

      # Flatten the embeddings before concatenation
      image_features = torch.flatten(image_features, start_dim=1)
      text_features = torch.flatten(text_features, start_dim=1)

      # Concatenate the features returned from two modality encoders
      combined = torch.cat([text_features, image_features], dim=1)

      # Pass final tensor to classifier to get scores
      logits = self.classifier(combined)

      # For loss calculations (automatically done by MMF
      # as per the loss defined in the config),
      # we need to return a dict with "scores" key as logits
      output = {"scores": logits}

      # MMF will automatically calculate loss
      return output




Training

In [18]:
from mmf_cli.run import run

!mmf_run config="configs/experiments/first_model/defaults.yaml" \
    model=first_model \
    dataset=okvqa \
    run_type=train_val

In [39]:
registry.mapping["state"] = {}
opts = opts=[
             "config='mmf/configs/models/first_model/defaults.yaml'", 
             "model=first_model", 
             "dataset=okvqa", 
             "training.num_workers=0"
             ]
run(opts=opts)

  "The `env` resolver is deprecated, see https://github.com/omry/omegaconf/issues/573"


FileNotFoundError: ignored