<a href="https://colab.research.google.com/github/j-min/VL-T5/blob/main/inference_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# VL-T5 inference on custom images

## Download code and install dependencies

In [None]:
!git clone https://github.com/j-min/VL-T5

In [None]:
cd VL-T5

In [None]:
!pip uninstall param -y # to resolve name conflict with src.param.py

In [None]:
!pip uninstall param -y # to resolve name conflict with src.param.py
!pip install -r requirements.txt
!python download_backbones.py

## Download the pretrained checkpoint

In [None]:
import gdown

In [None]:
!mkdir -p VL-T5/snap/pretrain/VLT5

In [None]:
gdown.download('https://drive.google.com/uc?id=100qajGncE_vc4bfjVxxICwz3dwiAxbIZ', 'VL-T5/snap/pretrain/VLT5/Epoch30.pth', quiet=False)

## Add source code path

In [None]:
import sys

In [None]:
sys.path.append('/content/VL-T5/VL-T5/src')
sys.path.append('/content/VL-T5/VL-T5/inference')

In [None]:
cd VL-T5

## Build a model and load weights from the pretrained checkpoint

In [None]:
!pip uninstall param -y

In [None]:
!pip install wandb

In [None]:
pip install sentencepiece

In [None]:
pip install transformers==4.2.1

In [16]:
pip install wget

Defaulting to user installation because normal site-packages is not writeable
[0mCollecting wget
  Using cached wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=259c62251c20610661b1db83156e35654ff0ab5d03badee95de7800987c67227
  Stored in directory: /home/vivek.trivedi/.cache/pip/wheels/04/5f/3e/46cc37c5d698415694d83f607f833f83f0149e49b3af9d0f38
Successfully built wget
[0mInstalling collected packages: wget
Successfully installed wget-3.2
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import wget

In [2]:
cd

/home/vivek.trivedi


In [2]:
import sentencepiece

In [4]:
cd "VL-T5/src"

/scratch/vivek.trivedi/VL_adapter/VL-T5/src


In [5]:
import transformers

In [6]:
from param import parse_args

In [10]:
args = parse_args(
    parse=False,
    backbone='facebook/bart-base',
    load="/scratch/vivek.trivedi/VL_adapter/VL-T5/snap/VLBart_multitask/4tasks_hard_RN101_LMfull_bs100_image224_lr1e-4/LAST"
)
args.gpu = 1

In [11]:
from vqa import Trainer

In [12]:
trainer = Trainer(args,
                  train=False
                  )

Building Model at GPU 1


Some weights of VLBartVQA were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['encoder.visual_embedding.feat_embedding.0.weight', 'encoder.visual_embedding.feat_embedding.0.bias', 'encoder.visual_embedding.feat_embedding.1.weight', 'encoder.visual_embedding.feat_embedding.1.bias', 'encoder.visual_embedding.absolute_vis_pos_embedding.0.weight', 'encoder.visual_embedding.absolute_vis_pos_embedding.0.bias', 'encoder.visual_embedding.absolute_vis_pos_embedding.1.weight', 'encoder.visual_embedding.absolute_vis_pos_embedding.1.bias', 'encoder.visual_embedding.obj_order_embedding.weight', 'encoder.visual_embedding.img_order_embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded from  /scratch/vivek.trivedi/VL_adapter/VL-T5/snap/VLBart_multitask/4tasks_hard_RN101_LMfull_bs100_image224_lr1e-4/LAST.pth
<All keys matched successfully>
Model Launching at GPU 1
model.encoder.visual_embedding.feat_embedding.0.weight is trainable...
model.encoder.visual_embedding.feat_embedding.0.bias is trainable...
model.encoder.visual_embedding.feat_embedding.1.weight is trainable...
model.encoder.visual_embedding.feat_embedding.1.bias is trainable...
model.encoder.visual_embedding.absolute_vis_pos_embedding.0.weight is trainable...
model.encoder.visual_embedding.absolute_vis_pos_embedding.0.bias is trainable...
model.encoder.visual_embedding.absolute_vis_pos_embedding.1.weight is trainable...
model.encoder.visual_embedding.absolute_vis_pos_embedding.1.bias is trainable...
model.encoder.visual_embedding.img_order_embedding.weight is trainable...
Trainable param percentage: 1.12% (1582848/141156864)
It took 10.1s


# Faster R-CNN inference script (from [Huggingface transformers LXMERT demo](https://github.com/huggingface/transformers/tree/master/examples/research_projects/lxmert))

In [17]:
cd "/scratch/vivek.trivedi/VL_adapter/VL-T5/VL-T5/VL-T5/"

/scratch/vivek.trivedi/VL_adapter/VL-T5/VL-T5/VL-T5


In [18]:
from IPython.display import clear_output, Image, display
import PIL.Image
import io
import json
import torch
import numpy as np
from inference.processing_image import Preprocess
from inference.visualizing_image import SingleImageViz
from inference.modeling_frcnn import GeneralizedRCNN
from inference.utils import Config, get_data

import wget
import pickle
import os


URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/images/input.jpg"
OBJ_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt"
ATTR_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt"
GQA_URL = "https://raw.githubusercontent.com/airsplay/lxmert/master/data/gqa/trainval_label2ans.json"
VQA_URL = "https://raw.githubusercontent.com/airsplay/lxmert/master/data/vqa/trainval_label2ans.json"

objids = get_data(OBJ_URL) 
attrids = get_data(ATTR_URL)
gqa_answers = get_data(GQA_URL) 
vqa_answers = get_data(VQA_URL) 
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg) 
image_preprocess = Preprocess(frcnn_cfg) 

# for visualizing output
def showarray(a, fmt='jpeg'):
    a = np.uint8(np.clip(a, 0, 255))
    f = io.BytesIO()
    PIL.Image.fromarray(a).save(f, fmt)
    display(Image(data=f.getvalue()))

%s not found in cache or force_download set to True, downloading to %s https://s3.amazonaws.com/models.huggingface.co/bert/unc-nlp/frcnn-vg-finetuned/config.yaml /home/vivek.trivedi/.cache/torch/transformers/tmpduu7w2ex


Downloading:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

loading configuration file cache
%s not found in cache or force_download set to True, downloading to %s https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin /home/vivek.trivedi/.cache/torch/transformers/tmpbvt_xu6e


Downloading:   0%|          | 0.00/262M [00:00<?, ?B/s]

loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /home/vivek.trivedi/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.




In [17]:
image_filename = wget.download(URL)

URLError: <urlopen error EOF occurred in violation of protocol (_ssl.c:1129)>

In [None]:
image_dirname = image_filename
frcnn_visualizer = SingleImageViz(image_filename, id2obj=objids, id2attr=attrids) 

images, sizes, scales_yx = image_preprocess(image_filename) 

output_dict = frcnn(
    images, 
    sizes, 
    scales_yx = scales_yx, 
    padding = 'max_detections', 
    max_detections = frcnn_cfg.max_detections, 
    return_tensors = 'pt' 
)

# add boxes and labels to the image 
frcnn_visualizer.draw_boxes(
    output_dict.get("boxes"), 
    output_dict.get("obj_ids"),
    output_dict.get("obj_probs"),
    output_dict.get("attr_ids"), 
    output_dict.get("attr_probs"),
)

showarray(frcnn_visualizer._get_buffer())

normalized_boxes = output_dict.get("normalized_boxes") 
features = output_dict.get("roi_features") 

## Load Tokenizer

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

## Inference

In [111]:
questions = ["vqa: What is the main doing?", 
             "vqa: What color is the clothing the man wears?", 
             "vqa: What color is the horse?",
            "vqa: whatis color of streat"] 

In [112]:
for question in questions:
    input_ids = tokenizer(question, return_tensors='pt', padding=True).input_ids
    batch = {}
    batch['input_ids'] = input_ids
    batch['vis_feats'] = features
    #batch['boxes'] = normalized_boxes
    batch['task']="vqa"

    result = trainer.model.test_step(batch)
    print(f"Q: {question}")
    print(f"A: {result['pred_ans'][0]}")

Q: What is the main doing?
A: 
Q: What color is the clothing the man wears?
A: 
Q: What color is the horse?
A: 
Q: what is color of streat?
A: 


In [103]:
trainer.model

VLBartVQA(
  (model): VLBartModel(
    (shared): Embedding(50465, 768)
    (encoder): JointEncoder(
      (embed_tokens): Embedding(50465, 768)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768, padding_idx=1)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
      (layernorm_embeddin