In [2]:
%apt-get install -y libzmq3-dev libffi-dev
%pip install --no-cache-dir -r requirements.txt

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libzmq3-dev is already the newest version (4.3.4-2).
The following NEW packages will be installed:
  libffi-dev
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 63.7 kB of archives.
After this operation, 336 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libffi-dev amd64 3.4.2-4 [63.7 kB]
Fetched 63.7 kB in 1s (102 kB/s)
Selecting previously unselected package libffi-dev:amd64.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack .../libffi-dev_3.4.2-4_amd64.deb ...
Unpacking libffi-dev:amd64 (3.4.2-4) ...
Setting up libffi-dev:amd64 (3.4.2-4) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting git+https://github.com/huggingface/transformers.git (from -r /content/drive/MyDrive/copy/requirements.txt (line 92))
  Cloning https://github.com/huggingface/transformers.git

In [20]:
from IPython.display import Image, display
import PIL.Image
import io
import torch
import numpy as np
import torch.nn.functional as F
from processing_image import Preprocess
from visualizing_image import SingleImageViz
from modeling_frcnn import GeneralizedRCNN
from utils import Config
import utils
from transformers import VisualBertForQuestionAnswering, BertTokenizerFast
from datasets import load_dataset
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset = load_dataset("derek-thomas/ScienceQA")

OBJ_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt"
ATTR_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt"
VQA_URL = "https://dl.fbaipublicfiles.com/pythia/data/answers_vqa.txt"


# for visualizing output
def showarray(a, fmt="jpeg"):
    a = np.uint8(np.clip(a, 0, 255))
    f = io.BytesIO()
    PIL.Image.fromarray(a).save(f, fmt)
    display(Image(data=f.getvalue()))

In [21]:
# load object, attribute, and answer labels

objids = utils.get_data(OBJ_URL)
attrids = utils.get_data(ATTR_URL)
vqa_answers = utils.get_data(VQA_URL)

In [22]:
# load models and model components
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")

frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg).to(device)

image_preprocess = Preprocess(frcnn_cfg)

bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
visualbert_vqa = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa").to(device)

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /root/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.


In [27]:
# 遍历ScienceQA数据集中的每个示例
for example in dataset['train']:  # 以train集为例
    image = example['image']  # 直接使用PIL图像对象
    question = example['question']
    choices = example['choices']
    correct_answer_index = example['answer']
    task_type = example['task']

    # 检查图像是否为None
    if image is not None:
        # 将PIL图像对象转换为NumPy数组并进行预处理
        image_array = np.array(image)
        images, sizes, scales_yx = image_preprocess(image_array)

        # 将处理后的图像转换为FloatTensor并移动到GPU
        images = torch.FloatTensor(images).to(device)  # 确保输入图像是FloatTensor并移动到GPU

        # 图像可视化
        frcnn_visualizer = SingleImageViz(image_array, id2obj=objids, id2attr=attrids)

        # 运行FRCNN以提取特征
        with torch.no_grad():  # 关闭梯度计算以节省内存
            output_dict = frcnn(
                images,
                sizes,
                scales_yx=scales_yx,
                padding="max_detections",
                max_detections=frcnn_cfg.max_detections,
                return_tensors="pt",
            )

        # 使用FRCNN提取图像特征
        normalized_boxes = output_dict.get("normalized_boxes")
        features = output_dict.get("roi_features").to(device)  # 确保特征在GPU上

        # 绘制FRCNN检测框
        frcnn_visualizer.draw_boxes(
            output_dict.get("boxes"),
            output_dict.pop("obj_ids"),
            output_dict.pop("obj_probs"),
            output_dict.pop("attr_ids"),
            output_dict.pop("attr_probs"),
        )

        # 显示处理后的图像
        plt.imshow(frcnn_visualizer._get_buffer())
        plt.axis('off')
        plt.show()

    else:
        # 如果图像为None，输出相应提示
        print("Image not available for this example.")

    # 将question和每个choice组合成新的输入
    choice_scores = []
    for choice in choices:
        question_choice_pair = question + " " + choice

        # 编码文本输入
        inputs = bert_tokenizer(
            [question_choice_pair],
            padding="max_length",
            max_length=30,
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )

        # 将输入移动到GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # 使用VisualBERT进行预测
        with torch.no_grad():  # 关闭梯度计算以节省内存
            output_vqa = visualbert_vqa(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                visual_embeds=features if image is not None else None,  # 只在有图像时提供features
                visual_attention_mask=torch.ones(features.shape[:-1], device=device) if image is not None else None,
                token_type_ids=inputs['token_type_ids'],
                output_attentions=False,
            )

        # 将logits转换为概率分数，并取最大值作为该选项的分数
        logits = output_vqa["logits"]
        score = F.softmax(logits, dim=-1).max().item()  # 获取置信度分数
        choice_scores.append(score)

    # 获取最高分的选项索引
    predicted_answer_index = choice_scores.index(max(choice_scores))
    predicted_answer = choices[predicted_answer_index]

    # 打印问题和预测结果
    print(f"Question:\t{question}")
    print(f"Predicted Answer:\t{predicted_answer}")
    print(f"Correct Answer:\t{choices[correct_answer_index]}")
    print(f"Task Type:\t{task_type}")
    print("=" * 50)


Output hidden; open in https://colab.research.google.com to view.