In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import torch
from transformers import ViltProcessor, ViltForQuestionAnswering
import pandas as pd
from PIL import Image
import numpy as np

In [6]:
base_model='dandelin/vilt-b32-mlm'
model_path = "/nfs/home/scg1143/MLDS/Quarter3/DeepLearning/Project/VQA-ObjectDetection/runs/1/model_state_15.pth"

In [7]:
image_dir=os.path.abspath('../dataset/images')
eval_file_path = os.path.abspath('../dataset/data_eval.csv')

In [8]:
test_df = pd.read_csv(eval_file_path)
print(test_df.shape)
test_df.head()

(2494, 3)


Unnamed: 0,question,answer,image_id
0,what is the colour of the bag on the chair,pink,image399
1,what is at the right bottom,table,image1341
2,what are found on the rack,toy,image1320
3,what is left of printer,mirror,image529
4,what is the colour of television,black,image201


In [9]:
test_df['image_path'] = test_df['image_id'].apply(lambda x : os.path.join(image_dir, f"{x}.png"))
assert os.path.exists(test_df.image_path.tolist()[0])

In [10]:
class VQAPredictor:
    def __init__(self, base_model, model_path, device='cpu'):
        self.processor = ViltProcessor.from_pretrained(base_model)
        assert os.path.exists(model_path), f"{model_path} does not exists"
        model_dict = torch.load(model_path)
        print(f"Accuracy : {model_dict['best_epoch_acc']}")
        self.id2label = model_dict['id2label']
        self.label2id = model_dict['label2id']
        self.device = device
        self.model = ViltForQuestionAnswering.from_pretrained(base_model, id2label=self.id2label, label2id=self.label2id)
        self.model.load_state_dict(model_dict['state_dict'])
        self.model.eval()
        self.model.to(self.device)
        print("Model Loaded")

    def predict(self, image_path, text, topk=3):
        image = Image.open(image_path)
        encoding = self.processor(image, text, padding="max_length", truncation=True, return_tensors="pt")
        pixel_mask = self.processor.image_processor.pad(encoding['pixel_values'], return_tensors="pt")['pixel_mask']
        encoding['pixel_mask'] = pixel_mask
        encoding.to(self.device)
        with torch.no_grad():
            outputs = self.model(**encoding)
            logits = torch.sigmoid(outputs.logits)
            logits = logits.detach().cpu().numpy()[0]
            sorted_indices = np.argsort(logits)[::-1]
            sorted_probs = logits[sorted_indices]
        result = [ {"answer" : self.id2label[sorted_indices[k]], "prob" : sorted_probs[k]} for k in range(topk)]
        return result


In [11]:
predictor = VQAPredictor(base_model=base_model, model_path=model_path, device='cuda')

Accuracy : 86.0523


Some weights of ViltForQuestionAnswering were not initialized from the model checkpoint at dandelin/vilt-b32-mlm and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.1.bias', 'classifier.1.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded


In [22]:
test_case = test_df.iloc[6]
out = predictor.predict(image_path = test_case['image_path'], text=test_case['question'])
out

[{'answer': 'photo', 'prob': 0.041434072},
 {'answer': 'papers', 'prob': 0.015667612},
 {'answer': 'toy', 'prob': 0.008851868}]

In [23]:
test_case

question                          what is the ball on the table
answer                                               basketball
image_id                                               image477
image_path    /nfs/home/scg1143/MLDS/Quarter3/DeepLearning/P...
Name: 6, dtype: object

In [94]:
encoding.keys()

dict_keys(['pixel_values', 'pixel_mask'])

In [87]:
num_corrects = 0
for idx,row in test_df.iterrows():
    image_path = row['image_path']
    text= row['question']
    result = predictor.predict(image_path = image_path, text=text)
    if result[0]['answer'] == row['answer']:
        num_corrects+=1

    if idx % 200 == 0:
        print(f"Idx : {idx} Num Corrects : {num_corrects}")

Idx : 0 Num Corrects : 0
Idx : 200 Num Corrects : 56
Idx : 400 Num Corrects : 109
Idx : 600 Num Corrects : 170
Idx : 800 Num Corrects : 248


ValueError: Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.

In [72]:
test_case

question                       how many plastic boxes are there
answer                                                        2
image_id                                               image447
image_path    /nfs/home/scg1143/MLDS/Quarter3/DeepLearning/P...
Name: 157, dtype: object