In [3]:
# !pip install transformers
# !pip install peft
# !pip install accelerate
# !pip install torchvision
# !pip install datasets
# !pip install pillow
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [4]:
import os
import pandas as pd
import torch
from transformers import ViltProcessor, ViltConfig, ViltForQuestionAnswering
from peft import PeftModel
from PIL import Image
from sklearn.metrics import accuracy_score, f1_score
from evaluate import load
from tqdm import tqdm



In [5]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"



In [6]:
# Load saved model, processor, and config
root = "/content/drive/Othercomputers/MyMacBookPro/VR_PROJECT_2"
save_directory = os.path.join(root, "vilt-finetuned-vqa")

processor = ViltProcessor.from_pretrained(save_directory)
config = ViltConfig.from_pretrained(save_directory)
base_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa", config=config, ignore_mismatched_sizes=True)
model = PeftModel.from_pretrained(base_model, save_directory)

model.to(device)
model.eval()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ViltForQuestionAnswering were not initialized from the model checkpoint at dandelin/vilt-b32-finetuned-vqa and are newly initialized because the shapes did not match:
- classifier.3.weight: found shape torch.Size([3129, 1536]) in the checkpoint and torch.Size([526, 1536]) in the model instantiated
- classifier.3.bias: found shape torch.Size([3129]) in the checkpoint and torch.Size([526]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): ViltForQuestionAnswering(
      (vilt): ViltModel(
        (embeddings): ViltEmbeddings(
          (text_embeddings): TextEmbeddings(
            (word_embeddings): Embedding(30522, 768)
            (position_embeddings): Embedding(40, 768)
            (token_type_embeddings): Embedding(2, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (patch_embeddings): ViltPatchEmbeddings(
            (projection): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
          )
          (token_type_embeddings): Embedding(2, 768)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): ViltEncoder(
          (layer): ModuleList(
            (0-11): 12 x ViltLayer(
              (attention): ViltAttention(
                (attention): ViltSelfAttention(
                  (query): lora.Linear

In [11]:
#!pip install bert_score
!pip install evaluate bert_score nltk




In [13]:
# Load test data
test_csv = os.path.join(root, "vqa_test_cleaned.csv")
df = pd.read_csv(test_csv)

# Initialize metric libraries
bertscore = load("bertscore")
meteor = load("meteor")

# Lists to store predictions and ground truth
true_answers = []
predicted_answers = []



Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [14]:
# Evaluation loop
for _, row in tqdm(df.iterrows(), total=len(df)):
    try:
        image_path = os.path.join(root, "images", row["path"])
        image = Image.open(image_path).convert("RGB")
        image = image.resize((384, 384))
        question = row["generated_question"]
        true_answer = row["generated_answer"].lower()

        # Preprocess
        inputs = processor(images=image, text=question, return_tensors="pt").to(device)

        # Inference
        with torch.no_grad():
            outputs = model(**inputs)
            pred_id = outputs.logits.argmax(-1).item()
            predicted_answer = model.config.id2label[pred_id].lower()

        true_answers.append(true_answer)
        predicted_answers.append(predicted_answer)
    except Exception as e:
        print(f"Skipping row due to error: {e}")
        continue



  5%|▌         | 168/3318 [01:50<30:00,  1.75it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (100 > 40). Running this sequence through the model will result in indexing errors


Skipping row due to error: The size of tensor a (100) must match the size of tensor b (40) at non-singleton dimension 1


  7%|▋         | 221/3318 [02:33<32:48,  1.57it/s]

Skipping row due to error: The size of tensor a (41) must match the size of tensor b (40) at non-singleton dimension 1


 23%|██▎       | 771/3318 [10:36<26:53,  1.58it/s]

Skipping row due to error: The size of tensor a (165) must match the size of tensor b (40) at non-singleton dimension 1


 23%|██▎       | 779/3318 [10:41<26:26,  1.60it/s]

Skipping row due to error: The size of tensor a (96) must match the size of tensor b (40) at non-singleton dimension 1


 24%|██▍       | 795/3318 [10:57<29:36,  1.42it/s]

Skipping row due to error: The size of tensor a (76) must match the size of tensor b (40) at non-singleton dimension 1


 30%|██▉       | 984/3318 [13:36<21:54,  1.78it/s]

Skipping row due to error: The size of tensor a (45) must match the size of tensor b (40) at non-singleton dimension 1


 35%|███▌      | 1167/3318 [16:13<22:32,  1.59it/s]

Skipping row due to error: The size of tensor a (43) must match the size of tensor b (40) at non-singleton dimension 1


 38%|███▊      | 1247/3318 [17:21<23:43,  1.46it/s]

Skipping row due to error: The size of tensor a (49) must match the size of tensor b (40) at non-singleton dimension 1


 50%|█████     | 1673/3318 [23:26<16:08,  1.70it/s]

Skipping row due to error: The size of tensor a (60) must match the size of tensor b (40) at non-singleton dimension 1


 51%|█████     | 1699/3318 [23:48<19:20,  1.40it/s]

Skipping row due to error: The size of tensor a (46) must match the size of tensor b (40) at non-singleton dimension 1


 58%|█████▊    | 1936/3318 [27:16<14:47,  1.56it/s]

Skipping row due to error: The size of tensor a (89) must match the size of tensor b (40) at non-singleton dimension 1


 61%|██████▏   | 2039/3318 [28:48<15:10,  1.40it/s]

Skipping row due to error: The size of tensor a (49) must match the size of tensor b (40) at non-singleton dimension 1


 64%|██████▎   | 2113/3318 [29:51<15:05,  1.33it/s]

Skipping row due to error: The size of tensor a (50) must match the size of tensor b (40) at non-singleton dimension 1


 72%|███████▏  | 2384/3318 [33:50<11:25,  1.36it/s]

Skipping row due to error: The size of tensor a (53) must match the size of tensor b (40) at non-singleton dimension 1


 73%|███████▎  | 2436/3318 [34:35<09:50,  1.49it/s]

Skipping row due to error: The size of tensor a (46) must match the size of tensor b (40) at non-singleton dimension 1


 78%|███████▊  | 2578/3318 [36:41<08:39,  1.42it/s]

Skipping row due to error: The size of tensor a (50) must match the size of tensor b (40) at non-singleton dimension 1


 81%|████████  | 2685/3318 [38:12<06:19,  1.67it/s]

Skipping row due to error: The size of tensor a (124) must match the size of tensor b (40) at non-singleton dimension 1


 92%|█████████▏| 3050/3318 [43:24<03:08,  1.42it/s]

Skipping row due to error: The size of tensor a (81) must match the size of tensor b (40) at non-singleton dimension 1


 99%|█████████▉| 3301/3318 [47:00<00:11,  1.51it/s]

Skipping row due to error: The size of tensor a (52) must match the size of tensor b (40) at non-singleton dimension 1


100%|█████████▉| 3312/3318 [47:09<00:03,  1.52it/s]

Skipping row due to error: The size of tensor a (101) must match the size of tensor b (40) at non-singleton dimension 1


100%|██████████| 3318/3318 [47:14<00:00,  1.17it/s]


In [15]:
# Compute standard metrics
accuracy = accuracy_score(true_answers, predicted_answers)
f1 = f1_score(true_answers, predicted_answers, average='weighted')



In [16]:
# Compute advanced metrics
bertscore_results = bertscore.compute(predictions=predicted_answers, references=true_answers, model_type="bert-base-uncased")
# bartscore_results = bartscore.compute(predictions=predicted_answers, references=true_answers, model_type="facebook/bart-large")
meteor_results = meteor.compute(predictions=predicted_answers, references=true_answers)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [17]:
# Output results
print(f"\n--- Evaluation Results ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"BERTScore F1: {sum(bertscore_results['f1']) / len(bertscore_results['f1']):.4f}")
# print(f"BARTScore F1: {sum(bartscore_results['f1']) / len(bartscore_results['f1']):.4f}")
print(f"METEOR Score: {meteor_results['meteor']:.4f}")



--- Evaluation Results ---
Accuracy: 0.7817
F1 Score: 0.7559
BERTScore F1: 0.9337
METEOR Score: 0.4016
