In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Base model

### Inference

In [1]:
import json
import os
import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
import re

def normalize_answer(ans):
    # Lowercase
    ans = ans.lower()
    # Split on underscores
    tokens = ans.split('_')
    # Split camelCase in each token
    final_tokens = []
    for token in tokens:
        camel_split = re.sub(r'([a-z])([A-Z])', r'\1 \2', token).split()
        final_tokens.extend(camel_split)
    # Join into a single string
    return ' '.join(final_tokens)


# Load model and processor
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to("cuda" if torch.cuda.is_available() else "cpu")

# Load data
with open("/kaggle/input/abo-vqa-dataset/input/test/qna_test.json", "r") as f:
    data = json.load(f)

root_img_dir = "/kaggle/input/abo-vqa-dataset/input/test/images_test"  # top-level folder where /images is stored
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Run inference
predictions = []
skipped_items = []

for item in data:
    try:
        image_path = os.path.join(root_img_dir, item["image_path"].lstrip("/"))
        image = Image.open(image_path).convert("RGB")
        question = item["question"]
        answer = normalize_answer(item["answer"])

        inputs = processor(image, question, return_tensors="pt").to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_idx = logits.argmax(-1).item()
        predicted_answer = model.config.id2label[predicted_idx]

        predictions.append({
            "image_id": item["image_id"],
            "question": question,
            "answer": answer,
            "generated_answer": predicted_answer
        })

    except FileNotFoundError:
        # print(f"[Missing Image] Skipping item with image_path: {item['image_path']}")
        skipped_items.append(item)
    except Exception as e:
        print(f"[Error] Skipping item due to: {e}")
        skipped_items.append(item)

# Create DataFrame
df = pd.DataFrame(predictions)
df.to_csv("results.csv")
print(f"Total items {len(data)}, skipped items: {len(skipped_items)}")


2025-05-16 14:57:45.036069: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747407465.272723      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747407465.345073      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

Total items 1490, skipped items: 0


### Evaluation (BERTScore and BARTScore)

In [7]:
pip install bert-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert-score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert-score)
  

In [5]:
import torch
import numpy as np
from bert_score import score as bert_score
from transformers import BartTokenizer, BartForConditionalGeneration

# Prepare refs and hyps
refs = df["answer"].astype(str).tolist()
hyps = df["generated_answer"].astype(str).tolist()

df["is_correct"] = df["answer"] == df["generated_answer"]
accuracy = df["is_correct"].mean()
print(f"Exact Match Accuracy: {accuracy:.4f}")

# BERTScore
P, R, F1 = bert_score(hyps, refs, lang="en", rescale_with_baseline=True)
df["bertscore_f1"] = F1.tolist()
mean_bertscore = F1.mean().item()
print(f"{mean_bertscore=}")

# BARTScore
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
bart_model.eval()

def compute_bartscore(hyp, ref):
    with torch.no_grad():
        input_ids = tokenizer(hyp, return_tensors="pt").input_ids.to(device)
        labels = tokenizer(ref, return_tensors="pt").input_ids.to(device)
        outputs = bart_model(input_ids=input_ids, labels=labels)
        return -outputs.loss.item()  # log-likelihood (higher = better)

# Compute BARTScore for each sample
bart_scores = [compute_bartscore(h, r) for h, r in zip(hyps, refs)]
df["bartscore"] = bart_scores
mean_bartscore = np.mean(bart_scores)
print(f"{mean_bartscore=}")


print("-------------")
print(f"Mean BERTScore (F1): {mean_bertscore:.4f}")
print(f"Mean BARTScore    : {mean_bartscore:.4f}")
print(f"Exact Match Acc.  : {accuracy:.4f}")



Exact Match Accuracy: 0.1047


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mean_bertscore=0.4109959900379181
mean_bartscore=-5.987912863052931
-------------
Mean BERTScore (F1): 0.4110
Mean BARTScore    : -5.9879
Exact Match Acc.  : 0.1047


In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Labels for scikit-learn must be exact matches of all possible classes
y_true = df["answer"]
y_pred = df["generated_answer"]

# If you're dealing with multiple possible labels, use macro or weighted average
precision = precision_score(y_true, y_pred, average="micro")
recall = recall_score(y_true, y_pred, average="micro")
f1 = f1_score(y_true, y_pred, average="micro")

print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")


Precision: 0.1047
Recall   : 0.1047
F1 Score : 0.1047


## Finetuned ViLT

### Inference

In [2]:
import os
os.getcwd()

'/kaggle/working'

In [9]:
from transformers import ViltProcessor, ViltConfig, ViltForQuestionAnswering
from peft import PeftModel
from PIL import Image
import torch
import json
import os
import pandas as pd
from PIL import Image
import re

def normalize_answer(ans):
    # Lowercase
    ans = ans.lower()
    # Split on underscores
    tokens = ans.split('_-')
    # Split camelCase in each token
    final_tokens = []
    for token in tokens:
        camel_split = re.sub(r'([a-z])([A-Z])', r'\1 \2', token).split()
        final_tokens.extend(camel_split)
    # Join into a single string
    return ' '.join(final_tokens)

device = "cuda" if torch.cuda.is_available() else "cpu"

# ✅ Load processor and config from finetuned LoRA folder
# processor = ViltProcessor.from_pretrained("vilt-finetuned-vqa")
# config = ViltConfig.from_pretrained("vilt-finetuned-vqa")  # Must include correct num_labels = 841
processor = ViltProcessor.from_pretrained("/kaggle/input/vilt-finetuned/vilt-finetuned-vqa")
config = ViltConfig.from_pretrained("/kaggle/input/vilt-finetuned/vilt-finetuned-vqa")  # Must include correct num_labels = 841

# ✅ Load base model with config — but DO NOT load weights from vilt-finetuned-vqa
base_model = ViltForQuestionAnswering.from_pretrained(
    "dandelin/vilt-b32-finetuned-vqa",
    config=config,
    ignore_mismatched_sizes=True
)

# ✅ Attach the LoRA adapter trained on top of this config
# model = PeftModel.from_pretrained(base_model, "/kaggle/input/vilt-fintuned-vqa/transformers/default/1/vilt-finetuned-vqa")
model = PeftModel.from_pretrained(base_model, "/kaggle/input/vilt-finetuned/vilt-finetuned-vqa")


model.to(device)
model.eval()

# Load data
with open("/kaggle/input/abo-vqa-dataset/input/test/qna_test.json", "r") as f:
    data = json.load(f)

root_img_dir = "/kaggle/input/abo-vqa-dataset/input/test/images_test"  # top-level folder where /images is stored
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Run inference
predictions = []
skipped_items = []

for item in data:
    try:
        image_path = os.path.join(root_img_dir, item["image_path"].lstrip("/"))
        image = Image.open(image_path).convert("RGB")
        question = item["question"]
        answer = normalize_answer(item["answer"])

        inputs = processor(image, question, return_tensors="pt").to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_idx = logits.argmax(-1).item()
        predicted_answer = model.config.id2label[predicted_idx]

        predictions.append({
            "image_id": item["image_id"],
            "question": question,
            "answer": answer,
            "generated_answer": normalize_answer(predicted_answer)
        })

    except FileNotFoundError:
        # print(f"[Missing Image] Skipping item with image_path: {item['image_path']}")
        skipped_items.append(item)
    except Exception as e:
        print(f"[Error] Skipping item due to: {e}")
        skipped_items.append(item)

# Create DataFrame
df1 = pd.DataFrame(predictions)
df1.to_csv("ft-results.csv")
print(f"Total items {len(data)}, skipped items: {len(skipped_items)}")


Some weights of ViltForQuestionAnswering were not initialized from the model checkpoint at dandelin/vilt-b32-finetuned-vqa and are newly initialized because the shapes did not match:
- classifier.3.weight: found shape torch.Size([3129, 1536]) in the checkpoint and torch.Size([1964, 1536]) in the model instantiated
- classifier.3.bias: found shape torch.Size([3129]) in the checkpoint and torch.Size([1964]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total items 1490, skipped items: 0


### Evaluation

In [10]:
import torch
import numpy as np
from bert_score import score as bert_score
from transformers import BartTokenizer, BartForConditionalGeneration

# Prepare refs and hyps
refs = df1["answer"].astype(str).tolist()
hyps = df1["generated_answer"].astype(str).tolist()

df1["is_correct"] = df1["answer"] == df1["generated_answer"]
accuracy = df1["is_correct"].mean()
print(f"Exact Match Accuracy: {accuracy:.4f}")

# BERTScore
P, R, F1 = bert_score(hyps, refs, lang="en", rescale_with_baseline=True)
df1["bertscore_f1"] = F1.tolist()
mean_bertscore = F1.mean().item()
print(f"{mean_bertscore=}")

# BARTScore
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
bart_model.eval()

def compute_bartscore(hyp, ref):
    with torch.no_grad():
        input_ids = tokenizer(hyp, return_tensors="pt").input_ids.to(device)
        labels = tokenizer(ref, return_tensors="pt").input_ids.to(device)
        outputs = bart_model(input_ids=input_ids, labels=labels)
        return -outputs.loss.item()  # log-likelihood (higher = better)

# Compute BARTScore for each sample
bart_scores = [compute_bartscore(h, r) for h, r in zip(hyps, refs)]
df1["bartscore"] = bart_scores
mean_bartscore = np.mean(bart_scores)
print(f"{mean_bartscore=}")


print("-------------")
print(f"Mean BERTScore (F1): {mean_bertscore:.4f}")
print(f"Mean BARTScore    : {mean_bartscore:.4f}")
print(f"Exact Match Acc.  : {accuracy:.4f}")


Exact Match Accuracy: 0.4913


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mean_bertscore=0.6480774879455566
mean_bartscore=-4.223890054905975
-------------
Mean BERTScore (F1): 0.6481
Mean BARTScore    : -4.2239
Exact Match Acc.  : 0.4913
