In [31]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
from PIL import Image
import requests
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
#from word2number import w2n

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
# Path to your merged VQA JSON (with filename-only paths)
json_path = '/content/drive/MyDrive/VR Project 2/Curated Dataset/final_VQA_dict_merged_1234_2.json'

# Load into a Python dict: {image_id: record}
with open(json_path, 'r', encoding='utf-8') as f:
    vqa_data = json.load(f)

In [34]:
vqa_data['91wdUx01uXL']

{'What is this product?': {'correct_option': 'Wall Clock',
  'plausible_options': ['Desk Clock', 'Alarm Clock', 'Clock Radio']},
 'Which category best describes this product?': {'correct_option': 'Wall Clocks',
  'plausible_options': ['Clocks', 'Home Decor', 'Kitchen Decor']},
 'What is the primary color of this Wall Clock?': {'correct_option': 'Black',
  'plausible_options': ['Brown', 'White', 'Gold']},
 'What year is displayed on the clock face?': {'correct_option': '1965',
  'plausible_options': ['1975', '1985', '2000']},
 'What is the shape of the clock?': {'correct_option': 'Round',
  'plausible_options': ['Square', 'Rectangular', 'Oval']},
 'Is the clock made of wood?': {'correct_option': 'yes',
  'plausible_options': ['no', 'partially', 'not visible']},
 'path': '/content/drive/MyDrive/VR Project 2/Data/abo-images-small/images/small/cc/cc9f405c.jpg'}

In [36]:
examples = []
for image_id, rec in vqa_data.items():
    img_path = os.path.join(
        '/content/drive/MyDrive/VR Project 2/Data Curation/Selected Images',
        rec['path']
    )
    # print(rec)
    # break
    # For each question in the record (skip 'path')
    for question, qa in rec.items():
        # print(qa)
        if question == 'path':
            continue
        # question = qa['question']
        if 'correct_option' in qa.keys():
            correct = qa['correct_option']
        else:
            correct = qa['correct']
        # if 'plausible_options' in qa.keys():
        #     options = qa['plausible_options'][:]
        # elif 'options' in qa.keys():
        #     options = qa['options'][:]
        # else:
        #     options = qa['plausible'][:]
        # ensure correct is in options
        # if correct not in options:
        #     # print('*******')
        #     options.insert(0, correct)
        # label = options.index(correct)
        examples.append({
            'Image_ID': image_id,
            'Question': question,
            'Image_Path': rec['path'],
            'Answer': correct
        })

#Image_ID,Item_ID,Question,Answer,Image_Path

In [37]:
df = pd.DataFrame(examples)
df = df.sample(frac=0.6, random_state=42).reset_index(drop=True)
df

Unnamed: 0,Image_ID,Question,Image_Path,Answer
0,71ovuTiG8AL,What is the primary color of this Desk?,/content/drive/MyDrive/VR Project 2/Data/abo-i...,Light Brown
1,71P7CM5zw-L,What is the primary color of this TV Trolley?,/content/drive/MyDrive/VR Project 2/Data/abo-i...,Black
2,71IjzZFmmiL,What type of earring setting is used?,/content/drive/MyDrive/VR Project 2/Data/abo-i...,Stud
3,21t4oVa+FRL,What is the shape of the containers?,/content/drive/MyDrive/VR Project 2/Data/abo-i...,Square
4,810jM9UbU7L,Is the fitted sheet shown made of cotton?,/content/drive/MyDrive/VR Project 2/Data/abo-i...,Yes
...,...,...,...,...
15509,51Ztzh1HAHL,Which category best describes this product?,/content/drive/MyDrive/VR Project 2/Data/abo-i...,Disposable Tableware
15510,81MXFSxSGbL,What is this product?,/content/drive/MyDrive/VR Project 2/Data/abo-i...,TV Mount
15511,81CI0nfUdVL,How many cubes are there in this storage unit?,/content/drive/MyDrive/VR Project 2/Data/abo-i...,Four
15512,91l2nw9egVL,Which category best describes this product?,/content/drive/MyDrive/VR Project 2/Data/abo-i...,Storage


In [38]:
df['Image_Path'][42]

'/content/drive/MyDrive/VR Project 2/Data/abo-images-small/images/small/7c/7c944a0c.jpg'

In [39]:
# Load the CSV file
# csv_path = "/kaggle/input/qna-final/qna_final.csv"
# df = pd.read_csv(csv_path)

# Get unique Item_IDs
unique_ids = df["Image_ID"].unique()

# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Shuffle and split the unique IDs
train_ids, temp_ids = train_test_split(unique_ids, test_size=0.3, random_state=random_seed)  # 70% train
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=random_seed)      # 15% val, 15% test

# Create train, val, and test DataFrames
train_df = df[df["Image_ID"].isin(train_ids)]
val_df = df[df["Image_ID"].isin(val_ids)]
test_df = df[df["Image_ID"].isin(test_ids)]

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")


Train size: 10807, Val size: 2359, Test size: 2348


In [40]:
"""Normalizing and mapping non-existing answers to semantically similar existing answers in label2id"""
# Load the model and processor
model_config_source = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Text-to-number mapping (same as before)
def get_text_to_num_mapping():
    text_to_num = {
        "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
        "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9",
        "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13",
        "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17",
        "eighteen": "18", "nineteen": "19", "twenty": "20",
    }
    for i in range(21, 1001):
        text_to_num[str(i)] = str(i)
    return text_to_num

text_to_num_map = get_text_to_num_mapping()

def normalize_answer(answer_str):
    normalized = str(answer_str).strip().lower()
    return text_to_num_map.get(normalized, normalized)

# Create DataFrames with .copy()
train_df = df[df["Image_ID"].isin(train_ids)].copy()
val_df = df[df["Image_ID"].isin(val_ids)].copy()
test_df = df[df["Image_ID"].isin(test_ids)].copy()

# Add normalized answers
train_df['normalized_answer'] = train_df['Answer'].apply(normalize_answer)
val_df['normalized_answer'] = val_df['Answer'].apply(normalize_answer)
test_df['normalized_answer'] = test_df['Answer'].apply(normalize_answer)

# Get the original label2id
original_label2id = model_config_source.config.label2id
original_answers = list(original_label2id.keys())

# Load sentence-transformers for semantic similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a pre-trained model for semantic embeddings
print("Loading sentence transformer model for semantic matching...")
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight model that works well for semantic similarity

# Pre-compute embeddings for all original answers
print("Computing embeddings for original vocabulary...")
original_embeddings = semantic_model.encode(original_answers, show_progress_bar=False)

def find_semantically_similar_answer(new_answer, original_answers, original_embeddings):
    """Find the most semantically similar answer in original_answers to new_answer"""
    # Get embedding for the new answer
    new_embedding = semantic_model.encode([new_answer], show_progress_bar=False)

    # Calculate cosine similarity between new answer and all original answers
    similarities = cosine_similarity(new_embedding, original_embeddings)[0]

    # Get the index of the most similar answer
    most_similar_idx = np.argmax(similarities)
    similarity_score = similarities[most_similar_idx]

    return original_answers[most_similar_idx], similarity_score

# Create a mapping dictionary for unseen answers
answer_mapping = {}
similarity_scores = {}

print("Creating semantic mappings for unseen answers...")
# Process all datasets to create mappings
for dataset_name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    mapped_count = 0
    for ans in df['normalized_answer'].unique():
        if ans not in original_label2id and ans not in answer_mapping:
            similar_ans, score = find_semantically_similar_answer(ans, original_answers, original_embeddings)
            answer_mapping[ans] = similar_ans
            similarity_scores[ans] = score
            mapped_count += 1

    print(f"Dataset {dataset_name}: Mapped {mapped_count} unseen answers to semantically similar existing answers")

# Apply mapping to create mapped_answer column
def map_to_similar_answer(answer):
    if answer in original_label2id:
        return answer  # Already in the vocabulary
    return answer_mapping.get(answer, answer)  # Map to similar answer if needed

train_df['mapped_answer'] = train_df['normalized_answer'].apply(map_to_similar_answer)
val_df['mapped_answer'] = val_df['normalized_answer'].apply(map_to_similar_answer)
test_df['mapped_answer'] = test_df['normalized_answer'].apply(map_to_similar_answer)

# Print some statistics about the mapping
print("\nAnswer mapping examples (with similarity scores):")
if answer_mapping:
    # Sort by similarity score for better examples display
    sorted_mappings = sorted([(k, v, similarity_scores[k]) for k, v in answer_mapping.items()],
                            key=lambda x: x[2], reverse=True)

    for i, (new_ans, similar_ans, score) in enumerate(sorted_mappings[:10]):  # Show first 10 examples
        print(f"  '{new_ans}' -> '{similar_ans}' (similarity: {score:.3f})")

    if len(answer_mapping) > 10:
        print(f"  ... and {len(answer_mapping) - 10} more mappings")
else:
    print("  No mappings were created (all answers already in vocabulary)")

# --- Final Check ---
print("\nFinal sizes:")
print(f"Train: {len(train_df)}")
print(f"Val: {len(val_df)}")
print(f"Test: {len(test_df)}")
print(f"Original vocabulary size: {len(original_label2id)}")
print(f"Total answer mappings created: {len(answer_mapping)}")

Loading sentence transformer model for semantic matching...
Computing embeddings for original vocabulary...
Creating semantic mappings for unseen answers...
Dataset train: Mapped 1477 unseen answers to semantically similar existing answers
Dataset val: Mapped 173 unseen answers to semantically similar existing answers
Dataset test: Mapped 169 unseen answers to semantically similar existing answers

Answer mapping examples (with similarity scores):
  'none' -> 'None' (similarity: 1.000)
  'spoon and fork' -> 'fork and spoon' (similarity: 0.994)
  'yellow and pink' -> 'pink and yellow' (similarity: 0.993)
  'one inch' -> '1 inch' (similarity: 0.974)
  'grey' -> 'gray' (similarity: 0.968)
  'flip-flops' -> 'flip flops' (similarity: 0.966)
  'ball catch' -> 'catch ball' (similarity: 0.965)
  '1-inch' -> '1 inch' (similarity: 0.962)
  'aluminium' -> 'aluminum' (similarity: 0.960)
  'recliners' -> 'recliner' (similarity: 0.952)
  ... and 1809 more mappings

Final sizes:
Train: 10807
Val: 235

In [41]:
"""Create a custom dataset class"""

class QnADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor, label2id): # Processor is not strictly needed here anymore, but label2id is
        self.dataframe = dataframe
        self.image_dir = image_dir
        # self.processor = processor # Not used directly in __getitem__ anymore
        self.label2id = label2id
        self.text_to_num = self.generate_text_to_num_mapping()

    def generate_text_to_num_mapping(self):
        # (Your existing generate_text_to_num_mapping method - keep as is)
        text_to_num = {
            "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
            "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9",
            "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13",
            "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17",
            "eighteen": "18", "nineteen": "19", "twenty": "20",
        }
        for i in range(21, 1001):
            text_to_num[str(i)] = str(i)
        return text_to_num

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = row['Image_Path']
        question_text = row["Question"]  # Keep as raw text

        # Use mapped_answer instead of Answer
        mapped_answer = row["mapped_answer"].strip().lower()

        # Convert text-based numbers to numerical strings if needed
        if mapped_answer in self.text_to_num:
            processed_answer_str = self.text_to_num[mapped_answer]
        else:
            processed_answer_str = mapped_answer

        # Load PIL image
        try:
            pil_image = Image.open(image_path).convert("RGB")
        except FileNotFoundError:
            print(f"Error: Image not found at {image_path}")
            # Handle appropriately: skip, return None, or use a placeholder
            # For now, let's re-raise to make it obvious during debugging
            raise
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            raise

        # Encode the answer string to an ID
        if processed_answer_str in self.label2id:
            answer_id = self.label2id[processed_answer_str]
        else:
            # This should be less common now since we're using mapped answers
            print(f"Warning: Mapped answer '{processed_answer_str}' not found in label2id mapping. Item index: {idx}, Image: {row['Image_Path']}")
            # We'll still include error handling for robustness
            raise ValueError(f"Mapped answer '{processed_answer_str}' (from original '{row.get('Answer', 'N/A')}') not found in label2id mapping for image {row['Image_Path']}.")

        return {
            "image": pil_image,          # Return the PIL Image object
            "question": question_text,   # Return the raw question string
            "labels": torch.tensor(answer_id, dtype=torch.long) # Return the label as a tensor
        }

In [42]:
""" Prepare dataloaders """
from functools import partial

# Use original_label2id instead of extended_label2id
num_labels = len(original_label2id)

# Directory containing the images
image_dir = "/kaggle/input/filtered-small-amazon-qna"

# Create datasets with ORIGINAL labels and dataframes containing mapped_answer column
train_dataset = QnADataset(train_df, image_dir, processor, original_label2id)
val_dataset = QnADataset(val_df, image_dir, processor, original_label2id)  # Use full val_df, not filtered
test_dataset = QnADataset(test_df, image_dir, processor, original_label2id)  # Use full test_df, not filtered

# Collate function with original num_labels
def collate_fn(batch, processor, num_classes=num_labels):
    """ViLT-compatible collate function with one-hot encoding"""
    # Filter out invalid entries
    valid_batch = [
        item for item in batch
        if item is not None
        and isinstance(item.get("image"), Image.Image)
        and item.get("question")
        and item.get("labels") is not None
    ]

    if not valid_batch:
        return None

    # Process valid items
    images = [item["image"] for item in valid_batch]
    texts = [item["question"] for item in valid_batch]
    labels = [item["labels"] for item in valid_batch]  # Should be class indices

    # Process through processor
    try:
        encoding = processor(
            images=images,
            text=texts,
            return_tensors="pt",
            padding="longest",
            truncation=True,
            max_length=512
        )
    except Exception as e:
        print(f"Skipping batch: {str(e)}")
        return None

    # Convert labels to one-hot encoding
    batch_size = len(labels)
    one_hot_labels = torch.zeros(batch_size, num_classes)
    for i, label in enumerate(labels):
        one_hot_labels[i, label] = 1.0

    encoding["labels"] = one_hot_labels
    return encoding

# Create DataLoaders with proper partial binding
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=partial(collate_fn, processor=processor),  # Keyword argument binding
    drop_last=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=partial(collate_fn, processor=processor)  # Keyword argument binding
)

test_loader = DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=partial(collate_fn, processor=processor)  # Keyword argument binding
)

# Fine tuning part

In [43]:
import os
import time
import torch
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
import torch.optim as optim
from peft import LoraConfig, get_peft_model

# --- Config ---
NUM_EPOCHS    = 20
LEARNING_RATE = 1e-4
WEIGHT_DECAY  = 1e-2
WARMUP_RATIO  = 0.1       # 10% of total steps
MAX_GRAD_NORM = 1.0
OUTPUT_DIR    = "/content/drive/MyDrive/VR Project 2/Fine Tune/models/best"
os.makedirs(OUTPUT_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Model + LoRA setup (UPDATED) ---
original_model = ViltForQuestionAnswering.from_pretrained(
    "dandelin/vilt-b32-finetuned-vqa",
    # Using original vocabulary from the pretrained model
    num_labels=len(original_label2id),
    id2label=model_config_source.config.id2label,
    label2id=original_label2id
    # Removed ignore_mismatched_sizes since we're using original sizes
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"]
)
model = get_peft_model(original_model, lora_config)
model.to(device)
model.print_trainable_parameters()

# --- Optimizer + Scheduler + AMP Scaler ---
optimizer = optim.AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)

total_steps   = len(train_loader) * NUM_EPOCHS
warmup_steps  = int(WARMUP_RATIO * total_steps)
scheduler     = get_linear_schedule_with_warmup(
    optimizer, warmup_steps, total_steps
)

scaler = torch.cuda.amp.GradScaler()

best_val_loss = float('inf')
patience, patience_counter = 10, 0

for epoch in range(1, NUM_EPOCHS + 1):
# for epoch in range(1):
    print(f"\n-- Epoch {epoch}/{NUM_EPOCHS} --")
    t0_epoch = time.time()

    # ---- TRAIN ----
    model.train()
    train_loss = 0.0
    train_batches = 0
    pbar = tqdm(train_loader, desc="Train", leave=False)
    for batch in pbar:
        # Skip None batches
        if batch is None:
            continue

        batch = {k: v.to(device) for k,v in batch.items() if v is not None}

        optimizer.zero_grad()
        with torch.amp.autocast('cuda'):  # Updated to new format
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        # clip grads
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        train_batches += 1
        pbar.set_postfix(loss=loss.item())

    avg_train = train_loss / max(train_batches, 1)  # Avoid division by zero

    # ---- VALIDATION ----
    model.eval()
    val_loss = 0.0
    val_batches = 0
    with torch.no_grad():
        pbar = tqdm(val_loader, desc="Valid", leave=False)
        for batch in pbar:
            # Skip None batches
            if batch is None:
                continue

            batch = {k: v.to(device) for k,v in batch.items() if v is not None}
            with torch.amp.autocast('cuda'):  # Updated to new format
                loss = model(**batch).loss
            val_loss += loss.item()
            val_batches += 1
            pbar.set_postfix(loss=loss.item())

    avg_val = val_loss / max(val_batches, 1)  # Avoid division by zero
    print(f"Train Loss: {avg_train:.4f} | Val Loss: {avg_val:.4f} | Time: {(time.time()-t0_epoch):.1f}s")

    # ---- Early Stopping & Checkpointing ----
    if avg_val < best_val_loss:
        best_val_loss = avg_val
        patience_counter = 0
        print(f" New best! Saving to {OUTPUT_DIR}")
        model.save_pretrained(OUTPUT_DIR)
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Stopping early (no improvement for {patience} epochs).")
            break

print("\n=== Training Complete ===")
print(f"Best Validation Loss: {best_val_loss:.4f}")
print(f"Best model saved at: {OUTPUT_DIR}")

# Metrics

In [44]:
!pip install bert-score
!git clone https://github.com/neulab/BARTScore.git
import sys
sys.path.append("./BARTScore")
# Now import
from bart_score import BARTScorer

fatal: destination path 'BARTScore' already exists and is not an empty directory.


# Baseline

In [50]:
import sys
import time
import torch
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 1. Add local BARTScore code into Python’s import path
sys.path.append("./BARTScore")

# 2. Semantic‐similarity imports
from bert_score import score as bert_score
from bart_score import BARTScorer

# 3. PEFT & model imports
from transformers import ViltForQuestionAnswering
from peft import PeftModel

# 4. Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 5. Reload base + LoRA‐finetuned model
#    (assumes you previously saved to OUTPUT_DIR)
OUTPUT_DIR = "/content/drive/MyDrive/VR Project 2/Training and Evaluation Scripts/Fine Tune/models/VILT_best"
model = ViltForQuestionAnswering.from_pretrained(
    "dandelin/vilt-b32-finetuned-vqa",
    num_labels=len(original_label2id),
    id2label=model_config_source.config.id2label,
    label2id=original_label2id,
    ignore_mismatched_sizes=True
)
# model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
model.to(device)
model.eval()

# 6. Accumulators
all_pred_ids   = []
all_true_ids   = []
all_pred_texts = []
all_true_texts = []

# --- Start overall timer ---
t0_overall = time.time()

# 7. Inference + gather labels/texts with progress bar
t0_loop = time.time()
for batch in tqdm(test_loader, desc="Evaluating batches"):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits  = outputs.logits

    # Predicted & true IDs
    pred_ids = logits.argmax(dim=-1)
    true_ids = batch["labels"].argmax(dim=-1)

    # Flatten for metrics
    pred_flat = pred_ids.view(-1).cpu().numpy()
    true_flat = true_ids.view(-1).cpu().numpy()
    all_pred_ids.extend(pred_flat)
    all_true_ids.extend(true_flat)

    # Convert to label strings
    all_pred_texts.extend([model.config.id2label[i] for i in pred_flat])
    all_true_texts.extend([model.config.id2label[i] for i in true_flat])
t1_loop = time.time()
print(f"\nInference & gathering took {t1_loop - t0_loop:.2f}s")

# 8. Classification metrics
t0_cls = time.time()
accuracy  = accuracy_score(all_true_ids, all_pred_ids)
precision = precision_score(all_true_ids, all_pred_ids, average="macro", zero_division=0)
recall    = recall_score(all_true_ids, all_pred_ids, average="macro", zero_division=0)
f1        = f1_score(all_true_ids, all_pred_ids, average="macro", zero_division=0)
t1_cls = time.time()

print(f"\nClassification metrics computed in {t1_cls - t0_cls:.2f}s")
print("=== Classification Metrics ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision (M) : {precision:.4f}")
print(f"Recall    (M) : {recall:.4f}")
print(f"F1 Score  (M) : {f1:.4f}")

# 9. BERTScore (semantic similarity)
t0_bert = time.time()
bert_p, bert_r, bert_f1 = bert_score(
    all_pred_texts,
    all_true_texts,
    lang="en",
    model_type="bert-base-uncased",
    rescale_with_baseline=True
)
t1_bert = time.time()
print(f"\nBERTScore computed in {t1_bert - t0_bert:.2f}s")
print("=== BERTScore ===")
print(f"Precision : {bert_p.mean().item():.4f}")
print(f"Recall    : {bert_r.mean().item():.4f}")
print(f"F1        : {bert_f1.mean().item():.4f}")

# 10. BARTScore (semantic entailment)
t0_bart = time.time()
bart_scorer = BARTScorer(device=device.type, checkpoint="facebook/bart-large-cnn")
bart_scores = bart_scorer.score(
    all_pred_texts,
    all_true_texts,
    batch_size=8
)
t1_bart = time.time()
mean_bart = sum(bart_scores) / len(bart_scores)
print(f"\nBARTScore computed in {t1_bart - t0_bart:.2f}s")
print("=== BARTScore ===")
print(f"Mean score: {mean_bart:.4f}")

# --- End overall timer ---
t1_overall = time.time()
print(f"\nTotal evaluation time: {t1_overall - t0_overall:.2f}s")


Using device: cuda


Evaluating batches: 100%|██████████| 147/147 [00:41<00:00,  3.50it/s]



Inference & gathering took 41.96s

Classification metrics computed in 0.02s
=== Classification Metrics ===
Accuracy      : 0.2777
Precision (M) : 0.0510
Recall    (M) : 0.0585
F1 Score  (M) : 0.0452

BERTScore computed in 0.93s
=== BERTScore ===
Precision : 0.6376
Recall    : 0.6286
F1        : 0.6314

BARTScore computed in 8.28s
=== BARTScore ===
Mean score: -5.4490

Total evaluation time: 51.19s


# Finetune

In [49]:
import sys
import time
import torch
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 1. Add local BARTScore code into Python’s import path
sys.path.append("./BARTScore")

# 2. Semantic‐similarity imports
from bert_score import score as bert_score
from bart_score import BARTScorer

# 3. PEFT & model imports
from transformers import ViltForQuestionAnswering
from peft import PeftModel

# 4. Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 5. Reload base + LoRA‐finetuned model
#    (assumes you previously saved to OUTPUT_DIR)
OUTPUT_DIR = "/content/drive/MyDrive/VR Project 2/Training and Evaluation Scripts/Fine Tune/models/VILT_best"
# base_model = ViltForQuestionAnswering.from_pretrained(
#     "dandelin/vilt-b32-finetuned-vqa",
#     num_labels=len(original_label2id),
#     id2label=model_config_source.config.id2label,
#     label2id=original_label2id,
#     ignore_mismatched_sizes=True
# )

model = ViltForQuestionAnswering.from_pretrained(OUTPUT_DIR)
model.to(device)
model.eval()

# 6. Accumulators
all_pred_ids   = []
all_true_ids   = []
all_pred_texts = []
all_true_texts = []

# --- Start overall timer ---
t0_overall = time.time()

# 7. Inference + gather labels/texts with progress bar
t0_loop = time.time()
for batch in tqdm(test_loader, desc="Evaluating batches"):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits  = outputs.logits

    # Predicted & true IDs
    pred_ids = logits.argmax(dim=-1)
    true_ids = batch["labels"].argmax(dim=-1)

    # Flatten for metrics
    pred_flat = pred_ids.view(-1).cpu().numpy()
    true_flat = true_ids.view(-1).cpu().numpy()
    all_pred_ids.extend(pred_flat)
    all_true_ids.extend(true_flat)

    # Convert to label strings
    all_pred_texts.extend([model.config.id2label[i] for i in pred_flat])
    all_true_texts.extend([model.config.id2label[i] for i in true_flat])
t1_loop = time.time()
print(f"\nInference & gathering took {t1_loop - t0_loop:.2f}s")

# 8. Classification metrics
t0_cls = time.time()
accuracy  = accuracy_score(all_true_ids, all_pred_ids)
precision = precision_score(all_true_ids, all_pred_ids, average="macro", zero_division=0)
recall    = recall_score(all_true_ids, all_pred_ids, average="macro", zero_division=0)
f1        = f1_score(all_true_ids, all_pred_ids, average="macro", zero_division=0)
t1_cls = time.time()

print(f"\nClassification metrics computed in {t1_cls - t0_cls:.2f}s")
print("=== Classification Metrics ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision (M) : {precision:.4f}")
print(f"Recall    (M) : {recall:.4f}")
print(f"F1 Score  (M) : {f1:.4f}")

# 9. BERTScore (semantic similarity)
t0_bert = time.time()
bert_p, bert_r, bert_f1 = bert_score(
    all_pred_texts,
    all_true_texts,
    lang="en",
    model_type="bert-base-uncased",
    rescale_with_baseline=True
)
t1_bert = time.time()
print(f"\nBERTScore computed in {t1_bert - t0_bert:.2f}s")
print("=== BERTScore ===")
print(f"Precision : {bert_p.mean().item():.4f}")
print(f"Recall    : {bert_r.mean().item():.4f}")
print(f"F1        : {bert_f1.mean().item():.4f}")

# 10. BARTScore (semantic entailment)
t0_bart = time.time()
bart_scorer = BARTScorer(device=device.type, checkpoint="facebook/bart-large-cnn")
bart_scores = bart_scorer.score(
    all_pred_texts,
    all_true_texts,
    batch_size=8
)
t1_bart = time.time()
mean_bart = sum(bart_scores) / len(bart_scores)
print(f"\nBARTScore computed in {t1_bart - t0_bart:.2f}s")
print("=== BARTScore ===")
print(f"Mean score: {mean_bart:.4f}")

# --- End overall timer ---
t1_overall = time.time()
print(f"\nTotal evaluation time: {t1_overall - t0_overall:.2f}s")


Using device: cuda


Evaluating batches: 100%|██████████| 147/147 [00:42<00:00,  3.49it/s]



Inference & gathering took 42.13s

Classification metrics computed in 0.02s
=== Classification Metrics ===
Accuracy      : 0.6231
Precision (M) : 0.3336
Recall    (M) : 0.3432
F1 Score  (M) : 0.3159

BERTScore computed in 0.87s
=== BERTScore ===
Precision : 0.8163
Recall    : 0.8141
F1        : 0.8143

BARTScore computed in 8.30s
=== BARTScore ===
Mean score: -3.8496

Total evaluation time: 51.32s
