In [1]:
!pip install -q torch torchvision transformers==4.44.2 datasets==2.20.0 accelerate==0.32.1 peft==0.8.2 evaluate==0.4.1 pillow scikit-learn tqdm fsspec==2024.5.0
!pip uninstall -y sentence-transformers
!pip uninstall -y jax jaxlib opencv-python opencv-python-headless opencv-contrib-python shap rasterio pytensor tobler

[0m

In [2]:
import torch
import random
import hashlib
import io
import string
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import accuracy_score

# Hugging Face Libraries (Matches Model Choices in Report)
from datasets import load_dataset
from transformers import (
    BertTokenizer, Blip2Processor, Blip2ForConditionalGeneration,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
import evaluate

# PyTorch & TorchVision (For CNN-LSTM Baseline)
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

# Set Seeds for Reproducibility (Required for Experiment Integrity)
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
# Load VQA-RAD dataset (open-source, as specified in assignment)
ds = load_dataset("flaviagiammarino/vqa-rad")
print("Dataset Structure:", ds)
print("Sample Train Item:", ds["train"][0])

# Hash images to group multiple questions per image (for multi-turn setup)
def hash_image(img):
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    return hashlib.md5(buf.getvalue()).hexdigest()

image_question_map = defaultdict(list)
for split in ["train", "test"]:
    for item in ds[split]:
        img_hash = hash_image(item["image"])
        image_question_map[img_hash].append(item)

print(f"Unique Images: {len(image_question_map)}")

# Body Part Detection (Anatomical Region Grouping, as in Preliminary Results)
body_parts = {
    "brain": ["brain", "cerebrum", "cerebellum", "ventricle", "cortex"],
    "lung": ["lung", "lungs", "pulmonary", "pleura", "chest"],
    "heart": ["heart", "cardiac", "ventricle", "atrium", "pericardium"],
    "abdomen": ["abdomen", "liver", "kidney", "pancreas", "stomach", "spleen", "intestine", "gallbladder"],
    "pelvis": ["pelvis", "bladder", "prostate", "uterus", "ovary", "pelvic"],
    "spine": ["spine", "vertebra", "cervical", "thoracic", "lumbar", "sacrum"],
    "eye": ["eye", "ocular", "retina", "cornea", "optic"],
    "other": []
}

def detect_body_part(question):
    q = question.lower().translate(str.maketrans("", "", string.punctuation))
    for part, keywords in body_parts.items():
        if any(k in q for k in keywords):
            return part
    return "other"

# Add body part annotations to all samples
for img_hash, qas in image_question_map.items():
    for qa in qas:
        qa["body_part"] = detect_body_part(qa["question"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1793 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/451 [00:00<?, ? examples/s]

Dataset Structure: DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 1793
    })
    test: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 451
    })
})
Sample Train Item: {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=566x555 at 0x7EB96E5F25D0>, 'question': 'are regions of the brain infarcted?', 'answer': 'yes'}
Unique Images: 314


In [54]:
from transformers import BertTokenizer, BertModel

# Tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", use_fast=False)

# BERT model to extract embeddings
bert_model = BertModel.from_pretrained("bert-base-uncased")
for param in bert_model.parameters():
    param.requires_grad = False  # freeze BERT embeddings
bert_embedding_weight = bert_model.embeddings.word_embeddings.weight


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [83]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from transformers import BertTokenizer, BertModel
import torch.optim as optim
from tqdm import tqdm

# ----------------------
# 1️⃣ Preprocessors
# ----------------------
cnn_image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

# ----------------------
# 2️⃣ Dataset
# ----------------------
class SingleTurnDataset(Dataset):
    def __init__(self, samples, tokenizer, processor, max_seq_len=32):
        self.samples = samples
        self.tokenizer = tokenizer
        self.processor = processor
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Image
        image = self.processor(sample["image"])

        # Question token IDs
        q_encoding = self.tokenizer(
            sample["question"],
            padding="max_length",
            truncation=True,
            max_length=self.max_seq_len,
            return_tensors="pt"
        )
        q_ids = q_encoding["input_ids"].squeeze(0)
        q_mask = q_encoding["attention_mask"].squeeze(0)

        # Answer index
        ans_idx = torch.tensor(ans_to_idx[sample["answer"].strip().lower()], dtype=torch.long)

        return {"image": image, "q_ids": q_ids, "q_mask": q_mask, "ans_idx": ans_idx}

# ----------------------
# 3️⃣ CNN-BERT Med-VQA Model
# ----------------------
class CNNBERTMedVQA(nn.Module):
    def __init__(self, num_classes, lstm_hidden=512, dropout=0.3):
        super().__init__()

        # Image encoder
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        self.img_feat_dim = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()

        # Question encoder
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")

        # Fusion layer
        self.fusion_input_dim = self.img_feat_dim + 768  # BERT pooler output size
        self.fusion = nn.Sequential(
            nn.Linear(self.fusion_input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(1024, 512),
            nn.ReLU()
        )

        # Classifier
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, image, q_ids, q_mask):
        # Image features
        img_feat = self.resnet(image)

        # Question features
        bert_out = self.bert_model(input_ids=q_ids, attention_mask=q_mask)
        q_feat = bert_out.pooler_output  # [batch_size, 768]

        # Concatenate and classify
        combined = torch.cat([img_feat, q_feat], dim=1)
        fused = self.fusion(combined)
        logits = self.classifier(fused)
        return logits

# ----------------------
# 4️⃣ Filter closed-ended QAs
# ----------------------
closed_answers = ["yes", "no", "normal", "abnormal"]

train_cnn_samples = [s for s in train_single if s["answer"].strip().lower() in closed_answers]
test_cnn_samples  = [s for s in test_single if s["answer"].strip().lower() in closed_answers]

# Build vocab
all_answers = sorted(list({s["answer"].strip().lower() for s in train_cnn_samples}))
ans_to_idx = {a: i for i, a in enumerate(all_answers)}
idx_to_ans = {i: a for i, a in enumerate(all_answers)}

print("Answer vocab:", all_answers)

# ----------------------
# 5️⃣ Create Datasets & Loaders
# ----------------------
train_cnn_dataset = SingleTurnDataset(train_cnn_samples, bert_tokenizer, cnn_image_transform)
test_cnn_dataset  = SingleTurnDataset(test_cnn_samples, bert_tokenizer, cnn_image_transform)

train_cnn_loader = DataLoader(train_cnn_dataset, batch_size=8, shuffle=True)
test_cnn_loader  = DataLoader(test_cnn_dataset, batch_size=8, shuffle=False)

# ----------------------
# 6️⃣ Training Setup
# ----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = CNNBERTMedVQA(num_classes=len(all_answers)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# ----------------------
# 7️⃣ Training Loop
# ----------------------
num_epochs = 10

for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0

    for batch in tqdm(train_cnn_loader, desc=f"Epoch {epoch}/{num_epochs}"):
        images = batch["image"].to(device)
        q_ids  = batch["q_ids"].to(device)
        q_mask = batch["q_mask"].to(device)
        labels = batch["ans_idx"].to(device)

        optimizer.zero_grad()
        outputs = model(images, q_ids, q_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_cnn_dataset)
    print(f"Epoch {epoch} | Train Loss: {epoch_loss:.4f}")

    # ----------------------
    # Evaluation
    # ----------------------
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in test_cnn_loader:
            images = batch["image"].to(device)
            q_ids  = batch["q_ids"].to(device)
            q_mask = batch["q_mask"].to(device)
            labels = batch["ans_idx"].to(device)

            outputs = model(images, q_ids, q_mask)
            preds = torch.argmax(outputs, dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total * 100
    print(f"Epoch {epoch} | Test Accuracy: {acc:.2f}%\n")

Answer vocab: ['abnormal', 'no', 'normal', 'yes']
Using device: cuda


Epoch 1/10: 100%|██████████| 25/25 [00:05<00:00,  4.97it/s]


Epoch 1 | Train Loss: 0.9361
Epoch 1 | Test Accuracy: 51.02%



Epoch 2/10: 100%|██████████| 25/25 [00:04<00:00,  5.10it/s]


Epoch 2 | Train Loss: 0.6683
Epoch 2 | Test Accuracy: 71.43%



Epoch 3/10: 100%|██████████| 25/25 [00:04<00:00,  5.03it/s]


Epoch 3 | Train Loss: 0.6046
Epoch 3 | Test Accuracy: 63.27%



Epoch 4/10: 100%|██████████| 25/25 [00:04<00:00,  5.04it/s]


Epoch 4 | Train Loss: 0.5761
Epoch 4 | Test Accuracy: 65.31%



Epoch 5/10: 100%|██████████| 25/25 [00:05<00:00,  4.99it/s]


Epoch 5 | Train Loss: 0.4543
Epoch 5 | Test Accuracy: 59.18%



Epoch 6/10: 100%|██████████| 25/25 [00:05<00:00,  4.97it/s]


Epoch 6 | Train Loss: 0.4586
Epoch 6 | Test Accuracy: 65.31%



Epoch 7/10: 100%|██████████| 25/25 [00:04<00:00,  5.02it/s]


Epoch 7 | Train Loss: 0.4737
Epoch 7 | Test Accuracy: 71.43%



Epoch 8/10: 100%|██████████| 25/25 [00:04<00:00,  5.03it/s]


Epoch 8 | Train Loss: 0.3660
Epoch 8 | Test Accuracy: 65.31%



Epoch 9/10: 100%|██████████| 25/25 [00:04<00:00,  5.09it/s]


Epoch 9 | Train Loss: 0.3841
Epoch 9 | Test Accuracy: 69.39%



Epoch 10/10: 100%|██████████| 25/25 [00:04<00:00,  5.03it/s]


Epoch 10 | Train Loss: 0.3893
Epoch 10 | Test Accuracy: 73.47%



In [84]:
# ---------------------- CNN+BERT Med-VQA ----------------------

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from transformers import BertTokenizer, BertModel
import torch.optim as optim
from tqdm import tqdm

# ----------------------
# 1️⃣ Preprocessors
# ----------------------
cnn_image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

# ----------------------
# 2️⃣ Dataset
# ----------------------
class SingleTurnDataset(Dataset):
    def __init__(self, samples, tokenizer, processor, max_seq_len=32):
        self.samples = samples
        self.tokenizer = tokenizer
        self.processor = processor
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Image
        image = self.processor(sample["image"])

        # Question token IDs
        q_encoding = self.tokenizer(
            sample["question"],
            padding="max_length",
            truncation=True,
            max_length=self.max_seq_len,
            return_tensors="pt"
        )
        q_ids = q_encoding["input_ids"].squeeze(0)
        q_mask = q_encoding["attention_mask"].squeeze(0)

        # Answer index
        ans_idx = torch.tensor(ans_to_idx[sample["answer"].strip().lower()], dtype=torch.long)

        return {"image": image, "q_ids": q_ids, "q_mask": q_mask, "ans_idx": ans_idx}

# ----------------------
# 3️⃣ CNN+BERT Med-VQA Model
# ----------------------
class CNNBertMedVQA(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        # ---------------- Image Encoder ----------------
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        self.img_feat_dim = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()

        # ---------------- Question Encoder (BERT) ----------------
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")
        # Freeze BERT optionally for small dataset
        for param in self.bert_model.parameters():
            param.requires_grad = False

        # ---------------- Fusion ----------------
        self.fusion_input_dim = self.img_feat_dim + 768  # 768 = BERT pooler_output
        self.fusion = nn.Sequential(
            nn.Linear(self.fusion_input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU()
        )

        # ---------------- Classifier ----------------
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, image, q_ids, q_mask):
        # Image features
        img_feat = self.resnet(image)

        # Question features from BERT
        outputs = self.bert_model(input_ids=q_ids, attention_mask=q_mask)
        q_feat = outputs.pooler_output  # [batch_size, 768]

        # Fusion and classification
        combined = torch.cat([img_feat, q_feat], dim=1)
        fused = self.fusion(combined)
        logits = self.classifier(fused)
        return logits

# ----------------------
# 4️⃣ Filter closed-ended QAs
# ----------------------
closed_answers = ["yes", "no", "normal", "abnormal"]

train_cnn_samples = [s for s in train_single if s["answer"].strip().lower() in closed_answers]
test_cnn_samples  = [s for s in test_single if s["answer"].strip().lower() in closed_answers]

# Build vocab
all_answers = sorted(list({s["answer"].strip().lower() for s in train_cnn_samples}))
ans_to_idx = {a: i for i, a in enumerate(all_answers)}
idx_to_ans = {i: a for i, a in enumerate(all_answers)}

print("Answer vocab:", all_answers)

# ----------------------
# 5️⃣ Create Datasets & Loaders
# ----------------------
train_cnn_dataset = SingleTurnDataset(train_cnn_samples, bert_tokenizer, cnn_image_transform)
test_cnn_dataset  = SingleTurnDataset(test_cnn_samples, bert_tokenizer, cnn_image_transform)

train_cnn_loader = DataLoader(train_cnn_dataset, batch_size=8, shuffle=True)
test_cnn_loader  = DataLoader(test_cnn_dataset, batch_size=8, shuffle=False)

# ----------------------
# 6️⃣ Training Setup
# ----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = CNNBertMedVQA(num_classes=len(all_answers)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# ----------------------
# 7️⃣ Training Loop
# ----------------------
num_epochs = 5

for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0

    for batch in tqdm(train_cnn_loader, desc=f"Epoch {epoch}/{num_epochs}"):
        images = batch["image"].to(device)
        q_ids  = batch["q_ids"].to(device)
        q_mask = batch["q_mask"].to(device)
        labels = batch["ans_idx"].to(device)

        optimizer.zero_grad()
        outputs = model(images, q_ids, q_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_cnn_dataset)
    print(f"Epoch {epoch} | Train Loss: {epoch_loss:.4f}")

    # ----------------------
    # Evaluation
    # ----------------------
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in test_cnn_loader:
            images = batch["image"].to(device)
            q_ids  = batch["q_ids"].to(device)
            q_mask = batch["q_mask"].to(device)
            labels = batch["ans_idx"].to(device)

            outputs = model(images, q_ids, q_mask)
            preds = torch.argmax(outputs, dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total * 100
    print(f"Epoch {epoch} | Test Accuracy: {acc:.2f}%\n")

Answer vocab: ['abnormal', 'no', 'normal', 'yes']
Using device: cuda


Epoch 1/5: 100%|██████████| 25/25 [00:03<00:00,  7.15it/s]


Epoch 1 | Train Loss: 0.9579
Epoch 1 | Test Accuracy: 51.02%



Epoch 2/5: 100%|██████████| 25/25 [00:03<00:00,  7.36it/s]


Epoch 2 | Train Loss: 0.8235
Epoch 2 | Test Accuracy: 71.43%



Epoch 3/5: 100%|██████████| 25/25 [00:03<00:00,  7.33it/s]


Epoch 3 | Train Loss: 0.5941
Epoch 3 | Test Accuracy: 71.43%



Epoch 4/5: 100%|██████████| 25/25 [00:03<00:00,  7.21it/s]


Epoch 4 | Train Loss: 0.4997
Epoch 4 | Test Accuracy: 63.27%



Epoch 5/5: 100%|██████████| 25/25 [00:03<00:00,  7.31it/s]


Epoch 5 | Train Loss: 0.4583
Epoch 5 | Test Accuracy: 69.39%

