In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Create project folders on Drive
import os
base = '/content/drive/MyDrive/AMD_TNBC'
for folder in ['data', 'checkpoints', 'outputs']:
    os.makedirs(f'{base}/{folder}', exist_ok=True)
print(" Drive mounted and folders ready!")

Mounted at /content/drive
 Drive mounted and folders ready!


In [2]:
!pip install datasets transformers Pillow -q

In [3]:
from datasets import load_dataset
from PIL import Image
import os

# Load PathVQA dataset
dataset = load_dataset("flaviagiammarino/path-vqa", split="train")

# Keywords relevant to TNBC pathology
TNBC_KEYWORDS = [
    "triple negative", "invasive ductal", "basal-like",
    "breast cancer", "carcinoma", "necrosis", "mitotic"
]

def is_tnbc_relevant(sample):
    question = sample.get("question", "").lower()
    answer = sample.get("answer", "").lower()
    return any(kw in question or kw in answer for kw in TNBC_KEYWORDS)

filtered = dataset.filter(is_tnbc_relevant)
print(f"Total samples after filtering: {len(filtered)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00007-f2d0e9ef9f022d(…):   0%|          | 0.00/42.8M [00:00<?, ?B/s]



data/train-00001-of-00007-47d8e0220bf6c9(…):   0%|          | 0.00/81.0M [00:00<?, ?B/s]

data/train-00002-of-00007-7fb5037c4c5da7(…):   0%|          | 0.00/104M [00:00<?, ?B/s]

data/train-00003-of-00007-74b9b7b81cc55f(…):   0%|          | 0.00/90.0M [00:00<?, ?B/s]

data/train-00004-of-00007-77eea90af4a55d(…):   0%|          | 0.00/46.1M [00:00<?, ?B/s]

data/train-00005-of-00007-5332ec423be520(…):   0%|          | 0.00/55.8M [00:00<?, ?B/s]

data/train-00006-of-00007-637a58c700b604(…):   0%|          | 0.00/57.3M [00:00<?, ?B/s]

data/validation-00000-of-00003-90a5518d2(…):   0%|          | 0.00/41.3M [00:00<?, ?B/s]

data/validation-00001-of-00003-cbfe947a3(…):   0%|          | 0.00/45.7M [00:00<?, ?B/s]

data/validation-00002-of-00003-9ec816895(…):   0%|          | 0.00/64.7M [00:00<?, ?B/s]

data/test-00000-of-00003-e9adadb4799f44d(…):   0%|          | 0.00/41.2M [00:00<?, ?B/s]

data/test-00001-of-00003-7ea98873fc91981(…):   0%|          | 0.00/45.3M [00:00<?, ?B/s]

data/test-00002-of-00003-162830843501982(…):   0%|          | 0.00/69.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19654 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6259 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6719 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19654 [00:00<?, ? examples/s]



Total samples after filtering: 1079


In [4]:
OUTPUT_DIR = "/content/drive/MyDrive/AMD_TNBC/data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

for idx, sample in enumerate(filtered):
    # Resize image to 512x512
    img = sample["image"].convert("RGB").resize((512, 512), Image.LANCZOS)
    img_path = os.path.join(OUTPUT_DIR, f"tnbc_{idx:04d}.png")
    img.save(img_path)

    # Generate caption text file
    caption = (
        f"Histopathology slide of triple-negative breast cancer. "
        f"{sample.get('question', '')} {sample.get('answer', '')}"
    ).strip()

    txt_path = os.path.join(OUTPUT_DIR, f"tnbc_{idx:04d}.txt")
    with open(txt_path, "w") as f:
        f.write(caption)

print(f"Saved {idx + 1} image-caption pairs to {OUTPUT_DIR}")

Saved 1079 image-caption pairs to /content/drive/MyDrive/AMD_TNBC/data


In [7]:
import os
import re

DATA_DIR = "/content/drive/MyDrive/AMD_TNBC/data"

def clean_caption(question, answer):
    # Remove question format and combine into a clinical statement
    question = question.strip().rstrip("?").lower()
    answer = answer.strip().lower()

    # If answer is just yes/no, use the question as the descriptor
    if answer in ["yes", "no"]:
        polarity = "showing" if answer == "yes" else "not showing"
        return f"Histopathology slide of triple-negative breast cancer {polarity} {question}."
    else:
        return f"Histopathology slide of triple-negative breast cancer demonstrating {answer}."

# Reload the filtered dataset to get original fields
from datasets import load_dataset

dataset = load_dataset("flaviagiammarino/path-vqa", split="train")

TNBC_KEYWORDS = [
    "triple negative", "invasive ductal", "basal-like",
    "breast cancer", "carcinoma", "necrosis", "mitotic"
]

def is_tnbc_relevant(sample):
    question = sample.get("question", "").lower()
    answer = sample.get("answer", "").lower()
    return any(kw in question or kw in answer for kw in TNBC_KEYWORDS)

filtered = dataset.filter(is_tnbc_relevant)

# Rewrite caption files
for idx, sample in enumerate(filtered):
    caption = clean_caption(
        sample.get("question", ""),
        sample.get("answer", "")
    )
    txt_path = os.path.join(DATA_DIR, f"tnbc_{idx:04d}.txt")
    with open(txt_path, "w") as f:
        f.write(caption)

print(f"Captions cleaned for {idx + 1} samples.")

# Verify first 3
for i in range(3):
    with open(os.path.join(DATA_DIR, f"tnbc_{i:04d}.txt")) as f:
        print(f"Sample {i}: {f.read()}")

Filter:   0%|          | 0/19654 [00:00<?, ? examples/s]

Captions cleaned for 1079 samples.
Sample 0: Histopathology slide of triple-negative breast cancer demonstrating foci of fat necrosis.
Sample 1: Histopathology slide of triple-negative breast cancer demonstrating excellent photo typical adenocarcinoma extending through muscularis to serosa.
Sample 2: Histopathology slide of triple-negative breast cancer showing does this image show excellent photo typical adenocarcinoma extending through muscularis to serosa.
