In [243]:
# Standard Libraries
import os

# Data Manipulation Libraries
import pandas as pd

# Machine Learning Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchvision import transforms

# Progress Bar
from tqdm import tqdm

# **Step 1: Load Data**

In [244]:
# Paths
ZIP_PATH = '/cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection.zip'
EXTRACTED_PATH = '/cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection-extracted'

SEED =  42

In [245]:
def extract_data(zip_path, extracted_path):
    """
    Extracts the ZIP file of the dataset.
    """
    os.makedirs(extracted_path, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_path)
    print(f"Data extracted to {extracted_path}")

# Uncomment the line below to extract data (if not already extracted)
# extract_data(ZIP_PATH, EXTRACTED_PATH)

# **Step 2: Data Preprocessing**

In [246]:
# Define Disease Labels
disease_labels = [
    "Aortic enlargement",
    "Atelectasis",
    "Calcification",
    "Cardiomegaly",
    "Consolidation",
    "ILD",
    "Infiltration",
    "Lung Opacity",
    "Nodule/Mass",
    "Other lesion",
    "Pleural effusion",
    "Pleural thickening",
    "Pneumothorax",
    "Pulmonary fibrosis"
]

def load_labels(csv_path, image_path):
    """
    Loads and preprocesses the labels from the CSV file.
    Maps each image to its corresponding file path and binary labels for each disease.
    """

    # Read the CSV file containing labels
    labels_df = pd.read_csv(csv_path)

    # Create binary columns for each disease label
    for disease in disease_labels:
        labels_df[disease] = labels_df['class_name'].str.contains(disease).astype(int)

    # Create a binary column for 'No Finding'
    labels_df['No finding'] = labels_df['class_name'].apply(lambda x: 1 if 'No finding' in x else 0)

    # Map image filenames to their full paths
    labels_df['Path'] = labels_df['image_id'].map(lambda x: os.path.join(image_path, 'train', f"{x}.dicom"))
    
    return labels_df

# Path to the labels CSV file
labels_csv_path = os.path.join(EXTRACTED_PATH, 'train.csv')

# Load and preprocess the labels
labels_df = load_labels(labels_csv_path, EXTRACTED_PATH)

**Print Dataframe**

In [247]:
labels_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,Aortic enlargement,Atelectasis,...,Infiltration,Lung Opacity,Nodule/Mass,Other lesion,Pleural effusion,Pleural thickening,Pneumothorax,Pulmonary fibrosis,No finding,Path
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,,0,0,...,0,0,0,0,0,0,0,0,1,/cluster/home/bjorneme/projects/Data/vinbigdat...
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,,0,0,...,0,0,0,0,0,0,0,0,1,/cluster/home/bjorneme/projects/Data/vinbigdat...
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,0,0,...,0,0,0,0,0,0,0,0,0,/cluster/home/bjorneme/projects/Data/vinbigdat...
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,1,0,...,0,0,0,0,0,0,0,0,0,/cluster/home/bjorneme/projects/Data/vinbigdat...
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,,0,0,...,0,0,0,0,0,0,0,0,1,/cluster/home/bjorneme/projects/Data/vinbigdat...


**Split Dataset by image_id**

In [248]:
# Split patients into training/validation and test sets
unique_patients = labels_df['image_id'].unique()
train_val_patients, test_patients = train_test_split(
    unique_patients, test_size=0.2, random_state=SEED
)

# Create training/validation and test dataframes
train_df = labels_df[labels_df['image_id'].isin(train_val_patients)].reset_index(drop=True)
val_df = labels_df[labels_df['image_id'].isin(test_patients)].reset_index(drop=True)

# Verify Split Sizes
print(f"Train size: {train_val_df.shape[0]}")
print(f"Val size: {test_df.shape[0]}")

Train size: 54370
Val size: 13544


# **Step 3: Pre-training using BYOL**

In [249]:
# TODO

# **Step 4: Data Preprocessing**

**Define Dataset for VinDr-CXR**

In [250]:
class VinDrCXRDataset(Dataset):
    def __init__(self, image_ids, labels_df, image_dir, transforms=None):
        self.image_ids = image_ids
        self.labels_df = labels_df
        self.image_dir = image_dir
        self.transforms = transforms

    def __len__(self):
        return len(self.image_ids)
    
    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        # Load image (adjust extension as needed, e.g., .dicom or .jpg)
        img_path = os.path.join(self.image_dir, f"{image_id}.jpg")
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
        
        # Get bounding boxes and (optionally) labels for this image
        records = self.labels_df[self.labels_df['image_id'] == image_id]
        # Expecting boxes in COCO format: [x, y, width, height]
        boxes = records[['x_min', 'y_min', 'x_max', 'y_max']].values
        # Convert from [x_min, y_min, x_max, y_max] to [x, y, w, h]
        boxes[:,2] = boxes[:,2] - boxes[:,0]
        boxes[:,3] = boxes[:,3] - boxes[:,1]
        # Use a default label (e.g., 1) if you only have one object type; otherwise use your actual label column.
        labels = records.get('class_id', pd.Series(np.ones(len(boxes)))).values.tolist()

        sample = {"image": image, "bboxes": boxes, "labels": labels}
        if self.transforms:
            sample = self.transforms(**sample)
        image = sample['image']
        # After transforms, boxes remain in COCO format (normalized if your transform does so)
        target = {
            'boxes': torch.as_tensor(sample['bboxes'], dtype=torch.float32),
            'labels': torch.as_tensor(sample['labels'], dtype=torch.long),
            'image_id': torch.tensor([idx])
        }
        return image, target

**Define Data Transformations**

In [251]:
# Define transformations for training data
train_transforms = transforms.Compose([

    # Convert image to PIL format for further transformations
    transforms.ToPILImage(),

    # Convert to grayscale and change to 3 channels
    transforms.Grayscale(num_output_channels=3),

    # Resize the image to 224x224
    transforms.Resize((224, 224)),

    # Apply random horizontal flip to augment the data
    transforms.RandomHorizontalFlip(),

    # Randomly rotate the image within a range of ±10 degrees
    transforms.RandomRotation(10),

    # Convert the image to a PyTorch tensor
    transforms.ToTensor(),

    # Normalize using ImageNet mean and std
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Define transformations for test data
val_transforms = transforms.Compose([

    # Convert image to PIL format for further transformations
    transforms.ToPILImage(),

    # Convert to grayscale and change to 3 channels
    transforms.Grayscale(num_output_channels=3),

    # Resize the image to 224x224
    transforms.Resize((224, 224)),

    # Convert the image to a PyTorch tensor
    transforms.ToTensor(),

    # Normalize using ImageNet mean and std
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

**Create Datasets**

In [252]:
train_dataset = VinDrCXRDataset(train_df, labels_df, image_dir=os.path.join(EXTRACTED_PATH, 'train'),
                                   transforms=train_transforms)
val_dataset = VinDrCXRDataset(val_df, labels_df, image_dir=os.path.join(EXTRACTED_PATH, 'train'),
                                   transforms=val_transforms)

**Create DataLoaders**

In [253]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=32)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=32)

# **Step 7: Build the Model**

In [254]:
class DETRModel(nn.Module):
    def __init__(self, num_classes, num_queries=100):
        super().__init__()
        self.model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)
        in_features = self.model.class_embed.in_features
        # Update classification head to match number of classes (note: background is handled separately)
        self.model.class_embed = nn.Linear(in_features, num_classes)
        self.model.num_queries = num_queries

    def forward(self, imgs):
        return self.model(imgs)


NUM_CLASSES = 2  # for example, object vs. background (adjust if needed)
NUM_QUERIES = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DETRModel(num_classes=NUM_CLASSES, num_queries=NUM_QUERIES).to(device)

Using cache found in /cluster/home/bjorneme/.cache/torch/hub/facebookresearch_detr_main


**Define Loss Function, Optimizer and Scheduler**

In [255]:
# Import DETR helper modules (ensure that DETR is cloned and its path appended)
DETR_DIR = 'detr'

if os.path.exists(DETR_DIR) == False:
    !git clone https://github.com/facebookresearch/detr.git

import sys
sys.path.append(DETR_DIR)

from detr.models.matcher import HungarianMatcher
from detr.models.detr import SetCriterion

matcher = HungarianMatcher()
weight_dict = {'loss_ce': 1, 'loss_bbox': 1, 'loss_giou': 1}
losses = ['labels', 'boxes', 'cardinality']
criterion = SetCriterion(NUM_CLASSES - 1, matcher, weight_dict, eos_coef=0.5, losses=losses).to(device)

optimizer = optim.Adam(model.parameters(), lr=2e-5)

# **Step 8: Train the Model**

In [256]:
EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1} Training")
    for imgs, targets in pbar:
        imgs = [img.to(device) for img in imgs]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        outputs = model(imgs)
        loss_dict = criterion(outputs, targets)
        loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        pbar.set_postfix(loss=train_loss/len(train_loader))
    print(f"Epoch {epoch+1} Training Loss: {train_loss/len(train_loader):.4f}")

Epoch 1 Training:   0%|          | 0/1700 [00:00<?, ?it/s]

Epoch 1 Training:   0%|          | 0/1700 [00:01<?, ?it/s]


KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/cluster/home/bjorneme/.local/lib/python3.11/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 32613

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/cluster/home/bjorneme/.local/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/cluster/home/bjorneme/.local/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/cluster/home/bjorneme/.local/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/tmp/ipykernel_1243452/2255799058.py", line 12, in __getitem__
    image_id = self.image_ids[idx]
               ~~~~~~~~~~~~~~^^^^^
  File "/cluster/home/bjorneme/.local/lib/python3.11/site-packages/pandas/core/frame.py", line 4102, in __getitem__
    indexer = self.columns.get_loc(key)
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/cluster/home/bjorneme/.local/lib/python3.11/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 32613


# **Step 9: Evaluate the Model**

In [None]:
# TODO

# Use mAP0.5 and mAP0.5:0.95