**Import Libraries**

In [17]:
# Standard Libraries
import os
import random

# Data Manipulation Libraries
import numpy as np
import pandas as pd

# Visualization Libraries
import matplotlib.pyplot as plt
import pydicom
from PIL import Image

# Progress Bar
from tqdm import tqdm

# Machine Learning Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, random_split

**Define Parameters**

In [18]:
# Paths
ZIP_PATH = '/cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection.zip'
EXTRACTED_PATH = '/cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection-extracted'

# Define parameters
SEED = 42
LEARNING_RATE = 0.005
EPOCHS =  1
BATCH_SIZE = 16

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


**Set Seed for Reproducibility**

In [19]:
def seed_everything(seed=SEED):
    """
    Sets the seed to ensure reproducibility.
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Ensure deterministic behavior in CUDA operations
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Apply the seed
seed_everything()

# **Step 1: Load Data**

In [20]:
def extract_data(zip_path, extracted_path):
    """
    Extracts the ZIP file of the dataset.
    """
    os.makedirs(extracted_path, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_path)
    print(f"Data extracted to {extracted_path}")

# Uncomment the line below to extract data (if not already extracted)
# extract_data(ZIP_PATH, EXTRACTED_PATH)

# **Step 2: Data Preprocessing**

In [21]:
# Define Disease Labels
disease_labels = [
    "Aortic enlargement",
    "Atelectasis",
    "Calcification",
    "Cardiomegaly",
    "Consolidation",
    "ILD",
    "Infiltration",
    "Lung Opacity",
    "Nodule/Mass",
    "Other lesion",
    "Pleural effusion",
    "Pleural thickening",
    "Pneumothorax",
    "Pulmonary fibrosis"
]

def load_labels(csv_path, image_path):
    """
    Loads and preprocesses the labels from the CSV file.
    Maps each image to its corresponding file path and binary labels for each disease.
    """

    # Read the CSV file containing labels
    labels_df = pd.read_csv(csv_path)

    # Create binary columns for each disease label
    for disease in disease_labels:
        labels_df[disease] = labels_df['class_name'].str.contains(disease).astype(int)

    # Create a binary column for 'No Finding'
    labels_df['No finding'] = labels_df['class_name'].apply(lambda x: 1 if 'No finding' in x else 0)

    # Map image filenames to their full paths
    labels_df['Path'] = labels_df['image_id'].map(lambda x: os.path.join(image_path, 'train', f"{x}.dicom"))
    
    return labels_df

# Path to the labels CSV file
train_csv_path = os.path.join(EXTRACTED_PATH, 'train.csv')
# test_csv_path = os.path.join(EXTRACTED_PATH, 'test.csv')

# Load and preprocess the labels
train_val_df = load_labels(train_csv_path, EXTRACTED_PATH)
# test_df = load_labels(test_csv_path, EXTRACTED_PATH)

train_val_df = train_val_df[train_val_df['class_id'] != 14]

# Verify Split Sizes
print(f"Train size: {train_val_df.shape[0]}")
# print(f"Test size: {test_df.shape[0]}")

Train size: 36096


In [22]:
df = train_val_df
df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,Aortic enlargement,Atelectasis,...,Infiltration,Lung Opacity,Nodule/Mass,Other lesion,Pleural effusion,Pleural thickening,Pneumothorax,Pulmonary fibrosis,No finding,Path
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,0,0,...,0,0,0,0,0,0,0,0,0,/cluster/home/bjorneme/projects/Data/vinbigdat...
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,1,0,...,0,0,0,0,0,0,0,0,0,/cluster/home/bjorneme/projects/Data/vinbigdat...
5,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,0,0,...,0,0,0,0,0,1,0,0,0,/cluster/home/bjorneme/projects/Data/vinbigdat...
6,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,0,0,...,0,0,0,0,0,0,0,0,0,/cluster/home/bjorneme/projects/Data/vinbigdat...
7,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,0,0,...,0,0,1,0,0,0,0,0,0,/cluster/home/bjorneme/projects/Data/vinbigdat...


**Define Dataset for VinDr-CXR images**

In [23]:


# %% [markdown]
# ## 2. Function to Convert Annotations to YOLO Format
# 
# YOLO expects a label file (one per image) with one line per object in this format:
# 
# ```
# <class_id> <x_center> <y_center> <width> <height>
# ```
# 
# where all coordinates are normalized by the image width/height.

# %% [code]
def convert_annotation_to_yolo(group, img_size_override=None):
    """
    Given a group of annotations for one image (grouped by image_id),
    read the image size and convert each bounding box.
    Returns the source image path and a list of YOLO-format lines.
    If img_size_override is provided as (w, h), it is used instead.
    """
    img_path = group.iloc[0]['Path']
    try:
        if img_size_override:
            img_width, img_height = img_size_override
        else:
            with Image.open(img_path) as img:
                img_width, img_height = img.size
    except Exception as e:
        print(f"Error reading image {img_path}: {e}")
        return None, []
    
    lines = []
    for _, row in group.iterrows():
        # Compute normalized coordinates
        x_min, y_min, x_max, y_max = row['x_min'], row['y_min'], row['x_max'], row['y_max']
        x_center = ((x_min + x_max) / 2.0) / img_width
        y_center = ((y_min + y_max) / 2.0) / img_height
        box_width = (x_max - x_min) / img_width
        box_height = (y_max - y_min) / img_height
        line = f"{int(row['class_id'])} {x_center:.6f} {y_center:.6f} {box_width:.6f} {box_height:.6f}"
        lines.append(line)
    return img_path, lines

# %% [markdown]
# ## 3. Split the Dataset into Training and Validation
# 
# We use an 80/20 split based on unique image IDs.

# %% [code]
unique_ids = df['image_id'].unique()
train_ids, val_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)
train_df = df[df['image_id'].isin(train_ids)]
val_df   = df[df['image_id'].isin(val_ids)]
print(f"Train images: {len(train_ids)}, Val images: {len(val_ids)}")

# %% [markdown]
# ## 4. Create Directory Structure and Save Images & Labels
# 
# We create:
# - `data/images/train` and `data/images/val`
# - `data/labels/train` and `data/labels/val`
# 
# For each unique image (grouped by `image_id`), we copy the image into the proper folder and write its label file.

# %% [code]
base_dir = "data"
for split in ['train', 'val']:
    os.makedirs(os.path.join(base_dir, "images", split), exist_ok=True)
    os.makedirs(os.path.join(base_dir, "labels", split), exist_ok=True)

def process_split(split_df, split_name):
    groups = split_df.groupby('image_id')
    for image_id, group in groups:
        img_path, yolo_lines = convert_annotation_to_yolo(group)
        if img_path is None:
            continue
        img_filename = os.path.basename(img_path)
        dst_img_path = os.path.join(base_dir, "images", split_name, img_filename)
        # Copy image file to the YOLO images folder
        try:
            shutil.copy(img_path, dst_img_path)
        except Exception as e:
            print(f"Error copying image {img_path}: {e}")
            continue
        # Write YOLO label file
        label_filename = os.path.splitext(img_filename)[0] + ".txt"
        label_path = os.path.join(base_dir, "labels", split_name, label_filename)
        with open(label_path, "w") as f:
            for line in yolo_lines:
                f.write(line + "\n")

print("Processing training set...")
process_split(train_df, "train")
print("Processing validation set...")
process_split(val_df, "val")

# %% [markdown]
# ## 5. Create the YOLO Data YAML File
# 
# This file tells YOLOv5 where your images are, how many classes there are, and the class names.
# Adjust the number of classes (`nc`) and the names list as needed.

# %% [code]
num_classes = 14
data_yaml = {
    "train": os.path.join(os.getcwd(), base_dir, "images", "train"),
    "val": os.path.join(os.getcwd(), base_dir, "images", "val"),
    "nc": num_classes,
    "names": [
        "Aortic enlargement", "Atelectasis", "Calcification", "Cardiomegaly",
        "Consolidation", "ILD", "Infiltration", "Lung Opacity",
        "Nodule/Mass", "Other lesion", "Pleural effusion", "Pleural thickening",
        "Pneumothorax", "Pulmonary fibrosis"
    ]
}

with open("data.yaml", "w") as f:
    yaml.dump(data_yaml, f, default_flow_style=False)

print("data.yaml contents:")
print(open("data.yaml").read())

# %% [markdown]
# ## 6. Train YOLOv5
# 
# In this cell, we clone the YOLOv5 repository (if not already present) and launch training.
# Adjust image size, batch size, number of epochs, and weights as needed.
# 
# If you prefer YOLOv9, swap the training script and weights accordingly.

# %% [code]
# Clone YOLOv5 repository if not present
if not os.path.exists("yolov5"):
    !git clone https://github.com/ultralytics/yolov5.git

# Launch YOLOv5 training:
# Here we use YOLOv5s (small) with image size 1024.
!python yolov5/train.py --img 1024 --batch 16 --epochs 50 --data data.yaml --weights yolov5s.pt --cache

# %% [markdown]
# ## 7. (Optional) Visualize Training Results
# 
# After training, YOLOv5 will save results (plots, metrics, etc.) under `yolov5/runs/train/exp/`.
# Here we display the training results plot if it exists.

# %% [code]
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

results_img = "yolov5/runs/train/exp/results.png"  # Adjust path if needed
if os.path.exists(results_img):
    plt.figure(figsize=(10, 8))
    plt.imshow(mpimg.imread(results_img))
    plt.title("Training Results")
    plt.axis("off")
    plt.show()
else:
    print("Results image not found.")


Train images: 3515, Val images: 879
Processing training set...


Error reading image /cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection-extracted/train/0005e8e3701dfb1dd93d53e2ff537b6e.dicom: cannot identify image file '/cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection-extracted/train/0005e8e3701dfb1dd93d53e2ff537b6e.dicom'
Error reading image /cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection-extracted/train/0007d316f756b3fa0baea2ff514ce945.dicom: cannot identify image file '/cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection-extracted/train/0007d316f756b3fa0baea2ff514ce945.dicom'
Error reading image /cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection-extracted/train/000d68e42b71d3eac10ccc077aba07c1.dicom: cannot identify image file '/cluster/home/bjorneme/projects/Data/vinbigdata-chest-xray-abnormalities-detection-extracted/train/000d68e42b71d3eac10ccc077aba07c1.dicom'
Error reading image /c