# Kaggle Notebook

In [None]:
# This will add the pydicom decompressors
!pip install pydicom pylibjpeg pylibjpeg-libjpeg
# Re-install a compatible version of numpy
!pip install numpy==1.26.4

## !!STOP AND RESTART SESSION!!
## YOU ONLY NEED TO DO THIS ONCE
## This makes sure the above dependencies are installed, and we're able to decompress images.
## You can skip the first two cells after restarting.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import pydicom
import h5py
import cv2
import os
from tqdm.auto import tqdm
import warnings
import json
from sklearn.model_selection import train_test_split

random_state = 24

# Suppress the DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# Set paths to the Kaggle data
bbox_csv_path = '/kaggle/input/rsna-2022-cervical-spine-fracture-detection/train_bounding_boxes.csv'

# We're using the clean version from the metadata dataset
meta_csv_path = '/kaggle/input/rsna-2022-spine-fracture-detection-metadata/meta_train_clean.csv'

# This is the path to all 136GB of images
image_data_path = '/kaggle/input/rsna-2022-cervical-spine-fracture-detection/train_images/'

# Set the output path
hdf5_save_path = '/kaggle/working/fracture_dataset_subset.h5'

# Settings
image_size = 256
negative_ratio = 3 # for each positive, we add 3 negative samples
max_boxes = 10

In [3]:
# Load and view metadata
meta_df = pd.read_csv(meta_csv_path)
meta_df.head(3)

Unnamed: 0,StudyInstanceUID,Slice,ImageHeight,ImageWidth,SliceThickness,ImagePositionPatient_x,ImagePositionPatient_y,ImagePositionPatient_z
0,1.2.826.0.1.3680043.10001,1,512,512,0.625,-52.308,-27.712,7.282
1,1.2.826.0.1.3680043.10001,2,512,512,0.625,-52.308,-27.712,6.657
2,1.2.826.0.1.3680043.10001,3,512,512,0.625,-52.308,-27.712,6.032


In [4]:
# Load and view bounding box data
bbox_df = pd.read_csv(bbox_csv_path)
bbox_df.head(3)

Unnamed: 0,StudyInstanceUID,x,y,width,height,slice_number
0,1.2.826.0.1.3680043.10051,219.27715,216.71419,17.3044,20.38517,133
1,1.2.826.0.1.3680043.10051,221.5646,216.71419,17.87844,25.24362,134
2,1.2.826.0.1.3680043.10051,216.82151,221.62546,27.00959,26.37454,135


In [5]:
print('Loading and merging metadata...')

# Rename slice columns to match
meta_df = meta_df.rename(columns={'Slice': 'SliceNumber'})
bbox_df = bbox_df.rename(columns={'slice_number': 'SliceNumber'})

# Group by slice and aggregate bboxes into a list [x, y, width, height]
print('Aggregating bounding boxes per slice...')
bbox_grouped = bbox_df.groupby(['StudyInstanceUID', 'SliceNumber'])[['x', 'y', 'width', 'height']].apply(
    lambda x: x.values.tolist()
).reset_index(name='bboxes')
print(f"Found {len(bbox_grouped)} slices with bounding boxes.")

# Merge all slices with the bounding box data
all_slices_df = pd.merge(
    meta_df,
    bbox_grouped,
    on=['StudyInstanceUID', 'SliceNumber'],
    how='left'
)

# Create final labels
# 'is_positive' is 1 if 'bboxes' is not NaN, 0 otherwise
all_slices_df['is_positive'] = all_slices_df['bboxes'].notna().astype(int)

# Fill NaN in 'bboxes' with an empty list for consistency
all_slices_df['bboxes'] = all_slices_df['bboxes'].apply(
    lambda x: x if isinstance(x, list) else []
)

print('Metadata merge complete.')

Loading and merging metadata...
Aggregating bounding boxes per slice...
Found 7217 slices with bounding boxes.
Metadata merge complete.


In [6]:
print('Creating positive/negative subset...')

positive_samples = all_slices_df[all_slices_df['is_positive'] == 1]
negative_samples = all_slices_df[all_slices_df['is_positive'] == 0]

num_positives = len(positive_samples)
num_negatives_to_sample = min(
    int(num_positives * negative_ratio),
    len(negative_samples)
)

print(f'Total positive slices: {num_positives}')
print(f'Sampling {num_negatives_to_sample} negative slices (Ratio: {negative_ratio})')

negative_samples_subset = negative_samples.sample(
    n=num_negatives_to_sample,
    random_state=random_state
)

final_subset_df = pd.concat(
    [positive_samples, negative_samples_subset]
).sample(frac=1, random_state=random_state).reset_index(drop=True)

print(f'\n--- Total samples to process: {len(final_subset_df)} ---')

Creating positive/negative subset...
Total positive slices: 7217
Sampling 21651 negative slices (Ratio: 3)

--- Total samples to process: 28868 ---


In [7]:
# Create a train/validation/test split

# Create a 70% train, 15% validation, 15% test split.
# We stratify on 'is_positive' to ensure all sets have a similar fracture ratio.

# First, split into 70% train and 30% (val + test)
train_df, val_test_df = train_test_split(
    final_subset_df,
    test_size=0.30, # 30% will be for val+test
    random_state=42,
    stratify=final_subset_df['is_positive']
)

# Next, split the 30% (val + test) in half (15% and 15%)
val_df, test_df = train_test_split(
    val_test_df,
    test_size=0.50, # 50% of the 30% -> 15% of the total
    random_state=42,
    stratify=val_test_df['is_positive']
)

# Add a 'split' column to each DataFrame
# 0 = train
# 1 = validation
# 2 = test
train_df['split'] = 0
val_df['split'] = 1
test_df['split'] = 2

# Recombine them into one final DataFrame ---
final_subset_df = pd.concat([train_df, val_df, test_df]).sort_index()

print('\n--- Final Dataset with Splits ---')
print(f'Total samples: {len(final_subset_df)}')
print(f'  Training samples:   {len(train_df)}')
print(f'  Validation samples: {len(val_df)}')
print(f'  Test samples:       {len(test_df)}')


--- Final Dataset with Splits ---
Total samples: 28868
  Training samples:   20207
  Validation samples: 4330
  Test samples:       4331


In [8]:
# final_subset_df.head()
final_subset_df['bboxes'] # before box scaling

0                                    []
1                                    []
2                                    []
3        [[141.0, 152.0, 235.0, 106.0]]
4                                    []
                      ...              
28863                                []
28864                                []
28865                                []
28866     [[195.0, 171.0, 115.0, 67.0]]
28867                                []
Name: bboxes, Length: 28868, dtype: object

In [9]:
# Scale bounding boxes from original 512x512 to 256x256
# We can divide each box coordinate by 2, or multiply by half
scale_factor = 0.5

def scale_bboxes(bbox_list, scale_factor):
    if not bbox_list:  # for negative sample
        return []
        
    scaled_boxes = []
    for box in bbox_list:
        # Scale each coordinate [x, y, w, h]
        scaled_box = [coord * scale_factor for coord in box]
        scaled_boxes.append(scaled_box)
        
    return scaled_boxes

# Apply this function to the 'bboxes' column
final_subset_df['bboxes'] = final_subset_df['bboxes'].apply(
    lambda x: scale_bboxes(x, scale_factor)
)

print('Bounding boxes successfully scaled down for 256x256 size.')
final_subset_df['bboxes'] # after box scaling

Bounding boxes successfully scaled down for 256x256 size.


0                                 []
1                                 []
2                                 []
3        [[70.5, 76.0, 117.5, 53.0]]
4                                 []
                    ...             
28863                             []
28864                             []
28865                             []
28866     [[97.5, 85.5, 57.5, 33.5]]
28867                             []
Name: bboxes, Length: 28868, dtype: object

In [11]:
# Image preprocessing function
def load_and_process_dicom(uid, slice_num, target_size):
    # Build the local file path
    file_path = f'{image_data_path}/{uid}/{slice_num}.dcm'
    
    ds = pydicom.dcmread(file_path)
    
    # Get pixel array
    img = ds.pixel_array.astype(np.float32)
    
    # Perform min-max normalization (scales pixel intensity to range [0,1])
    img_min = np.min(img)
    img_max = np.max(img)
    if img_max > img_min:
        img = (img - img_min) / (img_max - img_min)
    else:
        img = np.zeros(img.shape) # Handle black images
    
    # Resize
    img = cv2.resize(img, (target_size, target_size), interpolation=cv2.INTER_LINEAR)
    
    return img

In [12]:
## This cell was made using Google Gemini AI ##

num_samples = len(final_subset_df)
print(f"Creating HDF5 file at {hdf5_save_path} with {num_samples} total samples...")

# We'll write in chunks of 32 images at a time
image_chunk = (32, image_size, image_size) 
bbox_chunk = (32, max_boxes, 4)
text_chunk = (512,) # A good chunk size for 1D arrays

with h5py.File(hdf5_save_path, 'w') as hf:
    
    # --- Create datasets ---
    dset_images = hf.create_dataset('images', shape=(num_samples, image_size, image_size), dtype='f4', chunks=image_chunk)
    dset_labels = hf.create_dataset('labels', shape=(num_samples,), dtype='i1', chunks=text_chunk)
    dset_bboxes = hf.create_dataset('bboxes', shape=(num_samples, max_boxes, 4), dtype='f4', fillvalue=-1.0, chunks=bbox_chunk)
    dt_str = h5py.special_dtype(vlen=str)
    dset_uid = hf.create_dataset('StudyInstanceUID', (num_samples,), dtype=dt_str, chunks=text_chunk)
    dset_slice = hf.create_dataset('SliceNumber', (num_samples,), dtype=dt_str, chunks=text_chunk)
    dset_split = hf.create_dataset('split', shape=(num_samples,), dtype='i1', chunks=text_chunk)

    # --- Start the processing loop ---
    print("Starting processing loop... This will be fast!")
    
    for idx, row in tqdm(final_subset_df.iterrows(), total=num_samples, desc="Processing slices"):
        try:
            # 1. Load and process the image from the local disk
            img = load_and_process_dicom(
                row['StudyInstanceUID'], 
                row['SliceNumber'], 
                image_size
            )
            
            # 2. Save data to HDF5 file
            dset_images[idx] = img
            dset_labels[idx] = row['is_positive']
            dset_uid[idx] = row['StudyInstanceUID']
            dset_slice[idx] = str(row['SliceNumber'])
            dset_split[idx] = row['split']
            
            # 3. Process and save bounding boxes
            bboxes = row['bboxes']
            num_boxes = min(len(bboxes), max_boxes)
            
            if num_boxes > 0:
                dset_bboxes[idx, :num_boxes, :] = np.array(bboxes[:num_boxes])
                
        except Exception as e:
            # This will catch any corrupted files
            print(f"\n[Warning] Failed to process slice {row['StudyInstanceUID']}/{row['SliceNumber']}: {e}")

print("\n--- HDF5 file creation complete! ---")
print(f"File saved to: {hdf5_save_path}")
!ls -lh /kaggle/working/

Creating HDF5 file at /kaggle/working/fracture_dataset_subset.h5 with 28868 total samples...
Starting processing loop... This will be fast!


Processing slices:   0%|          | 0/28868 [00:00<?, ?it/s]


--- HDF5 file creation complete! ---
File saved to: /kaggle/working/fracture_dataset_subset.h5
total 7.1G
-rw-r--r-- 1 root root 7.1G Nov 17 05:22 fracture_dataset_subset.h5


In [13]:
# Load API credentials from your private api key dataset
# If you do not have this private dataset setup, see project README for instructions
# Doing this to work around Secret add-ons not working

private_secret_dataset_name = 'DATASET_NAME' ### REPLACE WITH YOUR PRIVATE DATASET NAME ###


CREDENTIALS_PATH = f'/kaggle/input/{private_secret_dataset_name}/kaggle.json'

# Create the hidden .kaggle directory
!mkdir -p ~/.kaggle

# Copy your key from the private dataset to the correct location
!cp '{CREDENTIALS_PATH}' ~/.kaggle/kaggle.json

# Set the correct permissions for the file
!chmod 600 ~/.kaggle/kaggle.json

print('Kaggle API credentials are now in place.')

Kaggle API credentials are now in place.


In [14]:
# Create the private Kaggle dataset

kaggle_username = 'KAGGLE_USERNAME' ### Set KAGGLE_USERNAME ###


# Define your dataset metadata
dataset_metadata = {
  "title": "RSNA 2022 HDF5 Subset",
  "id": f"{kaggle_username}/rsna-2022-hdf5-subset", 
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ]
}

# Write the metadata file
with open('/kaggle/working/dataset-metadata.json', 'w') as f:
    json.dump(dataset_metadata, f)

# Run the create command
# You may have to wait a few minutes for the dataset to fully load to the Kaggle API system
print('Starting dataset creation... This will upload 7.05 GB.')
!kaggle datasets create -p /kaggle/working/ -r zip

Starting dataset creation... This will upload 7.05 GB.
Starting upload for file fracture_dataset_subset.h5
100%|██████████████████████████████████████| 7.06G/7.06G [02:52<00:00, 43.9MB/s]
Upload successful: fracture_dataset_subset.h5 (7GB)
Starting upload for file .virtual_documents.zip
100%|█████████████████████████████████████████| 22.0/22.0 [00:00<00:00, 49.4B/s]
Upload successful: .virtual_documents.zip (22B)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/andymalinsky/rsna-2022-hdf5-subset
