# Pancreas Dataset


In [None]:
def make_if_dont_exist(folder_path,overwrite=False):
    """
    creates a folder if it does not exists
    input: 
    folder_path : relative path of the folder which needs to be created
    over_write :(default: False) if True overwrite the existing folder 
    """
    if os.path.exists(folder_path):
        
        if not overwrite:
            print(f"{folder_path} exists.")
        else:
            print(f"{folder_path} overwritten")
            shutil.rmtree(folder_path)
            os.makedirs(folder_path)

    else:
      os.makedirs(folder_path)
      print(f"{folder_path} created!")

# Maybe move path of preprocessed data directly on content - this may be signifcantely faster!
print("Current Working Directory {}".format(os.getcwd()))
path_dict = {
    "nnUNet_raw" : os.path.join(os.getcwd(), "nnUNet_raw"), 
    "nnUNet_preprocessed" : os.path.join(os.getcwd(), "nnUNet_preprocessed"), # 1 experiment: 1 epoch took 112s
    "nnUNet_results" : os.path.join(os.getcwd(), "nnUNet_results"),
    "RAW_DATA_PATH" : os.path.join(os.getcwd(), "original_data"), # This is used here only for convenience (not necessary for nnU-Net)!
}

# Write paths to environment variables
for env_var, path in path_dict.items():
  os.environ[env_var] = path 

# Check whether all environment variables are set correct!
for env_var, path in path_dict.items():
  if os.getenv(env_var) != path:
    print("Error:")
    print("Environment Variable {} is not set correctly!".format(env_var))
    print("Should be {}".format(path))
    print("Variable is {}".format(os.getenv(env_var)))
  make_if_dont_exist(path, overwrite=False)

print("If No Error Occured Continue Forward. =)")

In [None]:
# os.environ['nnUNet_raw_data_base'] = '/scratch/alif/nnUNet/original_data'
os.environ['nnUNet_raw_data_base'] = '/scratch/alif/nnUNet/nnUNet_raw_data_base'
os.environ['nnUNet_preprocessed'] = '/scratch/alif/nnUNet/nnUNet_preprocessed'
os.environ['RESULTS_FOLDER'] = '/scratch/alif/nnUNet/nnUNet_trained_models'

In [138]:
import os
import shutil
import json
import nibabel as nib
import numpy as np

# Define the source directories
source_train = 'original_data/UHN-MedImg3D-ML-quiz/train'
source_val = 'original_data/UHN-MedImg3D-ML-quiz/validation'
source_test = 'original_data/UHN-MedImg3D-ML-quiz/test'

# Define the target directories
target_base = 'original_data'
task_name = 'Task006_PancreasUHN'
target_task_dir = os.path.join(target_base, task_name)
images_tr_dir = os.path.join(target_task_dir, 'imagesTr')
images_ts_dir = os.path.join(target_task_dir, 'imagesTs')
labels_tr_dir = os.path.join(target_task_dir, 'labelsTr')

# Create the target directories if they don't exist
os.makedirs(images_tr_dir, exist_ok=True)
os.makedirs(images_ts_dir, exist_ok=True)
os.makedirs(labels_tr_dir, exist_ok=True)

# Initialize class mapping dictionary
class_mapping = {}

# Function to correct label values
def correct_labels(file_path):
    img = nib.load(file_path)
    data = img.get_fdata()

    print(f"Original unique labels in {file_path}: {np.unique(data)}")

    # Round to nearest valid value
    data_corrected = np.round(data)

    # Explicitly set near values to the exact intended labels
    data_corrected[np.isclose(data_corrected, 0.0)] = 0.0
    data_corrected[np.isclose(data_corrected, 1.0)] = 1.0
    data_corrected[np.isclose(data_corrected, 2.0)] = 2.0

    print(f"Corrected unique labels in {file_path}: {np.unique(data_corrected)}")
    print(f"----------------------------------------------------------------------\n\n")

    # Save the corrected label file to a temporary path
    temp_file_path = file_path.replace('.nii.gz', '_corrected.nii.gz')
    # corrected_img = nib.Nifti1Image(data_corrected, img.affine, img.header)
    corrected_img = nib.Nifti1Image(data_corrected, img.affine)
    
    nib.save(corrected_img, temp_file_path)
    
    return temp_file_path, np.unique(data_corrected), np.unique(data)

# Function to move files and record class labels
def move_files(source, image_dest, label_dest, class_mapping, case_id=0):
    
    if not label_dest:
        case_id = case_id
        total = len(os.listdir(source))
        for file in os.listdir(source):
            if file.endswith('0000.nii.gz'):
                src_image_file = os.path.join(source, file)

                case_str_image = f"case_{case_id:03d}_0000.nii.gz"
                dest_image_file = os.path.join(image_dest, case_str_image)
                shutil.copy(src_image_file, dest_image_file)
                
                print(f"""({case_id}/{total}) Source Image: {src_image_file} 
                      Dest Image: {dest_image_file}
                      Labels: No Class, Not Annotated
                """)
                case_id += 1
                
        return case_id
            
    else:
        case_id = case_id
        for subtype in os.listdir(source):
            subtype_path = os.path.join(source, subtype)
            if os.path.isdir(subtype_path):
                class_id = int(os.path.basename(subtype_path)[-1])
                total = len(os.listdir(subtype_path))
                for file in os.listdir(subtype_path):
                    if file.endswith('0000.nii.gz'):
                        src_image_file = os.path.join(subtype_path, file)

                        case_str_image = f"case_{case_id:03d}_0000.nii.gz"
                        dest_image_file = os.path.join(image_dest, case_str_image)
                        shutil.copy(src_image_file, dest_image_file)

                        src_label_file = src_image_file.replace('_0000.nii.gz', '.nii.gz')
                        
                        # Correct the label file before copying
                        corrected_label_file, unique_labels, orig_labels = correct_labels(src_label_file)
                        # print(corrected_label_file)
                        # print(unique_labels)
                        # print(orig_labels)

                        case_str_label = f"case_{case_id:03d}.nii.gz"
                        dest_label_file = os.path.join(label_dest, case_str_label)
                        
                        shutil.copy(corrected_label_file, dest_label_file)
                        
                        class_mapping[case_str_image] = class_id
                        print(f"""
                        
----------------------------------------------------------------------
({case_id}/{total}) 

Source Image: {src_image_file}
Source Label: {src_label_file}
Corrected Label: {corrected_label_file}
Dest Image: {dest_image_file}
Dest Label: {dest_label_file}
Labels: Class {class_id}, Annotated, Unique labels after correction: {unique_labels}
                        
                        """)
                        case_id += 1
                        # Remove the temporary corrected label file
                        os.remove(corrected_label_file)
                        
        return case_id

# Move training files and record class labels
case_id = move_files(source_train, images_tr_dir, labels_tr_dir, class_mapping, 1)
print("\n\n##################\n\n")
case_id = move_files(source_val, images_tr_dir, labels_tr_dir, class_mapping, case_id)
print("\n\n##################\n\n")
case_id = move_files(source_test, images_ts_dir, None, class_mapping, case_id)

# Save the class mapping to a JSON file
with open(os.path.join(target_task_dir, 'class_mapping.json'), 'w') as f:
    json.dump(class_mapping, f)

print("Data restructuring complete and class mapping saved.")


Original unique labels in original_data/UHN-MedImg3D-ML-quiz/train/subtype2/quiz_2_416.nii.gz: [0. 1. 2.]
Corrected unique labels in original_data/UHN-MedImg3D-ML-quiz/train/subtype2/quiz_2_416.nii.gz: [0. 1. 2.]
----------------------------------------------------------------------



                        
----------------------------------------------------------------------
(1/169) 

Source Image: original_data/UHN-MedImg3D-ML-quiz/train/subtype2/quiz_2_416_0000.nii.gz
Source Label: original_data/UHN-MedImg3D-ML-quiz/train/subtype2/quiz_2_416.nii.gz
Corrected Label: original_data/UHN-MedImg3D-ML-quiz/train/subtype2/quiz_2_416_corrected.nii.gz
Dest Image: original_data/Task006_PancreasUHN/imagesTr/case_001_0000.nii.gz
Dest Label: original_data/Task006_PancreasUHN/labelsTr/case_001.nii.gz
Labels: Class 2, Annotated, Unique labels after correction: [0. 1. 2.]
                        
                        
Original unique labels in original_data/UHN-MedImg3D-ML-quiz/train/subtype2

In [148]:
# import os


# # Directory containing the files
# directory = 'nnUNet_raw_data_base/nnUNet_raw_data/Task006_PancreasUHN/imagesTs'
# start_number = 289
# files = sorted(os.listdir(directory))


# for i, filename in enumerate(files):

#     new_filename = f"case_{start_number + i:03d}_0000.nii.gz"
    
#     old_file = os.path.join(directory, filename)
#     new_file = os.path.join(directory, new_filename)
    
#     os.rename(old_file, new_file)

#     print(f'Renamed: {filename} -> {new_filename}')

In [139]:
import os
import numpy as np
import nibabel as nib

def load_nifti_image(file_path):
    """Load a NIfTI image and return the data as a numpy array."""
    nifti_image = nib.load(file_path)
    return nifti_image.get_fdata()

def collect_unique_labels(labels_dir):
    """Collect and print all unique labels in the dataset."""
    unique_labels = set()
    
    label_files = sorted(os.listdir(labels_dir))
    
    for lbl_file in label_files:
        lbl_path = os.path.join(labels_dir, lbl_file)
        labels = load_nifti_image(lbl_path)
        unique_labels.update(np.unique(labels))
    
    return unique_labels

In [140]:
# Define the path to the labels directory
pancreas_labels_dir = 'original_data/Task006_PancreasUHN/labelsTr'

# Collect unique labels
pancreas_unique_labels = collect_unique_labels(pancreas_labels_dir)

# Print the unique labels
print(f"Unique labels in the dataset: {sorted(pancreas_unique_labels)}")

Unique labels in the dataset: [0.0, 1.0, 2.0]


In [141]:
! mv original_data/Task006_PancreasUHN/ nnUNet_raw_data_base/nnUNet_raw_data/

In [142]:
train_dir = '/scratch/alif/nnUNet/nnUNet_raw_data_base/nnUNet_raw_data/Task006_PancreasUHN/imagesTs'
test_dir = '/scratch/alif/nnUNet/nnUNet_raw_data_base/nnUNet_raw_data/Task006_PancreasUHN/imagesTs'

In [143]:
# remove_suffix(test_dir)

In [149]:
import os
import json

def create_dataset_json(base_dir, task_name, num_training, num_test):
    
    imagesTr_dir = os.path.join(base_dir, task_name, 'imagesTr')
    labelsTr_dir = os.path.join(base_dir, task_name, 'labelsTr')
    imagesTs_dir = os.path.join(base_dir, task_name, 'imagesTs')

    # Get list of training and test images
    training_images = sorted(os.listdir(imagesTr_dir))
    test_images = sorted(os.listdir(imagesTs_dir))

    # Ensure the number of requested training and test images does not exceed available images
    num_training = min(num_training, len(training_images))
    num_test = min(num_test, len(test_images))

    # Create the dataset dictionary
    dataset = {
        "name": "Pancreas_UHN",
        "description": "Segmentation of pancreatic structures using UHN dataset",
        "tensorImageSize": "3D",
        "reference": "",
        "licence": "",
        "release": "0.0",
        "modality": {
            "0": "CT"
        },
        "labels": {
            "0": "background",
            "1": "pancreas",
            "2": "lesion"
            # Add other structures as needed
        },
        "numTraining": num_training,
        "numTest": num_test,
        "training": [],
        "test": []
    }

    # Populate the training field
    # In the dataset file, no files should have _0000 suffix
    for i in range(num_training):
        dataset['training'].append({
            # "image": f"./imagesTr/{training_images[i]}",
            "image": f"./imagesTr/{training_images[i].replace('_0000.nii.gz', '.nii.gz')}",
            "label": f"./labelsTr/{training_images[i].replace('_0000.nii.gz', '.nii.gz')}"
        })

    # Populate the test field
    for i in range(num_test):
        # dataset['test'].append(f"./imagesTs/{test_images[i]}")
        dataset['test'].append(f"./imagesTs/{test_images[i].replace('_0000.nii.gz', '.nii.gz')}")

    # Save the dataset.json file
    with open(os.path.join(base_dir, task_name, 'dataset.json'), 'w') as f:
        json.dump(dataset, f, indent=4)

# Define the parameters
base_dir = '/scratch/alif/nnUNet/nnUNet_raw_data_base/nnUNet_raw_data/'
task_name = 'Task006_PancreasUHN'  # Replace 002 with the actual task number
num_training = 288  # Set the number of training samples
num_test = 72  # Set the number of test samples

# Create the dataset.json file
create_dataset_json(base_dir, task_name, num_training, num_test)
print('Done')

Done


In [150]:
! nnUNet_plan_and_preprocess -t 06 --verify_dataset_integrity



Please cite the following paper when using nnUNet:

Isensee, F., Jaeger, P.F., Kohl, S.A.A. et al. "nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation." Nat Methods (2020). https://doi.org/10.1038/s41592-020-01008-z


If you have questions or suggestions, feel free to open an issue at https://github.com/MIC-DKFZ/nnUNet

Verifying training set
checking case case_001
checking case case_002
checking case case_003
checking case case_004
checking case case_005
checking case case_006
checking case case_007
checking case case_008
checking case case_009
checking case case_010
checking case case_011
checking case case_012
checking case case_013
checking case case_014
checking case case_015
the spacing does not match between the images
(0.814453125, 0.814453125, 0.699999988079071)
(0.814453125, 0.814453125, 0.70001220703125)
The geometry of the image /scratch/alif/nnUNet/nnUNet_raw_data_base/nnUNet_raw_data/Task006_PancreasUHN/imagesTr/case_015 does not ma

In [154]:
! nnUNet_plan_and_preprocess -t 06 -tf 14 -tl 14



Please cite the following paper when using nnUNet:

Isensee, F., Jaeger, P.F., Kohl, S.A.A. et al. "nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation." Nat Methods (2020). https://doi.org/10.1038/s41592-020-01008-z


If you have questions or suggestions, feel free to open an issue at https://github.com/MIC-DKFZ/nnUNet

case_001
case_013
case_007
case_019
case_025
case_031
case_037
case_043
case_049
case_055
case_061
case_067
case_073
case_079
before crop: (1, 68, 143, 176) after crop: (1, 68, 143, 176) spacing: [1.5        0.62890625 0.62890625] 

before crop: (1, 73, 97, 161) after crop: (1, 73, 97, 161) spacing: [1.5        0.71679688 0.71679688] 

before crop: (1, 117, 149, 204) after crop: (1, 117, 149, 204) spacing: [1.        0.7265625 0.7265625] 

before crop: (1, 45, 157, 155) after crop: (1, 45, 157, 155) spacing: [3.         0.69921875 0.69921875] 

case_008
before crop: (1, 50, 124, 201) after crop: (1, 50, 124, 201) spacing: [3.     

In [156]:
! nnUNet_train 3d_fullres nnUNetTrainerV2_Custom Task006_PancreasUHN 0



Please cite the following paper when using nnUNet:

Isensee, F., Jaeger, P.F., Kohl, S.A.A. et al. "nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation." Nat Methods (2020). https://doi.org/10.1038/s41592-020-01008-z


If you have questions or suggestions, feel free to open an issue at https://github.com/MIC-DKFZ/nnUNet

###############################################
I am running the following nnUNet: 3d_fullres
My trainer class is:  <class 'nnunet.training.network_training.custom_trainer.nnUNetTrainerV2_Custom'>
For that I will be using the following configuration:
num_classes:  2
modalities:  {0: 'CT'}
use_mask_for_norm OrderedDict([(0, False)])
keep_only_largest_region None
min_region_size_per_class None
min_size_per_class None
normalization_schemes OrderedDict([(0, 'CT')])
stages...

stage:  0
{'batch_size': 2, 'num_pool_per_axis': [4, 5, 5], 'patch_size': array([ 64, 128, 192]), 'median_patient_size_in_voxels': array([ 59, 117, 181]), 'curre