## ------- How to : build a nnUNet_translation dataset -------

In [1]:
import numpy as np
import nibabel as nib
import shutil, json, glob, os
from tqdm import tqdm 
from concurrent.futures import ThreadPoolExecutor

data_dir = 'data/mr/'
target_dir = 'data/ct/'

os.environ['nnUNet_results'] = 'results/'
os.environ['nnUNet_raw'] = 'raw/'
os.environ['nnUNet_preprocessed'] = 'preprocessed/'

# example with 1 input modality
list_datas = sorted(glob.glob(os.path.join(data_dir, '*.nii.gz')))
list_targets = sorted(glob.glob(os.path.join(target_dir, '*.nii.gz')))

print(len(list_datas), list_datas)
print(len(list_targets), list_targets)

5 ['data/mr/1PA070.nii.gz', 'data/mr/1PA073_0000.nii.gz', 'data/mr/1PA074_0000.nii.gz', 'data/mr/1PA076_0000.nii.gz', 'data/mr/1PA079_0000.nii.gz']
5 ['data/ct/1PA070_0000.nii.gz', 'data/ct/1PA073_0000.nii.gz', 'data/ct/1PA074_0000.nii.gz', 'data/ct/1PA076_0000.nii.gz', 'data/ct/1PA079_0000.nii.gz']


#### Define dataset ID and make paths

In [2]:
dataset_id = 50 # /!\ we will use both the dataset_id and the dataset_id + 1 
dataset_data_name = 'SynthRAD2023_Pelvis_MR'
dataset_target_name = 'SynthRAD2023_Pelvis_CT'

# we will copy the datas
# do not use exist_ok=True, we want an error if the dataset exist already
dataset_data_path = os.path.join(os.environ['nnUNet_raw'], f'Dataset{dataset_id:03d}_{dataset_data_name}') 
os.makedirs(dataset_data_path, exist_ok = True)
os.makedirs(os.path.join(dataset_data_path, 'imagesTr'), exist_ok=True)
os.makedirs(os.path.join(dataset_data_path, 'labelsTr'), exist_ok = True)

dataset_target_path = os.path.join(os.environ['nnUNet_raw'], f'Dataset{dataset_id+1:03d}_{dataset_target_name}') 
os.makedirs(dataset_target_path, exist_ok = True)
os.makedirs(os.path.join(dataset_target_path, 'imagesTr'), exist_ok = True)
os.makedirs(os.path.join(dataset_target_path, 'labelsTr'), exist_ok = True)

#### Copy files and create dummy masks

In [3]:
def process_file(data_path, dataset_path, mat):
    curr_nifti = nib.load(data_path)
    filename = os.path.basename(data_path)
    if not filename.endswith('_0000.nii.gz'):
        filename = filename.replace('.nii.gz', '_0000.nii.gz')
    curr_nifti.to_filename(os.path.join(dataset_path, f'imagesTr/{filename}'))

    data = curr_nifti.get_fdata()
    # Adjust the mask as needed for your specific use case. By default, the mask is set to 1 for the entire volume.
    # This will be used for foreground preprocessing, cf https://github.com/MIC-DKFZ/nnUNet/blob/master/documentation/explanation_normalization.md
    data = np.ones_like(data)

    filename = filename.replace('_0000', '') #remove _0000 for masks
    nib.Nifti1Image(data, mat).to_filename(os.path.join(dataset_path, f'labelsTr/{filename}'))

mat = nib.load(list_datas[-1]).affine

with ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(lambda data_path: process_file(data_path, dataset_data_path, mat), list_datas), total=len(list_datas)))

with ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(lambda target_path: process_file(target_path, dataset_target_path, mat), list_targets), total=len(list_targets)))

#### without multithreading
# for data_path in tqdm(list_datas, total=len(list_datas)):
#     process_file(data_path, dataset_data_path, mat)

# for target_path in tqdm(list_targets, total=len(list_targets)):
#     process_file(target_path, dataset_target_path, mat)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.21s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.21it/s]


#### Create the dataset.json

In [4]:
# /!\ you will need to edit this with regards to the number of modalities used;
data_dataset_json = {
    "labels": {
        "label_001": "1", 
        "background": 0
    },
    "channel_names": {
        "0": "MR",
    },
    "numTraining": len(list_datas),
    "file_ending": ".nii.gz"
}
dump_data_datasets_path = os.path.join(dataset_data_path, 'dataset.json')
with open(dump_data_datasets_path, 'w') as f:
    json.dump(data_dataset_json, f)

target_dataset_json = {
    "labels": {
        "label_001": "1",
        "background": 0
    },
    "channel_names": {
        "0": "CT",
    },
    "numTraining": len(list_targets),
    "file_ending": ".nii.gz"
}
dump_target_datasets_path = os.path.join(dataset_target_path, 'dataset.json')
with open(dump_target_datasets_path, 'w') as f:
    json.dump(target_dataset_json, f)

#### Apply preprocessing and unpacking 

In [9]:
if 'MPLBACKEND' in os.environ: 
    del os.environ['MPLBACKEND'] # avoid conflicts with matplotlib backend  
    
os.system(f'nnUNetv2_plan_and_preprocess -d {dataset_id} -c 3d_fullres')
os.system(f'nnUNetv2_unpack {dataset_id} 3d_fullres 0')

os.system(f'nnUNetv2_plan_and_preprocess -d {dataset_id + 1} -c 3d_fullres')
os.system(f'nnUNetv2_unpack {dataset_id + 1} 3d_fullres 0')

Using device: cuda:0

#######################################################################
Please cite the following paper when using nnU-Net:
Isensee, F., Jaeger, P. F., Kohl, S. A., Petersen, J., & Maier-Hein, K. H. (2021). nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation. Nature methods, 18(2), 203-211.
#######################################################################



0

#### Define 2nd modality raw data as gt_segmentations of 1st modality
##### originally used for computing metrics / postprocessing, not sure if needed

In [20]:
nnunet_datas_preprocessed_dir = os.path.join(os.environ['nnUNet_preprocessed'], f'Dataset{dataset_id+1:03d}_{dataset_target_name}') 
nnunet_targets_preprocessed_dir = os.path.join(os.environ['nnUNet_preprocessed'], f'Dataset{dataset_id:03d}_{dataset_data_name}') 

list_targets = glob.glob(os.path.join(f"{dataset_target_path}/imagesTr", '*'))
list_targets.sort()
list_gt_segmentations_datas = glob.glob(os.path.join(f"{nnunet_targets_preprocessed_dir}/gt_segmentations", '*'))
list_gt_segmentations_datas.sort()

print(nnunet_targets_preprocessed_dir)

for (preprocessed_path, gt_path) in zip(list_targets, list_gt_segmentations_datas):
    # here, gt_path is the path to the gt_segmentation in nnUNet_preprocessed.
    print(preprocessed_path, "->", gt_path) # ensure correct file pairing; 
    shutil.copy(src = preprocessed_path, dst = gt_path) # we use shutil.copy to ensure safety, but switching to shutil.move would be more efficient

preprocessed/Dataset050_SynthRAD2023_Pelvis_MR
raw/Dataset051_SynthRAD2023_Pelvis_CT/imagesTr/1PA070_0000.nii.gz -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/gt_segmentations/1PA070.nii.gz
raw/Dataset051_SynthRAD2023_Pelvis_CT/imagesTr/1PA073_0000.nii.gz -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/gt_segmentations/1PA073.nii.gz
raw/Dataset051_SynthRAD2023_Pelvis_CT/imagesTr/1PA074_0000.nii.gz -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/gt_segmentations/1PA074.nii.gz
raw/Dataset051_SynthRAD2023_Pelvis_CT/imagesTr/1PA076_0000.nii.gz -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/gt_segmentations/1PA076.nii.gz
raw/Dataset051_SynthRAD2023_Pelvis_CT/imagesTr/1PA079_0000.nii.gz -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/gt_segmentations/1PA079.nii.gz


#### Define 2nd modality preprocessed files as ground truth of 1st modality
##### used in training, definitely needed

In [27]:
list_preprocessed_datas_seg_path = sorted(glob.glob(os.path.join(nnunet_targets_preprocessed_dir, 'nnUNetPlans_3d_fullres/*_seg.npy')))

list_preprocessed_targets_path = sorted(glob.glob(os.path.join(nnunet_datas_preprocessed_dir, 'nnUNetPlans_3d_fullres/*.npy')))
list_preprocessed_targets_path = [name for name in list_preprocessed_targets_path if '_seg' not in name]

for (datas_path, targets_path) in zip(list_preprocessed_datas_seg_path, list_preprocessed_targets_path):
    print(targets_path, "->", datas_path)
    shutil.copy(src = targets_path, dst = datas_path) 

preprocessed/Dataset051_SynthRAD2023_Pelvis_CT/nnUNetPlans_3d_fullres/1PA070.npy -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/nnUNetPlans_3d_fullres/1PA070_seg.npy
preprocessed/Dataset051_SynthRAD2023_Pelvis_CT/nnUNetPlans_3d_fullres/1PA073.npy -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/nnUNetPlans_3d_fullres/1PA073_seg.npy
preprocessed/Dataset051_SynthRAD2023_Pelvis_CT/nnUNetPlans_3d_fullres/1PA074.npy -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/nnUNetPlans_3d_fullres/1PA074_seg.npy
preprocessed/Dataset051_SynthRAD2023_Pelvis_CT/nnUNetPlans_3d_fullres/1PA076.npy -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/nnUNetPlans_3d_fullres/1PA076_seg.npy
preprocessed/Dataset051_SynthRAD2023_Pelvis_CT/nnUNetPlans_3d_fullres/1PA079.npy -> preprocessed/Dataset050_SynthRAD2023_Pelvis_MR/nnUNetPlans_3d_fullres/1PA079_seg.npy


#### That's it!
You should be able to start training with : 
```
export nnUNet_raw="/data/alonguefosse/nnUNet/raw"
export nnUNet_preprocessed="/data/alonguefosse/nnUNet/preprocessed"
export nnUNet_results="/data/alonguefosse/nnUNet/results"

nnUNetv2_train 50 3d_fullres 0 -tr nnUNetTrainerMRCT
```