In [1]:
# import os
# from audit.utils.commons.file_manager import (
#     list_dirs,
#     list_files,
#     delete_files_by_extension,
#     delete_folders_by_pattern,
#     move_files_to_parent,
#     organize_subfolders_into_named_folders,
#     rename_files,
#     add_string_filenames,
#     rename_directories
# )

In [1]:
import os
os.chdir("./..")
from src.audit.utils.commons.file_manager import (
    list_dirs,
    list_files,
    delete_files_by_extension,
    delete_folders_by_pattern,
    move_files_to_parent,
    organize_subfolders_into_named_folders,
    rename_files,
    add_string_filenames,
    rename_directories
)

The first step when working with any dataset is to analyze its file structure and understand its content. This includes identifying the available sequences, checking whether segmentation data is provided, and organizing the files into the necessary structure to work with AUDIT.

In this tutorial, we will demonstrate the full potential of AUDIT for preprocessing datasets. For this purpose, we have chosen the [LUMIERE](https://doi.org/10.1038/s41597-022-01881-7) dataset, as it contains multiple time points and images that will need to be filtered or adjusted during preprocessing.

To follow along with this tutorial, download the LUMIERE dataset using the following link: [DOWNLOAD LUMIERE](https://doi.org/10.6084/m9.figshare.c.5904905.v1).

In [3]:
root_data_path = "./datasets/LUMIERE/"

# Data understanding

We can see that the LUMIERE dataset contains a total of 91 directories, one for each subject.

In [4]:
print(list_dirs(root_data_path)[:5])
print(list_dirs(root_data_path)[-5:])

['Patient-001', 'Patient-002', 'Patient-003', 'Patient-004', 'Patient-005']
['Patient-087', 'Patient-088', 'Patient-089', 'Patient-090', 'Patient-091']


Inside each subject's folder, you will find subdirectories corresponding to the timepoints captured for each patient. The general structure of these directories follows the pattern "week-XXX", where XXX represents the week when the medical image was taken. Additionally, some directories may include a final suffix "-N" (e.g., for patient 001: week-000-1, week-000-2).

To simplify the tutorial and focus on the learning process, we will later remove these additional timepoints.

In [5]:
print(list_dirs(f"{root_data_path}Patient-001"))
print(list_dirs(f"{root_data_path}Patient-091"))

['week-000-1', 'week-000-2', 'week-044', 'week-056']
['week-000', 'week-001', 'week-014', 'week-026', 'week-036', 'week-043']


If we examine the details of a specific timepoint, for example, week-0000 for subject 091, we find that it contains different sequences along with predictions generated by two different models: "DeepBraTumIA-segmentation" and "HD-GLIO-AUTO-segmentation".

For the purpose of this tutorial, we will consider the segmentation provided by "DeepBraTumIA-segmentation" as the ground truth from the medical expert.

In [6]:
print(list_dirs(f"{root_data_path}Patient-091/week-000"))
print(list_files(f"{root_data_path}Patient-091/week-000"))

['DeepBraTumIA-segmentation', 'HD-GLIO-AUTO-segmentation']
['CT1.nii.gz', 'FLAIR.nii.gz', 'T1.nii.gz', 'T2.nii.gz']


The four sequences that are of primary interest to us are located within the "atlas/skull_strip" directory, and their corresponding segmentation can be found in "/atlas/segmentation".

In [7]:
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/atlas/skull_strip"))
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/atlas/segmentation"))

['brain_mask.nii.gz', 'ct1_skull_strip.nii.gz', 'flair_skull_strip.nii.gz', 't1_skull_strip.nii.gz', 't2_skull_strip.nii.gz']
['measured_volumes_in_mm3.json', 'seg_mask.nii.gz']


# Files cleaning

As mentioned earlier, all the sequences located in the main root of each timepoint ('CT1.nii.gz', 'FLAIR.nii.gz', 'T1.nii.gz', 'T2.nii.gz') need to be removed, as they are not of interest to us. To do this, we will use the delete_files_by_extension method from AUDIT.

This function has an argument called "safe_mode", which we highly recommend setting to True to ensure that we are only deleting the correct files.

In [8]:
delete_files_by_extension(
    root_dir=root_data_path,
    ext='CT1.nii.gz',
    safe_mode=True
)

[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-001/week-000-1/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-001/week-000-2/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-001/week-044/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-001/week-056/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-002/week-000/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-002/week-003/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-002/week-021/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-002/week-037/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-002/week-040-1/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-002/week-040-2/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-002/week-047/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-003/week-000-1/CT1.nii.gz
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-003/week-000-

Once we are certain that the files to be deleted are the ones shown by the function, we can go ahead and execute it. We will apply this to all the unnecessary sequences.

In [9]:
sequences_to_delete = ['CT1.nii.gz', 'FLAIR.nii.gz', 'T1.nii.gz', 'T2.nii.gz']
for seq in sequences_to_delete:
    delete_files_by_extension(
        root_dir=root_data_path,
        ext=seq,
        safe_mode=False
    )

If we run the command again that lists directories and files, we can verify that all the unnecessary files have been successfully deleted.

In [10]:
print(list_dirs(f"{root_data_path}Patient-091/week-000"))
print(list_files(f"{root_data_path}Patient-091/week-000"))

['DeepBraTumIA-segmentation', 'HD-GLIO-AUTO-segmentation']
[]


Other files that we have identified as unnecessary are 'atlas/skull_strip/brain_mask.nii.gz' and 'atlas/segmentation/measured_volumes_in_mm3.json'. We will delete these as well.

In [11]:
files_to_delete = ['.json', 'brain_mask.nii.gz']
for file in files_to_delete:
    delete_files_by_extension(
        root_dir=root_data_path,
        ext=file,
        safe_mode=False
    )

# Folders cleaning

The next step will be to remove the unnecessary directories, such as 'HD-GLIO-AUTO-segmentation' and 'DeepBraTumIA-segmentation/atlas/'. To do this, we will use a feature from AUDIT called delete_folders_by_pattern. This function is similar to the one seen in the previous step, but it takes an argument called 'pattern'. Using this argument, it will remove all directories that match the specified pattern.

In [12]:
delete_folders_by_pattern(
    root_dir=root_data_path,
    pattern="HD-GLIO",
    safe_mode=True
)

[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-088/week-000-2/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-088/week-000-1/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-014/week-000/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-014/week-001/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-014/week-012/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-046/week-000/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-046/week-064/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-046/week-016/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-046/week-001/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-042/week-000-2/HD-GLIO-AUTO-segmentation
[SAFE MODE] Would delete: ./datasets/LUMIERE/Patient-042/week-022/HD-GLIO-

In [13]:
delete_folders_by_pattern(
    root_dir=root_data_path,
    pattern="HD-GLIO",
    safe_mode=False
)

In [14]:
delete_folders_by_pattern(
    root_dir=root_data_path,
    pattern="native",
    safe_mode=False
)

The result we obtained after cleaning up the unnecessary files and directories is as follows:

In [15]:
# There are no innecessary files in any of the folders.
print(list_files(f"{root_data_path}Patient-091/week-000/"))
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/"))
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/atlas"))

# There are no innecessary folders in any of the folders.
print(list_dirs(f"{root_data_path}Patient-091/week-000/"))
print(list_dirs(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/"))
print(list_dirs(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/atlas"))

[]
[]
[]
['DeepBraTumIA-segmentation']
['atlas']
['segmentation', 'skull_strip']


All the images we need—the four sequences and the segmentation—are located in the following directories:

In [16]:
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/atlas/skull_strip"))
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/atlas/segmentation"))

['ct1_skull_strip.nii.gz', 'flair_skull_strip.nii.gz', 't1_skull_strip.nii.gz', 't2_skull_strip.nii.gz']
['seg_mask.nii.gz']


# Files organization

Although we now have only the files we need, they are located at a very deep directory level, and we need to organize them into the parent folders. To do this, we will use the move_files_to_parent function.

In this case, let's set ext (extension) to None to move all the files.

In [17]:
move_files_to_parent(
    root_dir=root_data_path,
    levels_up=3,
    ext=None,
    safe_mode=False
)

We can see that all the files have been reorganized as needed for each patient and their corresponding timepoints. Take a look at other features available in AUDIT, such as copy_files_by_extension, if you need to organize the images in a different way.

In [18]:
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/atlas/skull_strip"))
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/atlas/segmentation"))
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/atlas/"))
print(list_files(f"{root_data_path}Patient-091/week-000/DeepBraTumIA-segmentation/"))
print(list_files(f"{root_data_path}Patient-091/week-000/"))
print(list_files(f"{root_data_path}Patient-002/week-047/"))

[]
[]
[]
[]
['ct1_skull_strip.nii.gz', 'flair_skull_strip.nii.gz', 'seg_mask.nii.gz', 't1_skull_strip.nii.gz', 't2_skull_strip.nii.gz']
['ct1_skull_strip.nii.gz', 'flair_skull_strip.nii.gz', 'seg_mask.nii.gz', 't1_skull_strip.nii.gz', 't2_skull_strip.nii.gz']


Let's delete the directories we no longer need.

In [19]:
delete_folders_by_pattern(
    root_dir=root_data_path,
    pattern="DeepBraTumIA-segmentation",
    safe_mode=False
)

In [20]:
print(list_dirs(f"{root_data_path}Patient-091/week-000/"))
print(list_files(f"{root_data_path}Patient-091/week-000/"))

[]
['ct1_skull_strip.nii.gz', 'flair_skull_strip.nii.gz', 'seg_mask.nii.gz', 't1_skull_strip.nii.gz', 't2_skull_strip.nii.gz']


# Folder organization

To align with the structure required by AUDIT, it is recommended that each patient and timepoint be placed in a separate directory. Therefore, all timepoints must be moved to the root folder. Fortunately, the library provides a function called organize_subfolders_into_named_folders, which will reformat subdirectories (in this case, each timepoint) into subject directories.

The "join" argument defines the string used to concatenate the parent folder and the child folder. Check the documentation for more detailed information.

In [21]:
organize_subfolders_into_named_folders(
    root_dir=root_data_path,
    join_char="-",
    safe_mode=True
)
    

[SAFE MODE] Would move: ./datasets/LUMIERE/Patient-001/week-000-2/t2_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-001-week-000-2/t2_skull_strip.nii.gz
[SAFE MODE] Would move: ./datasets/LUMIERE/Patient-001/week-000-2/ct1_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-001-week-000-2/ct1_skull_strip.nii.gz
[SAFE MODE] Would move: ./datasets/LUMIERE/Patient-001/week-000-2/t1_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-001-week-000-2/t1_skull_strip.nii.gz
[SAFE MODE] Would move: ./datasets/LUMIERE/Patient-001/week-000-2/seg_mask.nii.gz -> ./datasets/LUMIERE/Patient-001-week-000-2/seg_mask.nii.gz
[SAFE MODE] Would move: ./datasets/LUMIERE/Patient-001/week-000-2/flair_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-001-week-000-2/flair_skull_strip.nii.gz
[SAFE MODE] Would move: ./datasets/LUMIERE/Patient-001/week-056/t2_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-001-week-056/t2_skull_strip.nii.gz
[SAFE MODE] Would move: ./datasets/LUMIERE/Patient-001/week-056/ct1_skull_stri

In [22]:
organize_subfolders_into_named_folders(
    root_dir=root_data_path,
    join_char="-",
    safe_mode=False
)
    

In [23]:
print(list_dirs(root_data_path)[:6])

['Patient-001-week-000-1', 'Patient-001-week-000-2', 'Patient-001-week-044', 'Patient-001-week-056', 'Patient-002-week-000', 'Patient-002-week-003']


As mentioned earlier, for simplicity, we will only keep the timepoints that do not have a "-N" suffix at the end of the corresponding week.

In [24]:
pattern_to_delete = r"^Patient-\d{3}-week-\d{3}-\d$"

delete_folders_by_pattern(
    root_dir=root_data_path,
    pattern=pattern_to_delete,
    safe_mode=False
)

In [25]:
print(list_dirs(root_data_path)[:6])
print(len(list_dirs(root_data_path)))

['Patient-001-week-044', 'Patient-001-week-056', 'Patient-002-week-000', 'Patient-002-week-003', 'Patient-002-week-021', 'Patient-002-week-037']
541


# Sequences name standarization

Finally, to follow a more standardized naming convention, such as the one used in the BraTS dataset, we will rename the sequences and the segmentation to follow a similar pattern. Typically, MRI sequences are named t1, t2, t1ce, and flair**,** and the segmentation is named **seg`. However, the names we currently have do not follow this convention. Let's use rename_files to modify them.

In [26]:
print(list_files(f"{root_data_path}Patient-001-week-044"))

['ct1_skull_strip.nii.gz', 'flair_skull_strip.nii.gz', 'seg_mask.nii.gz', 't1_skull_strip.nii.gz', 't2_skull_strip.nii.gz']


In [29]:
old_names = ['ct1_skull_strip.nii.gz', 'flair_skull_strip.nii.gz', 'seg_mask.nii.gz', 't1_skull_strip.nii.gz', 't2_skull_strip.nii.gz']
new_names = ['t1ce.nii.gz', 'flair.nii.gz', 'seg.nii.gz', 't1.nii.gz', 't2.nii.gz']

for o, n in zip(old_names, new_names):
    rename_files(
        root_dir=root_data_path,
        old_name=o,
        new_name=n,
        safe_mode=True
    )

[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-012-week-016/ct1_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-012-week-016/t1ce.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-023-week-001/ct1_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-023-week-001/t1ce.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-024-week-001/ct1_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-024-week-001/t1ce.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-078-week-045/ct1_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-078-week-045/t1ce.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-031-week-046/ct1_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-031-week-046/t1ce.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-067-week-136/ct1_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-067-week-136/t1ce.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-031-week-190/ct1_skull_strip.nii.gz -> ./datasets/LUMIERE/Patient-031-week-190/t1ce

In [30]:
for o, n in zip(old_names, new_names):
    rename_files(
        root_dir=root_data_path,
        old_name=o,
        new_name=n,
        safe_mode=False
    )

Additionally, to allow AUDIT to locate each image simply by the subject ID, we will name each image with the corresponding subject identifier along with the sequence name. To do this, we will use the add_string_filenames function, which allows us to add both suffixes and prefixes to specific files.

In [31]:
for subject in list_dirs(root_data_path)[:2]:
    add_string_filenames(
        root_dir=os.path.join(root_data_path, subject),
        prefix=f"{subject}_",
        ext=None,
        safe_mode=True
    )

[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-001-week-044/t1ce.nii.gz -> ./datasets/LUMIERE/Patient-001-week-044/Patient-001-week-044_t1ce.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-001-week-044/flair.nii.gz -> ./datasets/LUMIERE/Patient-001-week-044/Patient-001-week-044_flair.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-001-week-044/t1.nii.gz -> ./datasets/LUMIERE/Patient-001-week-044/Patient-001-week-044_t1.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-001-week-044/seg.nii.gz -> ./datasets/LUMIERE/Patient-001-week-044/Patient-001-week-044_seg.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-001-week-044/t2.nii.gz -> ./datasets/LUMIERE/Patient-001-week-044/Patient-001-week-044_t2.nii.gz
Safe mode enabled: No files were renamed.
[SAFE MODE] Would rename: ./datasets/LUMIERE/Patient-001-week-056/t1ce.nii.gz -> ./datasets/LUMIERE/Patient-001-week-056/Patient-001-week-056_t1ce.nii.gz
[SAFE MODE] Would rename: ./datasets/LUMIE

In [32]:
for subject in list_dirs(root_data_path):
    add_string_filenames(
        root_dir=os.path.join(root_data_path, subject),
        prefix=f"{subject}_",
        ext=None,
        safe_mode=False
    )

Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.
Renaming completed.


With this, we would have organized the project as required to work with AUDIT. Additionally, we recommend that the images (sequences and segmentations provided by the medical experts) be placed in a directory called DATASET_images, so that the segmentations from each model are contained in the DATASET_seg directory. Therefore, to conclude, we will rename the LUMIERE directory to LUMIERE_images.

In [33]:
rename_directories(
    root_dir="./datasets/",
    old_name="LUMIERE",
    new_name="LUMIERE_images",
    safe_mode=False
)