In [1]:
import pandas as pd
import numpy as np
import os
import nibabel as nib
from tqdm import tqdm
import pathlib

In [2]:
base_dir = "../data/MMs/"

base_dir = "/home/alalbiol/Data/mnms/OpenDataset"


In [3]:
def load_nii(img_path):
    """
    Function to load a 'nii' or 'nii.gz' file, The function returns
    everyting needed to save another 'nii' or 'nii.gz'
    in the same dimensional space, i.e. the affine matrix and the header
    :param img_path: (string) Path of the 'nii' or 'nii.gz' image file name
    :return: Three element, the first is a numpy array of the image values (height, width, slices, phases),
             ## (No) the second is the affine transformation of the image, and the
             ## (No) last one is the header of the image.
    """
    nimg = nib.load(img_path)
    return np.asanyarray(nimg.dataobj), nimg.affine, nimg.header

In [10]:
volumes_df = pd.read_csv(pathlib.Path(base_dir) / '211230_M&Ms_Dataset_information_diagnosis_opendataset.csv')
del volumes_df['Sex']
del volumes_df['Age']
del volumes_df['Weight']
del volumes_df['Height']
del volumes_df['Pathology']
del volumes_df['Unnamed: 0']
volumes_df.head()

Unnamed: 0,External code,VendorName,Vendor,Centre,ED,ES
0,A0S9V9,Siemens,A,1,0,9
1,A1D0Q7,Philips,B,2,0,9
2,A1D9Z7,Siemens,A,1,22,11
3,A1E9Q1,Siemens,A,1,0,9
4,A1K2P5,Canon,D,5,33,11


In [12]:
training_labeled = pathlib.Path(base_dir) / 'Training' / 'Labeled'
training_labeled_cases = [x.name for x in training_labeled.iterdir() if x.is_dir()]

training_unlabeled = pathlib.Path(base_dir) / 'Training' / 'Unlabeled'
training_unlabeled_cases = [x.name for x in training_unlabeled.iterdir() if x.is_dir()]

validation = pathlib.Path(base_dir) / 'Validation'
validation_cases = [x.name for x in validation.iterdir() if x.is_dir()]

testing = pathlib.Path(base_dir) / 'Testing'
testing_cases = [x.name for x in testing.iterdir() if x.is_dir()]


def get_partition(case):
    if case in training_labeled_cases:
        return 'Training'
    elif case in training_unlabeled_cases:
        return 'Training'
    elif case in validation_cases:
        return 'Validation'
    elif case in testing_cases:
        return 'Testing'
    else:
        return None
    
def get_labeled(case):
    if case in training_labeled_cases:
        return True
    elif case in training_unlabeled_cases:
        return False
    elif case in validation_cases:
        return True
    elif case in testing_cases:
        return True
    else:
        return False
    
volumes_df['Partition'] = volumes_df['External code'].apply(get_partition)
volumes_df['Labeled'] = volumes_df['External code'].apply(get_labeled)

volumes_df.head()


Unnamed: 0,External code,VendorName,Vendor,Centre,ED,ES,Partition,Labeled
0,A0S9V9,Siemens,A,1,0,9,Training,True
1,A1D0Q7,Philips,B,2,0,9,Training,True
2,A1D9Z7,Siemens,A,1,22,11,Training,True
3,A1E9Q1,Siemens,A,1,0,9,Training,True
4,A1K2P5,Canon,D,5,33,11,Testing,True


In [13]:
slices_info_dict = {
    "External code": [], "VendorName": [], "Vendor": [], 
    "Centre": [], "Partition": [], "Labeled": [], 
    "ED": [], "ES": [], "Slice": [], "Phase": []
}

In [14]:
for index, row in tqdm(volumes_df.iterrows(), total=volumes_df.shape[0]):
    external_code = row["External code"]
    partition = row["Partition"]
    
    if partition == "Training":
        label_str = "Labeled" if row["Labeled"] else "Unlabeled"
        partition = f"{partition}/{label_str}"
    
    nifit_path = os.path.join(
        base_dir, partition, external_code, f"{external_code}_sa.nii.gz"
    )
    nifti_volume = load_nii(nifit_path)[0]
    h,w, c_slices, c_phases = nifti_volume.shape  # h, w, slices, *phases*
    
    for s in range(c_slices):
        for p in range(c_phases):
            # Centre 4 (Vendor C) is 'not' segmented
            labeled = True if p in [row["ED"], row["ES"]] and row["Partition"] == "Training" and row["Centre"] != 4 else False
            
            slices_info_dict["External code"].append(row["External code"])
            slices_info_dict["VendorName"].append(row["VendorName"])
            slices_info_dict["Vendor"].append(row["Vendor"])
            slices_info_dict["Centre"].append(row["Centre"])
            slices_info_dict["Partition"].append(row["Partition"])
            slices_info_dict["Labeled"].append(labeled)
            slices_info_dict["ED"].append(row["ED"])
            slices_info_dict["ES"].append(row["ES"])
            slices_info_dict["Slice"].append(s)
            slices_info_dict["Phase"].append(p)

100%|██████████| 345/345 [02:28<00:00,  2.33it/s]


In [15]:
slices_info_df = pd.DataFrame.from_dict(slices_info_dict)
print(f"There are {len(slices_info_df)} entries")
slices_info_df.head()

There are 106159 entries


Unnamed: 0,External code,VendorName,Vendor,Centre,Partition,Labeled,ED,ES,Slice,Phase
0,A0S9V9,Siemens,A,1,Training,True,0,9,0,0
1,A0S9V9,Siemens,A,1,Training,False,0,9,0,1
2,A0S9V9,Siemens,A,1,Training,False,0,9,0,2
3,A0S9V9,Siemens,A,1,Training,False,0,9,0,3
4,A0S9V9,Siemens,A,1,Training,False,0,9,0,4


In [16]:
slices_info_df.to_csv(os.path.join(base_dir, "slices_info.csv"), index=False)

In [17]:
pd.read_csv(os.path.join(base_dir, "slices_info.csv"))

Unnamed: 0,External code,VendorName,Vendor,Centre,Partition,Labeled,ED,ES,Slice,Phase
0,A0S9V9,Siemens,A,1,Training,True,0,9,0,0
1,A0S9V9,Siemens,A,1,Training,False,0,9,0,1
2,A0S9V9,Siemens,A,1,Training,False,0,9,0,2
3,A0S9V9,Siemens,A,1,Training,False,0,9,0,3
4,A0S9V9,Siemens,A,1,Training,False,0,9,0,4
...,...,...,...,...,...,...,...,...,...,...
106154,Y6Y9Z2,Philips,B,3,Testing,False,29,9,11,25
106155,Y6Y9Z2,Philips,B,3,Testing,False,29,9,11,26
106156,Y6Y9Z2,Philips,B,3,Testing,False,29,9,11,27
106157,Y6Y9Z2,Philips,B,3,Testing,False,29,9,11,28
