In [34]:
pip install SimpleITK

Note: you may need to restart the kernel to use updated packages.


In [35]:
import SimpleITK as sitk
import numpy as np
import os
import sys
import shutil
from glob import glob
from collections import OrderedDict
import json

In [36]:
def copy_BraTS_segmentation_and_convert_labels(in_file, out_file):
    # use this for segmentation only!!!
    # nnUNet wants the labels to be continuous. BraTS is 0, 1, 2, 4 -> we make that into 0, 1, 2, 3
#   "0-background"
#   "1-non-enhancing"
#   "2-edema"
#   "3-enhancing"
    
    
    img = sitk.ReadImage(in_file)
    img_npy = sitk.GetArrayFromImage(img)

    uniques = np.unique(img_npy)
    for u in uniques:
        if u not in [0, 1, 2, 4]:
            raise RuntimeError('unexpected label')
            
    seg_new = np.zeros_like(img_npy)
    seg_new[img_npy == 4] = 3
    img_corr = sitk.GetImageFromArray(seg_new)
    img_corr.CopyInformation(img)
    sitk.WriteImage(img_corr, out_file)

In [46]:
# input: full path of user_dir and download_dir using pwd
def organize_data(user_dir, download_dir):
    task_name = 'Task100_Glioblastoma'
    data_dir = 'nnUNet/nnUNet_raw_data_base/nnUNet_raw_data'
    data_dir = os.path.join(user_dir,data_dir)
    data_dir = os.path.join(data_dir,task_name)
    train_data_name = os.path.join(data_dir,'imagesTr')
    train_label_name = os.path.join(data_dir, 'labelsTr')
    test_data_name = os.path.join(data_dir,'imagesTs')
    

    if not os.path.isdir(os.path.join(user_dir, 'nnUNet')):
        raise RuntimeError('Model has not been downloaded.')
  
    os.makedirs(train_data_name, exist_ok=True)
    os.makedirs(train_label_name, exist_ok=True)
    os.makedirs(test_data_name, exist_ok=True)
    
    patients_names = os.listdir(download_dir)
    patients_names =[sample for sample in patients_names if not sample.endswith('.csv')]
    
    ### Due to resource limitation, we choose 15 patients to test our model first 
    patients = ['data/TCGA-02-0033', 'data/TCGA-02-0009', 'data/TCGA-02-0027', 'data/TCGA-02-0011', 'data/TCGA-02-0006', 'data/TCGA-02-0064', 'data/TCGA-02-0069', 'data/TCGA-02-0068', 'data/TCGA-02-0034', 'data/TCGA-02-0047', 'data/TCGA-02-0046', 'data/TCGA-02-0037', 'data/TCGA-02-0059', 'data/TCGA-02-0054', 'data/TCGA-06-0122']
    patients_15 = [i.split('/')[-1] for i in patients]
    patients_names = [x for x in patients_names if x not in patients_15]
    
    
    ### split into train and test
    train_per = 1
    train_idx = int(len(patients_names)*train_per)
    nolabel_patient = []
    no_enough_modalities = []
    for i in range(len(patients_names)):
        directory = os.path.join(download_dir,patients_names[i])
        
        t1_path=glob(os.path.join(directory,'*t1.nii.gz'))
        t2_path=glob(os.path.join(directory,'*t2.nii.gz'))
        flair_path=glob(os.path.join(directory,'*flair.nii.gz'))
        t1_gd = glob(os.path.join(directory,'*t1Gd.nii.gz'))
        
        label_path=glob(os.path.join(directory,'*GlistrBoost_ManuallyCorrected.nii.gz'))
        
        if len(label_path)<1:
            nolabel_patient.append(patients_names[i])
            continue
        
        label_path = label_path[0]
        flag = len(t1_path)+len(t2_path)+len(flair_path)+len(t1_gd)
        if flag<4:
            no_enough_modalities.append(patients_names[i])
            continue
        
        if i<= train_idx:
            shutil.copy(flair_path[0],os.path.join(train_data_name,patients_names[i]+'_0000.nii.gz'))
            shutil.copy(t1_path[0],os.path.join(train_data_name,patients_names[i]+'_0001.nii.gz'))
            shutil.copy(t1_gd[0],os.path.join(train_data_name,patients_names[i]+'_0002.nii.gz'))
            shutil.copy(t2_path[0],os.path.join(train_data_name,patients_names[i]+'_0003.nii.gz'))
        
        else:
            shutil.copy(flair_path[0],os.path.join(test_data_name,patients_names[i]+'_0000.nii.gz'))
            shutil.copy(t1_path[0],os.path.join(test_data_name,patients_names[i]+'_0001.nii.gz'))
            shutil.copy(t1_gd[0],os.path.join(test_data_name,patients_names[i]+'_0002.nii.gz'))
            shutil.copy(t2_path[0],os.path.join(test_data_name,patients_names[i]+'_0003.nii.gz'))
            
        
#         print(label_path)
        copy_BraTS_segmentation_and_convert_labels(label_path, os.path.join(train_label_name,patients_names[i]+'.nii.gz'))

    
    
    patients_names =  [x for x in patients_names if x not in nolabel_patient]  
    print(len(patients_names))
    #### Create json file to save metadata
    json_dict = OrderedDict()
    json_dict['name'] = "TCGA-GBM"
    json_dict['description'] = "nothing"
    json_dict['tensorImageSize'] = "3D"
    json_dict['reference'] = "see TCGA-GBM"
    json_dict['licence'] = "see TCGA-GBM license"
    json_dict['release'] = "Version 1 (Current): 2017/07/17"
    json_dict['modality'] = {
        "0": "FLAIR",
        "1": "T1",
        "2": "T1Gd",   
        "3": "T2"
    }
    json_dict['labels'] = {
        "0": "background",
        "1": "non-enhancing",
        "2": "edema",
        "3": "enhancing",
    }
    json_dict['numTraining'] = len(patients_names)
    json_dict['numTest'] = len(patients_names)-train_idx
    json_dict['training'] = [{'image': "./imagesTr/%s.nii.gz" % i, "label": "./labelsTr/%s.nii.gz" % i} for i in patients_names[:train_idx+1]]
    json_dict['test'] = ["./imagesTs/%s.nii.gz" % i for i in patients_names[train_idx+1:]]
    
    json_p = os.path.join(data_dir,'dataset.json')
    with open(json_p, 'w', encoding='utf-8') as f:
        json.dump(json_dict, f, ensure_ascii=False, indent=4)
    
    return no_enough_modalities, nolabel_patient
            

In [44]:
def other_folders(user_dir):
    user_dir = os.path.join(user_dir,'nnUNet')
    preprocessed = os.path.join(user_dir,'nnUNet_preprocessed')
    output = os.path.join(user_dir, 'RESULTS_FOLDER') 
    
    os.makedirs(preprocessed, exist_ok=True)
    os.makedirs(output, exist_ok=True)

In [48]:
if __name__ == '__main__':
#     download_dir = sys.args[1]
    user_dir = '/Users/yuanqizhao/Desktop/Bioimage_Project'
    download_dir = '/Users/yuanqizhao/Desktop/Bioimage_Project/Pre-operative_TCGA_GBM_NIfTI_and_Segmentations'
    
    no_enough_modalities, nolabel_patient = organize_data(user_dir, download_dir)
    print('modality', no_enough_modalities)
    print('label', nolabel_patient)
    other_folders(user_dir)

82
modality []
label ['TCGA-12-3650', '.DS_Store', 'TCGA-08-0509', 'TCGA-06-0238', 'TCGA-02-0070', 'TCGA-08-0520']


Train = 87
Test = 10
label = 97
reference https://github.com/MIC-DKFZ/nnUNet/blob/master/nnunet/dataset_conversion/Task043_BraTS_2019.py