# Preprocess - BraTS 2018

Follow https://github.com/icerain-alt/brats-unet

In [None]:
# import kagglehub

# Download latest version
# path = kagglehub.dataset_download("harshitsinghai/miccai-brats2018-original-dataset")

# print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/harshitsinghai/miccai-brats2018-original-dataset?dataset_version_number=1...


100%|██████████| 2.81G/2.81G [02:25<00:00, 20.8MB/s]

Extracting model files...





Path to dataset files: /home/yinn147/.cache/kagglehub/datasets/harshitsinghai/miccai-brats2018-original-dataset/versions/1


In [None]:

import h5py
import os
import numpy as np
import SimpleITK as sitk
from tqdm import tqdm
import torch.nn.functional as F
import torch
import random

In [4]:
## split
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    print('Random seed for this experiment is {} !'.format(seed))
set_seed(42)

path = '/mnt/ExtData/Data/BraTS2018/MICCAI_BraTS_2018_Data_Training/LGG'
files = os.listdir(path)
files

print('total : ', len(files), 'train :', int(len(files) * 0.8), 'val :', len(files) - int(len(files)* 0.8) )

# randomly select 251 from subjects (non-repetition)
random.shuffle(files)
train_subjects = files[:int(len(files) * 0.8)]
val_subjects = files[int(len(files) * 0.8):]


with open('/mnt/ExtData/SFDA_DDFP/preprocess/train_brats2018.txt', 'w') as f:
    for subject in train_subjects:
        f.write(subject + '\n')
        
with open('/mnt/ExtData/SFDA_DDFP/preprocess/val_brats2018.txt', 'w') as f:
    for subject in val_subjects:
        f.write(subject + '\n')


Random seed for this experiment is 42 !
total :  75 train : 60 val : 15


In [None]:
# modalities = ('flair', 't1ce', 't1', 't2')

def process_h5(path, out_path, modalities):
    """ Save the data with dtype=float32.
        z-score is used but keep the background with zero! """
    label = sitk.GetArrayFromImage(sitk.ReadImage(path + 'seg.nii.gz')).transpose(1,2,0)
    # 4 x (H,W,D) -> (4,H,W,D) 
    # images = np.stack([sitk.GetArrayFromImage(sitk.ReadImage(path + modal + '.nii.gz')).transpose(1,2,0) for modal in modalities], 0)  # [240,240,155]
    images = sitk.GetArrayFromImage(sitk.ReadImage(path + modalities + '.nii.gz')).transpose(1,2,0) 

    label = label.astype(np.int64)
    images = images.astype(np.float32)
    case_name = path.split('/')[-1]
    
    path = os.path.join(out_path, case_name)

    images = (images - images.min()) / (images.max() - images.min()) #(240, 240, 155) 

    label[label==4] = 3

    one_hot_label = np.array(F.one_hot(torch.tensor(label), num_classes=4)) # (240, 240, 155, 4)

    for slice_i in range(images.shape[2]):
        if len(np.unique(label[:, :, slice_i])) == 1:
            continue
        else:
            ## visual
            # import matplotlib.pyplot as plt
            # plt.figure(figsize=(20, 15))
            # plt.subplot(1, 5, 1)
            # plt.imshow(images[:, :, slice_i], 'gray')
            # for i in (0, 1, 2, 3):
            #     plt.subplot(1, 5, i+2)
            #     label_i = np.zeros_like(label[:, :, slice_i])
            #     label_i[label[:, :, slice_i] == i] = 1
            #     plt.imshow(label_i)
            #     print(np.unique(label_i))
            # break

            data = np.concatenate((images[...,slice_i:slice_i+1], one_hot_label[...,slice_i, :]), axis=-1)
            np.save(path + modalities + str(slice_i).zfill(4) + '.npy', data)


def doit(dset):
    root, out_path, modalities = dset['root'], dset['out'], dset['mod']
    out_path = os.path.join(out_path, f'{modalities}_{phase}')
    os.makedirs(out_path, exist_ok=True)

    file_list = os.path.join(root, dset['flist'])
    subjects = open(file_list).read().splitlines()
    paths = [os.path.join(root, name, name + '_') for name in subjects]

    for path in tqdm(paths):
        process_h5(path, out_path, modalities)
    print('Finished')


# train
phase = 'train' 
train_set = {
        'mod': 'flair',
        'root': '/mnt/ExtData/Data/BraTS2018/MICCAI_BraTS_2018_Data_Training/LGG', 
        'out': '/mnt/ExtData/Data/preprocess_brats2018/',  
        'flist': f'/mnt/ExtData/SFDA_DDFP/preprocess/{phase}_brats2018.txt',  
        }

doit(train_set)

phase = 'val' 
train_set = {
        'mod': 'flair',
        'root': '/mnt/ExtData/Data/BraTS2018/MICCAI_BraTS_2018_Data_Training/LGG', 
        'out': '/mnt/ExtData/Data/preprocess_brats2018/',  
        'flist': f'/mnt/ExtData/SFDA_DDFP/preprocess/{phase}_brats2018.txt',  
        }

doit(train_set)

phase = 'train' 
train_set = {
        'mod': 't2',
        'root': '/mnt/ExtData/Data/BraTS2018/MICCAI_BraTS_2018_Data_Training/LGG', 
        'out': '/mnt/ExtData/Data/preprocess_brats2018/',  
        'flist': f'/mnt/ExtData/SFDA_DDFP/preprocess/{phase}_brats2018.txt',  
        }

doit(train_set)

phase = 'val' 
train_set = {
        'mod': 't2',
        'root': '/mnt/ExtData/Data/BraTS2018/MICCAI_BraTS_2018_Data_Training/LGG', 
        'out': '/mnt/ExtData/Data/preprocess_brats2018/',  
        'flist': f'/mnt/ExtData/SFDA_DDFP/preprocess/{phase}_brats2018.txt',  
        }

doit(train_set)

100%|██████████| 15/15 [00:18<00:00,  1.25s/it]
  0%|          | 0/60 [00:00<?, ?it/s]

Finished


100%|██████████| 60/60 [01:53<00:00,  1.90s/it]


Finished


100%|██████████| 15/15 [00:38<00:00,  2.59s/it]

Finished



