# Data creation for Pediatric GM Segmentation
1. Understand data skew
2. Obtain directories to put in training, validation, and test sets
3. Create numpy blocks of training, validation, and testing sets
4. Validate whether datasets match original numpy arrays from directories

In [None]:
import os
import nibabel as nib
import numpy as np
import subprocess

### Data Skew (1) and getting directory values (2)

Split data into 60/20/20 breakdown for train, val, and test respectively. Handled Skew in DTI_Normal_level.xlsx and balanced Lower Thoracic, Thoracic, and Cervical data. Though cervical data predominates, training data is fairly balanced. Val and testing are similarly balanced. *Train_dirs, val_dirs, and test_dirs* include the directories for dataset that handles skew. There will be imperfections, but this is a relatively balanced method.

In [None]:
train_dirs = ['140', '162', '162_second', '172', '172_second', '166_second', \
    '195', '21', '106', '208', '205', '237', '88', '93', '61', '113', '92_second', \
    '114', '143', '163', '153', '90', '125', '105', '175', '144', '58', '196', \
    '102', '235', '197', '207', '191', '124', '189', '226', '157', '194', \
    '180', '210', '187']

val_dirs = ['179_second', '199', '82', '142', '115', '135', '247', '127', \
    '161', '55', '181', '244', '150']

test_dirs = ['173_second', '86', '53', '70', '56', '160', '116', '52', \
    '155', '174', '169', '148', '201', '112']

### Create the training, validation, and testing datasets (3)

Going to go through directories and first append the numpy arrays to a list and then convert them a numpy array

In [None]:
working_dir = '/Users/captain/Documents/combined_roi/pediatric_gmseg'
mFFE_crop = 'mFFE_crop_r.nii.gz'
mFFE_gmseg = 'mFFE_crop_r_gmseg.nii.gz'

In [None]:
# Training data
train_data = []
train_targets = []

for dir in train_dirs:
    os.chdir(os.path.join(working_dir, dir))
    train_data.append(nib.load(mFFE_crop).get_fdata())
    train_targets.append(nib.load(mFFE_gmseg).get_fdata())

#Return to base directory
os.chdir(working_dir)

#Turn lists into numpy arrays
train_data = np.concatenate(train_data, axis=2)
train_targets = np.concatenate(train_targets, axis=2)

#Save numpy arrays
np.save('train_data.npy', train_data)
np.save('train_targets.npy', train_targets)

In [None]:
# Validation data
val_data = []
val_targets = []

for dir in val_dirs:
    os.chdir(os.path.join(working_dir, dir))
    val_data.append(nib.load(mFFE_crop).get_fdata())
    val_targets.append(nib.load(mFFE_gmseg).get_fdata())

#Return to base directory
os.chdir(working_dir)

#Turn lists into numpy arrays
val_data = np.concatenate(val_data, axis=2)
val_targets = np.concatenate(val_targets, axis=2)

#Save numpy arrays
np.save('val_data.npy', val_data)
np.save('val_targets.npy', val_targets)

In [None]:
# Testing data
test_data = []
test_targets = []

for dir in test_dirs:
    os.chdir(os.path.join(working_dir, dir))
    test_data.append(nib.load(mFFE_crop).get_fdata())
    test_targets.append(nib.load(mFFE_gmseg).get_fdata())

#Return to base directory
os.chdir(working_dir)

#Turn lists into numpy arrays
test_data = np.concatenate(test_data, axis=2)
test_targets = np.concatenate(test_targets, axis=2)

#Save numpy arrays
np.save('test_data.npy', test_data)
np.save('test_targets.npy', test_targets)

### Validating .npy arrays (4)

Validating if .npy truly contains the intended data by comparing it to the NifTi in the original directory

In [None]:
#return to base dir
os.chdir(working_dir)

#Performing Validation to see whether data is the same
samp_data = np.load('test_data.npy')
samp_targets = np.load('test_targets.npy')
start_slice = 0
end_slice = 14
data_equal = []
target_equal = []
for dir in test_dirs:
    os.chdir(os.path.join(working_dir, dir))
    data_equal.append(np.array_equal(samp_data[:,:, start_slice:end_slice],nib.load(mFFE_crop).get_fdata()))
    target_equal.append(np.array_equal(samp_targets[:,:, start_slice:end_slice],nib.load(mFFE_gmseg).get_fdata()))
    start_slice += 14
    end_slice += 14

### Creating an SCT GMSeg data for model comparison

Go through the testing directories and calculating GMSeg. Get arrays and create a numpy array for GMSeg.

In [None]:
#Array to append SCT gmseg to
sct_gm = []

# file name
sct_gm_seg = 'mFFE_crop_r_gmseg_test.nii.gz'

#Go through the testing directories and compute gm seg
for dir in test_dirs:
    print(dir)
    os.chdir(os.path.join(working_dir, dir))
    subprocess.run(['sct_deepseg_gm', '-i', 'mFFE_crop_r.nii.gz', '-o', sct_gm_seg])
    sct_gm.append(nib.load(sct_gm_seg).get_fdata())

#Return to base directory
os.chdir(working_dir)

#Turn lists into numpy arrays
sct_targets = np.concatenate(sct_gm, axis=2)

#Save numpy arrays
np.save('sct_targets.npy', sct_targets)

#### Validate SCT gmseg target files

Load numpy files and perform comparison similar to train, val, and test data

In [None]:
#return to base dir
os.chdir(working_dir)

# file name
sct_gm_seg = 'mFFE_crop_r_gmseg_test.nii.gz'

#Performing Validation to see whether data is the same
sct_targets = np.load('sct_targets.npy')
start_slice = 0
end_slice = 14
target_equal = []
for dir in test_dirs:
    os.chdir(os.path.join(working_dir, dir))
    target_equal.append(np.array_equal(sct_targets[:,:, start_slice:end_slice], nib.load(sct_gm_seg).get_fdata()))
    start_slice += 14
    end_slice += 14

print(target_equal)