In [1]:
from process.preproc_clinic import writeClinic
from process.preproc_img import imgRedir, mvRaw, preprocFSL, preprocCAT12
from process.preproc_combine import genHCData, combinePatData
from src.feature.base import genROI
from src.feature.texture import genTextureFeature, dropByCorrelation
from src.utils.data import getPandas, getConfig, writeConfig
from src.model.stats import stats_analyze
import os
import numpy as np
import pandas as pd

In [2]:
# Root directory as working directory
os.chdir('..')
os.getcwd()

'/home/biobot/disk/ldopa'

## Preprocessing & Statistical Analysis

In [None]:
# Get Image Data
imgRedir()
# Get Clinical Data
writeClinic()
# Combine Data & Generate HC Data
combinePatData()
genHCData()
# Generate Subject Directory & Move Raw Image Data
mvRaw(getPandas('img_raw'))
# Now all raw image data and clinical data are set up

In [None]:
# Image Preprocessing
# For Patient Data:
preprocFSL('pat_data')
preprocCAT12('pat_data')
# For HC Data:
preprocFSL('hc_data')
preprocCAT12('hc_data')

# Time Consuming!!!
# All volume data should be saved automatically

In [13]:
# Dataset Split
# One patient may have multiple images, so we need to split the dataset by patient
from sklearn.model_selection import GroupShuffleSplit

data = getPandas('pat_data')
data_conf = getConfig('data')

pat = data.sample(frac=1, random_state=0).reset_index(drop=False).drop_duplicates(subset=['PATNO', 'EVENT_ID']).reset_index(drop=True)

splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
split = splitter.split(pat, groups=pat['PATNO'])
train_inds, test_inds = next(split)
train_inds = pat.iloc[train_inds]['index'].values
test_inds = pat.iloc[test_inds]['index'].values

In [8]:
ratio = len(data[data['CAT_MDS']==1]) / len(data)
test_size = int(len(data) * 0.18)
unique_rec = data[~data.duplicated(subset=['PATNO'], keep=False)]
unique_pos = unique_rec[unique_rec['CAT_MDS']==1].reset_index(drop=False)
unique_neg = unique_rec[unique_rec['CAT_MDS']==0].reset_index(drop=False)
test_pos_inds = unique_pos.sample(n=int(test_size * ratio), random_state=0)['index'].values
test_neg_inds = unique_neg.sample(n=test_size-int(test_size * ratio), random_state=0)['index'].values
test_inds = np.concatenate([test_pos_inds, test_neg_inds])
train_inds = np.setdiff1d(data.index.values, test_inds)

In [14]:
# Data Analysis
group = data_conf['data_group']
x = data[group['demo'] + group['clinic'] + ['PATNO']]
y = data[['CAT_MDS']]
x_clinic_train = x.iloc[train_inds].reset_index(drop=True)
x_clinic_test = x.iloc[test_inds].reset_index(drop=True)
y_train = y.iloc[train_inds].reset_index(drop=True)
y_test = y.iloc[test_inds].reset_index(drop=True)
stats_analyze(x_clinic_train, x_clinic_test, y_train, y_test, data_conf, print)

SEX chi2 p: 0.9060069399226178
CAT_MDS chi2 p: 0.27800152208237083
AGE Normaltest p_train: [0.06522379], p_test: [0.80105024]
AGE t-test p: Ttest_indResult(statistic=array([1.24480961]), pvalue=array([0.21479351]))
NUPDR3OF Normaltest p_train: [0.027186], p_test: [0.6951501]
NUPDR3OF ranksums p: RanksumsResult(statistic=array([-1.47081735]), pvalue=array([0.14134052]))
LEDD Normaltest p_train: [8.66247582e-10], p_test: [0.00554728]
LEDD ranksums p: RanksumsResult(statistic=array([-1.64758224]), pvalue=array([0.09943843]))
DURATION Normaltest p_train: [0.1202715], p_test: [0.19042587]
DURATION t-test p: Ttest_indResult(statistic=array([-1.18909099]), pvalue=array([0.23594449]))



In [15]:
print('Number of CAT_MDS = 0 in train set: {}'.format(y_train['CAT_MDS'].value_counts()[0]))
print('Number of CAT_MDS = 0 in test set: {}'.format(y_test['CAT_MDS'].value_counts()[0]))
print('Number of CAT_MDS = 1 in train set: {}'.format(y_train['CAT_MDS'].value_counts()[1]))
print('Number of CAT_MDS = 1 in test set: {}'.format(y_test['CAT_MDS'].value_counts()[1]))
print('Ratio of CAT_MDS = 0 in train set: {}'.format(y_train['CAT_MDS'].value_counts()[0] / len(y_train)))
print('Ratio of CAT_MDS = 0 in test set: {}'.format(y_test['CAT_MDS'].value_counts()[0] / len(y_test)))

Number of CAT_MDS = 0 in train set: 67
Number of CAT_MDS = 0 in test set: 14
Number of CAT_MDS = 1 in train set: 78
Number of CAT_MDS = 1 in test set: 26
Ratio of CAT_MDS = 0 in train set: 0.46206896551724136
Ratio of CAT_MDS = 0 in test set: 0.35


In [16]:
# Save train/test indices to data config
data_conf['indices'] = {
    'pat': {}
}
data_conf['indices']['pat'] = {
    'train': train_inds.tolist(),
    'test': test_inds.tolist()
}
writeConfig('data', data_conf)

## Feature Extraction

In [None]:
## Texture Features
# Generate ROI
genROI()
# Generate Texture Features
genTextureFeature('pat_data', 'ANTs_Reg')

In [19]:
data = getPandas('pat_data')

In [21]:
data['ANTs_Reg'] = data['IMG_ROOT'] + os.sep + 'fsl' + os.sep + 'reg.nii.gz'
data['FSL_GM'] = data['IMG_ROOT'] + os.sep + 'fsl' + os.sep + 'reg_gm.nii'
data['FSL_WM'] = data['IMG_ROOT'] + os.sep + 'fsl' + os.sep + 'reg_wm.nii'
data['FSL_CSF'] = data['IMG_ROOT'] + os.sep + 'fsl' + os.sep + 'reg_csf.nii'
data['FSL_SGM'] = data['IMG_ROOT'] + os.sep + 'fsl' + os.sep + 'sreg_gm_masked.nii'
data['CAT12_GM'] = data['IMG_ROOT'] + os.sep + 'cat12' + os.sep + 'mri' + os.sep + 'mwp1raw.nii'
data['CAT12_SGM'] = data['IMG_ROOT'] + os.sep + 'cat12' + os.sep + 'mri' + os.sep + 'smwp1raw_masked.nii'
data['CAT12_WM'] = data['IMG_ROOT'] + os.sep + 'cat12' + os.sep + 'mri' + os.sep + 'mwp2raw.nii'
data['CAT12_CSF'] = data['IMG_ROOT'] + os.sep + 'cat12' + os.sep + 'mri' + os.sep + 'mwp3raw.nii'
from xml.dom import minidom
report_list = list(data['IMG_ROOT'] + os.sep + 'cat12' + os.sep + 'report' + os.sep + 'cat_raw.xml')
vol_list = []
for report in report_list:
    root = minidom.parse(report).documentElement
    tiv_str = root.getElementsByTagName('subjectmeasures')[1].getElementsByTagName('vol_TIV')[0].childNodes[0].data
    vol_str = root.getElementsByTagName('subjectmeasures')[1].getElementsByTagName('vol_abs_CGW')[0].childNodes[0].data
    tiv = float(tiv_str)
    gm = float(vol_str.split(' ')[1])
    wm = float(vol_str.split(' ')[2])
    vol_list.append({'TIV': tiv, 'GM_VOL': gm, 'WM_VOL': wm})
vol_list = pd.DataFrame(vol_list)
data = pd.concat([data, vol_list], axis=1)

In [23]:
from src.utils.data import writePandas
writePandas('pat_data', data)

In [25]:
dropByCorrelation('pat_data', 'pat_ANTs_Reg_radiomic', 'CAT_MDS', 0.8)

lRN
lSN
lSTN
lCAU
lPUT
lGPe
lGPi
lTHA
rRN
rSN
rSTN
rCAU
rPUT
rGPe
rGPi
rTHA


<Figure size 640x480 with 0 Axes>