# Identifying Gray Matter Markers of Irritability: a machine learning approach
This notebook is designed to analyze previously processed gray matter density volumes using support vector regression.

In [1]:
from nipype.pipeline.engine import Workflow, Node, MapNode
from nipype.interfaces.utility import IdentityInterface, Function
from nipype.interfaces.io import SelectFiles, DataSink, DataGrabber
from nipype.algorithms.misc import Gunzip
from nipype.interfaces.spm.preprocess import VBMSegment, Segment
from nipype.interfaces.ants import Atropos, Registration, ApplyTransforms, N4BiasFieldCorrection
from nipype.interfaces.fsl import ApplyMask, BET
from pandas import DataFrame, Series, read_csv

# Study specific variables
study_home = '/moochie/Cat/Aggregate_anats/GMD_ML'

sub_data_file = study_home + '/doc/subjectinfo.csv'
subject_info = read_csv(sub_data_file)
subjects_list = subject_info['freesurferID'].tolist()

preproc_dir = study_home + '/proc'
output_dir = study_home + '/ml_trainingset'

sample_template = study_home + '/templates/lcbd_template_1mm.nii.gz'
sample_template_brain = study_home + '/templates/lcbd_template_1mm_brain.nii.gz'
sample_template_mask = study_home + '/templates/lcbd_template_1mm_mask.nii.gz'


subject_info['age_inv'] = 1/subject_info['Age_yrs']
subject_info.describe()

Unnamed: 0,SubjectID,SequenceVersion,PotentiallyUsable,in_analysis,Age_yrs,Age_mos,male,cbq_activity,cbq_angfru,cbq_approach,...,cbq_impulsivity,cbq_inhibtctl,cbq_lip,cbq_perceptsens,cbq_sad,cbq_shy,cbq_smilelaugh,map_temploss,age_sq,age_inv
count,117.0,117.0,117.0,117.0,117.0,117.0,117.0,102.0,102.0,102.0,...,102.0,102.0,102.0,102.0,102.0,102.0,102.0,117.0,117.0,117.0
mean,970.25641,2.102564,0.991453,0.957265,8.106776,96.82906,0.581197,4.536415,4.049049,4.996765,...,4.230392,4.815359,5.237745,5.129085,4.1219,3.547451,5.37585,17.162393,69.2152,0.129964
std,787.790736,1.03711,0.09245,0.203129,1.877635,22.821757,0.495485,1.00984,1.536063,0.948968,...,0.953265,1.13619,1.197604,1.011188,0.990052,1.184342,0.782011,18.382881,32.500516,0.029889
min,2.0,1.0,0.0,0.0,4.62423,55.0,0.0,2.285714,1.0,2.5,...,2.0,2.166667,2.0,2.0,1.428571,1.0,2.666667,0.0,21.383503,0.080222
25%,208.0,1.0,1.0,1.0,6.78165,81.0,0.0,3.857143,3.0,4.333333,...,3.5,4.0,4.3125,4.5,3.571429,2.6675,4.833333,4.0,45.990777,0.108479
50%,1008.0,2.0,1.0,1.0,7.906913,94.0,1.0,4.5,3.833333,4.833333,...,4.166667,4.833333,5.5,5.333333,4.142857,3.666667,5.5,10.0,62.519274,0.126472
75%,2006.0,3.0,1.0,1.0,9.218344,110.0,1.0,5.25,5.166667,5.833333,...,4.833333,5.666667,6.125,5.833333,4.714286,4.333333,6.0,23.0,84.977859,0.147457
max,2038.0,4.0,1.0,1.0,12.465435,161.0,1.0,6.714286,6.833333,7.0,...,6.666667,7.0,7.0,7.0,6.571429,6.333333,7.0,81.0,155.387061,0.216252


In [2]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from numpy import squeeze

## Create a conditions list for the feature set
age_labels = subject_info[['Age_yrs','age_inv']].copy()
age_labels = age_labels.values
irr_labels = subject_info[['map_temploss']].copy()
irr_labels = irr_labels.values

scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
scaler.fit(age_labels)
sd_agedata = scaler.transform(age_labels)

pt = PowerTransformer()
pt.fit(irr_labels)
pt_irritability = pt.transform(irr_labels)
pt_irritability = squeeze(pt_irritability)

conditions = DataFrame(data=sd_agedata, index=None, columns=['age','ageinv'])
conditions['subject'] = Series(subjects_list, index=conditions.index)
conditions['irritability'] = Series(pt_irritability, index=conditions.index)

conditions.to_csv(output_dir + '/featureset_key.csv')
conditions.describe()

Unnamed: 0,age,agesq,ageinv,irritability
count,117.0,117.0,117.0,117.0
mean,-5.864255e-16,1.043799e-16,2.751835e-16,-5.427757e-16
std,1.004301,1.004301,1.004301,1.004301
min,-1.862729,-1.478051,-1.67135,-1.867147
25%,-0.7087775,-0.7176598,-0.7218803,-0.7193634
50%,-0.1069019,-0.2069114,-0.1173301,-0.04972326
75%,0.5945502,0.4870832,0.5877827,0.6939497
max,2.33134,2.662804,2.899351,2.052883


In [6]:
## Concatenate all the parameter estimates from preproc to create a feature set
from glob import glob
from nipype.interfaces.fsl.utils import Merge
from nipype.interfaces.fsl import SUSAN
files = glob(preproc_dir + '/soft_tissue_files/*/POSTERIOR_02.nii.gz')
files = sorted(files)

def brightthresh(img):
    import nibabel as nib
    from numpy import median, where
    
    from nipype import config, logging
    config.enable_debug_mode()
    logging.update_logging(config)
    
    img_nifti1 = nib.load(img)
    img_data = img_nifti1.get_data()
    img_data = img_data.astype(float)
    
    brain_values = where(img_data > 0)
    median_thresh = median(brain_values)
    bright_thresh = 0.75 * median_thresh
    
    return(bright_thresh)

sm = SUSAN()
sm.inputs.fwhm=6

for file in files:
    sm.inputs.brightness_threshold = brightthresh(file)
    sm.inputs.in_file = file
    sm.inputs.out_file = file.replace('POSTERIOR_02','smoothed_gm')
    sm.run()

gm_files = glob(preproc_dir + '/soft_tissue_files/*/smoothed_gm.nii.gz')
gm_files = sorted(gm_files)
gmd_feature_data = output_dir + '/gmd_smooth_combined.nii.gz'

merge = Merge()
merge.inputs.in_files = gm_files
merge.inputs.dimension = 't'
merge.inputs.merged_file = gmd_feature_data
#merge.run()

181120-13:35:10,139 interface DEBUG:
	 brightness_threshold_98.25
181120-13:35:10,143 interface DEBUG:
	 dimension_3
181120-13:35:10,144 interface DEBUG:
	 in_file_/moochie/Cat/Aggregate_anats/GMD_ML/proc/soft_tissue_files/101/POSTERIOR_02.nii.gz
181120-13:35:10,145 interface DEBUG:
	 out_file_/moochie/Cat/Aggregate_anats/GMD_ML/proc/soft_tissue_files/101/smoothed_gm.nii.gz
181120-13:35:10,146 interface DEBUG:
	 use_median_1
181120-13:38:26,616 interface DEBUG:
	 brightness_threshold_98.25
181120-13:38:26,619 interface DEBUG:
	 dimension_3
181120-13:38:26,620 interface DEBUG:
	 in_file_/moochie/Cat/Aggregate_anats/GMD_ML/proc/soft_tissue_files/102/POSTERIOR_02.nii.gz
181120-13:38:26,621 interface DEBUG:
	 out_file_/moochie/Cat/Aggregate_anats/GMD_ML/proc/soft_tissue_files/102/smoothed_gm.nii.gz
181120-13:38:26,622 interface DEBUG:
	 use_median_1
181120-13:41:42,981 interface DEBUG:
	 brightness_threshold_98.25
181120-13:41:42,983 interface DEBUG:
	 dimension_3
181120-13:41:42,984 inter

<nipype.interfaces.base.support.InterfaceResult at 0x7f337b118588>

In [7]:
analysis = 'irritability'

if analysis == 'age':
    labels = conditions['age']
    type_svm = 'regression'
elif analysis == 'ageinv':
    labels = conditions['ageinv']
    type_svm = 'regression'
elif analysis == 'irritability':
    labels = conditions['irritability']
    type_svm = 'regression'

results_file = open(output_dir + '/results_' + analysis + '.txt','w')
labels.describe()

count    1.170000e+02
mean    -5.427757e-16
std      1.004301e+00
min     -1.867147e+00
25%     -7.193634e-01
50%     -4.972326e-02
75%      6.939497e-01
max      2.052883e+00
Name: irritability, dtype: float64

In [None]:
if type_svm == 'regression':
    # Perform the support vector classification
    from nilearn.input_data import NiftiMasker
    from sklearn.feature_selection import f_regression, SelectPercentile
    from sklearn.svm import SVR
    from sklearn.pipeline import Pipeline

    # Set up the regression
    svr = SVR(kernel='linear', C=1)
    masker = NiftiMasker(mask_img=sample_template_mask,standardize=True, 
                         memory='nilearn_cache', memory_level=1)
    
    feature_selection = SelectPercentile(f_regression, percentile=5)
    fs_svr = Pipeline([('feat_select', feature_selection), ('svr', svr)])
    
    # Run the regression
    X = masker.fit_transform(gmd_feature_data)
    fs_svr.fit(X, labels)
        
    from sklearn.model_selection import cross_val_predict, LeaveOneGroupOut

    loso = LeaveOneGroupOut()
    y_pred = cross_val_predict(fs_svr, X, y=labels, n_jobs=6,
                               groups=conditions['subject'],cv=loso)
    # save weights
    coef = svr.coef_
    coef = feature_selection.inverse_transform(coef)
    coef_image = masker.inverse_transform(coef)
    coef_image.to_filename(output_dir + '/svrweights_' + analysis + '.nii.gz')
    
    from scipy.stats import linregress
    slope, intercept, r_val, p_val, stderr = linregress(labels, y_pred) 

    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(labels, y_pred)
    
    from scipy.stats import spearmanr
    spear_r, spear_p = spearmanr(labels, y_pred)

    print("prediction accuracy: %.4f / p-value: %f / MSE: %f // Spearman: %f / p-value: %f" % (r_val, p_val, mse, spear_r, spear_p))

    # plot the predicted versus actual values
    import matplotlib.pyplot as plt
    plt.scatter(labels, y_pred, color='b')
    plt.xlabel('Actual ' + analysis)
    plt.ylabel('Predicted ' + analysis)
    plt.savefig(output_dir + '/scatter_pred_actual_' + analysis + '_final.svg', transparent=True)
    plt.show()
    plt.close()

    results_file.write("Prediction accuracy r-value: %.4f / p-value: %f / MSE: %f // Spearman: %f / p-value: %f \n" % (r_val, p_val, mse, spear_r, spear_p))
    results_file.write('predicted: ' + str(y_pred) + '\n')
    results_file.write('actual: ' + str(labels) + '\n')

    results_file.close()