# Identifying Gray Matter Markers of Irritability: a machine learning approach
This notebook is designed to analyze previously processed gray matter density volumes using support vector regression.

In [None]:
from nipype.pipeline.engine import Workflow, Node, MapNode
from nipype.interfaces.utility import IdentityInterface, Function
from nipype.interfaces.io import SelectFiles, DataSink, DataGrabber
from nipype.algorithms.misc import Gunzip
from nipype.interfaces.spm.preprocess import VBMSegment, Segment
from nipype.interfaces.ants import Atropos, Registration, ApplyTransforms, N4BiasFieldCorrection
from nipype.interfaces.fsl import ApplyMask, BET
from pandas import DataFrame, Series, read_csv

# Study specific variables
study_home = '/moochie/user_data/CamachoCat/Aggregate_anats/GMD_ML'

sub_data_file = study_home + '/doc/subject_info.csv'
subject_info = read_csv(sub_data_file, index_col=0)
subjects_list = subject_info['freesurferID'].tolist()

preproc_dir = study_home + '/proc'
output_dir = study_home + '/ml_trainingset'

sample_template = study_home + '/templates/lcbd_template_1mm.nii.gz'
sample_template_brain = study_home + '/templates/lcbd_template_1mm_brain.nii.gz'
sample_template_mask = study_home + '/templates/lcbd_template_1mm_mask.nii.gz'

subject_info.describe()

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import warnings
from scipy import stats
warnings.filterwarnings('ignore')
sns.set(context='paper',style='white')
#for variable in ['Age_yrs', 'MAP_Temper_Loss','MAP_Noncompliance','MAP_General_Aggression','MAP_Low_Concern']:
#    plt.figure()
#    sns.distplot(subject_info[variable],hist=True,kde=False,bins=30, color='#171C43', hist_kws={'edgecolor':'black'})
    #plt.savefig(variable+'_hist.svg')
    
for variable in ['MAP_Temper_Loss','MAP_Noncompliance','MAP_General_Aggression','MAP_Low_Concern']:
    plt.figure()
    a = sns.jointplot(subject_info['Age_yrs'],subject_info[variable],
                      marginal_kws={'kde':False,'bins':30})
    a.annotate(stats.pearsonr,fontsize=12)
    #plt.savefig(variable+'_age_corr.svg')
    plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from numpy import squeeze

## Create a conditions list for the feature set
age_labels = subject_info[['Age_yrs']].copy()
age_labels = age_labels.values
irr_labels = subject_info[['MAP_Temper_Loss','MAP_Noncompliance','MAP_General_Aggression','MAP_Low_Concern']].copy()
irr_labels = irr_labels.values

scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
scaler.fit(age_labels)
sd_agedata = scaler.transform(age_labels)

pt = PowerTransformer()
pt.fit(irr_labels)
pt_irritability = pt.transform(irr_labels)
pt_irritability = squeeze(pt_irritability)

subject_info = subject_info.merge(DataFrame(pt_irritability,
                                            columns=['temploss_yj','noncomp_yj','genagg_yj','lowcon_yj'],
                                            index=subject_info.index),left_index=True, right_index=True)
subject_info['age_cent'] = sd_agedata

#subject_info.to_csv(output_dir + '/featureset_key.csv')
subject_info.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.scatterplot(x='MAP_Temper_Loss',y='temploss_yj',data=subject_info)
plt.figure()
plt.hist(subject_info['temploss_yj'][np.isfinite(subject_info['temploss_yj'])])

In [None]:
## Concatenate all the parameter estimates from preproc to create a feature set
from nipype.interfaces.fsl.utils import Merge

gm_template = preproc_dir + '/final_gmd/{0}/final_smooth_gm.nii.gz'
gm_files = []
for sub in subjects_list:
    gm_files.append(gm_template.format(sub))
gmd_feature_data = output_dir + '/gmd_combined.nii.gz'
#print(gm_files)
merge = Merge()
merge.inputs.in_files = gm_files
merge.inputs.dimension = 't'
merge.inputs.merged_file = gmd_feature_data
#merge.run()

In [None]:
## Concatenate all the parameter estimates from preproc to create a feature set
from glob import glob
from nipype.interfaces.fsl.utils import Merge
from nipype.interfaces.fsl import SUSAN, Threshold
files = glob(preproc_dir + '/soft_tissue_files/*/POSTERIOR_02.nii.gz')
files = sorted(files)

def brightthresh(img):
    import nibabel as nib
    from numpy import median, where
    
    from nipype import config, logging
    config.enable_debug_mode()
    logging.update_logging(config)
    
    img_nifti1 = nib.load(img)
    img_data = img_nifti1.get_data()
    img_data = img_data.astype(float)
    
    brain_values = where(img_data > 0)
    median_thresh = median(brain_values)
    bright_thresh = 0.75 * median_thresh
    
    return(bright_thresh)

sm = SUSAN()
sm.inputs.fwhm=4
thr = Threshold()
thr.inputs.thresh=0.05
thr.inputs.direction='below'

for file in files:
    #sm.inputs.brightness_threshold = brightthresh(file)
    sm.inputs.in_file = file
    sm.inputs.out_file = file.replace('POSTERIOR_02','smoothed_gm')
    #sm.run()
    thr.inputs.in_file = sm.inputs.out_file
    thr.inputs.out_file = file.replace('POSTERIOR_02','final_smooth_gm')
    #thr.run()
    

gm_files = glob(preproc_dir + '/soft_tissue_files/*/final_smooth_gm.nii.gz')
gm_files = sorted(gm_files)
gmd_feature_data = output_dir + '/gmd_combined.nii.gz'

merge = Merge()
merge.inputs.in_files = gm_files
merge.inputs.dimension = 't'
merge.inputs.merged_file = gmd_feature_data
#merge.run()

In [None]:
from nipype.interfaces.fsl import GLM, Merge
from os.path import abspath
from subprocess import check_call

usable_subs = subject_info[subject_info['final_incl']==1]
subjects_list = usable_subs['freesurferID'].tolist()
ages = usable_subs['age_cent'].tolist()
male = usable_subs['male'].tolist()
seq1 = usable_subs['seq1'].tolist()
seq2 = usable_subs['seq2'].tolist()
seq3 = usable_subs['seq3'].tolist()
seq4 = usable_subs['seq4'].tolist()
eTIV = usable_subs['eTIV'].tolist()

final_data_list = []
text_file = open('temp_text.txt','w')

for a in range(0,len(subjects_list)):
    file = preproc_dir + '/final_gmd/{0}/final_smooth_gm.nii.gz'.format(subjects_list[a])
    final_data_list.append(file)
    text_file.write('{0} {1} {2} {3} {4} {5}\n'.format(male[a], seq1[a], seq2[a], seq3[a],seq4[a], eTIV[a]))
    
text_file.close()

file = abspath('temp_text.txt')
check_call(['Text2Vest',file,'design.mat'])
design_file = abspath('design.mat')

me=Merge()
me.inputs.dimension='t'
me.inputs.in_files=final_data_list
me.inputs.merged_file='data_merged.nii.gz'
me.run()
merged_gmd = abspath('data_merged.nii.gz')

glm = GLM()
glm.inputs.in_file = merged_gmd
glm.inputs.design = design_file
glm.inputs.mask = sample_template_mask
glm.inputs.out_res_name = 'data_resids_noage.nii.gz'
glm.run()

final_data = abspath('data_resids_noage.nii.gz')

In [None]:
from nilearn.input_data import NiftiMasker

analysis = 'factor4'
masker = NiftiMasker(mask_img=sample_template_mask,standardize=True, 
                     memory='nilearn_cache', memory_level=1)
X = masker.fit_transform(gmd_feature_data)

if analysis == 'Age':
    mask = subject_info['final_incl']==1
    labels = subject_info['Age_yrs'][mask]
    groups = subject_info['freesurferID'][mask]
    X=X[mask]
elif analysis == 'Temper_Loss':
    mask = (subject_info['MAP_Temper_Loss']>=0) & (subject_info['final_incl']==1)
    labels = subject_info['temploss_yj'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
elif analysis == 'Noncompliance':
    mask = (subject_info['MAP_Noncompliance']>=0) & (subject_info['final_incl']==1)
    labels = subject_info['noncomp_yj'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
elif analysis == 'General_Aggression':
    mask = (subject_info['MAP_General_Aggression']>=0) & (subject_info['final_incl']==1)
    labels = subject_info['genagg_yj'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
elif analysis == 'Low_Concern':
    mask = (subject_info['MAP_Low_Concern']>=0) & (subject_info['final_incl']==1)
    labels = subject_info['lowcon_yj'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
elif analysis == 'factor1':
    mask = (subject_info['smiling_laughter']>=0) & (subject_info['final_incl']==1)
    labels = subject_info['factor1'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
elif analysis == 'factor2':
    mask = (subject_info['smiling_laughter']>=0) & (subject_info['final_incl']==1)
    labels = subject_info['factor2'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
elif analysis == 'factor3':
    mask = (subject_info['smiling_laughter']>=0) & (subject_info['final_incl']==1)
    labels = subject_info['factor3'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
elif analysis == 'factor4':
    mask = (subject_info['smiling_laughter']>=0) & (subject_info['final_incl']==1)
    labels = subject_info['factor4'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
    
analysis= analysis
results_file = open(output_dir + '/results_' + analysis + '.txt','w')
labels.describe()

In [None]:
# Perform the support vector classification
from nilearn.input_data import NiftiMasker
from sklearn.feature_selection import f_regression, SelectPercentile
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from pandas import DataFrame, Series

# Set up the regression
svr = SVR(kernel='linear', C=1)

feature_selection = SelectPercentile(f_regression, percentile=5)
fs_svr = Pipeline([('feat_select', feature_selection), ('svr', svr)])

# Run the regression
fs_svr.fit(X, labels)

from sklearn.model_selection import cross_val_predict, LeaveOneGroupOut, RepeatedKFold

cv = LeaveOneGroupOut()
#cv = RepeatedKFold(n_splits=10,n_repeats=10)
y_pred = cross_val_predict(fs_svr, X, y=labels, n_jobs=10,groups=groups,cv=cv)

# save weights
coef = svr.coef_
coef = feature_selection.inverse_transform(coef)
coef_image = masker.inverse_transform(coef)
coef_image.to_filename(output_dir + '/svrweights_' + analysis + '.nii.gz')

from scipy.stats import linregress
slope, intercept, r_val, p_val, stderr = linregress(labels, y_pred) 

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(labels, y_pred)

from scipy.stats import spearmanr
spear_r, spear_p = spearmanr(labels, y_pred)

print("prediction accuracy: %.4f / p-value: %f / MSE: %f // Spearman: %f / p-value: %f" % (r_val, p_val, mse, spear_r, spear_p))

svr_results=DataFrame()
svr_results['labels']=labels
svr_results['y_pred']=Series(y_pred,index=labels.index)
# plot the predicted versus actual values
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='poster',style='white')
sns.lmplot(x='labels', y='y_pred',ci=None,data=svr_results)
plt.xlabel('Actual ' + analysis)
plt.ylabel('Predicted ' + analysis)
plt.savefig(output_dir + '/scatter_pred_actual_' + analysis + '_poster.svg')
plt.show()
plt.close()

results_file.write("Prediction accuracy r-value: %.4f / p-value: %f / MSE: %f // Spearman: %f / p-value: %f \n" % (r_val, p_val, mse, spear_r, spear_p))
results_file.write('predicted: ' + str(y_pred) + '\n')
results_file.write('actual: ' + str(labels) + '\n')

results_file.close()

In [None]:
from sklearn.model_selection import permutation_test_score
import matplotlib.pyplot as plt
from numpy import savetxt

results_file = open(output_dir + '/perm_results_' + analysis + '.txt','w')

score, permutation_scores, pvalue = permutation_test_score(fs_svr, X, labels, scoring='neg_mean_squared_error', 
                                                           cv=cv, n_permutations=500, n_jobs=20, groups=groups)
savetxt(output_dir + '/permutation_scores_mse_' + analysis + '.txt', permutation_scores)

# Save a figure of the permutation scores
plt.hist(permutation_scores, 20, label='Permutation scores',
         edgecolor='black')
ylim = plt.ylim()
plt.plot(2 * [score], ylim, '--g', linewidth=3,
         label='Mean Squared Error (pvalue %f)' % pvalue)
plt.ylim(ylim)
plt.legend()
plt.xlabel('Score')
plt.savefig(output_dir + '/permutation_plot_mse_' + analysis + '.svg', transparent=True)
plt.close()

# save final pval/classifier score
results_file.write('MSE score %s (pvalue : %s) \n' % (score, pvalue))

## Perform permutation testing to get a p-value for r-squared
score, permutation_scores, pvalue = permutation_test_score(fs_svr, X, labels, scoring='r2', 
                                                           cv=cv, n_permutations=500, n_jobs=20, groups=groups)
savetxt(output_dir + '/permutation_scores_r2_' + analysis + '.txt', permutation_scores)

# Save a figure of the permutation scores
plt.hist(permutation_scores, 20, label='Permutation scores',
         edgecolor='black')
ylim = plt.ylim()
plt.plot(2 * [score], ylim, '--g', linewidth=3,
         label='R-squared (pvalue %f)' % pvalue)
plt.ylim(ylim)
plt.legend()
plt.xlabel('Score')
plt.savefig(output_dir + '/permutation_plot_r2_' + analysis + '.svg', transparent=True)
plt.close()

# save final pval/classifier score
results_file.write('R square: %s (pvalue : %s) \n' % (score, pvalue))
results_file.close()

In [None]:
from nilearn.input_data import NiftiMasker

analysis = 'sequence_LOSO_6_10'
masker = NiftiMasker(mask_img=sample_template_mask,standardize=True, 
                     memory='nilearn_cache', memory_level=1)
X = masker.fit_transform(gmd_feature_data)
mask=(conditions.age_yrs<=10) & (conditions.age_yrs>=6)

if analysis == 'sequence_LOSO':
    labels = conditions['sequence']
    groups = conditions['subject'] 
elif analysis == 'sequence_LOGO':
    labels = conditions['sequence']
    groups = conditions['sequence'] 
elif analysis == 'sequence_LOSO_6_10':
    labels = conditions['sequence'][mask]
    groups = conditions['subject'][mask]
    X=X[mask]

    
results_file = open(output_dir + '/results_' + analysis + '.txt','w')
labels.describe()

In [None]:
# Perform the support vector classification
from sklearn.svm import SVC
from sklearn.feature_selection import f_classif, SelectPercentile
from sklearn.pipeline import Pipeline

# Set up the support vector classifier
svc = SVC(kernel='linear')

# Select the features contributing to the model
feature_selection = SelectPercentile(f_classif, percentile=5) #0.05/228453 voxels
fs_svc = Pipeline([('feat_select', feature_selection), ('svc', svc)])

# Run the classifier
fs_svc.fit(X, labels)

# Obtain prediction values via cross validation
from sklearn.model_selection import cross_validate, LeaveOneGroupOut, cross_val_predict

loso = LeaveOneGroupOut()
cv_scores = cross_validate(fs_svc, X, y=labels, n_jobs=20, return_train_score=True,
                           groups=groups, cv=loso, scoring='accuracy')
y_pred = cross_val_predict(fs_svc, X, y=labels, n_jobs=20,groups=groups, cv=loso)

## Save the SVM weights to a nifti
coef = svc.coef_
coef = feature_selection.inverse_transform(coef)
weight_img = masker.inverse_transform(coef)
weight_img.to_filename(output_dir + '/svmweights_'+ analysis +'.nii.gz')

## Calculate performance metrics
from sklearn.metrics import recall_score, precision_score

classification_accuracy = cv_scores['test_score'].mean()
chance = 1. / len(labels.unique())
print("Classification accuracy: %.4f / Chance level: %f" % 
      (classification_accuracy, chance))

for label in labels.unique():
    sensitivity = recall_score(labels,y_pred,labels=[label],average='weighted')
    precision = precision_score(labels,y_pred,labels=[label],average='weighted')

    results_file.write("%s: classification accuracy: %.4f \n chance level: %f \n sensitivity: %f \n precision: %f \n" % 
    (label, classification_accuracy, chance, sensitivity, precision))

# compute and display a confusion matrix
from sklearn.metrics import confusion_matrix
from numpy import set_printoptions
import itertools
import matplotlib.pyplot as plt

cnf_matrix = confusion_matrix(labels, y_pred)
set_printoptions(precision=2)
classes = labels.unique()

def plot_confusion_matrix(cm, classes):
    from numpy import arange
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix')
    plt.colorbar()
    tick_marks = arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size=16)
    plt.yticks(tick_marks, classes, size=16)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j],  'd'),
                 horizontalalignment='center',
                 color='white' if cm[i, j] > thresh else 'black', size=16)

    plt.tight_layout()
    plt.ylabel('True label', size=16)
    plt.xlabel('Predicted label', size=16)

plot_confusion_matrix(cnf_matrix, classes)
plt.savefig(output_dir + '/confusion_matrix_' + analysis + '.svg', transparent=True)
plt.close()

results_file.close()