# Group level analysis to test associations between symptomology and gray matter density in children

### Multivariate Analysis

In [None]:
from pandas import DataFrame, Series, read_csv

# Study specific variables
study_home = '/home/camachocm2/Analysis/aggregate_anats'
moochie_home = '/moochie/user_data/CamachoCat/Aggregate_anats'
sub_data_file = moochie_home + '/doc/subject_info_alex.csv'
subject_info = read_csv(sub_data_file, index_col=0)
subjects_list = subject_info['freesurferID'].tolist()

preproc_dir = moochie_home + '/proc/subj_data'
output_dir = moochie_home + '/proc/Alex_project/SVR'
sample_template = moochie_home + '/templates/lcbd_template_1mm.nii.gz'
sample_template_brain = moochie_home + '/templates/lcbd_template_1mm_brain.nii.gz'
sample_template_mask = moochie_home + '/templates/lcbd_template_1mm_mask.nii.gz'

subject_info.describe()

In [None]:
from sklearn.preprocessing import StandardScaler, PowerTransformer
from numpy import squeeze

## Create a conditions list for the feature set
age_labels = subject_info[['Age_yrs']].copy()
age_labels = age_labels.values
agesq_labels = age_labels*age_labels
sx_labels = subject_info[['cbcl_intern','cbcl_extern','cbcl_total']].copy()
sx_labels = sx_labels.values

scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
scaler.fit(age_labels)
sd_agedata = scaler.transform(age_labels)
scaler.fit(agesq_labels)
sd_agesqdata = scaler.transform(agesq_labels)

pt = PowerTransformer()
pt.fit(sx_labels)
pt_sx = pt.transform(sx_labels)
pt_sx = squeeze(pt_sx)

subject_info = subject_info.merge(DataFrame(pt_sx,
                                            columns=['cbcl_intern_yj','cbcl_extern_yj','cbcl_total_yj'],
                                            index=subject_info.index),left_index=True, right_index=True)
subject_info['age_cent'] = sd_agedata
subject_info['age_sq'] = sd_agedata*sd_agedata

subject_info.to_csv(output_dir + '/featureset_key.csv')
subject_info.describe()

In [None]:
## Concatenate all the parameter estimates from preproc to create a feature set
from nipype.interfaces.fsl.utils import Merge

gm_template = preproc_dir + '/final_gmd/{0}/final_smooth_gm_4.nii.gz'
gm_files = []
for sub in subjects_list:
    gm_files.append(gm_template.format(sub))
gmd_feature_data = output_dir + '/gmd_combined_4.nii.gz'
print(gm_files)

merge = Merge()
merge.inputs.in_files = gm_files
merge.inputs.dimension = 't'
merge.inputs.merged_file = gmd_feature_data
merge.run()

In [None]:
from nilearn.input_data import NiftiMasker

analysis = 'age'
masker = NiftiMasker(mask_img=sample_template_mask,standardize=True, 
                     memory='nilearn_cache', memory_level=1)
X = masker.fit_transform(gmd_feature_data)

if analysis == 'age':
    labels = subject_info['age_cent']
    groups = subject_info['freesurferID']
elif analysis == 'age_sq':
    labels = subject_info['age_sq']
    groups = subject_info['freesurferID']
    X = X[mask]
elif analysis == 'internalizing':
    mask = (subject_info['cbcl_intern']>=0)
    labels = subject_info['cbcl_intern_yj'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
elif analysis == 'externalizing':
    mask = (subject_info['cbcl_extern']>=0)
    labels = subject_info['cbcl_extern_yj'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]
elif analysis == 'total_sxs':
    mask = (subject_info['cbcl_total']>=0)
    labels = subject_info['cbcl_total_yj'][mask]
    groups = subject_info['freesurferID'][mask]
    X = X[mask]

results_file = open(output_dir + '/results_' + analysis + '.txt','w')
labels.describe()

In [None]:
# Perform the support vector classification
from nilearn.input_data import NiftiMasker
from sklearn.feature_selection import f_regression, SelectPercentile
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from pandas import DataFrame, Series

# Set up the regression
svr = SVR(kernel='linear', C=1)

feature_selection = SelectPercentile(f_regression, percentile=5)
fs_svr = Pipeline([('feat_select', feature_selection), ('svr', svr)])

# Run the regression
fs_svr.fit(X, labels)

from sklearn.model_selection import cross_val_predict, LeaveOneGroupOut, RepeatedKFold

#cv = LeaveOneGroupOut()
cv = RepeatedKFold(n_splits=10,n_repeats=10)
y_pred = cross_val_predict(fs_svr, X, y=labels, n_jobs=10, groups=groups, cv=cv)

# save weights
coef = svr.coef_
coef = feature_selection.inverse_transform(coef)
coef_image = masker.inverse_transform(coef)
coef_image.to_filename(output_dir + '/svrweights_' + analysis + '.nii.gz')

from scipy.stats import linregress
slope, intercept, r_val, p_val, stderr = linregress(labels, y_pred) 

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(labels, y_pred)

from scipy.stats import spearmanr
spear_r, spear_p = spearmanr(labels, y_pred)

print("prediction accuracy: %.4f / p-value: %f / MSE: %f // Spearman: %f / p-value: %f" % (r_val, p_val, mse, spear_r, spear_p))

svr_results=DataFrame()
svr_results['labels']=labels
svr_results['y_pred']=Series(y_pred,index=labels.index)
# plot the predicted versus actual values
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='poster',style='white')
sns.lmplot(x='labels', y='y_pred',ci=None,data=svr_results)
plt.xlabel('Actual ' + analysis)
plt.ylabel('Predicted ' + analysis)
plt.savefig(output_dir + '/scatter_pred_actual_' + analysis + '_poster.svg')
plt.show()
plt.close()

results_file.write("Prediction accuracy r-value: %.4f / p-value: %f / MSE: %f // Spearman: %f / p-value: %f \n" % (r_val, p_val, mse, spear_r, spear_p))
results_file.write('predicted: ' + str(y_pred) + '\n')
results_file.write('actual: ' + str(labels) + '\n')

results_file.close()

In [None]:
from sklearn.model_selection import permutation_test_score
import matplotlib.pyplot as plt
from numpy import savetxt

results_file = open(output_dir + '/perm_results_' + analysis + '.txt','w')

score, permutation_scores, pvalue = permutation_test_score(fs_svr, X, labels, scoring='neg_mean_squared_error', 
                                                           cv=cv, n_permutations=50, n_jobs=20, groups=groups)
savetxt(output_dir + '/permutation_scores_mse_' + analysis + '.txt', permutation_scores)

# Save a figure of the permutation scores
plt.hist(permutation_scores, 20, label='Permutation scores',
         edgecolor='black')
ylim = plt.ylim()
plt.plot(2 * [score], ylim, '--g', linewidth=3,
         label='Mean Squared Error (pvalue %f)' % pvalue)
plt.ylim(ylim)
plt.legend()
plt.xlabel('Score')
plt.savefig(output_dir + '/permutation_plot_mse_' + analysis + '.svg', transparent=True)
plt.close()

# save final pval/classifier score
results_file.write('MSE score %s (pvalue : %s) \n' % (score, pvalue))

## Perform permutation testing to get a p-value for r-squared
score, permutation_scores, pvalue = permutation_test_score(fs_svr, X, labels, scoring='r2', 
                                                           cv=cv, n_permutations=500, n_jobs=20, groups=groups)
savetxt(output_dir + '/permutation_scores_r2_' + analysis + '.txt', permutation_scores)

# Save a figure of the permutation scores
plt.hist(permutation_scores, 20, label='Permutation scores',
         edgecolor='black')
ylim = plt.ylim()
plt.plot(2 * [score], ylim, '--g', linewidth=3,
         label='R-squared (pvalue %f)' % pvalue)
plt.ylim(ylim)
plt.legend()
plt.xlabel('Score')
plt.savefig(output_dir + '/permutation_plot_r2_' + analysis + '.svg', transparent=True)
plt.close()

# save final pval/classifier score
results_file.write('R square: %s (pvalue : %s) \n' % (score, pvalue))
results_file.close()

### Mixed-Effects Linear Modeling

In [None]:
from nipype.pipeline.engine import Workflow, Node, MapNode
from nipype.interfaces.utility import IdentityInterface, Function
from nipype.interfaces.io import SelectFiles, DataSink, DataGrabber

# Study specific variables
study_home = '/home/camachocm2/Analysis/aggregate_anats'
moochie_home = '/moochie/user_data/CamachoCat/Aggregate_anats'
sub_data_file = moochie_home + '/doc/subject_info_alex.csv'
subject_info = read_csv(sub_data_file, index_col=0)
subjects_list = subject_info['freesurferID'].tolist()

preproc_dir = moochie_home + '/proc/subj_data'
output_dir = moochie_home + '/proc/Alex_project/MELM'
sample_template = moochie_home + '/templates/lcbd_template_1mm.nii.gz'
sample_template_brain = moochie_home + '/templates/lcbd_template_1mm_brain.nii.gz'
sample_template_mask = moochie_home + '/templates/lcbd_template_1mm_mask.nii.gz'

gmd_feature_data = output_dir + '/gmd_combined_4.nii.gz'
transf_data = moochie_home + '/proc/Alex_project/SVR/featureset_key.csv'

In [None]:
subject_dataframe = '/moochie/user_data/CamachoCat/Aggregate_anats/doc/subject_info_alex.csv'
subject_files = '/moochie/user_data/CamachoCat/Aggregate_anats/proc/group_data/mvpa/data_merged_smooth.nii.gz'

import statsmodels.formula.api as smf
from numpy import zeros_like
from pandas import DataFrame, read_csv, Series, concat
from nilearn.masking import apply_mask, unmask

subj_data = read_csv(subject_dataframe, header=0, index_col=0)
func_data = apply_mask(func, mask)

# Load the brain data
subj_data = load(subject_files)

# Preallocate the output arrays

# for the model
pval_intercept_data = zeros_like(mask).astype(float)
pval_age_data = zeros_like(mask).astype(float)
pval_sex_data = zeros_like(mask).astype(float)


coeff_image = unmask(coefficients, mask)

In [None]:
labels = read_csv(label_file, index_col=None)
labels=labels['region_name']

masker = NiftiLabelsMasker(labels_img=atlas, background_label=0)
time_series = masker.fit_transform(func)
time_series_df = DataFrame(data=time_series, columns=labels)