# Build the Classifier
This notebook is dedicated to creating classifiers and run classification analyses of interest on neuroimaging data.

Can we accurately classify:
- adults vs. children
- condition within adults
- condition within children

In [22]:
from pandas import DataFrame, Series, read_csv

# Study specific variables
study_home = '/home/camachocm2/Analysis/KidVid_MVPA'
standard_mask = study_home + '/template/MNI152_T1_2mm_brain_mask_KV.nii.gz'
template = study_home + '/template/MNI152_T1_1mm_brain.nii.gz'
sub_data_file = study_home + '/doc/subjectinfo.csv'
preproc_dir = study_home + '/analysis/preproc/betas'
output_dir = study_home + '/analysis/classifier'

condition_data = read_csv(study_home + '/doc/conditionslist.csv')
subject_info = read_csv(sub_data_file)
subject_info.describe()
print()

Unnamed: 0.1,Unnamed: 0,age_mos,age_yrs,CBCL_intern,CBCL_extern,MAPDB_temploss,CBQ_Anger_Frustration,CBQ_Activity_Level,CBQ_Approach_Positive_Anticipation,CBQ_Attentional_ Focusing,...,CBQ_Low_ Intensity_ Pleasure,CBQ_Perceptual_ Sensitivity,CBQ_Sadness,CBQ_Shyness,CBQ_Smiling_Laughter,age_mos_std,MAPDB_temploss_std,CBQ_Anger_Frustration_std,CBCL_intern_std,CBCL_extern_std
count,51.0,51.0,51.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,25.0,186.007843,15.137255,5.233333,5.366667,12.233333,4.54127,3.883444,4.994555,4.933222,...,5.258333,5.094445,3.971604,3.716889,5.377889,6.666682e-12,-3.333334e-12,-3.333319e-12,3.333327e-12,3.333339e-12
std,14.866069,119.838533,10.282061,5.969366,6.183869,7.933705,1.017132,0.981753,0.957611,1.168547,...,1.398095,1.252341,1.068094,1.450841,1.171197,1.017095,1.017095,1.017095,1.017095,1.017095
min,0.0,58.0,4.0,0.0,0.0,0.0,2.285714,2.0,2.833333,2.833333,...,0.0,0.0,0.0,0.0,0.0,-1.64883,-1.568304,-2.255473,-0.8916857,-0.8826855
25%,12.5,86.15,6.0,1.0,1.25,5.25,3.857143,3.083333,4.208333,4.041667,...,4.5625,4.666667,3.75,2.6675,5.0,-0.8812385,-0.8952583,-0.684102,-0.7212999,-0.6770911
50%,25.0,122.1,10.0,3.0,4.0,11.5,4.5,3.668333,5.0,5.0,...,5.625,5.333333,4.142857,3.916667,5.583333,-0.09481084,-0.0940128,-0.04126833,-0.3805283,-0.2247833
75%,37.5,288.0,24.0,6.75,7.75,16.75,5.178571,4.666667,5.833333,5.958333,...,6.21875,5.791667,4.678571,4.833333,6.0,0.8411323,0.5790334,0.6372783,0.2584185,0.3920001
max,50.0,528.0,44.0,24.0,31.0,31.0,6.428571,5.833333,6.67,7.0,...,7.0,7.0,5.0,5.5,6.833333,1.807685,2.405873,1.887233,3.197574,4.216057


In [23]:
## Create a conditions list for the feature set
condition_labels = condition_data['labels'].tolist()
subjects_list = subject_info['subjID'].tolist()
age_group_list = subject_info['group'].tolist()
ages_mos_list = subject_info['age_mos'].tolist()
mapdb_temploss_list = subject_info['MAPDB_temploss_std'].tolist()
cbq_angfrust_list = subject_info['CBQ_Anger_Frustration_std'].tolist()
intern_list = subject_info['CBCL_intern_std'].tolist()
extern_list = subject_info['CBCL_extern_std'].tolist()

conditions = condition_data
conditions['subject'] = Series(subjects_list[0], index=conditions.index)
conditions['ageGroup'] = Series(age_group_list[0], index=conditions.index)
conditions['age'] = Series(ages_mos_list[0], index=conditions.index)
conditions['MAPDB'] = Series(mapdb_temploss_list[0], index=conditions.index)
conditions['CBQ'] = Series(cbq_angfrust_list[0], index=conditions.index)
conditions['CBCL_intern'] = Series(intern_list[0], index=conditions.index)
conditions['CBCL_extern'] = Series(extern_list[0], index=conditions.index)

for a in range(1,len(subjects_list)):
    temp=DataFrame()
    temp['labels'] = Series(condition_labels)
    temp['subject'] = Series(subjects_list[a], index=temp.index)
    temp['ageGroup'] = Series(age_group_list[a], index=temp.index)
    temp['age'] = Series(ages_mos_list[a], index=temp.index)
    temp['MAPDB'] = Series(mapdb_temploss_list[a], index=temp.index)
    temp['CBQ'] = Series(cbq_angfrust_list[a], index=temp.index)
    temp['CBCL_intern'] = Series(intern_list[a], index=temp.index)
    temp['CBCL_extern'] = Series(extern_list[a], index=temp.index)
    
    
    conditions = conditions.append(temp, ignore_index=True)

#conditions.to_csv(output_dir + '/featureset_key.csv')
conditions.describe()

Unnamed: 0,age,MAPDB,CBQ,CBCL_intern,CBCL_extern
count,1224.0,720.0,720.0,720.0,720.0
mean,186.007843,-3.333353e-12,-3.333294e-12,3.333346e-12,3.333339e-12
std,118.70633,1.000695,1.000695,1.000695,1.000695
min,58.0,-1.568304,-2.255473,-0.8916857,-0.8826855
25%,84.5,-0.9273081,-0.684102,-0.7212999,-0.71821
50%,122.1,-0.0940128,-0.04126833,-0.3805283,-0.2247833
75%,288.0,0.6110832,0.7444172,0.3010149,0.433119
max,528.0,2.405873,1.887233,3.197574,4.216057


In [24]:
## Temporally concatenate all the parameter estimates from preproc to create a feature set
from glob import glob
from nipype.interfaces.fsl.utils import Merge
files = glob(preproc_dir + '/*/betas.nii.gz')
files = sorted(files)

bold_feature_data = output_dir + '/featureset.nii.gz'

merge = Merge()
merge.inputs.in_files = files
merge.inputs.dimension = 't'
merge.inputs.merged_file = bold_feature_data
#merge.run()

## Perform binary support vector classification

In [4]:
if type_svm == 'binary':
    # Perform the support vector classification
    from nilearn.input_data import NiftiMasker
    from sklearn.svm import SVC
    from sklearn.feature_selection import SelectPercentile, f_classif
    from sklearn.pipeline import Pipeline

    # Set up the support vector classifier
    svc = SVC(kernel='linear')
    masker = NiftiMasker(mask_img=standard_mask,standardize=True, 
                         memory='nilearn_cache', memory_level=1)
    feature_selection = SelectPercentile(f_classif, percentile=5)
    anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)])

    # Run the classifier
    X = masker.fit_transform(bold_feature_data)
    X = X[mask]
    maskedlabels=labels[mask]
    anova_svc.fit(X, maskedlabels)
    y_pred = anova_svc.predict(X)

    # Obtain prediction values via cross validation
    from sklearn.model_selection import cross_val_score, LeaveOneGroupOut

    cv = LeaveOneGroupOut(groups=conditions['subject'][mask])
    cv_scores = cross_val_score(anova_svc, X, maskedlabels, cv=cv)
    classification_accuracy = cv_scores.mean()

    print("Classification accuracy: %.4f / Chance level: %f" % 
          (classification_accuracy, 1. / len(labels.unique())))

    #results_file.write("Classification accuracy: %.4f / Chance level: %f \n" % (classification_accuracy, 1. / len(labels.unique())))

NameError: name 'type_svm' is not defined

Save the SVM weights as niftis to ease in visualization.

In [None]:
coef = svc.coef_
# reverse feature selection
coef = feature_selection.inverse_transform(coef)
# reverse masking
weight_img = masker.inverse_transform(coef)
weight_img.to_filename(output_dir + '/svmweights_'+ analysis +'.nii.gz')

In [25]:
# determine which analysis to run
analysis = 'intern'

if analysis == 'all_conditions':
    mask = conditions['labels'].isin(['negative','positive','neutral'])
    labels = conditions['labels']
    type_svm = 'binary'
elif analysis == 'adults':
    mask = conditions['ageGroup'].isin(['adult'])
    labels = conditions['labels']
    type_svm = 'binary'
elif analysis == 'children':
    mask = conditions['ageGroup'].isin(['child'])
    labels = conditions['labels']
    type_svm = 'binary'
elif analysis == 'allConds_predAge':
    mask = conditions['labels'].isin(['negative','positive','neutral'])
    labels = conditions['ageGroup']
    type_svm = 'binary'
elif analysis == 'negative':
    mask = conditions['labels'].isin(['negative'])
    labels = conditions['ageGroup']
    type_svm = 'binary'
elif analysis == 'positive':
    mask = conditions['labels'].isin(['positive'])
    labels = conditions['ageGroup']
    type_svm = 'binary'
elif analysis == 'neutral':
    mask = conditions['labels'].isin(['neutral'])
    labels = conditions['ageGroup']
    type_svm = 'binary'
elif analysis=='age':
    mask = (conditions['ageGroup']=='child')
    labels = conditions['age']
elif analysis == 'age_neg':
    mask = (conditions['ageGroup']=='child') & (conditions['labels']=='negative')
    labels = conditions['age']
    type_svm = 'nonbinary'
elif analysis == 'age_pos':
    mask = (conditions['ageGroup']=='child') & (conditions['labels']=='positive')
    labels = conditions['age']
    type_svm = 'nonbinary'
elif analysis == 'age_neu':
    mask = (conditions['ageGroup']=='child') & (conditions['labels']=='neutral')
    labels = conditions['age']
    type_svm = 'nonbinary'
elif analysis=='intern':
    mask = (conditions['ageGroup']=='child')
    labels = conditions['CBCL_intern']
    type_svm = 'nonbinary'
elif analysis == 'mapdb':
    mask = (conditions['ageGroup']=='child')
    labels = conditions['MAPDB']
    type_svm = 'nonbinary'
elif analysis == 'cbq':
    mask = (conditions['ageGroup']=='child')
    labels = conditions['CBQ']
    type_svm = 'nonbinary'
elif analysis == 'extern':
    mask = (conditions['ageGroup']=='child')
    labels = conditions['CBCL_extern']
    type_svm = 'nonbinary'

conditions[mask].describe()
#results_file = open(output_dir + '/results_' + analysis + '.txt','w')

Unnamed: 0,age,MAPDB,CBQ,CBCL_intern,CBCL_extern
count,720.0,720.0,720.0,720.0,720.0
mean,93.013333,-3.333363e-12,-3.333341e-12,3.333334e-12,3.333353e-12
std,21.250027,1.000695,1.000695,1.000695,1.000695
min,58.0,-1.568304,-2.255473,-0.8916857,-0.8826855
25%,73.4,-0.9273081,-0.684102,-0.7212999,-0.71821
50%,91.0,-0.0940128,-0.04126833,-0.3805283,-0.2247833
75%,114.0,0.6110832,0.7444172,0.3010149,0.433119
max,131.4,2.405873,1.887233,3.197574,4.216057


## Non-binary Classification

The below cells performs non-binary classifiacation based on age and irritability scores.

In [None]:
if type_svm == 'nonbinary':
    # Perform the support vector classification
    from nilearn.input_data import NiftiMasker
    from sklearn.svm import SVR
    from sklearn.pipeline import Pipeline

    # Set up the regression
    svr_lin = SVR(kernel='linear', C=1)
    masker = NiftiMasker(mask_img=standard_mask,standardize=True, 
                         memory='nilearn_cache', memory_level=1)

    # Run the regression
    X = masker.fit_transform(bold_feature_data)
    X = X[mask]
    maskedlabels=labels[mask]
    
    y_lin = svr_lin.fit(X, maskedlabels).predict(X)
    
    from sklearn.model_selection import cross_val_predict, LeaveOneGroupOut

    loso = LeaveOneGroupOut()
    pred_y = cross_val_predict(svr_lin, X, y=maskedlabels, 
                               groups=conditions['subject'][mask],cv=loso)
    
    from scipy.stats import linregress
    slope, intercept, r_val, p_val, stderr = linregress(maskedlabels, pred_y) 
    
    print("prediction accuracy: %.4f / p-value: %f" % 
          (r_val, p_val))

    #results_file.write("Classification accuracy: %.4f / Chance level: %f \n" % (classification_accuracy, 1. / len(labels.unique())))

Save SVR weights as a nifti to aid in visualization.

In [None]:
sample_weight = svr_lin.coef_
coef_image = masker.inverse_transform(sample_weight)
coef_image.to_filename(output_dir + '/linsvrweights_' + analysis + '.nii.gz')

In [None]:
import matplotlib.pyplot as plt

plt.scatter(maskedlabels, pred_y, color='b')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.savefig(output_dir + '/scatter_pred_actual_' + analysis + '.png', transparent=True)
plt.show()

MAPDB:

overall: prediction accuracy: -0.0684 / p-value: 0.066543

irr_neg: prediction accuracy: -0.0570 / p-value: 0.379246

irr_pos: prediction accuracy: -0.0880 / p-value: 0.174417

irr_neu: prediction accuracy: -0.0406 / p-value: 0.531449

CBQ

overall: prediction accuracy: 0.0132 / p-value: 0.724230

## Perform permutation testing to get a p-value for the classifier

In [None]:
from sklearn.model_selection import permutation_test_score
import matplotlib.pyplot as plt
from numpy import savetxt

# Perform permutation testing to get a p-value
score, permutation_scores, pvalue = permutation_test_score(svc, X, y_pred, scoring="accuracy", 
                                                           cv=cv, n_permutations=500, n_jobs=30)
savetxt(output_dir + '/permutation_scores_' + analysis + '.txt', permutation_scores)

print("Classification score %s (pvalue : %s)" % (score, pvalue))

plt.hist(permutation_scores, 20, label='Permutation scores',
         edgecolor='black')
ylim = plt.ylim()
plt.plot(2 * [score], ylim, '--g', linewidth=3,
         label='Classification Score'
         ' (pvalue %s)' % pvalue)
plt.plot(2 * [1. / len(labels.unique())], ylim, '--k', linewidth=3, label='Luck')

plt.ylim(ylim)
plt.legend()
plt.xlabel('Score')
plt.savefig(output_dir + '/permutation_plot_' + analysis + '.png', transparent=True)
plt.show()

results_file.write("Classification score %s (pvalue : %s)" % (score, pvalue))
results_file.close()


In [None]:
#Shelve the results
import shelve

filename = output_dir + '/' + analysis + '_shelved.out'
my_shelf = shelve.open(filename,'n') # 'n' for new

for key in dir():
    try:
        my_shelf[key] = globals()[key]
    except TypeError:
        #
        # __builtins__, my_shelf, and imported modules can not be shelved.
        #
        print('ERROR shelving: {0}'.format(key))
my_shelf.close()