# Build the Classifier
This notebook is dedicated to creating classifiers and run classification analyses of interest on neuroimaging data.

In [1]:
from pandas import DataFrame, Series, read_csv

# Study specific variables
study_home = '/home/camachocm2/Analysis/aggregate_anats'
preproc_dir = study_home + '/proc/subj_data'
standard_mask = preproc_dir + '/sample_template/lcbd_template_mask.nii.gz'
template = preproc_dir + '/sample_template/lcbd_template0.nii.gz'
sub_data_file = study_home + '/results/feature_all_data_20190320.csv'
#brain_feature_data = study_home + '/proc/group_data/mvpa/data_merged_smooth.nii.gz'
#brain_feature_data = study_home + '/proc/group_data/mvpa/fullCT_resids.nii.gz'
brain_feature_data = study_home + '/proc/group_data/mvpa/resids_CT_withage.nii.gz'
output_dir = study_home + '/proc/group_data/mvpa'

subject_info = read_csv(sub_data_file, index_col=0)
subject_info.describe()

Unnamed: 0,Insula_f3,LIPL_f3,MFG_f3,RIFG_f1,eTIV,Left-Putamen,Right-Putamen,Left-Pallidum,Right-Pallidum,Left-Caudate,...,CBCL_internalizing_std,CBCL_externalizing_std,res_putamen,res_pall,res_caud,res_nacc,RSPS_f1,RMFG_f3,RIPL_F3,LOFC_f3
count,134.0,134.0,134.0,134.0,134.0,128.0,128.0,130.0,132.0,127.0,...,123.0,123.0,118.0,118.0,118.0,118.0,134.0,134.0,134.0,134.0
mean,1.537452,2.770503,3.705037,2.666904,1481158.0,5298.224219,5337.238281,2014.197692,1927.068939,3834.638583,...,-0.059289,-0.034229,0.022736,0.036025,0.005033,0.019501,2.7826,2.606423,2.444999,1.537452
std,0.245681,0.421733,0.440648,0.45098,146961.0,710.34156,634.935797,236.337617,249.043854,525.245559,...,1.005467,1.016694,0.681155,0.685004,0.724816,0.753716,0.714813,0.44968,0.326264,0.245681
min,1.016464,1.34123,2.764317,1.728269,1142335.0,3556.1,3940.8,1520.6,1396.6,2531.1,...,-1.661238,-1.235337,-1.794073,-1.621015,-1.429075,-1.358056,1.381433,1.163695,1.214467,1.016464
25%,1.35052,2.491446,3.362829,2.355866,1383378.0,4844.575,4912.5,1835.6,1753.5,3528.1,...,-0.803833,-1.023681,-0.448299,-0.479548,-0.557407,-0.500544,2.261016,2.311487,2.235609,1.35052
50%,1.517579,2.799807,3.751267,2.615858,1490017.0,5286.85,5295.4,1995.65,1907.4,3812.5,...,-0.191401,-0.247607,0.035165,0.080088,-0.008184,-0.05475,2.671038,2.605209,2.421827,1.517579
75%,1.716242,3.053427,4.037119,2.978805,1581896.0,5740.825,5818.7,2187.95,2104.875,4080.7,...,0.543517,0.59902,0.452938,0.534231,0.496811,0.404827,3.267962,2.853014,2.665709,1.716242
max,2.11239,4.005555,4.793296,4.017766,1975068.0,7658.5,7436.3,2712.7,2570.8,5572.7,...,2.421642,2.292272,1.446386,2.006917,1.638915,2.931471,4.721301,3.849567,3.416235,2.11239


## Step 1: Create feature set and labels

In [2]:
# determine which analysis to run
analysis = 'Factor1'
import numpy as np

if analysis == 'Factor1':
    mask = subject_info['Factor1'] !=np.nan
    labels = subject_info['Factor1']
    type_svm = 'nonbinary'
elif analysis == 'age':
    mask = subject_info['Age_yrs'] >0
    labels = subject_info['Age_yrs']
    type_svm = 'nonbinary'
elif analysis == 'extern':
    mask = subject_info['CBCL_externalizing']>0
    labels = subject_info['CBCL_externalizing']
    type_svm = 'nonbinary'
elif analysis=='age_factor1':
    mask = subject_info['Factor1'] !=np.nan

results_file = open(output_dir + '/results_' + analysis + '_withagereal.txt','w')
out_file = output_dir + '/svrweights_' + analysis + '_withagereal.nii.gz'
labels[mask].describe()

count    134.000000
mean      -0.039009
std        0.943274
min       -1.962494
25%       -0.666022
50%       -0.081934
75%        0.680102
max        1.926131
Name: Factor1, dtype: float64

## Support Vector Classification

The below cells perform categorical classification

In [None]:
if type_svm == 'binary':
    # Perform the support vector classification
    from nilearn.input_data import NiftiMasker
    from sklearn.svm import SVC
    from sklearn.feature_selection import f_classif, SelectPercentile
    from sklearn.pipeline import Pipeline

    # Set up the support vector classifier
    svc = SVC(kernel='linear')
    masker = NiftiMasker(mask_img=gm_mask,standardize=True, 
                         memory='nilearn_cache', memory_level=1)
    
    # Select the features contributing to the model
    feature_selection = SelectPercentile(f_classif, percentile=5) #0.05/228453 voxels
    fs_svc = Pipeline([('feat_select', feature_selection), ('svc', svc)])

    # Run the classifier
    X = masker.fit_transform(brain_feature_data)
    X = X[mask]
    maskedlabels=labels[mask]
    fs_svc.fit(X, maskedlabels)
    
    # Obtain prediction values via cross validation
    from sklearn.model_selection import cross_validate, LeaveOneGroupOut, cross_val_predict

    loso = LeaveOneGroupOut()
    cv_scores = cross_validate(fs_svc, X, y=maskedlabels, n_jobs=10, return_train_score=True,
                               groups=conditions['subject'][mask], cv=loso, scoring='accuracy')
    y_pred = cross_val_predict(fs_svc, X, y=maskedlabels, n_jobs=10,
                               groups=conditions['subject'][mask], cv=loso)
    
    ## Save the SVM weights to a nifti
    coef = svc.coef_
    coef = feature_selection.inverse_transform(coef)
    weight_img = masker.inverse_transform(coef)
    weight_img.to_filename(output_dir + '/svmweights_'+ analysis +'.nii.gz')
    
    ## Calculate performance metrics
    from sklearn.metrics import recall_score, precision_score
    
    classification_accuracy = cv_scores['test_score'].mean()
    chance = 1. / len(labels.unique())
    print("Classification accuracy: %.4f / Chance level: %f" % 
          (classification_accuracy, chance))
    
    for label in maskedlabels.unique():
        sensitivity = recall_score(maskedlabels,y_pred,labels=[label],average='weighted')
        precision = precision_score(maskedlabels,y_pred,labels=[label],average='weighted')
        
        results_file.write("%s: classification accuracy: %.4f \n chance level: %f \n sensitivity: %f \n precision: %f \n" % 
        (label, classification_accuracy, chance, sensitivity, precision))
    
    # compute and display a confusion matrix
    from sklearn.metrics import confusion_matrix
    from numpy import set_printoptions
    import itertools
    import matplotlib.pyplot as plt

    cnf_matrix = confusion_matrix(maskedlabels, y_pred)
    set_printoptions(precision=2)
    classes = maskedlabels.unique()

    def plot_confusion_matrix(cm, classes):
        from numpy import arange
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title('Confusion matrix')
        plt.colorbar()
        tick_marks = arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45, size=16)
        plt.yticks(tick_marks, classes, size=16)

        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j],  'd'),
                     horizontalalignment='center',
                     color='white' if cm[i, j] > thresh else 'black', size=16)

        plt.tight_layout()
        plt.ylabel('True label', size=16)
        plt.xlabel('Predicted label', size=16)

    plot_confusion_matrix(cnf_matrix, classes)
    plt.savefig(output_dir + '/confusion_matrix_' + analysis + '.svg', transparent=True)
    plt.close()
    
    results_file.close()

### Perform permutation testing to get a p-value for the classifier

In [None]:
'''
N.B.: in order to use the below function (permutation_test_score) for this particular analysis, 
I added the following code to the _shuffle function (starts on line 966 of sklearn/model_selection/_validation.py) code:

     elif permute_groups==True:
        indices = np.arange(len(groups))
        indices = random_state.permutation(indices)

and I added an argument to the permutation_test_score function (permute_groups=True) that is passed to the _shuffle function. 
This enables groups to be used for cross validation, but ignores groups for permutation. This is useful if you have multiple
features per subject with the same label and you are using grouping to denote a whole subject for LOSO cross validation.
'''

In [None]:
from sklearn.model_selection import permutation_test_score
import matplotlib.pyplot as plt
from numpy import savetxt

results_file = open(output_dir + '/permut_results_' + analysis + '_final.txt','w')

if type_svm == 'binary':
    # Perform permutation testing to get a p-value
    score, permutation_scores, pvalue = permutation_test_score(fs_svc, X, maskedlabels, scoring='accuracy', 
                                                               cv=loso, n_permutations=500, n_jobs=10, 
                                                               groups=conditions['subject'][mask], permute_groups=True)
    savetxt(output_dir + '/permutation_scores_' + analysis + '.txt', permutation_scores)

    print("Classification score %s (pvalue : %s)" % (score, pvalue))
    # Save a figure of the permutation scores
    plt.hist(permutation_scores, 20, label='Permutation scores',
             edgecolor='black')
    ylim = plt.ylim()
    plt.plot(2 * [score], ylim, '--g', linewidth=3,
             label='Classification Score (pvalue %f)' % pvalue)
    plt.plot(2 * [1. / len(labels.unique())], ylim, '--k', linewidth=3, label='Luck')
    plt.ylim(ylim)
    plt.legend()
    plt.xlabel('Score')
    plt.savefig(output_dir + '/permutation_plot_' + analysis + '.svg', transparent=True)
    plt.close()
    
    # save final pval/classifier score
    results_file.write("Classification score %s (pvalue : %s)" % (score, pvalue))
    results_file.close()

## Support Vector Regression

The below cells performs continuous classification (i.e. predict a continuous variable) based on age.

In [3]:
if type_svm == 'nonbinary':
    # Perform the support vector classification
    from nilearn.input_data import NiftiMasker
    from sklearn.feature_selection import f_regression, SelectPercentile
    from sklearn.svm import SVR
    from sklearn.pipeline import Pipeline

    # Set up the regression
    svr = SVR(kernel='linear', C=1)
    masker = NiftiMasker(mask_img=standard_mask,standardize=True, 
                         memory='nilearn_cache', memory_level=1)
    
    feature_selection = SelectPercentile(f_regression, percentile=5)
    fs_svr = Pipeline([('feat_select', feature_selection), ('svr', svr)])
    
    # Run the regression
    X = masker.fit_transform(brain_feature_data)
    X = X[mask]
    maskedlabels=labels[mask]
    fs_svr.fit(X, maskedlabels)
        
    from sklearn.model_selection import cross_val_predict, LeaveOneGroupOut

    loso = LeaveOneGroupOut()
    y_pred = cross_val_predict(fs_svr, X, y=maskedlabels, n_jobs=20,
                               groups=subject_info['freesurferID'][mask],cv=loso)
    # save weights
    coef = svr.coef_
    coef = feature_selection.inverse_transform(coef)
    coef_image = masker.inverse_transform(coef)
    coef_image.to_filename(out_file)
    
    results_df = subject_info[mask]
    results_df['pred' + analysis] = Series(y_pred, index=results_df.index)
    results_df.head()
    
    
    from scipy.stats import linregress
    slope, intercept, r_val, p_val, stderr = linregress(maskedlabels, y_pred) 

    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(maskedlabels, y_pred)
    
    from scipy.stats import spearmanr
    spear_r, spear_p = spearmanr(maskedlabels, y_pred)

    print("prediction accuracy: %.4f / p-value: %f / MSE: %f // Spearman: %f / p-value: %f" % (r_val, p_val, mse, spear_r, spear_p))

    # plot the predicted versus actual values
    import matplotlib.pyplot as plt
    plt.scatter(maskedlabels, y_pred, color='b')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.savefig(output_dir + '/scatter_pred_actual_mean_' + analysis + '_final.svg', transparent=True)
    plt.show()
    plt.close()

    results_file.write("MEAN prediction accuracy r-value: %.4f / p-value: %f / MSE: %f // Spearman: %f / p-value: %f \n" % (r_val, p_val, mse, spear_r, spear_p))
    results_file.write('predicted: ' + str(y_pred) + '\n')
    results_file.write('actual: ' + str(maskedlabels) + '\n')

    results_file.close()

prediction accuracy: -0.2119 / p-value: 0.013982 / MSE: 74.257216 // Spearman: -0.054901 / p-value: 0.528665


<Figure size 640x480 with 1 Axes>

### Perform permutation testing

In [None]:
if type_svm == 'nonbinary':
    ## Perform permutation testing to get a p-value for MSE
    score, permutation_scores, pvalue = permutation_test_score(fs_svr, X, maskedlabels, scoring='neg_mean_squared_error', 
                                                               cv=loso, n_permutations=1000, n_jobs=20, 
                                                               groups=subject_info['freesurferID'][mask])
    savetxt(output_dir + '/permutation_scores_mse_' + analysis + '.txt', permutation_scores)

    # Save a figure of the permutation scores
    plt.hist(permutation_scores, 20, label='Permutation scores',
             edgecolor='black')
    ylim = plt.ylim()
    plt.plot(2 * [score], ylim, '--g', linewidth=3,
             label='Mean Squared Error (pvalue %f)' % pvalue)
    plt.ylim(ylim)
    plt.legend()
    plt.xlabel('Score')
    plt.savefig(output_dir + '/permutation_plot_mse_' + analysis + '.svg', transparent=True)
    plt.close()

    # save final pval/classifier score
    results_file.write('MSE score %s (pvalue : %s) \n' % (score, pvalue))
    
    ## Perform permutation testing to get a p-value for r-squared
    score, permutation_scores, pvalue = permutation_test_score(fs_svr, X, maskedlabels, scoring='r2', 
                                                               cv=loso, n_permutations=1000, n_jobs=20, 
                                                               groups=subject_info['freesurferID'][mask])
    savetxt(output_dir + '/permutation_scores_r2_' + analysis + '.txt', permutation_scores)

    # Save a figure of the permutation scores
    plt.hist(permutation_scores, 20, label='Permutation scores',
             edgecolor='black')
    ylim = plt.ylim()
    plt.plot(2 * [score], ylim, '--g', linewidth=3,
             label='R-squared (pvalue %f)' % pvalue)
    plt.ylim(ylim)
    plt.legend()
    plt.xlabel('Score')
    plt.savefig(output_dir + '/permutation_plot_r2_' + analysis + '.svg', transparent=True)
    plt.close()

    # save final pval/classifier score
    results_file.write('R square: %s (pvalue : %s) \n' % (score, pvalue))
    results_file.close()    

In [None]:
from glob import glob
from numpy import mean, std, loadtxt
files = glob('/home/camachocm2/Analysis/KidVid_MVPA/analysis/classifier/final_SVM_linear_5percent/permutation_scores_*.txt')
for file in files:
    analysis = file[103:-4]
    scores = loadtxt(file)
    print(analysis + ' average = ' + str(mean(scores)) + ', SD = ' + str(std(scores)))