In [1]:
import os
import copy
import time
import math
import random
import itertools
import scipy.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from IPython.display import clear_output

In [2]:
def df_to_arr(df):
    
    vals = []
    for _, row in df.iterrows():
        vals.extend(row.tolist())
    return np.array([x for x in vals if str(x) != 'nan'])

In [3]:
def get_subjects(path):
    
    '''
    Gets a list of subject IDs and the file suffix, given a path to the data files. 
    
    Note: subject ID must be only 2 characters for this to work, and all data files
    must have same suffix.
    
    Parameters
    ----------
    path: str
        directory to the data files
        
    Returns
    -------
    list
        a list of subject IDs
    str
        the suffix to the filenames
    '''
    
    files = os.listdir(path)
    subjects = [f[:2] for f in files]
    suffix = files[0][2:]
        
    subjects.sort()
    
    return subjects, suffix

In [4]:
def scramble_labels(y_data):
    
    '''
    Randomly selects half of the labels in the data to switch to the other class.
    
    Parameters
    ----------
    y_data: array-like
        label data to scramble
    '''
    
    classes = list(set(y_data))
    classes.sort()
    
    y_data_copy = y_data.copy()
    
    labels_0 = [i for i, x in enumerate(y_data) if x == classes[0]]
    labels_1 = [i for i, x in enumerate(y_data) if x == classes[1]]
    to_change = random.sample(labels_0, k=len(labels_0)//2)
    to_change.extend(random.sample(labels_1, k=len(labels_1)//2))
    
    for index in to_change:
        if y_data[index] == classes[0]:
            y_data[index] = classes[1]
        else:
            y_data[index] = classes[0]
    

In [28]:
def get_optimal_run(x_train, y_train, x_test, y_test, kernels, gamma_range, C_range):
    
    '''
    Gets best hyperparameters (kernel, C, and gamma values) that optimize SVM's predictions for given
    x and y test dataset.
    
    Parameters
    ----------
    x_train: array-like
        dataset of block data used to train classifier
    y_train: array-like
        dataset of label data used to train classifier
    x_test: array-like
        testing dataset of block data used to optimize hyperparameters on
    y_test: array-like
        testing dataset of label data used to optimize hyperparameters on
    kernels: list
        kernels to test (recommended options are 'linear', 'rbf', and 'sigmoid')
    gamma_range: dict
        dict that specifies the range of values of gamma to test; should include start, stop to range,
        num of values, and the exponential base
    C_range: dict
        dict that specifies the range of values of C to test; should include start, stop to range,
        num of values, and the exponential base
        
    Returns
    -------
    dict
        best combination of parameters found from grid search
    float
        best accuracy obtained from testing
    '''
    
    gamma_vals = np.logspace(gamma_range['start'], gamma_range['stop'], gamma_range['num'], base=gamma_range['base'])
    C_vals = np.logspace(C_range['start'], C_range['stop'], C_range['num'], base=C_range['base'])

    param_grid = ParameterGrid({'kernel': kernels, 'gamma': gamma_vals, 'C': C_vals})
    
    best_acc = 0
    best_params = None
    
    # Tests each parameter combination to find best one for given testing data
    for params in list(param_grid):
        
        svclassifier = SVC(kernel=params['kernel'], gamma=params['gamma'], C=params['C'], max_iter=-1)
        svclassifier.fit(x_train, y_train)
        
        curr_acc = svclassifier.score(x_test, y_test)
        
        if curr_acc > best_acc:
            best_acc = curr_acc
            best_params = params
            
    return best_params, best_acc

# Training Within Subjects

In [35]:
def extract_tr_subject_data(path, subject, suffix, roi, conds):
    
    '''
    Extracts individual subject data from the .mat files.
    
    Parameters
    ----------
    path: str
        directory to data files
    subject: str
        ID of subject to load data for
    suffix: str
        ending suffix of the data filename
    roi: int
        0 for V1 data, 1 for MT data
    conds: list
        list of integers specifying the conditional datasets to extract
        (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)
        
    Returns
    -------
    Lists of voxel data (x_data) separated by individual TRs and the corresponding labels (y_data)
    '''
    
    x_data = []
    y_data = []
    
    path_to_file = path + subject + suffix
    mat = scipy.io.loadmat(path_to_file)['roi_scanData'][0][roi]
    scan_indices = []
    
    # Extract all TR data from all blocks from all scans
    for scan in range(len(mat[0])):

        for cond in conds:
            
            for block in range(len(mat[0][scan][0][cond][0])):
                
                block_x_data = []
                
                for tr in range(len(mat[0][scan][0][cond][0][block][0])):

                    tr_data = mat[0][scan][0][cond][0][block][0][tr][0][0][0].tolist()
                    block_x_data.extend(tr_data)
                    
                x_data.append(block_x_data)
                y_data.append(mat[0][scan][1][cond][0].replace('_post', ''))
                
        scan_indices.append(len(x_data))
    
    # MinMaxScaler scales each feature to values between 0 and 1 among all x data
    print(len(x_data))
    for i in x_data:
        print(len(i))
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_data = scaler.fit_transform(x_data)        
    
    x_data_by_scan = {}
    y_data_by_scan = {}
    for scan_id, idx in enumerate(scan_indices):
        if idx == scan_indices[0]:
            x_data_by_scan[scan_id] = x_data[0:idx]
            y_data_by_scan[scan_id] = y_data[0:idx]
        else:
            x_data_by_scan[scan_id] = x_data[scan_indices[scan_id-1]:idx]
            y_data_by_scan[scan_id] = y_data[scan_indices[scan_id-1]:idx]
    
    return x_data_by_scan, y_data_by_scan

In [14]:
def train_within_subjects_combined(data_params, grid_params, runs=50, scramble=False):
    
    '''
    Trains and tests the classifier for accuracy using SVMs. Combines post-training and pre-training
    data for training and inner testing of SVM for comparison purposes.
    
    Parameters
    ----------
    data_params: dict
        path_pre: str
            the path to the pre-training data files
        path_post: str
            the path to the post-training data files
        roi: int
            0 for V1 data, 1 for MT data
        conds: list
            list of integers specifying the conditional datasets to extract
            (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)   
    grid_params: dict
        kernels: list
            kernels to test (recommended options are 'linear', 'rbf', and 'sigmoid')
        gamma: dict
            dict that specifies the range of values of gamma to test; should include start, stop to range,
            num of values, and the exponential base
        C: dict
            dict that specifies the range of values of C to test; should include start, stop to range,
            num of values, and the exponential base
    runs: int
        number of runs to test on for each subject
    scramble: boolean, optional
        whether or not to scramble the labels when training, 
        default is False
        
    Returns
    -------
    DataFrame
        data of inner subject combination testing accuracy
    DataFrame
        data of outer pre-training subject testing accuracy
    DataFrame
        data of outer post-training subject testing accuracy
    '''
    
    subjects, suffix_post = get_subjects(data_params['path_post'])
    _, suffix_pre = get_subjects(data_params['path_pre'])
    
    inner_result = {}
    outer_pre_result = {}
    outer_post_result = {}
    
    for subject in subjects:
        
        inner_result[subject] = []
        outer_pre_result[subject] = []
        outer_post_result[subject] = []
        
        print(f"Currently on subject {subject}.")
        
        x_data_pre, y_data_pre = extract_tr_subject_data(data_params['path_pre'], subject, suffix_pre, roi, conds)
        x_data_post, y_data_post = extract_tr_subject_data(data_params['path_post'], subject, suffix_post, roi, conds)
        
        x_data = x_data_pre.copy()
        y_data = y_data_pre.copy()
        for x_val, y_val in zip(x_data_post.values(), y_data_post.values()):
            x_data[len(x_data)] = x_val
            y_data[len(y_data)] = y_val
        
        scans = x_data.keys()
        for outer_scan in scans:
            
            inner_scans = [s for s in scans if s != outer_scan]
            
            opt_inner_acc = -1
            opt_inner_params = None
            for inner_scan in inner_scans:
                
                x_train, y_train, x_test, y_test = [], [], [], []
                for scan in scans:
                    if scan == inner_scan:
                        x_test.extend(x_data[scan])
                        y_test.extend(y_data[scan])
                    elif scan != outer_scan:
                        x_train.extend(x_data[scan])
                        y_train.extend(y_data[scan])
            
                opt_params, inner_acc = get_optimal_run(x_train, y_train, x_test, y_test, grid_params['kernels'], grid_params['gamma'], grid_params['C']) 
                if inner_acc > opt_inner_acc:
                    opt_inner_acc = inner_acc
                    opt_inner_params = opt_params
                    
                inner_result[subject].append(inner_acc)
                    
            x_train, y_train, x_test, y_test = [], [], [], []
            for scan in scans:
                if scan == outer_scan:
                    x_test.extend(x_data[scan])
                    y_test.extend(y_data[scan])
                else:
                    x_train.extend(x_data[scan])
                    y_train.extend(y_data[scan])
                    
            svclassifier = SVC(kernel=opt_inner_params['kernel'], gamma=opt_inner_params['gamma'], C=opt_inner_params['C'], max_iter=-1)
            svclassifier.fit(x_train, y_train)
            outer_acc = svclassifier.score(x_test, y_test)
            outer_result[subject].append(outer_acc)
            
    end_time = time.time()
    exec_time = end_time - start_time
    minutes = exec_time // 60
    print(f"Last turn took {round(minutes, 3)} minutes.")

    return inner_result, outer_result
    

In [30]:
def train_within_subjects(data_params, grid_params, scramble=False):
    
    '''
    Trains and tests the classifier for accuracy using SVMs.
    
    Parameters
    ----------
    data_params: dict
        path: str
            the path to the data files
        roi: int
            0 for V1 data, 1 for MT data
        conds: list
            list of integers specifying the conditional datasets to extract
            (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)   
    grid_params: dict
        kernels: list
            kernels to test (recommended options are 'linear', 'rbf', and 'sigmoid')
        gamma: dict
            dict that specifies the range of values of gamma to test; should include start, stop to range,
            num of values, and the exponential base
        C: dict
            dict that specifies the range of values of C to test; should include start, stop to range,
            num of values, and the exponential base
    runs: int
        number of runs to test on for each subject
    scramble: boolean, optional
        whether or not to scramble the labels when training, 
        default is False
        
    Returns
    -------
    DataFrame
        data of inner subject combination testing accuracy
    DataFrame
        data of outer subject testing accuracy
    '''
    
    subjects, suffix = get_subjects(data_params['path'])
    inner_result = {}
    outer_result = {}
    
    start_time = time.time()
    subjects = ['CG']
    for subject in subjects:
        
        inner_result[subject] = []
        outer_result[subject] = []
        
        print(f"Currently on subject {subject}.")
        
        x_data, y_data = extract_tr_subject_data(path, subject, suffix, roi, conds)
        scans = x_data.keys()
        for outer_scan in scans:
            
            inner_scans = [s for s in scans if s != outer_scan]
            
            opt_inner_acc = -1
            opt_inner_params = None
            for inner_scan in inner_scans:
                
                x_train, y_train, x_test, y_test = [], [], [], []
                for scan in scans:
                    if scan == inner_scan:
                        x_test.extend(x_data[scan])
                        y_test.extend(y_data[scan])
                    elif scan != outer_scan:
                        x_train.extend(x_data[scan])
                        y_train.extend(y_data[scan])
            
                opt_params, inner_acc = get_optimal_run(x_train, y_train, x_test, y_test, grid_params['kernels'], grid_params['gamma'], grid_params['C']) 
                if inner_acc > opt_inner_acc:
                    opt_inner_acc = inner_acc
                    opt_inner_params = opt_params
                    
                inner_result[subject].append(inner_acc)
                    
            x_train, y_train, x_test, y_test = [], [], [], []
            for scan in scans:
                if scan == outer_scan:
                    x_test.extend(x_data[scan])
                    y_test.extend(y_data[scan])
                else:
                    x_train.extend(x_data[scan])
                    y_train.extend(y_data[scan])
                    
            svclassifier = SVC(kernel=opt_inner_params['kernel'], gamma=opt_inner_params['gamma'], C=opt_inner_params['C'], max_iter=-1)
            svclassifier.fit(x_train, y_train)
            outer_acc = svclassifier.score(x_test, y_test)
            outer_result[subject].append(outer_acc)
            
    end_time = time.time()
    exec_time = end_time - start_time
    minutes = exec_time // 60
    print(f"Last turn took {round(minutes, 3)} minutes.")

    return inner_result, outer_result
    

In [36]:
path = r'scans/output/PRE/'
roi = 1                            # V1-roi: 0, MT-roi: 1
conds = [0, 2]                     # trained_cp: 0, trained_ip: 1, untrained_cp: 2, untrained_ip: 3

gamma_range = {'start': -15, 'stop': 3, 'num': 19, 'base': 2.0}
C_range = {'start': -3, 'stop': 15, 'num': 19, 'base': 2.0}
kernels = ['rbf', 'sigmoid']

data_params = {'path': path, 'roi': roi, 'conds': conds}
grid_params = {'gamma': gamma_range, 'C': C_range, 'kernels': kernels}

inner_accs, outer_accs = train_within_subjects(data_params, grid_params)

Currently on subject CG.
12
624
624
702
702
624
624
624
624
624
624
702
702


ValueError: setting an array element with a sequence.

In [75]:
outer_acc_report

{'CC': [0.0,
  0.9,
  0.0,
  0.0,
  0.5,
  0.0,
  0.0,
  0.0,
  0.0,
  0.6,
  0.0,
  0.0,
  0.0,
  0.5,
  0.0,
  0.0],
 'GD': [0.875, 0.0, 0.0, 0.0, 0.0, 0.75, 0.0, 0.0, 0.25, 0.75, 0.0, 0.0],
 'JM': [0.0, 0.75, 0.0, 0.0, 0.0, 0.75, 0.0, 0.0, 0.75, 0.0, 0.0, 0.0],
 'JS': [0.0,
  0.0,
  0.1111111111111111,
  0.6666666666666666,
  0.0,
  0.0,
  0.1111111111111111,
  0.6666666666666666,
  0.0,
  0.0,
  0.0,
  0.3333333333333333],
 'NL': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'RK': [0.0,
  0.875,
  0.0,
  1.0,
  0.375,
  0.0,
  0.375,
  0.0,
  0.625,
  0.25,
  0.125,
  0.125],
 'SC': [0.0,
  0.6,
  0.0,
  0.0,
  0.0,
  0.2,
  0.0,
  0.0,
  0.6,
  0.4,
  0.0,
  0.0,
  1.0,
  0.2,
  0.0,
  0.0],
 'YY': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0]}

In [24]:
path_pre = r'scans/output/PRE/'
path_post = r'scans/output/cp&ip/'
roi = 0                            # V1-roi: 0, MT-roi: 1
conds = [0, 2]                     # trained_cp: 0, trained_ip: 1, untrained_cp: 2, untrained_ip: 3

gamma_range = {'start': -15, 'stop': 3, 'num': 19, 'base': 2.0}
C_range = {'start': -3, 'stop': 15, 'num': 19, 'base': 2.0}
kernels = ['rbf', 'sigmoid']

data_params = {'path_pre': path_pre, 'path_post': path_post, 'roi': roi, 'conds': conds}
grid_params = {'gamma': gamma_range, 'C': C_range, 'kernels': kernels}

inner_acc_report, outer_acc_report_pre, outer_acc_report_post = train_within_subjects_combined(data_params, grid_params, runs=200)

outer_acc_report_pre['Average'] = outer_acc_report_pre.mean(axis=1)
outer_acc_report_pre.to_csv('output/post_cp/outer_accs_within_pre.csv')
outer_acc_report_post['Average'] = outer_acc_report_post.mean(axis=1)
outer_acc_report_post.to_csv('output/post_cp/outer_accs_within_post.csv')
inner_acc_report['Average'] = inner_acc_report.mean(axis=1)
inner_acc_report.to_csv('output/post_cp/inner_accs_within_combined.csv')


## Permutation Runs

In [None]:
def permutation_within_subjects(data_params, grid_params, inner_dist, outer_dist, runs=50, train_runs=100, history=True):
    
    '''
    Performs a specified number of runs where data labels are scrambled.
    
    Parameters
    ----------
    data_params: dict
        contains specifications for data processing (see train method for documentation)
    grid_params: dict
        contains values for grid search (see train method for documentation)
    inner_dist: list
        holds accuracy values for individual inner subject tests
    outer_dist: list
        holds accuracy values for individual outer subject tests
    runs: int
        number of runs to perform, default is 50
    train_runs: int
        number of runs to train on each subject, default is 100
    history: boolean
        whether to track accuracy over runs and output permutation accuracy plot, 
        default is True
    '''
    
    subjects, suffix = get_subjects(data_params['path'])
    if history:
        outer_sample_means = []
        for i in range(len(outer_dist)//(len(subjects)*train_runs)):
            outer_sample_means.append(np.mean(outer_dist[i*len(subjects)*train_runs:(i+1)*len(subjects)*train_runs]))
        
        x = [i for i in range(1, len(outer_sample_means)+1)]
        if len(outer_sample_means) > 0:
            y = [outer_sample_means[0]]
            for i in range(2, len(outer_sample_means)+1):
                y.append(np.mean(outer_sample_means[:i]))
        else:
            y = []
        
    for n in range(runs):
        print(f'On run #{n+1} of {runs}.')
        inner_accs, outer_accs = train_within_subjects(data_params, grid_params, runs=train_runs, scramble=True)
        
        inner_dist.extend(df_to_arr(inner_accs).tolist())
        outer_dist.extend(df_to_arr(outer_accs).tolist())
        
        outer_sample_means.append(np.mean(df_to_arr(outer_accs)))
        
        if history:
            y.append(np.mean(outer_sample_means))
            x.append(len(y))

            plt.plot(x, y)
            plt.xlabel('Run')
            plt.ylabel('Overall Mean Accuracy')
            plt.title('Overall Outer Subject Accuracy')
            plt.savefig(f"output/cp/perm_hist.png")
        

In [None]:
path = r'scans/output/PRE/'
roi = 1                            # V1-roi: 0, MT-roi: 1
conds = [1, 3]                     # trained_cp: 0, trained_ip: 1, untrained_cp: 2, untrained_ip: 3

gamma_range = {'start': -15, 'stop': 3, 'num': 19, 'base': 2.0}
C_range = {'start': -3, 'stop': 15, 'num': 19, 'base': 2.0}
kernels = ['rbf', 'sigmoid']

data_params = {'path': path, 'roi': roi, 'conds': conds}
grid_params = {'gamma': gamma_range, 'C': C_range, 'kernels': kernels}

inner_dist = []
outer_dist = []
permutation_within_subjects(data_params, grid_params, inner_dist, outer_dist, runs=10, train_runs=200)