In [1]:
import os
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.io
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import ParameterGrid
from IPython.display import clear_output

In [4]:
import itertools

# SVM Training Functions

<a href="https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf">Guide to SVM Training</a>
### To-do:
<ul>
    <li> Refine grid search, increase accuracy of outer loop subjects </li>
    <li> Run scrambled label data (and fix the scramble method) </li>
    <li> Make data generation process more efficient/less redundant </li>
    <li> Shorten/simplify parameter list </li>
</ul>

In [2]:
def get_subjects(path, removeML = False):
    
    '''
    Gets a list of subject IDs and the file suffix, given a path to the data files. 
    
    Note: subject ID must be only 2 characters for this to work, and all data files
    must have same suffix.
    
    Parameters
    ----------
    path: str
        directory to the data files
    removeML: boolean, optional
        specifies whether subject ML should be skipped, default is False
        
    Returns
    -------
    list
        a list of subject IDs
    str
        the suffix to the filenames
    '''
    
    files = os.listdir(path)
    subjects = [f[:2] for f in files]
    suffix = files[0][2:]
    
    if removeML:
        subjects.remove('ML')
        
    subjects.sort()
    
    return subjects, suffix

In [26]:
cols = []
for combo in itertools.combinations(range(len(subjects)), 2):
    col = ''
    for subject in combo:
        col += '/' + subjects[subject]
    cols.append(col[1:])
        
cols

['AT/CC',
 'AT/CG',
 'AT/GD',
 'AT/JM',
 'AT/JR',
 'AT/JS',
 'AT/ML',
 'AT/NL',
 'AT/RK',
 'AT/SC',
 'AT/TP',
 'AT/YY',
 'CC/CG',
 'CC/GD',
 'CC/JM',
 'CC/JR',
 'CC/JS',
 'CC/ML',
 'CC/NL',
 'CC/RK',
 'CC/SC',
 'CC/TP',
 'CC/YY',
 'CG/GD',
 'CG/JM',
 'CG/JR',
 'CG/JS',
 'CG/ML',
 'CG/NL',
 'CG/RK',
 'CG/SC',
 'CG/TP',
 'CG/YY',
 'GD/JM',
 'GD/JR',
 'GD/JS',
 'GD/ML',
 'GD/NL',
 'GD/RK',
 'GD/SC',
 'GD/TP',
 'GD/YY',
 'JM/JR',
 'JM/JS',
 'JM/ML',
 'JM/NL',
 'JM/RK',
 'JM/SC',
 'JM/TP',
 'JM/YY',
 'JR/JS',
 'JR/ML',
 'JR/NL',
 'JR/RK',
 'JR/SC',
 'JR/TP',
 'JR/YY',
 'JS/ML',
 'JS/NL',
 'JS/RK',
 'JS/SC',
 'JS/TP',
 'JS/YY',
 'ML/NL',
 'ML/RK',
 'ML/SC',
 'ML/TP',
 'ML/YY',
 'NL/RK',
 'NL/SC',
 'NL/TP',
 'NL/YY',
 'RK/SC',
 'RK/TP',
 'RK/YY',
 'SC/TP',
 'SC/YY',
 'TP/YY']

In [24]:
[t for t in itertools.combinations(range(len(subjects)), 2)]

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (0, 10),
 (0, 11),
 (0, 12),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (1, 9),
 (1, 10),
 (1, 11),
 (1, 12),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (2, 8),
 (2, 9),
 (2, 10),
 (2, 11),
 (2, 12),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (3, 10),
 (3, 11),
 (3, 12),
 (4, 5),
 (4, 6),
 (4, 7),
 (4, 8),
 (4, 9),
 (4, 10),
 (4, 11),
 (4, 12),
 (5, 6),
 (5, 7),
 (5, 8),
 (5, 9),
 (5, 10),
 (5, 11),
 (5, 12),
 (6, 7),
 (6, 8),
 (6, 9),
 (6, 10),
 (6, 11),
 (6, 12),
 (7, 8),
 (7, 9),
 (7, 10),
 (7, 11),
 (7, 12),
 (8, 9),
 (8, 10),
 (8, 11),
 (8, 12),
 (9, 10),
 (9, 11),
 (9, 12),
 (10, 11),
 (10, 12),
 (11, 12)]

In [55]:
combo = [0,1,2]
for subject in range(len(subjects)):
    if subject in combo:
        print('hi')

hi
hi
hi


In [3]:
def scramble_labels(y_data, classes):
    
    '''
    Randomly selects half of the labels in the data to switch to the other class.
    
    Parameters
    ----------
    y_data: array-like
        label data to scramble
    classes: list
        the two different classes of labels
    '''
    
    for index in np.nditer(np.random.choice(len(y_data), size=len(y_data)//2, replace=False)):
        
        if y_data[index] == classes[0]:
            y_data[index] = classes[1]
        else:
            y_data[index] = classes[0]


In [39]:
def extract_subject_data(path, subject, suffix, roi, conds, block_length):
    
    '''
    Extracts individual subject data from the .mat files.
    
    Parameters
    ----------
    path: str
        directory to data files
    subject: str
        ID of subject to load data for
    suffix: str
        ending suffix of the data filename
    roi: int
        0 for V1 data, 1 for MT data
    conds: list
        list of integers specifying the conditional datasets to extract
        (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)
    block_length: int
        the number of voxels to standardize every block in the dataset to
        
    Returns
    -------
    List of voxel data (x_data) separated by individual blocks and the corresponding labels (y_data)
    '''
    
    x_data = []
    y_data = []
    
    path_to_file = path + subject + suffix
    mat = scipy.io.loadmat(path_to_file)['roi_scanData'][0][roi]
        
    for scan in range(len(mat[0])):
            
        for cond in conds:
            
            for block in range(len(mat[0][scan][0][cond][0])):

                block_data = []
                for tr in range(len(mat[0][scan][0][cond][0][block][0])):
                    
                    # Extract all voxel data from individual TRs
                    block_data.extend(mat[0][scan][0][cond][0][block][0][tr][0][0][0].tolist())
                
                # Filters for most active voxels in each block
                block_data.sort()
                block_data = block_data[-block_length:]
                
                x_data.append(block_data)
                y_data.append(mat[0][scan][1][cond][0])
                
    data = {'x': x_data, 'y': y_data}
    return data

In [76]:
'''
Generates training and testing data.
'''
def generate_data(subjects, inner_test_subjects, outer_test_subject, path, suffix, roi, conds, block_length):
    
    '''
    Generates training and testing data, which is separated into training data, inside testing data,
    and outside testind data.
    
    Parameters
    ----------
    subjects: list
        a list of subject IDs to extract data from
    inner_test_subject: list
        list of indices of the inner test subjects
    outer_test_subject: int
        the index of the outer test subject
    path: str
        the path to the data files
    suffix: str
        ending suffix of the data filename
    roi: int
        0 for V1 data, 1 for MT data
    conds: list
        list of integers specifying the conditional datasets to extract
        (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)    
    block_length: int
        the number of voxels to standardize every block in the dataset to
    
    Returns
    -------
    list
        blocks of voxel data for training use
    list
        training labels
    list
        inner test subject blocks of voxel data for testing use
    list 
        testing labels for inner test subject
    list
        outer test subject blocks of voxel data for testing use
    list
        testing labels for outer test subject
    '''
    
    data = []
    
    x_train = []
    y_train = []
    
    x_test_inner = []
    y_test_inner = []
    
    x_test_outer = []
    y_test_outer = []
    
    for subject in subjects:
        
        subject_data = extract_subject_data(path, subject, suffix, roi, conds, block_length)
        if subject == outer_test_subject:
            x_test_outer.extend(subject_data['x'])
            y_test_outer.extend(subject_data['y'])
        elif subject in inner_test_subjects:
            x_test_inner.extend(subject_data['x'])
            y_test_inner.extend(subject_data['y'])
        else:
            x_train.extend(subject_data['x'])
            y_train.extend(subject_data['y'])
    
    x_train_len = len(x_train)
    x_test_outer_len = len(x_test_outer)
    
    data.extend(x_train)
    data.extend(x_test_outer)
    data.extend(x_test_inner)
    
    # MinMaxScaler scales each feature to values between 0 and 1 among all x data
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_normalized = scaler.fit_transform(data)
    x_train, x_test_outer, x_test_inner = x_normalized[:x_train_len], x_normalized[x_train_len:x_train_len+x_test_outer_len], x_normalized[x_train_len+x_test_outer_len:]

    y_train = np.stack(y_train, axis=0)
    
    return x_train, y_train, x_test_inner, y_test_inner, x_test_outer, y_test_outer

In [28]:
def get_optimal_run(x_train, y_train, x_test, y_test, kernels, gamma_range, C_range):
    
    '''
    Gets best hyperparameters (kernel, C, and gamma values) that optimize SVM's predictions for given
    x and y test dataset.
    
    Parameters
    ----------
    x_train: array-like
        dataset of block data used to train classifier
    y_train: array-like
        dataset of label data used to train classifier
    x_test: array-like
        testing dataset of block data used to optimize hyperparameters on
    y_test: array-like
        testing dataset of label data used to optimize hyperparameters on
    kernels: list
        kernels to test (recommended options are 'linear', 'rbf', and 'sigmoid')
    gamma_range: dict
        dict that specifies the range of values of gamma to test; should include start, stop to range,
        num of values, and the exponential base
    C_range: dict
        dict that specifies the range of values of C to test; should include start, stop to range,
        num of values, and the exponential base
        
    Returns
    -------
    dict
        best combination of parameters found from grid search
    float
        best accuracy obtained from testing
    '''
    
    gamma_vals = np.logspace(gamma_range['start'], gamma_range['stop'], gamma_range['num'], base=gamma_range['base'])
    C_vals = np.logspace(C_range['start'], C_range['stop'], C_range['num'], base=C_range['base'])

    param_grid = ParameterGrid({'kernel': kernels, 'gamma': gamma_vals, 'C': C_vals})
    
    best_acc = 0
    best_params = None
    for params in list(param_grid):
        
        svclassifier = SVC(kernel=params['kernel'], gamma=params['gamma'], C=params['C'], max_iter=-1)
        svclassifier.fit(x_train, y_train)
        
        curr_acc = svclassifier.score(x_test, y_test)
        
        if curr_acc > best_acc:
            best_acc = curr_acc
            best_params = params
            
    return best_params, best_acc

In [79]:
def train_with_all(path, roi, conds, block_length, kernels, gamma_range, C_range, scramble=False):
    
    '''
    Trains and tests the classifier for accuracy.
    
    Parameters
    ----------
    path: str
        the path to the data files
    roi: int
        0 for V1 data, 1 for MT data
    conds: list
        list of integers specifying the conditional datasets to extract
        (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)    
    block_length: int
        the number of voxels to standardize every block in the dataset to
    kernels: list
        kernels to test (recommended options are 'linear', 'rbf', and 'sigmoid')
    gamma_range: dict
        dict that specifies the range of values of gamma to test; should include start, stop to range,
        num of values, and the exponential base
    C_range: dict
        dict that specifies the range of values of C to test; should include start, stop to range,
        num of values, and the exponential base
    scramble: boolean, optional
        whether or not to scramble the labels when training, default is False
        
    Returns
    -------
    DataFrame
        data of inner subject testing accuracy
    DataFrame
        data of outer subject testing accuracy
    '''
    
    subjects, suffix = get_subjects(path)
    
    inner_acc_report = pd.DataFrame(index=subjects, columns=subjects)
    outer_acc_report = pd.DataFrame(index=subjects, columns=subjects)
    
    for outer_subject in range(len(subjects)):
        
        print("Currently on outer subject #%i." % (outer_subject+1))

        start_time = time.time()
        
        for inner_subject in range(len(subjects)):

            if inner_subject == outer_subject:
                continue

            print("Currently on inner subject #%i." % (inner_subject+1))    
            x_train, y_train, x_test_inner, y_test_inner, x_test_outer, y_test_outer = generate_data(subjects, inner_subject, outer_subject, path, suffix, roi, conds, block_length)
            
            x_whole = np.vstack((x_train, x_test_inner, x_test_outer))
            y_whole = np.concatenate((y_train, y_test_inner, y_test_outer))
            
            if scramble:
                scramble_labels(y_train, classes)
                
            # gets optimal params for training dataset from grid search
            params, inner_acc = get_optimal_run(x_whole, y_whole, x_test_inner, y_test_inner, kernels, gamma_range, C_range) 
            print('Found best params for current inner subject.')
            
            # train model using optimal params for this set
            svclassifier = SVC(kernel=params['kernel'], gamma=params['gamma'], C=params['C'], max_iter=-1)
            svclassifier.fit(x_whole, y_whole)
            
            print('Testing outer subject...')
            outer_acc = svclassifier.score(x_test_outer, y_test_outer)
            
            # logs inner and outer subject accuracy data in dataframe
            index = subjects[outer_subject]
            col = subjects[inner_subject]
            
            inner_acc_report.at[index, col] = inner_acc
            outer_acc_report.at[index, col] = outer_acc

        clear_output()
        
        end_time = time.time()
        exec_time = end_time - start_time
        minutes = exec_time // 60
        seconds = exec_time % 60
        print('Last turn took %i minutes and %f seconds.' % (minutes, seconds))
    
    clear_output()
    return inner_acc_report, outer_acc_report

In [81]:
def train(path, roi, conds, block_length, kernels, gamma_range, C_range, num_inner=1, scramble=False):
    
    '''
    Trains and tests the classifier for accuracy.
    
    Parameters
    ----------
    path: str
        the path to the data files
    roi: int
        0 for V1 data, 1 for MT data
    conds: list
        list of integers specifying the conditional datasets to extract
        (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)    
    block_length: int
        the number of voxels to standardize every block in the dataset to
    kernels: list
        kernels to test (recommended options are 'linear', 'rbf', and 'sigmoid')
    gamma_range: dict
        dict that specifies the range of values of gamma to test; should include start, stop to range,
        num of values, and the exponential base
    C_range: dict
        dict that specifies the range of values of C to test; should include start, stop to range,
        num of values, and the exponential base
    scramble: boolean, optional
        whether or not to scramble the labels when training, default is False
        
    Returns
    -------
    DataFrame
        data of inner subject testing accuracy
    DataFrame
        data of outer subject testing accuracy
    '''
    
    subjects, suffix = get_subjects(path)
    
    cols = []
    for combo in itertools.combinations(range(len(subjects)), num_inner):
        col = ''
        for subject in combo:
            col += '/' + subjects[subject]
        cols.append(col[1:])

    inner_acc_report = pd.DataFrame(index=subjects, columns=cols)
    outer_acc_report = pd.DataFrame(index=subjects, columns=cols)
    
    for outer_subject in subjects:
        
        print("Currently on outer subject #%i." % (subjects.index(outer_subject)+1))

        start_time = time.time()
        
        inner_subjects = [s for s in subjects if s != outer_subject]
        for inner_subject_test in itertools.combinations((inner_subjects), num_inner):
            
            inner_subject_test = list(inner_subject_test)

            col = ''
            for subject in inner_subject_test:
                col += '/' + subject
            col = col[1:]
            print("Currently on combination of %s." % (col))    
            
            x_train, y_train, x_test_inner, y_test_inner, x_test_outer, y_test_outer = generate_data(subjects, inner_subject_test, outer_subject, path, suffix, roi, conds, block_length)
            print('Training size: %i \t Inner testing size: %i \t Outer testing size: %i' % (len(x_train), len(x_test_inner), len(x_test_outer)))
            
            if scramble:
                scramble_labels(y_train, classes)
                
            # gets optimal params for training dataset from grid search
            params, inner_acc = get_optimal_run(x_train, y_train, x_test_inner, y_test_inner, kernels, gamma_range, C_range) 
            print('Found best params for current inner subject.')
            
            # train model using optimal params for this set
            svclassifier = SVC(kernel=params['kernel'], gamma=params['gamma'], C=params['C'], max_iter=-1)
            svclassifier.fit(x_train, y_train)
            
            print('Testing outer subject...')
            outer_acc = svclassifier.score(x_test_outer, y_test_outer)
            
            # logs inner and outer subject accuracy data in dataframe
            index = subjects[outer_subject]
            
            inner_acc_report.at[index, col] = inner_acc
            outer_acc_report.at[index, col] = outer_acc

        clear_output()
        
        end_time = time.time()
        exec_time = end_time - start_time
        minutes = exec_time // 60
        seconds = exec_time % 60
        print('Last turn took %i minutes and %f seconds.' % (minutes, seconds))
    
    clear_output()
    return inner_acc_report, outer_acc_report

# Training Classifier/Visualizing Accuracies

In [82]:
path = r'scans/output/PRE/'
roi = 1                            # V1-roi: 0, MT-roi: 1
conds = [1, 3]                     # trained_cp: 0, trained_ip: 1, untrained_cp: 2, untrained_ip: 3
block_length = 624

gamma_range = {'start': -13, 'stop': 1, 'num': 16, 'base': 2.0}
C_range = {'start': -3, 'stop': 11, 'num': 16, 'base': 2.0}
kernels = ['rbf', 'sigmoid']

inner_accs, outer_accs = train(path, roi, conds, block_length, kernels, gamma_range, C_range, num_inner=2)

Currently on outer subject #1.
Currently on combination of CC/CG.
Training size: 128 	 Inner testing size: 28 	 Outer testing size: 12
Found best params for current inner subject.
Testing outer subject...


TypeError: list indices must be integers or slices, not str

In [13]:
path = r'scans/output/PRE/'
roi = 1                            # V1-roi: 0, MT-roi: 1
conds = [1, 3]                     # trained_cp: 0, trained_ip: 1, untrained_cp: 2, untrained_ip: 3
block_length = 624

gamma_range = {'start': -13, 'stop': 1, 'num': 32, 'base': 2.0}
C_range = {'start': -3, 'stop': 11, 'num': 32, 'base': 2.0}
kernels = ['rbf', 'sigmoid']

inner_accs, outer_accs = train_with_all(path, roi, conds, block_length, kernels, gamma_range, C_range)

In [16]:
inner_accs.to_csv('output/baseline_inner_accs', sep='\t')
outer_accs.to_csv('output/baseline_outer_accs', sep='\t')