# Training SVM

### To-do:
<ul>
    <li> Find and track optimal parameters for SVM </li>
    <li> Write training function with nested cross-validation AND scrambled labels </li>
    <li> Track accuracy report </li>
</ul>

In [2]:
from sklearn.svm import SVC
import scipy.io
import numpy as np
import os

In [12]:
subjects

['AT', 'CC', 'CG', 'GD', 'JM', 'JR', 'JS', 'NL', 'RK', 'SC', 'TP', 'YY']

In [6]:
'''
Trains the classifier, uses nested cross-validation.

x_data:           formatted array of all subject voxel data
y_data:           formatted array of labels corresponding to the x_data
num_of_subjects:  number of subjects in the data
kernel_type:      type of kernel to run SVC 
                  (options are 'linear', 'rbf', 'poly', 'sigmoid', 'precomputed')
'''
def train(x_data, y_data, subjects, kernel_type):
    
    # scramble labels
    
    for outer_subject in range(len(subjects)):

        # set aside outer subject

        for inner_subject in range(len(subjects)):

            if inner_subject == outer_subject:
                continue

            x_train, y_train, x_test, y_test = # get data    
                
            # train model
            # type of kernel will change
            svclassifier = SVC(kernel=kernel_type, gamma='auto', max_iter=-1)
            svclassifier.fit(x_train, y_train)
            
            # set aside testing subject for inner loop
            predict = svclassifier.predict(x_inner_test)
            acc = svclassifier.score(x_inner_test, y_inner_test)

        # test outer subject

SyntaxError: invalid syntax (<ipython-input-6-d81d3cd12370>, line 18)

In [7]:
'''
Randomly selects half of the labels in the data to switch to the other class. 

data:    dictionary containing the x and y data
classes: list of the two different classes of labels
'''
def scramble_labels(data, classes):
    
    y_data = data['y']
    for index in np.nditer(np.random.choice(len(y_data), size=len(y_data)//2, replace=False)):
        
        if y_data[index] == classes[0]:
            print("%i: %s -> %s" % (index, y_data[index], classes[1]))
            y_data[index] = classes[1]
        else:
            print("%i: %s -> %s" % (index, y_data[index], classes[0]))
            y_data[index] = classes[0]
            
    data['y'] = y_data

In [8]:
'''
Gets a list of subject IDs, given a path to the data files
Note: subject ID must be only 2 characters for this to work
'''
def get_subjects(path):
    
    files = os.listdir(path)
    subjects = [f[:2] for f in files]
    subjects.remove('ML')
    subjects.sort()
    
    return subjects

In [9]:
'''
Gets individuals subject data.

path:          directory to data files
subject:       specific subject data to load
suffix:        ending to filename
roi:           0 for MT data, 1 for V1 data
conds:         which conditional dataset to extract
block_length:  standardized number of voxels per block
'''
def extract_subject_data(path, subject, suffix, roi, conds, block_length):
    
    x_data = []
    y_data = []
    
    path_to_file = path + subject + suffix
    mat = scipy.io.loadmat(path_to_file)['roi_scanData'][0][roi]
        
    for scan in range(len(mat[0])):
            
        for cond in conds:
            
            for block in range(len(mat[0][scan][0][cond][0])):

                block_data = []
                for tr in range(len(mat[0][scan][0][cond][0][block][0])):
                    
                    # Extract all voxel data from individual TRs
                    block_data.extend(mat[0][scan][0][cond][0][block][0][tr][0][0][0].tolist())
                
                # Filters for most active voxels in each block
                block_data.sort()
                block_data = block_data[-block_length:]
                
                x_data.append(block_data)
                y_data.append(mat[0][scan][1][cond][0])
                
    data = {'x': x_data, 'y': y_data}
    return data

In [10]:
'''
Generates training and testing data
'''
def generate_data(subjects, inner_test_subject, outer_test_subject, path, suffix, block_length, roi, conds):
    
    x_train = []
    y_train = []
    x_test = []
    y_test = []
    
    for subject in range(len(subjects)):
        
        if subject == outer_test_subject or subject == inner_test_subject:
            continue
        subject_data = extract_subject_data(path, subjects[subject], suffix, roi, conds, block_length)
        x_train.extend(subject_data['x'])
        y_train.extend(subject_data['y'])
    
    test_data = extract_subject_data(path, subjects[inner_test_subject], suffix, roi, conds, block_length)
    x_test = test_data['x']
    y_test = test_data['y']
    
    x_train = np.stack(x_train, axis=0)
    y_train = np.stack(y_train, axis=0)
    
    return x_train, y_train, x_test, y_test

### Debugging

In [11]:
kernel_type = 'poly'
path = r'scans/output/PRE/'
suffix = "PRE_EVERY_TR_roi.mat"
roi = 1                            # V1-roi: 0, MT-roi: 1
conds = [1, 3]                     # trained_cp: 0, trained_ip: 1, untrained_cp: 2, untrained_ip: 3
subjects = get_subjects(path)
block_length = 624                 # minimum block length

inner_subject = 3
outer_subject = 0
x_train, y_train, x_test, y_test = generate_data(subjects, inner_subject, outer_subject, path, suffix, block_length, roi, conds)
svclassifier = SVC(kernel=kernel_type, gamma='auto', max_iter=-1)
svclassifier.fit(x_train, y_train)

svclassifier.score(x_test, y_test)


0.5

In [837]:
kernel_type = 'sigmoid'
path = r'scans/output/PRE/'
suffix = "PRE_EVERY_TR_roi.mat"
roi = 1                            # V1-roi: 0, MT-roi: 1
conds = [1, 3]                     # trained_cp: 0, trained_ip: 1, untrained_cp: 2, untrained_ip: 3
subjects = get_subjects(path)
block_length = 624                 # minimum block length

data = extract_subject_data(path, subjects[2], suffix, roi, conds, block_length)
#svclassifier = SVC(kernel=kernel_type, gamma='auto', max_iter=-1)
#svclassifier.fit(data['x'], data['y'])

In [734]:
test_data = extract_subject_data(path, subjects[1], suffix, roi, conds, block_length)
svclassifier.score(test_data['x'], test_data['y'])

0.625

In [735]:
classes = ['trained_ip', 'untrained_ip']

data = extract_subject_data(path, subjects[0], suffix, roi, conds, block_length)
scramble_labels(data, classes)

9: trained_ip -> untrained_ip
8: trained_ip -> untrained_ip
10: untrained_ip -> trained_ip
5: trained_ip -> untrained_ip
1: trained_ip -> untrained_ip
3: untrained_ip -> trained_ip


### Accuracy Report

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))