# Using Deep Neural Network

DNN Guides:
<br><a href="https://www.dlology.com/blog/quick-notes-on-how-to-choose-optimizer-in-keras/">Optimizers</a>
<br><a href="https://towardsdatascience.com/a-guide-to-an-efficient-way-to-build-neural-network-architectures-part-i-hyper-parameter-8129009f131b">DNN Layers</a>

In [15]:
import os
import copy
import time
import math
import random
import itertools
import scipy.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

import tensorflow as tf
from tensorflow.keras import Sequential, layers, optimizers
from sklearn.preprocessing import MinMaxScaler

In [17]:
def labels_to_int(labels):
    classes = list(set(labels))
    return np.array([0 if label == classes[1] else 1 for label in labels])

In [2]:
def get_subjects(path):
    
    '''
    Gets a list of subject IDs and the file suffix, given a path to the data files. 
    
    Note: subject ID must be only 2 characters for this to work, and all data files
    must have same suffix.
    
    Parameters
    ----------
    path: str
        directory to the data files
        
    Returns
    -------
    list
        a list of subject IDs
    str
        the suffix to the filenames
    '''
    
    files = os.listdir(path)
    subjects = [f[:2] for f in files]
    suffix = files[0][2:]
        
    subjects.sort()
    
    return subjects, suffix

In [3]:
def scramble_labels(y_data):
    
    '''
    Randomly selects half of the labels in the data to switch to the other class.
    
    Parameters
    ----------
    y_data: array-like
        label data to scramble
    '''
    
    classes = list(set(y_data))
    classes.sort()
    
    y_data_copy = y_data.copy()
    for index in np.nditer(np.random.choice(len(y_data), size=len(y_data)//2, replace=False)):
        
        if y_data[index] == classes[0]:
            y_data[index] = classes[1]
        else:
            y_data[index] = classes[0]
    
    # Makes sure labels are scrambled properly
    num_diff = sum(i != j for i, j in zip(y_data, y_data_copy))  
    if num_diff != len(y_data)//2:
        raise ValueError
    

In [4]:
def get_min_max_block_length(path, subjects, suffix, roi, conds):
    
    '''
    Gets the minimum and maximum lengths of the blocks in the data.
    
    Parameters
    ----------
    path: str
        directory to data files
    subject: str
        ID of subject to load data for
    suffix: str
        ending suffix of the data filename
    roi: int
        0 for V1 data, 1 for MT data
    conds: list
        list of integers specifying the conditional datasets to extract
        
    Returns
    -------
    int
        minimum block length
    int
        maxmimum block length
    '''
    
    min_bl, max_bl = math.inf, 0
    for subject in subjects:
        
        path_to_file = path + subject + suffix
        mat = scipy.io.loadmat(path_to_file)['roi_scanData'][0][roi]

        for scan in range(len(mat[0])):
            for cond in conds:
                for block in range(len(mat[0][scan][0][cond][0])):
        
                    block_data = []
                    for tr in range(len(mat[0][scan][0][cond][0][block][0])):
                        block_data.extend(mat[0][scan][0][cond][0][block][0][tr][0][0][0].tolist())
                    
                    min_bl = min(min_bl, len(block_data))
                    max_bl = max(max_bl, len(block_data))
                    
    print(f"Min block length: {min_bl}")
    print(f"Max block length: {max_bl}")

    return min_bl, max_bl

In [5]:
def extract_subject_data(path, subject, suffix, roi, conds, block_length, rank_first):
    
    '''
    Extracts individual subject data from the .mat files.
    
    Parameters
    ----------
    path: str
        directory to data files
    subject: str
        ID of subject to load data for
    suffix: str
        ending suffix of the data filename
    roi: int
        0 for V1 data, 1 for MT data
    conds: list
        list of integers specifying the conditional datasets to extract
        (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)
    block_length: int
        the number of voxels to standardize every block in the dataset to
    rank_first: boolean
        whether to use first block in subject to order the rest of the blocks for that subject
        
    Returns
    -------
    Lists of voxel data (x_data) separated by individual blocks and the corresponding labels (y_data)
    '''
    
    x_data = []
    y_data = []
    
    path_to_file = path + subject + suffix
    mat = scipy.io.loadmat(path_to_file)['roi_scanData'][0][roi]
    
    ranked_indices = None
    
    # Run through and find shortest subject block
    if rank_first:
        shortest_block_length = math.inf
        for scan in range(len(mat[0])):

            for cond in conds:

                blocks = [x for x in range(len(mat[0][scan][0][cond][0]))]

                for block in blocks:
                    block_data = []
                    for tr in range(len(mat[0][scan][0][cond][0][block][0])):
                        # Extract all voxel data from individual TRs
                        block_data.extend(mat[0][scan][0][cond][0][block][0][tr][0][0][0].tolist())
                    if len(block_data) < shortest_block_length:
                        shortest_block_length = min(shortest_block_length, len(block_data))    
                        ranked_indices = [i for i in (np.array(block_data)).argsort()[-block_length:]]
                        ranked_indices = np.flip(ranked_indices)
    
    # Run through and rank-order based on shortest subject block            
    for scan in range(len(mat[0])):
            
        for cond in conds:
            
            blocks = [x for x in range(len(mat[0][scan][0][cond][0]))]
            
            for block in blocks:
                block_data = []
                for tr in range(len(mat[0][scan][0][cond][0][block][0])):
                    # Extract all voxel data from individual TRs
                    block_data.extend(mat[0][scan][0][cond][0][block][0][tr][0][0][0].tolist())          
                if rank_first:
                    # Rank-orders a given subject's block based on the order of its first encountered block
                    block_data = [block_data[i] for i in ranked_indices]
                else:
                    # Filters for most active voxels in each block
                    block_data.sort()
                    block_data = block_data[-block_length:]

                x_data.append(block_data)
                y_data.append(mat[0][scan][1][cond][0])
    
    data = {'x': x_data, 'y': y_data}
    return data

In [6]:
def generate_dataset(subjects, path, suffix, roi, conds, block_length, rank_first):
    
    '''
    Generates entire dataset from subject list, partitioned by subject.
    
    Parameters
    ----------
    subjects: list
        a list of subject IDs to extract data from
    path: str
        the path to the data files
    suffix: str
        ending suffix of the data filename
    roi: int
        0 for V1 data, 1 for MT data
    conds: list
        list of integers specifying the conditional datasets to extract
        (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)    
    block_length: int
        the number of voxels to standardize every block in the dataset to
    rank_first: boolean
        whether to use first block in subject to order the rest of the blocks for that subject
    
    Returns
    -------
    dict
        voxel data with subject key
    dict
        label data with subject key
    '''
    
    x_data = []
    
    x_data_indices = []
    y_data_by_subject = dict()
    
    for subject in subjects:
        
        subject_data = extract_subject_data(path, subject, suffix, roi, conds, block_length, rank_first)
        x_data_indices.append(len(x_data))
        y_data_by_subject[subject] = subject_data['y']
        
        x_data.extend(subject_data['x'])
    
    # MinMaxScaler scales each feature to values between 0 and 1 among all x data
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_standardized = scaler.fit_transform(x_data)
    
    # Sorts block data into respective subject
    x_data_by_subject = dict()
    for i in range(len(subjects)):
        subject = subjects[i]
        start_index = x_data_indices[i]
        end_index = x_data_indices[i+1] if i+1 < len(x_data_indices) else len(x_data)
        
        x_data_by_subject[subject] = x_standardized[start_index:end_index]
    
    return x_data_by_subject, y_data_by_subject

In [7]:
def split_dataset(x_data, y_data, inner_subjects, outer_subject, scramble):
    
    '''
    Splits voxel and label data into appropriate testing and training data for nested
    cross-validation with SVM.
    
    Parameters
    ----------
    x_data: dict
        voxel data with subject key
    y_data: dict
        label data with subject key
    inner_subjects: list
        list of subject IDs of the inner test subjects
    outer_subject: str
        the ID of the outer test subject
    scramble: boolean, optional
        whether or not to scramble the labels when training, 
        default is False
    
    Returns
    -------
    list
        blocks of voxel data for training use
    list
        training labels for respective blocks
    list
        blocks of voxel data from inner test subject(s) for testing use
    list 
        labels for inner test subject(s)
    list
        blocks of voxel data from outer test subject for testing use
    list
        labels for outer test subject    
    '''
    
    x_train = []
    y_train = []
    
    x_test_inner = []
    y_test_inner = []
    
    x_test_outer = []
    y_test_outer = []
    
    for subject in x_data.keys():
        if subject == outer_subject:
            x_test_outer.extend(x_data[subject])
            y_test_outer.extend(y_data[subject])
        elif subject in inner_subjects:
            x_test_inner.extend(x_data[subject])
            y_test_inner.extend(y_data[subject])
        else:
            x_train.extend(x_data[subject])
            if scramble:
                y_scrambled = y_data[subject].copy()
                scramble_labels(y_scrambled)
                y_train.extend(y_scrambled)
            else:
                y_train.extend(y_data[subject])
            
    return x_train, y_train, x_test_inner, y_test_inner, x_test_outer, y_test_outer

In [18]:
def trainNN(data_params, epochs=20, layer_size=256, num_inner=1, scramble=False, rank_first=True, shuffle=False):
    
    '''
    Trains and tests the classifier for accuracy using NNs.
    
    Parameters
    ----------
    data_params: dict
        path: str
            the path to the data files
        roi: int
            0 for V1 data, 1 for MT data
        conds: list
            list of integers specifying the conditional datasets to extract
            (0 for trained_cp, 1 for trained_ip, 2 for untrained_cp, 3 for untrained_ip)
    epochs: int
        number of iterations to train model on
    layer_size: int
        size of hidden layer
    num_inner: int
        number of inner subjects to test classifier on,
        default is 1
    scramble: boolean, optional
        whether or not to scramble the labels when training, 
        default is False
    rank_first: boolean
        whether to use first block in subject to order the rest of the blocks for that subject,
        default is True
    shuffle: boolean
        whether to randomize which block to use in rank-ordering, 
        default is False
        
    Returns
    -------
    DataFrame
        data of inner subject combination testing accuracy
    DataFrame
        data of outer subject testing accuracy
    '''
    
    subjects, suffix = get_subjects(data_params['path'])
    
    cols = []
    for combo in itertools.combinations(range(len(subjects)), num_inner):
        col = ''
        for subject in combo:
            col += '/' + subjects[subject]
        cols.append(col[1:])

    outer_acc_report = pd.DataFrame(index=subjects, columns=cols)
    val_acc_report = pd.DataFrame(index=subjects, columns=cols)
    
    bmin, bmax = get_min_max_block_length(data_params['path'], subjects, suffix, data_params['roi'], data_params['conds'])
    block_length = bmin
    x_data, y_data = generate_dataset(subjects, data_params['path'], suffix, data_params['roi'], data_params['conds'], block_length, rank_first)
    
    for outer_subject in subjects:
        
        print(f"Currently on outer subject #{subjects.index(outer_subject)+1}.")

        start_time = time.time()
        
        inner_subjects = [s for s in subjects if s != outer_subject]
        for inner_test_subjects in itertools.combinations((inner_subjects), num_inner):
            
            inner_test_subjects = list(inner_test_subjects)

            col = ''
            for subject in inner_test_subjects:
                col += '/' + subject
            col = col[1:]
            print(f"Currently on combination of {col}.")    
            
            x_train, y_train, x_test_inner, y_test_inner, x_test_outer, y_test_outer = split_dataset(x_data, y_data, inner_test_subjects, outer_subject, scramble)
            
            y_train = labels_to_int(y_train)
            y_test_inner = labels_to_int(y_test_inner)
            y_test_outer = labels_to_int(y_test_outer)
            
            x_train = np.array(x_train)
            x_test_inner = np.array(x_test_inner)
            x_test_outer = np.array(x_test_outer)
            
            model = Sequential([
                    layers.Dense(layer_size, input_shape=(block_length,), activation="relu"),
                    layers.Dense(1, activation="sigmoid")
            ])

            optimizer = optimizers.Adam(learning_rate=0.001)
            model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])

            model.fit(x_train, y_train, epochs=epochs, validation_data=(x_test_inner, y_test_inner), verbose=0)
            outer_loss, outer_acc = model.evaluate(x_test_outer, y_test_outer, verbose=0)
            val_loss, val_acc = model.evaluate(x_test_inner, y_test_inner, verbose=0)
                
            # logs inner and outer subject accuracy data in dataframe
            outer_acc_report.at[outer_subject, col] = outer_acc
            val_acc_report.at[outer_subject, col] = val_acc

        clear_output()
        
        end_time = time.time()
        exec_time = end_time - start_time
        minutes = exec_time // 60
        seconds = exec_time % 60
        print(f"Last turn took {minutes} minutes and {seconds} seconds.")
    
    clear_output()
    return outer_acc_report, val_acc_report


### Testing Runs of NN

In [341]:
path = r'scans/output/PRE/'
roi = 1                            # V1-roi: 0, MT-roi: 1
conds = [1, 3]                     # trained_cp: 0, trained_ip: 1, untrained_cp: 2, untrained_ip: 3
block_length = 624

data_params = {'path': r'scans/output/PRE/', 'roi': 1, 'conds': [1, 3]}

'''
outer_accs, val_accs = []
for i in range(5):
    print(f'On run {i+1}.')
    outer_acc_report, val_accs = trainNN(data_params, classes, layer_size=256, scramble=False)
    outer_accs.extend(df_to_arr(outer_acc_report))
'''

outer_accs, val_accs = trainNN(data_params, layer_size=256, scramble=False)

print(f"Outer accuracy mean: {np.mean(df_to_arr(outer_accs))}")
print(f"Validation accuracy mean: {np.mean(df_to_arr(val_accs))}")

Outer accuracy mean: 0.5587606430053711
Validation accuracy mean: 0.558226466178894


In [None]:
path = r'scans/output/PRE/'
roi = 1                            # V1-roi: 0, MT-roi: 1
conds = [1, 3]                     # trained_cp: 0, trained_ip: 1, untrained_cp: 2, untrained_ip: 3
block_length = 624

data_params = {'path': r'scans/output/PRE/', 'roi': 1, 'conds': [1, 3]}

outer_accs_unscrambled = []
for _ in range(20):
    outer_acc_report, vals_accs = trainNN(data_params, scramble=False)
    outer_accs_unscrambled.extend(df_to_arr(outer_acc_report))
    np.save('output/nn_outer.npy', outer_accs_unscrambled)
    
outer_accs_scrambled = []
for _ in range(20):
    outer_acc_report, val_accs = trainNN(data_params, scramble=True)
    outer_accs_scrambled.extend(df_to_arr(outer_acc_report))
    np.save('output/nn_outer_s.npy', outer_accs_scrambled)

Last turn took 0 minutes and 26.890951 seconds.
Currently on outer subject #9.
