In [1]:
#Run cell to mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#import necessary packages

#our workhorses
import numpy as np
import pandas as pd
import scipy

#to visualize
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#style params for figures
sns.set(font_scale = 2)
plt.style.use('seaborn-white')
plt.rc("axes", labelweight="bold")
from IPython.display import display, HTML

#to load files
import os
import sys
import h5py

#append repo folder to search path
sys.path.append('/content/drive/MyDrive/limb-position-EMG-Repo/')
from utils import *

from sklearn.model_selection import KFold

In [3]:
def get_log_reg_model(input_shape, n_outputs, n_dense_pre = 0, drop_prob = 0.5, activation = 'tanh'):
    """
    Create 
    
    Args:
        input_shape
        n_outputs: number of output classes
        mask_value: value indicating which timepoints to mask out
            
    Returns:
        model
    """
    
    #define model architecture
    X_input = Input(shape = input_shape)
    X = X_input
    for n in range(n_dense_pre):
        X = Dense(input_shape[1],activation = activation)(X)
        X = Dropout(drop_prob)(X)
    X = Dense(n_outputs,activation = 'softmax')(X)
    model = Model(inputs = X_input, outputs = X)
    return model

def get_log_reg_f1(X, Y, model, average = 'weighted', mask_value = -100):
    """
    Get f1 score for an RNN model using masked timepoint data

    Args:
        X: 3D numpy array with shape [samples, timepoints, features]
        Y: 3D numpy array with shape [samples, timepoints, classes]. one-hot coding of classes
        model: RNN model object
        average: string argument for f1_score function. Usually 'macro' or 'weighted'
        mask_value: value indicating which timepoints to mask out

    Returns:
        f1: f1 score
    """
    # Mask out indices based on mask value
    nonmasked_idxs = np.where(Y[:,0].flatten()!=mask_value)[0]
    # Get target labels for non-masked timepoints
    y_true = np.argmax(Y,1).flatten()[nonmasked_idxs]
    # Get model predictions for non-masked timepoints
    preds = model.predict(X)
    y_pred = np.argmax(preds,1).flatten()[nonmasked_idxs]
    # Get F1 score
    f1 = f1_score(y_true,y_pred,average = average)

    return f1

def prepare_data_for_log_reg(X,Y, select_idxs, exclude_labels, train = False,scaler = None):

    X_cube =  X[select_idxs,:]
    Y_cube = Y[select_idxs]

    if train:
        scaler = StandardScaler()
        scaler = scaler.fit(X_cube)
        X_cube = scaler.transform(X_cube)
    else:
        X_cube = scaler.transform(X_cube)

    include_idxs = np.where(np.isin(Y_cube,exclude_labels, invert = True))[0]

    X_cube = X_cube[include_idxs,:]
    Y_cube = Y_cube[include_idxs]
    Y_cube = to_categorical(Y_cube-np.min(Y_cube))

    return X_cube, Y_cube, scaler
def shift_array(arr, num, fill_value=np.nan):
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = fill_value
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = fill_value
        result[:num] = arr[-num:]
    else:
        result[:] = arr
    return result

def get_mv_preds(X, model, n_votes):
    #get predictions by majority voting scheme

    y_prob = np.squeeze(model.predict(X))
    y_pred = np.argmax(y_prob,1)

    y_stack = y_pred.astype('float').copy()
    y_last = y_pred.astype('float').copy()

    for n in range(n_votes):
        y_shifted = shift_array(y_last,1)
        y_stack = np.vstack((y_stack,y_shifted))
        y_last = y_shifted.copy()

    y_pred_mv, vote_counts = scipy.stats.mode(y_stack,0,nan_policy='omit')
    y_pred_mv = np.squeeze(y_pred_mv.data)

    return y_pred_mv

def within_subject_log_reg_performance(X, Y, series_labels, exclude,  verbose = 0, epochs = 40, batch_size = 2, mv = False, permute = False):
    
    #initialize object for k-fold cross-validation
    n_splits = np.unique(series_labels).size
    kf = KFold(n_splits=n_splits,shuffle = True)
    #initialize empty arrays
    train_f1_scores = np.empty((n_splits,))
    test_f1_scores = np.empty((n_splits,))

    for split_count, (series_train, series_test) in enumerate(kf.split(np.unique(series_labels))):
        print('Split Count: %i'% (split_count+1))
        #get train and test idxs
        train_idxs = np.where(series_labels==series_train)[0]
        test_idxs = np.where(series_labels==series_test)[0]
        #get training data cubes
        X_train_cube, Y_train_cube, scaler = prepare_data_for_log_reg(X,Y, train_idxs, exclude, train = True)
        if permute:
            perm_idxs = np.random.permutation(np.arange(Y_train_cube.shape[0]))
            Y_train_cube = Y_train_cube[perm_idxs,:]

        n_features, n_outputs = X_train_cube.shape[1], Y_train_cube.shape[1]

        #setting timestep dimension to None 
        model = get_log_reg_model((n_features,),n_outputs)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary

        print('Training Model')
        # fit network
        history = model.fit(X_train_cube, Y_train_cube, epochs=epochs, batch_size=batch_size, verbose=verbose)

        # # evaluate trained network
        print('Evaluate Model')
        

        if mv:
            # get testing data cubes
            X_test_cube, Y_test_cube, scaler = prepare_data_for_log_reg(X,Y, train_idxs, [], train = False, scaler = scaler)
            y_pred = get_mv_preds(X_test_cube, model, n_votes= 5)+1
            y_true = np.squeeze(np.argmax(Y_test_cube,1))
            include_idxs = np.where(np.isin(y_true,exclude, invert = True))[0]
            y_true = y_true[include_idxs]
            y_pred = y_pred[include_idxs]
            train_f1 = f1_score(y_true,y_pred,average = 'weighted')

            # get testing data cubes
            X_test_cube, Y_test_cube, scaler = prepare_data_for_log_reg(X,Y, test_idxs, [], train = False, scaler = scaler)
            y_pred = get_mv_preds(X_test_cube, model, n_votes= 5)+1
            y_true = np.squeeze(np.argmax(Y_test_cube,1))
            include_idxs = np.where(np.isin(y_true,exclude, invert = True))[0]
            y_true = y_true[include_idxs]
            y_pred = y_pred[include_idxs]
            test_f1 = f1_score(y_true,y_pred,average = 'weighted')
        else:
            #get score for training data
            train_f1 = get_log_reg_f1(X_train_cube, Y_train_cube, model)
            # get testing data cubes
            X_test_cube, Y_test_cube, scaler = prepare_data_for_log_reg(X,Y, test_idxs, exclude, train = False, scaler = scaler)
            #get score for testing data
            test_f1 = get_log_reg_f1(X_test_cube, Y_test_cube, model)
        #put scores in array
        train_f1_scores[split_count] = train_f1
        test_f1_scores[split_count] = test_f1

    return train_f1_scores, test_f1_scores

In [4]:
#define where the data files are located
data_folder = '/content/drive/MyDrive/limb-position-EMG-Repo/EMG_data/'

nsubjects = 36


# User-defined parameters
lo_freq = 20 #lower bound of bandpass filter
hi_freq = 450 #upper bound of bandpass filter

win_size = 100 #define window size over which to compute time-domain features
step = win_size #keeping this parameter in case we want to re-run later with some overlap

#excluded labels
exclude = [0,7]

#for subject_id in range(nsubjects,nsubjects+1):
subject_id = 1
subject_folder = os.path.join(data_folder,'%02d'%(subject_id))
print('=======================')
print(subject_folder)

# Process data and get features 
#get features across segments and corresponding info
feature_matrix_sub, target_labels_sub, window_tstamps_sub, \
block_labels_sub, series_labels_sub = get_subject_data_for_classification(subject_folder, lo_freq, hi_freq, \
                                                                win_size, step)

/content/drive/MyDrive/limb-position-EMG-Repo/EMG_data/01


In [6]:
nreps = 10
exclude = [0,7]#labels to exclude

#for RNN training
verbose = 0
epochs_list = [10,20,40]
batch_size_list = [2, 5, 10]
# epochs = 40
# batch_size = 2

results_df = []#initialize empty array for dataframes
for epochs in epochs_list:
    for batch_size in batch_size_list:
        np.random.seed(1)#for reproducibility
        for rep in range(nreps):
            print('Epochs %d| Batch size %d|Rep %d'%(epochs, batch_size, rep+1))

            print('True Data')
            train_f1, test_f1 = within_subject_log_reg_performance(feature_matrix_sub, target_labels_sub, series_labels_sub, exclude,\
                                                                                verbose = 0, epochs = epochs, batch_size = batch_size, permute = False)
            results_df.append(pd.DataFrame({'F1_score':train_f1,\
                                'Fold':np.arange(train_f1.size)+1,\
                                'Rep':[rep+1 for x in range(train_f1.size)],\
                                'Type':['Train' for x in range(train_f1.size)],\
                                'Shuffled':[False for x in range(train_f1.size)],\
                                'Subject':[subject_id for x in range(train_f1.size)],\
                                'Epochs':[epochs for x in range(train_f1.size)],\
                                'Batch_size':[batch_size for x in range(train_f1.size)],\
                                }))
            results_df.append(pd.DataFrame({'F1_score':test_f1,\
                                'Fold':np.arange(test_f1.size)+1,\
                                'Rep':[rep+1 for x in range(test_f1.size)],\
                                'Type':['Test' for x in range(test_f1.size)],\
                                'Shuffled':[False for x in range(test_f1.size)],\
                                'Subject':[subject_id for x in range(test_f1.size)],\
                                'Epochs':[epochs for x in range(test_f1.size)],\
                                'Batch_size':[batch_size for x in range(test_f1.size)],\
                                }))
            
            print('Permuted Data')
            train_f1_perm, test_f1_perm = within_subject_log_reg_performance(feature_matrix_sub, target_labels_sub, series_labels_sub, exclude,\
                                                                                verbose = 0, epochs = epochs, batch_size = batch_size, permute = True)
            results_df.append(pd.DataFrame({'F1_score':train_f1_perm,\
                                'Fold':np.arange(train_f1_perm.size)+1,\
                                'Rep':[rep+1 for x in range(train_f1_perm.size)],\
                                'Type':['Train' for x in range(train_f1_perm.size)],\
                                'Shuffled':[True for x in range(train_f1_perm.size)],\
                                'Subject':[subject_id for x in range(train_f1_perm.size)],\
                                'Epochs':[epochs for x in range(train_f1_perm.size)],\
                                'Batch_size':[batch_size for x in range(train_f1_perm.size)],\
                                }))
            results_df.append(pd.DataFrame({'F1_score':test_f1_perm,\
                                'Fold':np.arange(test_f1_perm.size)+1,\
                                'Rep':[rep+1 for x in range(test_f1_perm.size)],\
                                'Type':['Test' for x in range(test_f1_perm.size)],\
                                'Shuffled':[True for x in range(test_f1_perm.size)],\
                                'Subject':[subject_id for x in range(test_f1_perm.size)],\
                                'Epochs':[epochs for x in range(test_f1_perm.size)],\
                                'Batch_size':[batch_size for x in range(test_f1_perm.size)],\
                                }))
results_df = pd.concat(results_df, axis = 0)

Epochs 10| Batch size 2|Rep 1
Split Count: 1
Training Model
Evaluate Model
Split Count: 2
Training Model
Evaluate Model
Split Count: 1
Training Model
Evaluate Model
Split Count: 2
Training Model
Evaluate Model
Epochs 10| Batch size 2|Rep 2
Split Count: 1
Training Model
Evaluate Model
Split Count: 2
Training Model
Evaluate Model
Split Count: 1
Training Model
Evaluate Model
Split Count: 2
Training Model
Evaluate Model
Epochs 10| Batch size 2|Rep 3
Split Count: 1
Training Model
Evaluate Model
Split Count: 2
Training Model
Evaluate Model
Split Count: 1
Training Model
Evaluate Model
Split Count: 2
Training Model
Evaluate Model
Epochs 10| Batch size 2|Rep 4
Split Count: 1
Training Model
Evaluate Model
Split Count: 2
Training Model
Evaluate Model
Split Count: 1
Training Model
Evaluate Model
Split Count: 2
Training Model
Evaluate Model
Epochs 10| Batch size 2|Rep 5
Split Count: 1
Training Model
Evaluate Model
Split Count: 2
Training Model
Evaluate Model
Split Count: 1
Training Model
Evaluate M

In [8]:
results_df.groupby(['Type','Shuffled','Epochs','Batch_size']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1_score,Fold,Rep,Subject
Type,Shuffled,Epochs,Batch_size,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Test,False,10,2,0.764233,1.5,5.5,1.0
Test,False,10,5,0.634544,1.5,5.5,1.0
Test,False,10,10,0.485749,1.5,5.5,1.0
Test,False,20,2,0.810021,1.5,5.5,1.0
Test,False,20,5,0.776334,1.5,5.5,1.0
Test,False,20,10,0.687989,1.5,5.5,1.0
Test,False,40,2,0.821832,1.5,5.5,1.0
Test,False,40,5,0.815008,1.5,5.5,1.0
Test,False,40,10,0.789799,1.5,5.5,1.0
Test,True,10,2,0.142039,1.5,5.5,1.0


In [9]:
results_folder = '/content/drive/MyDrive/limb-position-EMG-Repo/results_data/single_subject_training/log_reg/'
#save results to file
results_fn = 'subject_%02d_training_scheme_results.h5'%(subject_id)
results_df.to_hdf(os.path.join(results_folder,results_fn), key='results_df', mode='w')


In [None]:
n_splits = np.unique(series_labels).size
X = feature_matrix_sub.copy()
Y = target_labels_sub.copy()
exclude =[0,7]

#for RNN training
verbose = 0
epochs = 40
batch_size = 2






    

Split Count: 1
Split Count: 2


Training Model


(593, 7)