In [1]:
#Run cell to mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install package to have access to custom functions
%pip install /content/drive/MyDrive/EMG_gestures/ --use-feature=in-tree-build

Processing ./drive/MyDrive/EMG_gestures
Building wheels for collected packages: EMG-gestures
  Building wheel for EMG-gestures (setup.py) ... [?25l[?25hdone
  Created wheel for EMG-gestures: filename=EMG_gestures-0.1.0-py3-none-any.whl size=35315 sha256=2d0d3dcea8b6eceac3b007f2953f318e76408155ee7ecd88daeaa0bb7fc427af
  Stored in directory: /tmp/pip-ephem-wheel-cache-7164qdnr/wheels/a2/b7/61/2147fa082a9e51bef5dcc38dd3f0898fe0554d62203c0e383e
Successfully built EMG-gestures
Installing collected packages: EMG-gestures
Successfully installed EMG-gestures-0.1.0


In [3]:
#import necessary packages

#our workhorses
import numpy as np
import pandas as pd
import scipy

#to visualize
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
#style params for figures
sns.set(font_scale = 2)
plt.style.use('seaborn-white')
plt.rc("axes", labelweight="bold")
from IPython.display import display, HTML

#to load files
import os
import sys
import h5py
import pickle

#append repo folder to search path
#import cusotm functions
from EMG_gestures.utils import *
from EMG_gestures.analysis import log_reg_xsubject_transform_module_train_frac_subjects, log_reg_xsubject_transform_module_train_all_subjects



In [4]:
#define hyper params for each model
model_dict = {1:{'n_dense_pre':0, 'activation':''},\
              2:{'n_dense_pre':1, 'activation':'tanh'},\
              3:{'n_dense_pre':1, 'activation':'relu'},\
              4:{'n_dense_pre':2, 'activation':'tanh'},\
              5:{'n_dense_pre':2, 'activation':'relu'},\
              }


In [20]:
#define where the data files are located
data_folder = '/content/drive/MyDrive/EMG_gestures/EMG_data/'

nsubjects = 5

#randomly-selected subjects to use as hold-out test data 
test_subjects = [17, 23,  7,  8,  3]

# User-defined parameters
lo_freq = 20 #lower bound of bandpass filter
hi_freq = 450 #upper bound of bandpass filter

win_size = 100 #define window size over which to compute time-domain features
step = win_size #keeping this parameter in case we want to re-run later with some overlap


In [21]:
#intialize empty lists
feature_matrix_all = np.empty((0,0))
target_labels_all = np.empty((0,))
window_tstamps_all = np.empty((0,))
block_labels_all  = np.empty((0,))
subject_id_all = np.empty((0,))
block_count = 0

for subject_id in range(1,nsubjects+1):
    if subject_id not in test_subjects:
        subject_folder = os.path.join(data_folder,'%02d'%(subject_id))
        print('=======================')
        print(subject_folder)

        # Process data and get features 
        #get features across segments and corresponding info
        feature_matrix, target_labels, window_tstamps, \
        block_labels, series_labels = get_subject_data_for_classification(subject_folder, lo_freq, hi_freq, \
                                                                        win_size, step)

        #prevent repeat of block labels by increasing block count
        block_labels = block_labels+block_count
        block_count = np.max([block_count, np.max(block_labels)])

        # concatenate lists
        feature_matrix_all = np.vstack((feature_matrix_all,feature_matrix)) if feature_matrix_all.size else feature_matrix
        target_labels_all = np.hstack((target_labels_all,target_labels))
        window_tstamps_all = np.hstack((window_tstamps_all,window_tstamps))
        block_labels_all = np.hstack((block_labels_all,block_labels))
        subject_id_all = np.hstack((subject_id_all,np.ones((block_labels.size))*subject_id))
        

/content/drive/MyDrive/EMG_gestures/EMG_data/01
/content/drive/MyDrive/EMG_gestures/EMG_data/02
/content/drive/MyDrive/EMG_gestures/EMG_data/04
/content/drive/MyDrive/EMG_gestures/EMG_data/05


In [None]:
results_folder = '/content/drive/MyDrive/EMG_gestures/results_data/xsubject_joint_data/log_reg/'
figure_folder = '/content/drive/MyDrive/EMG_gestures/figures/training_history/xsubject_joint_data/log_reg'


#RNN training args - all other arguments are the same
verbose = 0
epochs = 30
batch_size = 5
validation_split = 0.1
# experiment params
n_splits = 4
nreps = 10
for model_id in range(2,5+1):
    np.random.seed(1)# Set seed for replicability
    results_df = []
    for rep in range(nreps):
        print('Model %d | Rep %d'%(model_id, rep+1))
        print('--True Data--')
        train_f1, test_f1, train_history = log_reg_xsubject_join_data(feature_matrix_all, target_labels_all, subject_id_all, block_labels_all,\
                                                                                model_dict = model_dict[model_id], n_splits = n_splits,\
                                                                                verbose = verbose, epochs = epochs, batch_size = batch_size,\
                                                                                validation_split = validation_split, mv = False, permute = False)
        #put results in datafram e
        results_df.append(pd.DataFrame({'F1_score':train_f1,\
                        'Fold':np.arange(train_f1.size)+1,\
                        'Rep':[rep+1 for x in range(train_f1.size)],\
                        'Type':['Train' for x in range(train_f1.size)],\
                        'Shuffled':[False for x in range(train_f1.size)]
                        }))

        results_df.append(pd.DataFrame({'F1_score':test_f1,\
                        'Fold':np.arange(test_f1.size)+1,\
                        'Rep':[rep+1 for x in range(test_f1.size)],\
                        'Type':['Test' for x in range(test_f1.size)],\
                        'Shuffled':[False for x in range(test_f1.size)]
                        }))
        #plot training history
        fig_title = 'Log reg model %02d; rep %i'%(model_id,rep)
        fig_fn = os.path.join(figure_folder,'log_reg_model_%02d_rep_%i_loss.png'%(model_id,rep))
        plot_training_history(train_history, fig_title,fig_fn)

        #repeat with shuffled data
        print('Model %d | Rep %d'%(model_id, rep+1))
        print('--Permuted Data--')
        train_f1_perm, test_f1_perm, train_history = log_reg_xsubject_join_data(feature_matrix_all, target_labels_all, subject_id_all, block_labels_all,\
                                                                            model_dict = model_dict[1], n_splits = n_splits,\
                                                                            verbose = verbose, epochs = epochs, batch_size = batch_size,\
                                                                            validation_split = validation_split, mv = False, permute = True)
        
        results_df.append(pd.DataFrame({'F1_score':train_f1_perm,\
                        'Fold':np.arange(train_f1_perm.size)+1,\
                        'Rep':[rep+1 for x in range(train_f1_perm.size)],\
                        'Type':['Train' for x in range(train_f1_perm.size)],\
                        'Shuffled':[True for x in range(train_f1_perm.size)]
                        }))

        results_df.append(pd.DataFrame({'F1_score':test_f1_perm,\
                        'Fold':np.arange(test_f1_perm.size)+1,\
                        'Rep':[rep+1 for x in range(test_f1_perm.size)],\
                        'Type':['Test' for x in range(test_f1_perm.size)],\
                        'Shuffled':[True for x in range(test_f1_perm.size)]
                        }))
    #concatenate all data frames
    results_df = pd.concat(results_df,axis = 0)

    #save results to file
    results_fn = 'model_%02d_results.h5'%(model_id)
    results_df.to_hdf(os.path.join(results_folder,results_fn), key='results_df', mode='w')


In [10]:
from sklearn.metrics import accuracy_score, f1_score,make_scorer, log_loss

In [11]:
def get_scores(X, Y, model, score_list ,average = 'weighted', mask_value = -100):
    """
    Get indicated performance scores for a trained model. can mask out samples

    Args:
        X: 2D numpy array with shape [samples, features]
        Y: 2D numpy array with shape [samples,classes]. one-hot coding of classes
        model: trained model object
        average: string argument for f1_score function. Usually 'macro' or 'weighted'
        mask_value: value indicating which samples to mask out

    Returns:
        out_scores
    """
    # Mask out indices based on mask value
    nonmasked_idxs = np.where(Y[:,0].flatten()!=mask_value)[0]
    # Get target labels for non-masked timepoints
    y_true = np.argmax(Y,1).flatten()[nonmasked_idxs]
    # Get model predictions for non-masked timepoints
    preds = model.predict(X)
    y_pred = np.argmax(preds,1).flatten()[nonmasked_idxs]

    # Get scores
    out_scores = np.empty((0,0))
    if 'f1' in score_list:
        out_scores = f1_score(y_true,y_pred,average = average)
    if 'accuracy' in score_list:
        out_scores = np.hstack((out_scores,accuracy_score(y_true,y_pred))) if out_scores.size else accuracy_score(y_true,y_pred)

    return out_scores

In [30]:
def evaluate_trained_log_reg(X, Y, test_idxs, exclude, trained_model, score_list,scaler, mv):
    #exclude indicated labels
    test_idxs_orig = test_idxs.copy()
    in_samples = np.where(np.isin(Y,exclude, invert = True))[0]
    test_idxs = np.intersect1d(test_idxs,in_samples)

    print('Evaluate Model')
    if mv:
        # get testing data cubes
        X_test_cube, Y_test_cube, scaler = prepare_data_for_log_reg(X,Y, test_idxs, [], train = False, scaler = scaler)
        y_pred = get_mv_preds(X_test_cube, trained_model, n_votes= 5)+1
        y_true = np.squeeze(np.argmax(Y_test_cube,1))
        include_idxs = np.where(np.isin(y_true,exclude, invert = True))[0]
        y_true = y_true[include_idxs]
        y_pred = y_pred[include_idxs]
        test_f1 = f1_score(y_true,y_pred,average = 'weighted')
    else:

        # get testing data cubes
        X_test_cube, Y_test_cube, scaler = prepare_data_for_log_reg(X,Y, test_idxs, exclude, train = False, scaler = scaler)
        #get score for testing data
        #test_f1 = get_log_reg_f1(X_test_cube, Y_test_cube, trained_model)
        test_scores = get_scores(X_train_cube, Y_train_cube, model, score_list)
    return test_f1

def get_trained_model(X, Y, train_idxs, exclude = [], model_dict = {},score_list = ['f1'], verbose = 0, epochs = 40, batch_size = 2,\
                      validation_split = 0, mv = False, permute = False):


    if not model_dict:
        model_dict = {'n_dense_pre':0, 'activation':''}

    #exclude indicated labels
    in_samples = np.where(np.isin(Y,exclude, invert = True))[0]
    train_idxs_orig = train_idxs.copy()
    train_idxs = np.intersect1d(train_idxs,in_samples)

    #get training data cubes
    X_train_cube, Y_train_cube, scaler = prepare_data_for_log_reg(X,Y, train_idxs, exclude, train = True)

    #testfor equal number of samples
    assert X_train_cube.shape[0] == Y_train_cube.shape[0]
    n_features, n_outputs = X_train_cube.shape[1], Y_train_cube.shape[1]
    #setting timestep dimension to None 
    model = get_log_reg_model((n_features,),n_outputs, n_dense_pre=model_dict['n_dense_pre'], activation=model_dict['activation'])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.summary

    print('Training Model')
    # fit network
    history = model.fit(X_train_cube, Y_train_cube,validation_split = validation_split, \
                        epochs=epochs, batch_size=batch_size, verbose=verbose)
    # # evaluate trained network
    print('Evaluate Model on Trained Data')

    if mv:
        # get testing data cubes
        X_test_cube, Y_test_cube, scaler = prepare_data_for_log_reg(X,Y, train_idxs, [], train = False, scaler = scaler)
        y_pred = get_mv_preds(X_test_cube, model, n_votes= 5)+1
        y_true = np.squeeze(np.argmax(Y_test_cube,1))
        include_idxs = np.where(np.isin(y_true,exclude, invert = True))[0]
        y_true = y_true[include_idxs]
        y_pred = y_pred[include_idxs]
        train_scores = f1_score(y_true,y_pred,average = 'weighted')

    else:
        #get score for training data
        train_scores = get_scores(X_train_cube, Y_train_cube, model, score_list)
    return train_scores, model, scaler, history



In [None]:
#

In [24]:
#model_dir = '/content/drive/MyDrive/limb-position-EMG-Repo/model_data/xsubject_transform_module/log_reg/'
# figure_dir = '/content/drive/MyDrive/limb-position-EMG-Repo/figures/training_history/xsubject_transform_module/log_reg/'

# rep = 0
# figure_folder = os.path.join(figure_dir,'rep_%i'%(rep+1))
# if not os.path.isdir(figure_folder):
#     os.makedirs(figure_folder)
# model_folder = os.path.join(model_dir,'rep_%i'%(rep+1))
# if not os.path.isdir(model_folder):
#     os.makedirs(model_folder)

feature_matrix = feature_matrix_all.copy()
target_labels = target_labels_all.copy()
sub_labels = subject_id_all.copy()
block_labels = block_labels_all.copy()

score_list = ['f1','accuracy']

n_splits = 4


verbose = 0
epochs = 30
batch_size = 5
mv = None
permute = False
exclude = [0,7]

In [18]:
from sklearn.model_selection import KFold

array([1., 2.])

In [25]:
# def log_reg_xsubject_join_data(feature_matrix, target_labels, sub_labels, block_labels, model_dict, n_splits = 4,\
#                        verbose = 0, epochs = 40, batch_size = 2, validation_split = 0.1, mv = False, permute = False):

#subjects in list. there are the units over which we will do train/test split
subs = np.unique(sub_labels)

if permute:
    #permute while ignoring excluded blocks
    target_labels = permute_class_within_sub(target_labels, block_labels, sub_labels, exclude)


#initialize object for k-fold cross-validation
kf = KFold(n_splits=n_splits,shuffle = True)
#initialize empty arrays

n_scores = len(score_list)
train_scores_all = np.empty((n_splits,n_scores))
test_scores_all = np.empty((n_splits,n_scores))
train_history = dict()
train_history['loss'] = np.empty((0,0))
train_history['val_loss'] = np.empty((0,0))

for split_count, (subs_train_idxs, subs_test_idxs) in enumerate(kf.split(subs)):
    print('Split Count: %i'% (split_count+1))

    #get train and test indices
    train_subs = subs[subs_train_idxs]
    test_subs = subs[subs_test_idxs]
    train_idxs = np.where(np.isin(sub_labels,train_subs, invert = False))[0]
    test_idxs = np.where(np.isin(sub_labels,test_subs, invert = False))[0]




Split Count: 1
Split Count: 2
Split Count: 3
Split Count: 4


In [31]:
#get trained model
validation_split = 0.1
np.random.seed(1)
train, trained_model, scaler, history = get_trained_model(feature_matrix, target_labels, train_idxs,exclude, model_dict[1],\
                                                             score_list,\
                                                            verbose = verbose, epochs = epochs, batch_size = batch_size,\
                                                            validation_split = validation_split,\
                                                            mv = mv, permute = permute)

Training Model
Evaluate Model on Trained Data


In [32]:
#Evaluating on held-out subjects
test_scores = evaluate_trained_log_reg(feature_matrix, target_labels, test_idxs, exclude, trained_model, score_list,scaler, mv = mv)


array([0.87826701, 0.87932422])

In [None]:

    



        #put scores in array
        train_scores_all[split_count,:] = train_scores
        test_scores_all[split_count],: = test_scores

        #append history
        train_history['loss'] = np.vstack((train_history['loss'],history.history['loss'])) if train_history['loss'].size else np.array(history.history['loss'])
        if validation_split>0:
            train_history['val_loss'] = np.vstack((train_history['val_loss'],history.history['val_loss'])) if train_history['val_loss'].size else np.array(history.history['val_loss'])

    return train_f1_scores, test_f1_scores, train_history