# This is the source code used in Ryerson-Intel project
Author: Alice

Created: Dec 2019

Modified for EMD analysis: Mar 2, 2020

Classification for EMD fixed frame

Using the EMD features and its delta features

In [0]:
# Mount the google drive to get the feature files
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Import system packages
import os
import glob

# Importing packages
import pandas as pd
import numpy as np
from numpy import set_printoptions

# Preparing dataset
from sklearn.preprocessing import MinMaxScaler
import random

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestRegressor
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB



# Cross Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Performance Measure
from sklearn import metrics
import statistics

# Plotting
import matplotlib.pyplot as plt

In [0]:
# Control setting to see which types of data
FCTOL = 'FC10Per' #FC10Per or FC15Per
SEG   = 'Triad'   #FixedFrameSize, Triad, or '' 

## Preparing the Dataset
Loading the feature csv file into arrays and normalized the features to range of [0,1]. There are 49 signals in total with 21 stress signals and 28 non-stress signals

In [0]:
# Setting folder directory
datapath = '/content/drive/My Drive/2019 DDK Paper/Journal/Features/EMD/GITA Features/'
# use the folder name for segmentation: '16K Fixed Frame Size', '16K Triad', '4k', and 'EMDDeltaEMD'
segmentation = 'EMDDeltaEMD'  
# The actual datafile is in Statistical Summary folder
datapath = datapath + segmentation +'/'

filename = SEG+'*'+FCTOL+'*.csv'
#addrs = glob.glob(datapath+'*.csv')
addrs = glob.glob(datapath+'*'+filename)

print(segmentation)
print(os.path.basename(addrs[0]))
print(os.path.basename(addrs[1]))



EMDDeltaEMD
EMDHCTriadFC10Per.csv
EMDPDTriadFC10Per.csv


In [0]:
# Construct dataframe for analysis

# Importing data matrices and remove rows containing nan
HCaddr = [addr for addr in addrs if 'HC' in addr ]
PDaddr = [addr for addr in addrs if 'PD' in addr ]

dfHCOriginal = pd.read_csv(HCaddr[0])
dfHCOriginal.dropna()
print('HC matrix dim: ', dfHCOriginal.shape)

dfPDOriginal = pd.read_csv(PDaddr[0])
dfPDOriginal.dropna()
print('PD matrix dim: ',dfPDOriginal.shape)




# Remove rows that contain 0 for all std features
def removeZeroData(dftmp):
  colnames = dftmp.columns
  stdcol = [header for header in colnames if 'Std' in header]
  stdtotal = dftmp[stdcol].sum(axis=1)
  dropIdx = [i for i,std in enumerate(stdtotal) if std==0]
  #print(stdtotal)
  print('Removing ', len(dropIdx), ' rows')
  for i in dropIdx:
    dftmp = dftmp.drop(dftmp.index[i])
  return dftmp

dfPDOriginal = removeZeroData(dfPDOriginal)
pdrow, pdcol = dfPDOriginal.shape
print(dfPDOriginal.shape)

dfHCOriginal = removeZeroData(dfHCOriginal)
hcrow, hccol = dfHCOriginal.shape
print(dfHCOriginal.shape)



# Combining the HC and PD dataframes
dfHCPD = pd.concat([dfHCOriginal,dfPDOriginal], axis=0)
hclabels = np.ones(hcrow, dtype=int)
pdlabels = np.zeros(pdrow, dtype=int)
                   
Labels = np.reshape(np.append(hclabels, pdlabels), (-1,1)) # Label for HC=1 and PD=0
print(hcrow,pdrow)
print('Label size', Labels.shape)

# features
colnames = dfHCPD.columns
hcpdfeatures = np.array(dfHCPD.iloc[:,1:].values)

print('Features: ', colnames)
print('dfHCPD matrix dim: ', dfHCPD.shape)
print('size of HCPD features: ', hcpdfeatures.shape)



HC matrix dim:  (50, 434)
PD matrix dim:  (50, 434)
Removing  0  rows
(50, 434)
Removing  0  rows
(50, 434)
50 50
Label size (100, 1)
Features:  Index(['subject', 'segmentNum', 'numIMFMean', 'OBW1Mean', 'OBW2Mean',
       'OBW3Mean', 'OBW4Mean', 'OBW5Mean', 'OBW6Mean', 'OBW7Mean',
       ...
       'deltaFCenter1Kurt', 'deltaFCenter2Kurt', 'deltaFCenter3Kurt',
       'deltaFCenter4Kurt', 'deltaFCenter5Kurt', 'deltaFCenter6Kurt',
       'deltaFCenter7Kurt', 'deltaFCenter8Kurt', 'deltaAmpKurt',
       'deltaDurKurt'],
      dtype='object', length=434)
dfHCPD matrix dim:  (100, 434)
size of HCPD features:  (100, 433)


In [0]:
# Normalizing the features into [0,1]
scaler = MinMaxScaler(feature_range = (0,1)) # scale the values to min 0, max 1
rescaledfeat = np.array(scaler.fit_transform(hcpdfeatures)) # fit the trainning feature X into scaler

set_printoptions(precision=3) # how many decimal places.
print(rescaledfeat[0:5,:])
print(rescaledfeat.shape)



[[0.278 0.429 0.499 ... 0.429 0.193 0.447]
 [0.667 0.467 0.592 ... 0.692 0.379 0.649]
 [0.333 0.567 0.    ... 0.266 0.381 0.107]
 [0.167 0.573 0.63  ... 0.181 0.095 0.127]
 [0.5   0.418 0.877 ... 0.412 0.196 0.433]]
(100, 433)


## Classification 
The classifiers tested in this project include Decision Tree, Each classifier uses 5-fold for validation.

In [0]:
# Classifiers
# Decision Tree
DT = DecisionTreeClassifier(random_state=0)
# Gradient Boost
GB = GradientBoostingClassifier(n_estimators=100, 
                                learning_rate=1.0, 
                                max_depth=1, 
                                random_state=0)

# Support Vector Machine
SVM = SVC(random_state = 0, gamma='scale',probability = True)

# Linear Discrimant Analysis
LDA = LinearDiscriminantAnalysis()

# Gaussian Naive Bayes
GNB = GaussianNB()



In [0]:
# Classificaiton function with K-Fold cross-validation with user define iterations to get the standard deviation 

def classify(model, Iterations, num_folds, rescaledX, y):
    acc = []
    pre = []
    rec = []
    rocauc = []
    f1 = []
    Skip = False

    bestAcc = 0
    beststd = 0



    for i in range(Iterations):

        # Shuffle the dataset for each iteration
        data = list(zip(rescaledX,y))
        random.shuffle(data)
        rescaledX, y = zip(*data)
        rescaledX = np.array(rescaledX)
        y = np.array(y)
        #y = y.reshape(-1,1)

        # Perform 5-fold validation
        kfold = KFold(n_splits = num_folds)#, shuffle=True, random_state = None)
        results = cross_val_score(model, rescaledX, y.ravel(), cv = kfold)
        if results.mean()>bestAcc:
          bestAcc = results.mean()
          beststd = results.std()
          print(results)

        for train_index, test_index in kfold.split(rescaledX):
            X_train, X_test = rescaledX[train_index], rescaledX[test_index]
            y_train, y_test = y[train_index], y[test_index]

            y_total = sum(y_test)
            if (y_total == 0) or (y_total == len(y_test)):
                Skip = True
                
            if not Skip:
                # perform training and testing
                if model == SVC:
                    model.fit(X_train,y_train)
                else:
                    model.fit(X_train,y_train.ravel())
                #dtscores = DT.score(X_test,y_test)
                yPred = model.predict(X_test)


                # record performance
                acc = np.append(acc,metrics.accuracy_score(y_test, yPred)) 
                pre = np.append(pre,metrics.precision_score(y_test,
                                                            yPred, 
                                                            pos_label=1, 
                                                            average='macro', 
                                                            labels=np.unique(yPred)))
                rec = np.append(rec,metrics.recall_score(y_test,
                                                         yPred, 
                                                         pos_label=1, 
                                                         average='macro', 
                                                         labels=np.unique(yPred)))
                rocauc = np.append(rocauc, metrics.roc_auc_score(y_test, 
                                                                 yPred,
                                                                 average='macro'))
                f1 = np.append(f1, metrics.f1_score(y_test,yPred))
    return acc, pre, rec, rocauc, f1, bestAcc, beststd

    

### Parameter setting
The classification are set to 100 iteration using 5-fold cross validation.

NOTE: You might encounter single predicted labels instead of 2 class labels. This will return an error. The easiest work around is to run the cell again to shuffle tthe data. 

In [0]:
Iterations = 200
num_folds = 10
seed = 15

## Support Vector Machine

In [0]:
print('Support Vector Machine')
acc, pre, rec, rocauc, f1,bestacc, beststd = classify(SVM, Iterations, num_folds,  rescaledfeat, Labels)
print('Accuracy (mean, std):', statistics.mean(acc), statistics.stdev(acc))
print('Precision (mean, std):', statistics.mean(pre), statistics.stdev(pre))
print('Recall (mean, std):', statistics.mean(rec), statistics.stdev(pre))
print('Area under the Receiver Operating Characteristic Curve', statistics.mean(rocauc), statistics.stdev(rocauc))
print('F1-score:', statistics.mean(f1), statistics.stdev(f1))
print('best iteration accuracy: ', bestacc, beststd)


Support Vector Machine
[0.6 0.8 0.8 0.6 0.8 0.9 0.8 0.6 0.6 0.7]
[0.7 0.9 0.6 0.6 0.9 0.9 0.7 0.7 0.7 0.8]
[0.9 0.8 0.7 0.7 0.7 0.9 0.8 0.8 0.7 0.8]
Accuracy (mean, std): 0.7081469648562301 0.14371151091015189
Precision (mean, std): 0.7372530934631574 0.14657084028895023
Recall (mean, std): 0.7208363443886607 0.14657084028895023
Area under the Receiver Operating Characteristic Curve 0.7180408172321112 0.14091972618621207
F1-score: 0.7348890480293239 0.1446805551229715
best iteration accuracy:  0.78 0.07483314773547886


In [0]:
# Classificaiton function with K-Fold cross-validation with user define iterations to get the standard deviation 

def findbestresults(model, Iterations, num_folds, rescaledX, y):
    
    Skip = False

    bestAcc = 0

    for i in range(Iterations):

        # Shuffle the dataset for each iteration
        data = list(zip(rescaledX,y))
        random.shuffle(data)
        rescaledX, y = zip(*data)
        rescaledX = np.array(rescaledX)
        y = np.array(y)
        #y = y.reshape(-1,1)

        # Perform 5-fold validation
        kfold = KFold(n_splits = num_folds)#, shuffle=True, random_state = None)
        results = cross_val_score(model, rescaledX, y.ravel(), cv = kfold)
        if results.mean()>bestAcc:
          bestAcc = results.mean()
          
          print(results)

          acc = []
          pre = []
          rec = []
          rocauc = []
          f1 = []

          for train_index, test_index in kfold.split(rescaledX):
              X_train, X_test = rescaledX[train_index], rescaledX[test_index]
              y_train, y_test = y[train_index], y[test_index]

              y_total = sum(y_test)
              if (y_total == 0) or (y_total == len(y_test)):
                  Skip = True
                  
              if not Skip:
                  # perform training and testing
                  if model == SVC:
                      model.fit(X_train,y_train)
                  else:
                      model.fit(X_train,y_train.ravel())
                  #dtscores = DT.score(X_test,y_test)
                  yPred = model.predict(X_test)


                  # record performance
                  acc = np.append(acc,metrics.accuracy_score(y_test, yPred)) 
                  pre = np.append(pre,metrics.precision_score(y_test,
                                                              yPred, 
                                                              pos_label=1, 
                                                              average='macro', 
                                                              labels=np.unique(yPred)))
                  rec = np.append(rec,metrics.recall_score(y_test,
                                                          yPred, 
                                                          pos_label=1, 
                                                          average='macro', 
                                                          labels=np.unique(yPred)))
                  rocauc = np.append(rocauc, metrics.roc_auc_score(y_test, 
                                                                  yPred,
                                                                  average='macro'))
                  f1 = np.append(f1, metrics.f1_score(y_test,yPred))
    return acc, pre, rec, rocauc, f1

In [0]:
Iterations = 200
num_folds = 10
seed =14

print('Support Vector Machine')
acc, pre, rec, rocauc, f1 = findbestresults(SVM, Iterations, num_folds,  rescaledfeat, Labels)
print('Accuracy (mean, std):', statistics.mean(acc), statistics.stdev(acc))
print('Precision (mean, std):', statistics.mean(pre), statistics.stdev(pre))
print('Recall (mean, std):', statistics.mean(rec), statistics.stdev(pre))
print('Area under the Receiver Operating Characteristic Curve', statistics.mean(rocauc), statistics.stdev(rocauc))
print('F1-score:', statistics.mean(f1), statistics.stdev(f1))

Support Vector Machine
[0.9 0.6 0.8 0.6 0.7 0.8 0.5 1.  0.8 0.8]
[0.3 0.8 1.  0.8 0.7 0.7 0.8 0.9 0.9 0.7]
[0.8 0.7 0.8 0.7 0.6 0.8 0.8 0.6 0.9 1. ]
Accuracy (mean, std): 0.77 0.12516655570345728
Precision (mean, std): 0.8153571428571429 0.11681834703107995
Recall (mean, std): 0.7722619047619048 0.11681834703107995
Area under the Receiver Operating Characteristic Curve 0.7722619047619048 0.12595321860875075
F1-score: 0.7968797868797869 0.1234353775790824
