# This is the source code used in Ryerson-Intel project
Author: Alice

Created: Dec 2019

Modified for EMD analysis: Mar 2, 2020

Classification for EMD Envelope downsampled at 4K

In [1]:
# Mount the google drive to get the feature files
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Import system packages
import os
import glob

# Importing packages
import pandas as pd
import numpy as np
from numpy import set_printoptions

# Preparing dataset
from sklearn.preprocessing import MinMaxScaler
import random

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestRegressor
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB


# Cross Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Performance Measure
from sklearn import metrics
import statistics

# Plotting
import matplotlib.pyplot as plt

In [0]:
# Control setting to see which types of data
FCTOL = 'FC15Per' #FC10Per or FC15Per
SEG   = ''   #FixedFrameSize, Triad, Delta, or '' 

# Set the folder according to segmentation: '16K Fixed Frame Size', '16K Triad', '4k', and 'EMDDeltaEMD'
if SEG=='FixedFrameSize':
  segmentation = '16K Fixed Frame Size'
elif SEG == 'Triad':
  segmentation = '16K Triad' 
elif SEG == 'Delta':
  segmentation = 'EMDDeltaEMD'
else:
  segmentation = '4K'

## Preparing the Dataset
Loading the feature csv file into arrays and normalized the features to range of [0,1]. There are 49 signals in total with 21 stress signals and 28 non-stress signals

In [4]:
# Setting folder directory
datapath = '/content/drive/My Drive/2019 DDK Paper/Journal/Features/EMD/GITA Features/'
 
# The actual datafile is in Statistical Summary folder
if SEG == 'Delta' or SEG == '':
  datapath = datapath + segmentation + '/'
else:
  datapath = datapath + segmentation + '/Statistical Summary/'


filename = SEG+'*'+FCTOL+'*.csv'
#addrs = glob.glob(datapath+'*.csv')
addrs = glob.glob(datapath+'*'+filename)

print(segmentation)
print(datapath)
print(os.path.basename(addrs[0]))
print(os.path.basename(addrs[1]))



4K
/content/drive/My Drive/2019 DDK Paper/Journal/Features/EMD/GITA Features/4K/
20200303EMDAmpHCEnvFC15Per.csv
20200303EMDAmpPDEnvFC15Per.csv


In [5]:
# Construct dataframe for analysis

# Importing data matrices
HCaddr = [addr for addr in addrs if 'HC' in addr ]
PDaddr = [addr for addr in addrs if 'PD' in addr ]
dfHCOriginal = pd.read_csv(HCaddr[0])
dfPDOriginal = pd.read_csv(PDaddr[0])

print('HC matrix dim: ', dfHCOriginal.shape)
print('PD matrix dim: ', dfPDOriginal.shape)


# Combining the HC and PD dataframes
dfHCPD = pd.concat([dfHCOriginal,dfPDOriginal], axis=0)
Labels = np.reshape([np.ones([50,1]), np.zeros([50,1])], (-1,1)) # Label for HC=1 and PD=0

# features
colnames = dfHCPD.columns
if SEG == '':
  hcpdfeatures = np.array(dfHCPD.iloc[:,4:].values)
else:
  hcpdfeatures = np.array(dfHCPD.iloc[:,1:].values)

print('Features: ', colnames)
print('HC matrix dim: ', dfHCPD.shape)
print('size of HCPD features: ', hcpdfeatures.shape)
#print(dfHCPD.dtypes)



HC matrix dim:  (50, 75)
PD matrix dim:  (50, 75)
Features:  Index(['subject', 'soundtype', 'recording', 'segmentNum', 'numIMF', 'OBW1',
       'OBW2', 'OBW3', 'OBW4', 'OBW5', 'OBW6', 'OBW7', 'OBW8', 'OBW9', 'OBW10',
       'OBWRatio1', 'OBWRatio2', 'OBWRatio3', 'OBWRatio4', 'OBWRatio5',
       'OBWRatio6', 'OBWRatio7', 'OBWRatio8', 'OBWRatio9', 'OBWRatio10',
       'PobwdB1', 'PobwdB2', 'PobwdB3', 'PobwdB4', 'PobwdB5', 'PobwdB6',
       'PobwdB7', 'PobwdB8', 'PobwdB9', 'PobwdB10', 'PobwRatio_23',
       'PobwRatio_34', 'PobwRatio_45', 'PobwRatio_56', 'PobwRatio_67',
       'PobwRatio_78', 'P3dB1', 'P3dB2', 'P3dB3', 'P3dB4', 'P3dB5', 'P3dB6',
       'P3dB7', 'P3dB8', 'P3dBBalancing_23', 'P3dBBalancing_34',
       'P3dBBalancing_45', 'P3dBBalancing_56', 'P3dBBalancing_67',
       'P3dBBalancing_78', 'FCenter1', 'FCenter2', 'FCenter3', 'FCenter4',
       'FCenter5', 'FCenter6', 'FCenter7', 'FCenter8', 'FCRatio1', 'FCRatio2',
       'FCRatio3', 'FCRatio4', 'FCRatio5', 'FCRatio6', 'SNRdB',

In [6]:
# Normalizing the features into [0,1]
scaler = MinMaxScaler(feature_range = (0,1)) # scale the values to min 0, max 1
rescaledfeat = np.array(scaler.fit_transform(hcpdfeatures)) # fit the trainning feature X into scaler

set_printoptions(precision=3) # how many decimal places.
print(rescaledfeat[0:5,:])



[[0.583 0.836 0.524 0.436 0.418 0.638 0.282 0.26  0.234 0.116 0.13  0.581
  0.292 0.043 0.363 0.362 0.218 0.816 0.214 0.132 0.539 0.288 0.41  0.505
  0.297 0.109 0.298 0.285 0.355 0.809 0.486 0.433 0.764 0.913 0.392 0.542
  0.53  0.098 0.468 0.535 0.28  0.084 0.399 0.427 0.456 0.623 0.09  0.103
  0.938 0.902 0.096 0.873 0.674 0.561 0.253 0.146 0.132 0.124 0.265 0.143
  0.168 0.211 0.125 0.213 0.069 0.518 0.306 0.28  0.31  0.134 0.137]
 [0.75  0.877 0.78  0.55  0.392 0.722 0.889 0.89  0.625 0.417 0.366 0.759
  0.489 0.    0.026 0.307 0.326 0.722 0.36  0.108 0.392 0.583 0.429 0.654
  0.854 0.669 0.198 0.051 0.235 0.434 0.525 0.51  0.499 0.766 0.961 0.857
  0.695 0.228 0.167 0.485 0.629 0.498 0.133 0.133 0.175 0.62  0.086 0.135
  0.941 0.916 0.097 0.721 0.407 0.564 0.517 1.    0.988 0.451 0.514 0.055
  0.048 0.011 0.069 0.936 0.269 0.032 0.195 0.567 0.453 0.324 0.097]
 [0.667 0.835 0.321 0.348 0.325 0.439 0.231 0.174 0.201 0.127 0.113 0.405
  0.286 0.113 0.296 0.461 0.127 0.599 0.336 0.10

## Classification 
The classifiers tested in this project include Decision Tree, Each classifier uses 5-fold for validation.

In [0]:
# Classifiers
# Decision Tree
DT = DecisionTreeClassifier(random_state=0)
# Gradient Boost
GB = GradientBoostingClassifier(n_estimators=100, 
                                learning_rate=1.0, 
                                max_depth=1, 
                                random_state=0)

# Support Vector Machine
SVM = SVC(random_state = 0, gamma='scale',probability = True)

# Linear Discrimant Analysis
LDA = LinearDiscriminantAnalysis()

# Gaussian Naive Bayes
GNB = GaussianNB()



In [0]:
# Classificaiton function with K-Fold cross-validation with user define iterations to get the standard deviation 

def classify(model, Iterations, num_folds, rescaledX, y):
    acc = []
    pre = []
    rec = []
    rocauc = []
    f1 = []
    Skip = False

    bestAcc = 0
    beststd = 0



    for i in range(Iterations):

        # Shuffle the dataset for each iteration
        data = list(zip(rescaledX,y))
        random.shuffle(data)
        rescaledX, y = zip(*data)
        rescaledX = np.array(rescaledX)
        y = np.array(y)
        #y = y.reshape(-1,1)

        # Perform 5-fold validation
        kfold = KFold(n_splits = num_folds)#, shuffle=True, random_state = None)
        results = cross_val_score(model, rescaledX, y.ravel(), cv = kfold)
        if results.mean()>bestAcc:
          bestAcc = results.mean()
          beststd = results.std()
          print(results)

        for train_index, test_index in kfold.split(rescaledX):
            X_train, X_test = rescaledX[train_index], rescaledX[test_index]
            y_train, y_test = y[train_index], y[test_index]

            y_total = sum(y_test)
            if (y_total == 0) or (y_total == len(y_test)):
                Skip = True
                
            if not Skip:
                # perform training and testing
                if model == SVC:
                    model.fit(X_train,y_train)
                else:
                    model.fit(X_train,y_train.ravel())
                #dtscores = DT.score(X_test,y_test)
                yPred = model.predict(X_test)


                # record performance
                acc = np.append(acc,metrics.accuracy_score(y_test, yPred)) 
                pre = np.append(pre,metrics.precision_score(y_test,
                                                            yPred, 
                                                            pos_label=1, 
                                                            average='macro', 
                                                            labels=np.unique(yPred)))
                rec = np.append(rec,metrics.recall_score(y_test,
                                                         yPred, 
                                                         pos_label=1, 
                                                         average='macro', 
                                                         labels=np.unique(yPred)))
                rocauc = np.append(rocauc, metrics.roc_auc_score(y_test, 
                                                                 yPred,
                                                                 average='macro'))
                f1 = np.append(f1, metrics.f1_score(y_test,yPred))
    return acc, pre, rec, rocauc, f1, bestAcc, beststd
    

    

### Parameter setting
The classification are set to 100 iteration using 5-fold cross validation.

NOTE: You might encounter single predicted labels instead of 2 class labels. This will return an error. The easiest work around is to run the cell again to shuffle tthe data. 

In [0]:
Iterations = 200
num_folds = 10
seed =15

### Support Vector Machine

In [0]:
print('Support Vector Machine')
acc, pre, rec, rocauc, f1, bestacc, beststd = classify(SVM, Iterations, num_folds,  rescaledfeat, Labels)
print('Accuracy (mean, std):', statistics.mean(acc), statistics.stdev(acc))
print('Precision (mean, std):', statistics.mean(pre), statistics.stdev(pre))
print('Recall (mean, std):', statistics.mean(rec), statistics.stdev(pre))
print('Area under the Receiver Operating Characteristic Curve', statistics.mean(rocauc), statistics.stdev(rocauc))
print('F1-score:', statistics.mean(f1), statistics.stdev(f1))
print('Best iteration', bestacc, beststd)

Support Vector Machine
[0.9 0.8 0.6 0.7 0.6 0.8 0.6 0.6 0.7 0.9]
[0.6 0.8 0.8 0.9 0.5 0.8 0.7 0.7 0.9 0.9]
Accuracy (mean, std): 0.6916666666666667 0.14018549196608265
Precision (mean, std): 0.7159768151087595 0.13863026826476252
Recall (mean, std): 0.706377222957084 0.13863026826476252
Area under the Receiver Operating Characteristic Curve 0.706377222957084 0.1347693414428491
F1-score: 0.7019724873891541 0.15807560526007083
Best iteration 0.7600000000000001 0.128062484748657


In [8]:
# Classificaiton function with K-Fold cross-validation with user define iterations to get the standard deviation 

def findbestresults(model, Iterations, num_folds, rescaledX, y):
    
    Skip = False

    bestAcc = 0

    for i in range(Iterations):

        # Shuffle the dataset for each iteration
        data = list(zip(rescaledX,y))
        random.shuffle(data)
        rescaledX, y = zip(*data)
        rescaledX = np.array(rescaledX)
        y = np.array(y)
        #y = y.reshape(-1,1)

        # Perform 5-fold validation
        kfold = KFold(n_splits = num_folds)#, shuffle=True, random_state = None)
        results = cross_val_score(model, rescaledX, y.ravel(), cv = kfold)
        if results.mean()>bestAcc:
          bestAcc = results.mean()
          
          print(results)

          acc = []
          pre = []
          rec = []
          rocauc = []
          f1 = []

          for train_index, test_index in kfold.split(rescaledX):
              X_train, X_test = rescaledX[train_index], rescaledX[test_index]
              y_train, y_test = y[train_index], y[test_index]

              y_total = sum(y_test)
              if (y_total == 0) or (y_total == len(y_test)):
                  Skip = True
                  
              if not Skip:
                  # perform training and testing
                  if model == SVC:
                      model.fit(X_train,y_train)
                  else:
                      model.fit(X_train,y_train.ravel())
                  #dtscores = DT.score(X_test,y_test)
                  yPred = model.predict(X_test)


                  # record performance
                  acc = np.append(acc,metrics.accuracy_score(y_test, yPred)) 
                  pre = np.append(pre,metrics.precision_score(y_test,
                                                              yPred, 
                                                              pos_label=1, 
                                                              average='macro', 
                                                              labels=np.unique(yPred)))
                  rec = np.append(rec,metrics.recall_score(y_test,
                                                          yPred, 
                                                          pos_label=1, 
                                                          average='macro', 
                                                          labels=np.unique(yPred)))
                  rocauc = np.append(rocauc, metrics.roc_auc_score(y_test, 
                                                                  yPred,
                                                                  average='macro'))
                  f1 = np.append(f1, metrics.f1_score(y_test,yPred))

    return acc, pre, rec, rocauc, f1


Iterations = 200
num_folds = 10
seed =14

print('Support Vector Machine')
acc, pre, rec, rocauc, f1 = findbestresults(SVM, Iterations, num_folds,  rescaledfeat, Labels)
print('Accuracy (mean, std):', statistics.mean(acc), statistics.stdev(acc))
print('Precision (mean, std):', statistics.mean(pre), statistics.stdev(pre))
print('Recall (mean, std):', statistics.mean(rec), statistics.stdev(pre))
print('Area under the Receiver Operating Characteristic Curve', statistics.mean(rocauc), statistics.stdev(rocauc))
print('F1-score:', statistics.mean(f1), statistics.stdev(f1))

Support Vector Machine
[0.6 0.8 0.6 0.7 0.9 0.8 0.5 0.6 0.7 0.9]
[0.7 0.6 0.6 0.7 0.9 0.7 0.8 0.6 0.9 0.8]
[0.7 0.9 0.7 0.7 0.6 0.8 0.7 0.8 0.5 0.9]
[0.6 0.9 0.4 0.7 0.9 0.7 0.7 0.9 0.7 0.9]
[0.7 0.8 0.7 0.8 1.  0.9 0.6 0.7 0.8 0.6]
[0.7 0.7 0.8 0.7 0.6 0.8 0.9 0.8 0.6 1. ]
Accuracy (mean, std): 0.76 0.1264911064067352
Precision (mean, std): 0.7945634920634921 0.1237795674992844
Recall (mean, std): 0.7625 0.1237795674992844
Area under the Receiver Operating Characteristic Curve 0.7625000000000001 0.12793450163852838
F1-score: 0.7882967032967032 0.11910447438555657
