# This is the source code used in Ryesron-Intel project
Author: Alice

Created: Dec 2019

Modified for EMD analysis: Mar 2, 2020

Classification for EMD Triad

In [1]:
# Mount the google drive to get the feature files
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Import system packages
import os
import glob

# Importing packages
import pandas as pd
import numpy as np
from numpy import set_printoptions

# Preparing dataset
from sklearn.preprocessing import MinMaxScaler
import random

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestRegressor
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB


# Cross Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Performance Measure
from sklearn import metrics
import statistics

# Plotting
import matplotlib.pyplot as plt

In [0]:
# Control setting to see which types of data
FCTOL = 'FC15Per' #FC10Per or FC15Per
SEG   = 'Triad'   #FixedFrameSize, Triad, Delta, or '' 

# Set the folder according to segmentation: '16K Fixed Frame Size', '16K Triad', '4k', and 'EMDDeltaEMD'
if SEG=='FixedFrameSize':
  segmentation = '16K Fixed Frame Size'
elif SEG == 'Triad':
  segmentation = '16K Triad' 
elif SEG == 'Delta':
  segmentation = 'EMDDeltaEMD'
else:
  segmentation = ''

## Preparing the Dataset
Loading the feature csv file into arrays and normalized the features to range of [0,1]. There are 49 signals in total with 21 stress signals and 28 non-stress signals

In [4]:
# Setting folder directory
datapath = '/content/drive/My Drive/2019 DDK Paper/Journal/Features/EMD/GITA Features/'
 
# The actual datafile is in Statistical Summary folder
datapath = datapath + segmentation + '/Statistical Summary/'


filename = SEG+'*'+FCTOL+'*.csv'
#addrs = glob.glob(datapath+'*.csv')
addrs = glob.glob(datapath+'*'+filename)

print(segmentation)
print(os.path.basename(addrs[0]))
print(os.path.basename(addrs[1]))



16K Triad
EMDAmpHCTriadFC15Per.csv
EMDAmpPDTriadFC15Per.csv


In [5]:
# Construct dataframe for analysis

# Importing data matrices
HCaddr = [addr for addr in addrs if 'HC' in addr ]
PDaddr = [addr for addr in addrs if 'PD' in addr ]
dfHCOriginal = pd.read_csv(HCaddr[0])
dfPDOriginal = pd.read_csv(PDaddr[0])

print('HC matrix dim: ', dfHCOriginal.shape)
print('PD matrix dim: ', dfPDOriginal.shape)


# Combining the HC and PD dataframes
dfHCPD = pd.concat([dfHCOriginal,dfPDOriginal], axis=0)
dfHCPD = dfHCPD.drop(columns=['AmplificationMean', 'Length(s)Mean', 'PowerOBW(W)Mean', 'Power3dB(W)Mean',
                               'AmplificationStd', 'Length(s)Std', 'PowerOBW(W)Std', 'Power3dB(W)Std',
                               'AmplificationSkew', 'Length(s)Skew', 'PowerOBW(W)Skew', 'Power3dB(W)Skew',
                               'AmplificationKurt', 'Length(s)Kurt', 'PowerOBW(W)Kurt', 'Power3dB(W)Kurt'])

Labels = np.reshape([np.ones([50,1]), np.zeros([50,1])], (-1,1)) # Label for HC=1 and PD=0

# features
colnames = dfHCPD.columns
hcpdfeatures = np.array(dfHCPD.iloc[:,1:].values)

print('Features: ', colnames)
print('HC matrix dim: ', dfHCPD.shape)
print('size of HCPD features: ', hcpdfeatures.shape)



HC matrix dim:  (50, 282)
PD matrix dim:  (50, 282)
Features:  Index(['subject', 'segmentNum', 'numIMFMean', 'OBW1Mean', 'OBW2Mean',
       'OBW3Mean', 'OBW4Mean', 'OBW5Mean', 'OBW6Mean', 'OBW7Mean',
       ...
       'FCenter6Kurt', 'FCenter7Kurt', 'FCenter8Kurt', 'FCRatio1Kurt',
       'FCRatio2Kurt', 'FCRatio3Kurt', 'FCRatio4Kurt', 'FCRatio5Kurt',
       'FCRatio6Kurt', 'SNRdBKurt'],
      dtype='object', length=266)
HC matrix dim:  (100, 266)
size of HCPD features:  (100, 265)


In [6]:
# Normalizing the features into [0,1]
scaler = MinMaxScaler(feature_range = (0,1)) # scale the values to min 0, max 1
rescaledfeat = np.array(scaler.fit_transform(hcpdfeatures)) # fit the trainning feature X into scaler

set_printoptions(precision=3) # how many decimal places.
print(rescaledfeat[0:5,:])



[[0.278 0.49  0.495 ... 0.03  0.312 0.426]
 [0.667 0.449 0.588 ... 0.044 0.295 0.014]
 [0.333 0.626 0.011 ... 0.311 0.079 0.508]
 [0.167 0.592 0.636 ... 0.034 0.188 0.265]
 [0.5   0.413 0.878 ... 0.304 0.331 0.751]]


## Classification 
The classifiers tested in this project include Decision Tree, Each classifier uses 5-fold for validation.

In [0]:
# Classifiers
# Decision Tree
DT = DecisionTreeClassifier(random_state=0)
# Gradient Boost
GB = GradientBoostingClassifier(n_estimators=100, 
                                learning_rate=1.0, 
                                max_depth=1, 
                                random_state=0)

# Support Vector Machine
SVM = SVC(random_state = 0, gamma='scale',probability = True)

# Linear Discrimant Analysis
LDA = LinearDiscriminantAnalysis()

# Gaussian Naive Bayes
GNB = GaussianNB()



In [0]:
# Classificaiton function with K-Fold cross-validation with user define iterations to get the standard deviation 

def classify(model, Iterations, num_folds, rescaledX, y):
    acc = []
    pre = []
    rec = []
    rocauc = []
    f1 = []
    Skip = False

    bestAcc = 0
    beststd = 0



    for i in range(Iterations):

        # Shuffle the dataset for each iteration
        data = list(zip(rescaledX,y))
        random.shuffle(data)
        rescaledX, y = zip(*data)
        rescaledX = np.array(rescaledX)
        y = np.array(y)
        #y = y.reshape(-1,1)

        # Perform 5-fold validation
        kfold = KFold(n_splits = num_folds)#, shuffle=True, random_state = None)
        results = cross_val_score(model, rescaledX, y.ravel(), cv = kfold)
        if results.mean()>bestAcc:
          bestAcc = results.mean()
          beststd = results.std()
          print(results)

        for train_index, test_index in kfold.split(rescaledX):
            X_train, X_test = rescaledX[train_index], rescaledX[test_index]
            y_train, y_test = y[train_index], y[test_index]

            y_total = sum(y_test)
            if (y_total == 0) or (y_total == len(y_test)):
                Skip = True
                
            if not Skip:
                # perform training and testing
                if model == SVC:
                    model.fit(X_train,y_train)
                else:
                    model.fit(X_train,y_train.ravel())
                #dtscores = DT.score(X_test,y_test)
                yPred = model.predict(X_test)


                # record performance
                acc = np.append(acc,metrics.accuracy_score(y_test, yPred)) 
                pre = np.append(pre,metrics.precision_score(y_test,
                                                            yPred, 
                                                            pos_label=1, 
                                                            average='macro', 
                                                            labels=np.unique(yPred)))
                rec = np.append(rec,metrics.recall_score(y_test,
                                                         yPred, 
                                                         pos_label=1, 
                                                         average='macro', 
                                                         labels=np.unique(yPred)))
                rocauc = np.append(rocauc, metrics.roc_auc_score(y_test, 
                                                                 yPred,
                                                                 average='macro'))
                f1 = np.append(f1, metrics.f1_score(y_test,yPred))
    return acc, pre, rec, rocauc, f1, bestAcc, beststd
    

    

### Parameter setting
The classification are set to 100 iteration using 5-fold cross validation.

NOTE: You might encounter single predicted labels instead of 2 class labels. This will return an error. The easiest work around is to run the cell again to shuffle tthe data. 

In [0]:
Iterations = 200
num_folds = 10
seed =15


### Support Vector Machine

In [0]:
print('Support Vector Machine')
acc, pre, rec, rocauc, f1, bestacc, beststd = classify(SVM, Iterations, num_folds,  rescaledfeat, Labels)
print('Accuracy (mean, std):', statistics.mean(acc), statistics.stdev(acc))
print('Precision (mean, std):', statistics.mean(pre), statistics.stdev(pre))
print('Recall (mean, std):', statistics.mean(rec), statistics.stdev(pre))
print('Area under the Receiver Operating Characteristic Curve', statistics.mean(rocauc), statistics.stdev(rocauc))
print('F1-score:', statistics.mean(f1), statistics.stdev(f1))
print('best iteration accuracy: ', bestacc, beststd)

Support Vector Machine
[0.7 0.5 0.7 0.8 0.5 0.5 0.7 0.7 0.7 1. ]
[0.5 0.5 0.6 0.7 0.5 0.9 0.6 0.7 1.  0.9]
[0.6 0.7 0.8 0.5 0.7 0.8 0.9 0.9 0.3 0.8]
[0.4 0.8 0.9 0.6 0.7 0.7 0.6 0.8 0.9 0.8]
Accuracy (mean, std): 0.6794117647058824 0.16350528178917872
Precision (mean, std): 0.7021796218487395 0.163775049378274
Recall (mean, std): 0.6856004901960784 0.163775049378274
Area under the Receiver Operating Characteristic Curve 0.6856004901960785 0.16930341597860044
F1-score: 0.6929617095707061 0.181656183485435
best iteration accuracy:  0.72 0.1469693845669907


In [8]:
# finding the best iteration from 200 iterations

def findbestresults(model, Iterations, num_folds, rescaledX, y):
    
    Skip = False

    bestAcc = 0

    for i in range(Iterations):

        # Shuffle the dataset for each iteration
        data = list(zip(rescaledX,y))
        random.shuffle(data)
        rescaledX, y = zip(*data)
        rescaledX = np.array(rescaledX)
        y = np.array(y)
        #y = y.reshape(-1,1)

        # Perform 5-fold validation
        kfold = KFold(n_splits = num_folds)#, shuffle=True, random_state = None)
        results = cross_val_score(model, rescaledX, y.ravel(), cv = kfold)
        if results.mean()>bestAcc:
          bestAcc = results.mean()
          
          print(results)

          acc = []
          pre = []
          rec = []
          rocauc = []
          f1 = []

          for train_index, test_index in kfold.split(rescaledX):
              X_train, X_test = rescaledX[train_index], rescaledX[test_index]
              y_train, y_test = y[train_index], y[test_index]

              y_total = sum(y_test)
              if (y_total == 0) or (y_total == len(y_test)):
                  Skip = True
                  
              if not Skip:
                  # perform training and testing
                  if model == SVC:
                      model.fit(X_train,y_train)
                  else:
                      model.fit(X_train,y_train.ravel())
                  #dtscores = DT.score(X_test,y_test)
                  yPred = model.predict(X_test)


                  # record performance
                  acc = np.append(acc,metrics.accuracy_score(y_test, yPred)) 
                  pre = np.append(pre,metrics.precision_score(y_test,
                                                              yPred, 
                                                              pos_label=1, 
                                                              average='macro', 
                                                              labels=np.unique(yPred)))
                  rec = np.append(rec,metrics.recall_score(y_test,
                                                          yPred, 
                                                          pos_label=1, 
                                                          average='macro', 
                                                          labels=np.unique(yPred)))
                  rocauc = np.append(rocauc, metrics.roc_auc_score(y_test, 
                                                                  yPred,
                                                                  average='macro'))
                  f1 = np.append(f1, metrics.f1_score(y_test,yPred))
    return acc, pre, rec, rocauc, f1



Iterations = 200
num_folds = 10
seed =14

print('Support Vector Machine')
acc, pre, rec, rocauc, f1 = findbestresults(SVM, Iterations, num_folds,  rescaledfeat, Labels)
print('Accuracy (mean, std):', statistics.mean(acc), statistics.stdev(acc))
print('Precision (mean, std):', statistics.mean(pre), statistics.stdev(pre))
print('Recall (mean, std):', statistics.mean(rec), statistics.stdev(pre))
print('Area under the Receiver Operating Characteristic Curve', statistics.mean(rocauc), statistics.stdev(rocauc))
print('F1-score:', statistics.mean(f1), statistics.stdev(f1))




Support Vector Machine
[0.8 0.7 0.7 0.4 0.6 0.7 0.9 0.6 0.6 0.6]
[0.6 0.9 0.6 0.5 0.6 0.8 0.6 0.9 0.5 0.8]
[0.8 0.8 0.8 0.6 0.5 0.7 0.9 0.7 0.5 0.6]
[0.5 0.7 0.9 0.7 0.9 0.7 0.6 0.6 0.6 0.8]
[0.9 0.8 0.6 0.6 0.5 0.6 0.8 0.7 0.9 0.8]
Accuracy (mean, std): 0.72 0.13984117975602023
Precision (mean, std): 0.719484126984127 0.14733685763539225
Recall (mean, std): 0.7162499999999999 0.14733685763539225
Area under the Receiver Operating Characteristic Curve 0.71625 0.15064739408857697
F1-score: 0.7230128205128205 0.13800919635629388
