# Import necessary modules and libraries

In [1]:
import PyQt5
%config InlineBackend.figure_format = 'retina'
%matplotlib qt5

#python packages
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pac
import warnings

import seaborn as sns
sns.set_style('white')

import imp
import shape
import utils
from load_features import load_WaveformShape_features, Bandpower_features, mean_and_peak_freqs, statistics, fractal_dimensions, entropies
from val_metrics import *

imp.reload(utils)
imp.reload(shape)

<module 'shape' from 'C:\\Users\\USER\\Documents\\Yachay_Tech\\Thesis_Project\\ParkinsonsDetection\\python_scripts\\shape.py'>

# Load data and meta-data

In [2]:
"""Which comparison to make:
    1. Off-med vs Controls
    2. On-med vs Controls
    3. Off-med vs On-med
    """
comparison = 3
dataset = 'UCSD' #UCSD or UNM

In [3]:
## Load data and meta-data
all_chan = False; EO = False
bands = [[0.5,4], [4,8], [8,12], [16,32], [32,64]] #Delta, Theta, Alpha, Beta, Gamma

In [4]:
Fs, t, S, Sc, Smed, flo, fhi = utils.loadmeta()  
eeg,rejects = utils.loadPD(EO, all_chan, dataset) # EO means Eyes Opened

# Ploting a signal

In [7]:
# Plot a signal
sns.set(font_scale=1.2)
data = eeg['C'][1]
time = np.arange(data.size) / Fs

fig, ax = plt.subplots(1, 1, figsize=(12, 4))
plt.plot(time, data, lw=1.5, color='k')
plt.xlabel('Time (seconds)')
plt.ylabel('Voltage')
plt.xlim([time.min(), time.max()])
plt.title('EEG Signal for Healthy Subject')
sns.despine()

# plt.plot(eeg['off'][6,0:1000],label='control')
# plt.ticklabel_format(axis="x", style="sci", scilimits=(0,0))

# Waveform Shape Features

In [5]:
"""
This cell saves features from waveform shape and PAC in .pkl files -- Just run once, then commented--
"""
# import pickle
# shape_features = [ShR, PTR, StR, RDR, pac]
# shape_featuresStr = ["ShR", "PTR", "StR", "RDR", "pac"]
# for i in range(len(shape_features)):
#     f = open(shape_featuresStr[i] + ".pkl","wb")
#     pickle.dump(shape_features[i],f)
#     f.close()

'\nThis cell saves features from waveform shape and PAC in .pkl files -- Just run once, then commented--\n'

In [5]:
"""This function calculates the shape measures calculated for analysis
    of the PD data set

    1. Peak and trough times(pks,trs)
    2. Peak and trough sharpness(pksharp,trsharp)
    3. Rise and decay steepnes(risteep,desteep)
    3. Sharpness ratio(ShR)
    4. Steepness ratio(StR)
    5. Peak-to-trough ratio(PTR)
    6. Rise-to-decay ratio(RDR)
    """
widthS = 3 #To calculate Waveform Shape features

pks,trs,ShR,PTR,StR,RDR = utils.measure_shape(eeg, rejects, widthS=widthS)
"""
Algorithms for estimating phase-amplitude coupling
"""
pac = utils.measure_pac(eeg,rejects,flo,fhi,Fs=Fs)

# Calculate Spectral Features

In [6]:
"""
Absolute and Relative BandPower features, from all five bands: Delta, Theta, Alpha, Beta, Gamma
"""
abs_powerOff = Bandpower_features(eeg['off'], Fs, bands, S, False, 'welch')
abs_powerOn = Bandpower_features(eeg['on'], Fs, bands, Smed, False, 'welch')
abs_powerCtl = Bandpower_features(eeg['C'], Fs, bands, Sc, False, 'welch')

rel_powerOff = Bandpower_features(eeg['off'], Fs, bands, S, True, 'welch')
rel_powerOn = Bandpower_features(eeg['on'], Fs, bands, Smed, True, 'welch')
rel_powerCtl = Bandpower_features(eeg['C'], Fs, bands, Sc, True, 'welch')

In [7]:
"""
Mean and Peak Frequency from the spectrum
"""
meanFreqsOff = mean_and_peak_freqs(eeg['off'], Fs, S)[0] 
meanFreqsOn = mean_and_peak_freqs(eeg['on'], Fs, Smed)[0]
meanFreqsCtl = mean_and_peak_freqs(eeg['C'], Fs, Sc)[0]

peakFreqsOff = mean_and_peak_freqs(eeg['off'], Fs, S)[1]
peakFreqsOn = mean_and_peak_freqs(eeg['on'], Fs, Smed)[1]
peakFreqsCtl = mean_and_peak_freqs(eeg['C'], Fs, Sc)[1]

# Statistical Features

In [8]:
"""This cell calculates statistical measures extracted from EEG for analysis
    of the PD data set

    1. Mean
    2. Standard Deviation
    3. Skewness
    4. Kurtosis
    5. Maximum
    6. Minimum
    7. 5th percentile value
    8. 25th percentile value
    9. 75th percentile value
    10. 95th percentile value
    11. Median
    12. Variance
    13. Root Mean Square value
    """
statsOff = statistics(eeg['off'], S).get()
statsOn = statistics(eeg['on'], Smed).get()
statsCtl = statistics(eeg['C'], Sc).get()

# Non-linear analysis

In [9]:
fractalOff = fractal_dimensions(eeg['off'], S)
fractalOn = fractal_dimensions(eeg['on'], Smed)
fractalCtl = fractal_dimensions(eeg['C'], Sc)

In [10]:
entOff = entropies(eeg['off'], S, Fs)
entOn = entropies(eeg['on'], Smed, Fs)
entCtl = entropies(eeg['C'], Sc, Fs)

# Load dataset with all features

In [11]:
# create features of class I
f1_B    = np.reshape(pac['off'],(S,1))
f2_B    = np.reshape(ShR['off'],(S,1))
f3_B    = np.reshape(StR['off'],(S,1))
f4_B    = np.reshape(PTR['off'],(S,1))
f5_B    = np.reshape(RDR['off'],(S,1))
cl_B    = np.ones((S,1)) # 1

In [12]:
# create features of class II
f1_C    = np.reshape(pac['on'],(Smed,1))
f2_C    = np.reshape(ShR['on'],(Smed,1))
f3_C    = np.reshape(StR['on'],(Smed,1))
f4_C    = np.reshape(PTR['on'],(Smed,1))
f5_C    = np.reshape(RDR['on'],(Smed,1))
if comparison == 1 or comparison == 3:
    cl_C    = np.zeros((Smed,1)) # transition means 0 #Original line
elif comparison == 2:
    cl_C    = np.ones((Smed,1))

In [13]:
# create features of class III
f1_E    = np.reshape(pac['C'],(Sc,1))
f2_E    = np.reshape(ShR['C'],(Sc,1))
f3_E    = np.reshape(StR['C'],(Sc,1))
f4_E    = np.reshape(PTR['C'],(Sc,1))
f5_E    = np.reshape(RDR['C'],(Sc,1))
cl_E    = np.negative(np.ones((Sc,1))) # -1

In [14]:
MftB = np.concatenate([f1_B,f2_B,f3_B, f4_B, f5_B, rel_powerOff, abs_powerOff, meanFreqsOff, peakFreqsOff, statsOff, fractalOff, entOff, cl_B],axis=1)
MftC = np.concatenate([f1_C,f2_C,f3_C, f4_C, f5_C, rel_powerOn,  abs_powerOn,  meanFreqsOn, peakFreqsOn,   statsOn,  fractalOn, entOn,  cl_C],axis=1)
MftE = np.concatenate([f1_E,f2_E,f3_E, f4_E, f5_E, rel_powerCtl, abs_powerCtl, meanFreqsCtl, peakFreqsCtl, statsCtl, fractalCtl, entCtl, cl_E],axis=1)

In [15]:
features = ['PAC','ShR','StR', 'PtT', 'RtF', 'rel_delta',
            'rel_theta','rel_alpha','rel_beta',
            'rel_gamma','abs_delta','abs_theta',
            'abs_alpha','abs_beta','abs_gamma','meanFreq','peakFreq',
            'mean','std','skewness', 'kurtosis', 'maximum', 'minimum',
            '5th perc','25th perc','75th perc','95th perc','median','variance','RMS',
           'detrended_fluctuation', 'higuchi_fd', 'katz_fd', 'petrosian_fd',
           'perm_entropy', 'svd_entropy']

In [16]:
FCM_B = pd.DataFrame(MftB,columns= features + ['class'])
FCM_C = pd.DataFrame(MftC,columns= features + ['class'])
FCM_E = pd.DataFrame(MftE,columns= features + ['class'])

In [17]:
#Classification between patients on-medication and patients off-medication   

if comparison == 3:
    TotalDataset = pd.concat([FCM_B,FCM_C],ignore_index=True)
    visDat = TotalDataset.copy(deep=True)
    visDat['class'] = visDat['class'].map({1:'off_med',0:'on_med'})

#Classification between patients on-medication and healthy control subjects        

elif comparison == 2:
    TotalDataset = pd.concat([FCM_C,FCM_E],ignore_index=True)
    visDat = TotalDataset.copy(deep=True)
    # visDat['class'] = visDat['class'].map({-1:'control',0:'on_med'}) #Original line
    visDat['class'] = visDat['class'].map({-1:'control',1:'on_med'})

#Classification between patients off-medication and healthy control subjects        

elif comparison == 1:
    TotalDataset = pd.concat([FCM_E,FCM_B],ignore_index=True)
    visDat = TotalDataset.copy(deep=True)
    visDat['class'] = visDat['class'].map({-1:'control',1:'off_med'})

In [18]:
visDat.head(3240)

Unnamed: 0,PAC,ShR,StR,PtT,RtF,rel_delta,rel_theta,rel_alpha,rel_beta,rel_gamma,...,median,variance,RMS,detrended_fluctuation,higuchi_fd,katz_fd,petrosian_fd,perm_entropy,svd_entropy,class
0,0.017733,0.098913,0.163642,1.255779,0.686054,0.136813,0.072965,0.159242,0.296707,0.148003,...,0.145518,14.431695,2.983523,0.971172,1.538859,3.156071,1.017937,2.312534,1.173157,off_med
1,0.015171,0.027426,0.034887,0.938801,0.922812,0.144274,0.306474,0.197352,0.221095,0.172563,...,-0.175242,12.177666,2.757860,1.124485,1.472822,2.393348,1.017533,2.318673,0.968671,off_med
2,0.074130,0.059304,0.180660,0.872361,0.659689,0.255378,0.102832,0.112770,0.184038,0.273906,...,0.184774,4.240290,1.602635,0.912102,1.667425,3.347637,1.018317,2.329433,1.272294,off_med
3,0.040598,0.079251,0.181069,1.200192,0.659069,0.080871,0.084503,0.151343,0.163961,0.199832,...,-0.249745,12.298036,2.791133,0.815680,1.759567,4.118678,1.019734,2.417424,1.377425,off_med
4,0.024312,0.050979,0.006251,1.124550,1.014498,0.064976,0.182431,0.240476,0.205108,0.181574,...,0.108688,8.402344,2.283840,1.018992,1.515206,3.096279,1.017425,2.305711,1.066772,off_med
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,0.019738,0.074054,0.014414,0.843230,1.033747,0.106573,0.260207,0.276501,0.199614,0.120696,...,0.059966,6.710670,2.081394,1.126202,1.359608,2.451823,1.016800,2.287053,0.885987,on_med
1076,0.002907,0.031093,0.052324,1.074220,1.128039,0.226736,0.111056,0.081581,0.265683,0.372943,...,-0.003529,6.073371,1.964574,0.980170,1.681824,3.713433,1.013435,2.131549,1.104992,on_med
1077,0.011754,0.009274,0.046552,1.021585,0.898356,0.111009,0.196301,0.274965,0.217806,0.137826,...,0.056445,8.597344,2.331193,1.081778,1.372105,2.855682,1.015550,2.234730,0.918034,on_med
1078,0.011085,0.023967,0.026912,1.056737,1.063928,0.144146,0.208713,0.203343,0.242096,0.131568,...,0.203333,8.454064,2.358044,1.026892,1.375314,2.879516,1.015037,2.240442,0.945589,on_med


In [None]:
interval = np.arange(17,25)
off = FCM_B.iloc[:,interval].mean(axis=0)
on = FCM_C.iloc[:,interval].mean(axis=0)
control = FCM_E.iloc[:,interval].mean(axis=0)

df = pd.DataFrame({'PD Off-medication': off,
                   'PD On-medication': on,
                   'Healthy': control}, index=np.asarray(features)[interval])
ax = df.plot.bar(rot=0)

# Selecting the set of features

In [19]:
X = TotalDataset[features]
y = TotalDataset[['class']]
X = np.asarray(X)
y = np.asarray(y)

# PCA

In [None]:
# import pandas as pd
# from sklearn import datasets
# from sklearn.decomposition import PCA

# # load dataset
# df = pd.DataFrame(X, columns=features)

# # Standarize Data
# from sklearn import preprocessing
# scaler = preprocessing.StandardScaler().fit(df)
# data_scaled = pd.DataFrame(scaler.transform(df),columns = df.columns) 

# # PCA
# pca = PCA(.95)
# # pca.fit_transform(data_scaled)
# X = pca.fit_transform(data_scaled)

In [None]:
# # get component loadings (correlation coefficient between original variables and the component) 
# # the squared loadings within the PCs always sums to 1
# loadings = pca.components_
# num_pc = pca.n_features_
# pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
# loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
# loadings_df['variable'] = df.columns.values
# loadings_df = loadings_df.set_index('variable')

# # positive and negative values in component loadings reflects the positive and negative correlation of the variables
# # with then PCs. 

# # get correlation matrix plot for loadings
# import seaborn as sns
# import matplotlib.pyplot as plt
# ax = sns.heatmap(loadings_df, annot=True, cmap='coolwarm')
# plt.show()

# ANOVA Feature selection, SelectKBest

In [20]:
#Scale Dataset
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

# ANOVA feature selection for numeric input and categorical output
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, SelectFpr
from sklearn.feature_selection import f_classif, f_regression
# define feature selection
fs = SelectKBest(score_func=f_classif, k=26)
# fs = SelectKBest()
# fs = SelectFpr(score_func=f_classif, alpha=0.01)
# apply feature selection
X = fs.fit_transform(X, np.ravel(y))

print("The shape of the new dataset is = " + str(X.shape))

The shape of the new dataset is = (1080, 26)


In [68]:
# X_indices = np.arange(X.shape[-1])
# selector = SelectKBest(f_classif, k=4)
# selector.fit(X, np.ravel(y))
# scores = -np.log10(selector.pvalues_)
# scores /= scores.max()
# plt.bar(X_indices , scores, width=.2,
#         label=r'Univariate score ($-Log(p_{value})$)')


# plt.title("Comparing feature selection")
# plt.xlabel('Feature number')
# # plt.yticks(())
# plt.axis('tight')
# plt.legend(loc='upper right')
# plt.show()

In [69]:
# df = pd.DataFrame(fs.pvalues_,columns= ['pvalues'],index = features)
# df.sort_values(by=['pvalues'], ascending=True)

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import plot_confusion_matrix
# from sklearn.metrics import classification_report

In [22]:
from sklearn.datasets import make_classification
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split

# Cross-validation scores

In [23]:
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
# prepare configuration for cross validation test harness
seed = 151461
folds = model_selection.ShuffleSplit(n_splits=10, test_size=0.20, random_state=seed)

# Gaussian Process

In [24]:
clf = GaussianProcessClassifier()
accuraccy_Gaussian, f1_Gaussian = cross_vald(clf, folds, X, y)

Accuracy: 0.828 (+/- 0.031)
F1-Score: 0.826 (+/- 0.034)
Precision: 0.851 (+/- 0.068)
Recall: 0.804 (+/- 0.033)


In [None]:
roc_curve(clf, folds, X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
        X, np.ravel(y), random_state=0, test_size =0.20)
clf = GaussianProcessClassifier()
clf.fit(X_train, y_train)

plot_confusion_matrix(clf, X_test, y_test)  
plt.show()  

# Gradient Boosting Classifier

In [None]:
clf = GradientBoostingClassifier(learning_rate = 0.25, max_depth =2, n_estimators=100)
accuraccy_GradBoost, f1_GradBoost = cross_vald(clf, folds, X, y)

In [None]:
roc_curve(clf, folds, X, y)

# SVM

In [25]:
clf = SVC(C=60, kernel = 'poly', degree = 3, gamma = 'scale')
accuraccy_SVC, f1_SVC = cross_vald(clf, folds, X, y)

Accuracy: 0.806 (+/- 0.030)
F1-Score: 0.809 (+/- 0.029)
Precision: 0.815 (+/- 0.065)
Recall: 0.805 (+/- 0.063)


In [None]:
roc_curve(clf, folds, X, y)

# KNN

In [26]:
clf = KNeighborsClassifier(n_neighbors = 10, p=1, weights='distance')
accuraccy_kNN, f1_kNN = cross_vald(clf, folds, X, y)

Accuracy: 0.845 (+/- 0.039)
F1-Score: 0.842 (+/- 0.048)
Precision: 0.873 (+/- 0.056)
Recall: 0.813 (+/- 0.056)


In [None]:
roc_curve(clf, folds, X, y)

# Random Forest

In [27]:
clf = RandomForestClassifier(criterion = 'gini', max_depth = 10, max_features = 'sqrt', min_samples_split = 5, n_estimators = 100)
accuraccy_RF, f1_RF = cross_vald(clf, folds, X, y)

Accuracy: 0.844 (+/- 0.038)
F1-Score: 0.850 (+/- 0.031)
Precision: 0.832 (+/- 0.072)
Recall: 0.871 (+/- 0.050)


In [None]:
roc_curve(clf, folds, X, y)

# MLP

In [28]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [29]:
clf = MLPClassifier(activation = 'relu', alpha = 0.05, hidden_layer_sizes = (50,100,50), learning_rate = 'adaptive', solver = 'lbfgs')
accuraccy_MLP, f1_MLP = cross_vald(clf, folds, X, y)

Accuracy: 0.849 (+/- 0.038)
F1-Score: 0.846 (+/- 0.055)
Precision: 0.847 (+/- 0.058)
Recall: 0.855 (+/- 0.063)


In [None]:
roc_curve(clf, folds, X, y)

# Accuraccy and F1-score boxplots

In [30]:
acc = np.concatenate([np.reshape(accuraccy_Gaussian,(folds.get_n_splits(),1)),
                      np.reshape(accuraccy_SVC,(folds.get_n_splits(),1)), 
                      np.reshape(accuraccy_kNN,(folds.get_n_splits(),1)), 
                      np.reshape(accuraccy_RF,(folds.get_n_splits(),1)), 
                      np.reshape(accuraccy_MLP,(folds.get_n_splits(),1))],axis=1)
f1 = np.concatenate([np.reshape(f1_Gaussian,(folds.get_n_splits(),1)), 
                     np.reshape(f1_SVC,(folds.get_n_splits(),1)),
                     np.reshape(f1_kNN,(folds.get_n_splits(),1)), 
                     np.reshape(f1_RF,(folds.get_n_splits(),1)), 
                     np.reshape(f1_MLP,(folds.get_n_splits(),1))],axis=1)

In [31]:
accdf = pd.DataFrame(acc,columns=['GausProc', 'SVC', 'KNN', 'RandFor', 'MLP'])
f1df = pd.DataFrame(f1,columns=['GausProc', 'SVC', 'KNN', 'RandFor', 'MLP'])

In [32]:
boxplots(EO, comparison, accdf, f1df)