In [3]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.activations import relu, sigmoid
import pandas as pd
import numpy as np

np.random.seed(1234)

After we have added all libraries, we will split the data into the associated x and y sets

In [21]:
speech_features = pd.read_csv("pd_speech_features.csv", header=1)
num_ids = speech_features["id"].max()
rows = []
#average all the trials per participant id
for i in range(num_ids):
    id_vals = speech_features[speech_features["id"]==i]
    id_mean = id_vals.mean(axis=0)
    rows.append(id_mean)

speech_features = pd.DataFrame(rows, columns=speech_features.columns)
y = speech_features["class"]
X = speech_features.drop(columns=["class", "id"])

Next we will split to train and test sets so we can validate for accuracy

In [22]:
X_trn, X_test, y_trn, y_test = train_test_split(X,y, test_size=.20, stratify=y, shuffle=True)
print("Train Shape: X ", X_trn.shape, " y ", y_trn.shape)
print("Val Shape: X ", X_test.shape, " y ", y_test.shape)

Train Shape: X  (200, 753)  y  (200,)
Val Shape: X  (51, 753)  y  (51,)


This will try to run the data through a nueral network with an architecture of 512 nuerons on the first layer and 128 in the middle layer, outputting a 0 or 1 for classification

In [24]:
model = Sequential()
model.add(Dense(512, input_shape=(X_trn.shape[1],), activation=relu))
model.add(Dense(128, activation=relu))
model.add(Dense(1, activation=sigmoid))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy'])
model.fit(X_trn, y_trn, validation_data=(X_test, y_test), epochs=10, batch_size=24)

Train on 200 samples, validate on 51 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x21640474e48>

The validation accuracy is kind of all over the place and train accuracy doesn't really decrease the loss much after every epoch so nueral network may not be a good fit, or some of the features aren't very indicative of parkinsons so it is making a skewed judgement.

In [25]:
clf = SVC().fit(X_trn, y_trn)
print(clf.score(X_test, y_test))

0.7058823529411765


In [26]:
pca = PCA(n_components=3, svd_solver='full')
print("Before PCA, number of features is: ", X.shape[1])
X_new = pca.fit_transform(X)
X_trn, X_test, y_trn, y_test = train_test_split(X_new, y, test_size=.20, stratify=y)
print("After PCA, number of features is: ", X_new.shape[1])
pca.explained_variance_ratio_

Before PCA, number of features is:  753
After PCA, number of features is:  3


array([9.99990443e-01, 7.54056562e-06, 1.59136283e-06])

This shows that one feature out of the 752 hold 99.998% of the variance of all features.

In [28]:
clf = SVC().fit(X_trn, y_trn)
print(clf.score(X_test, y_test))

0.7254901960784313


As you can see, an SCV trained on all 753 features achieves around the same accuracy as an SVC with 3 features

In [32]:
model = Sequential()
model.add(Dense(512, input_shape=(X_trn.shape[1],), activation=relu))
model.add(Dense(128, activation=relu))
model.add(Dense(1, activation=sigmoid))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy'])
model.fit(X_trn, y_trn, validation_data=(X_test, y_test), epochs=10, batch_size=24)

Train on 200 samples, validate on 51 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x21642f12388>

This also results in a slightly higher train accuracy than that from the nueral network above