In [97]:
# Libraries
import numpy as np
import pandas as pd
import numpy.random as rand
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Globals
test_percent = 0.20 # percent of dataset withheld for validation

In [104]:
# Import Training Data
# column names
names = ['Subject ID',
            'Jitter (local)', 'Jitter (local, abs)', 'Jitter (rap)', 
            'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)', 
            'Shimmer (local, dB)', 'Shimmer (apq3)', 'Shimmer (apq5)', 
            'Shimmer (apq11)', 'Shimmer (dda)', 'AC', 'NTH', 'HTN',
            'Median Pitch', 'Mean Pitch', 'Std Dev Pitch', 'Min Pitch', 
            'Max Pitch', 'Num Pulses', 'Num Periods', 'Mean Period',
            'Std Dev Periods', 'Frac Unvoiced Frames', 'Num  Breaks',
            'Degree of Breaks']
# training column names
train_names = names + ['UPDRS', 'class info']
               
df = pd.read_csv("../Parkinson_Multiple_Sound_Recording/train_data.txt", 
                       header=None,
                       names =train_names)
df.sample(5)

Unnamed: 0,Subject ID,Jitter (local),"Jitter (local, abs)",Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),...,Max Pitch,Num Pulses,Num Periods,Mean Period,Std Dev Periods,Frac Unvoiced Frames,Num Breaks,Degree of Breaks,UPDRS,class info
536,21,4.18,0.000306,1.559,1.733,4.676,16.048,1.425,7.112,10.561,...,497.042,106,92,0.007315,0.00282,54.974,8,52.766,1,0
807,32,2.184,0.000117,0.955,0.921,2.865,22.509,1.827,12.848,15.39,...,498.411,228,186,0.005375,0.002305,4.167,1,1.901,1,0
960,37,2.4,0.000113,1.375,1.121,4.126,11.571,1.063,6.021,6.951,...,267.27,76,74,0.004689,0.0006,38.333,1,18.475,1,0
859,34,0.636,4.5e-05,0.259,0.306,0.777,5.871,0.574,2.827,3.202,...,159.183,123,121,0.007076,0.000313,11.0,0,0.0,1,0
435,17,1.921,0.000138,0.902,1.007,2.706,9.243,0.88,3.537,5.775,...,147.136,50,48,0.007208,0.000355,15.909,1,21.448,5,1


In [4]:
df['Type']=np.array(list(np.arange(1,27))*40)
df.sample(3)

Unnamed: 0,Subject ID,Jitter (local),"Jitter (local, abs)",Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),...,Num Pulses,Num Periods,Mean Period,Std Dev Periods,Frac Unvoiced Frames,Num Breaks,Degree of Breaks,UPDRS,class info,Type
996,39,5.715,0.000372,3.66,2.912,10.981,12.264,1.047,6.657,7.264,...,40,38,0.006513,0.000757,50.0,1,41.414,1,0,9
712,28,2.06,0.000122,0.858,0.753,2.573,16.186,1.439,5.512,7.385,...,31,29,0.005934,0.000448,70.968,1,25.402,1,0,11
278,11,1.219,8.2e-05,0.741,0.847,2.223,7.466,0.687,3.677,4.914,...,69,68,0.006713,0.000216,0.0,0,0.0,24,1,19


In [11]:
# Get examples
X = df.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
# Get labels
Y = df['class info']

### Baseline: Linear Regression

In [149]:
# runs linear regression on X, Y, returns accuracy
def linear_model(X, Y, stats=False):
    # Separate Training and testing data
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=test_percent, random_state=0)
    # Create Classifier
    lin_reg = LinearRegression(normalize=True, copy_X=True)
    # Fit the classifer
    lin_reg.fit(X_train, Y_train)
    # Prediction
    predict = np.rint(lin_reg.predict(X_test))
    # Results
    if stats:
        print(confusion_matrix(Y_test, predict))
        print(classification_report(Y_test, predict))
    # accuracy
    return accuracy_score(Y_test, predict)*100
print("Accuracy: %.1f%%" % linear_model(X,Y))

Accuracy: 25.0%


### Logistic Regression

In [150]:
# runs logistic regression on X, Y, returns results accuracy
# INPUTS: X, Y are examples and labels
# Inputs:   p -> penalty ('l1', 'l2', etc.) = 'l2'
#           d -> dual (boolean)             = False
#           t -> tolerance                  = 1e-3
#           m -> max_iter                   = 1e4
#           c -> CV                         = 3
def logistic_model(X, Y, p='l2', d=False, t=1e-3, stats=False, m=1e4, c=3):
    # Separate Training and testing data
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=test_percent, random_state=0)
    # Create Classifier
    log_reg = LogisticRegressionCV(penalty=p, dual=d, tol=t, max_iter=m,
                                 random_state=0, solver='lbfgs', cv=c, n_jobs=-1)
    # Fit the classifer
    log_reg.fit(X_train, Y_train)
    # Prediction
    predict = log_reg.predict(X_test)
    # Results
    if stats:
        print(confusion_matrix(Y_test, predict))
        print(classification_report(Y_test, predict))
    # return accuracy
    return accuracy_score(Y_test, predict)*100
print("Accuracy: %.1f%%" % logistic_model(X, Y, c=3))

Accuracy: 50.0%


### SVMs & Kernel Methods

In [157]:
# run an SVM with the specified Kernel
def SVM(X, Y, k='linear', p=3, stats=False):
    # Separate Training and testing data
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=test_percent, random_state=0)
    # Create Classifier
    clf = SVC(kernel=k, degree=p, random_state=0, gamma='scale',
             max_iter=-1, tol=1e-4)
    # Fit the Classifier
    clf.fit(X_train, Y_train)
    # Predictions
    predict = clf.predict(X_test)
    # Results
    if stats:
        print(confusion_matrix(Y_test, predict))
        print(classification_report(Y_test, predict))
    # Return accuracy
    return accuracy_score(Y_test, predict)*100

# pred = SVM(X, Y, k='linear', p=100)
print("Linear Accuracy: %.1f%%" % SVM(X, Y, k='linear'))
# print("Poly-1 Accuracy: %.1f%%" % SVM(X, Y, k='poly', p=1))
# print("Poly-3 Accuracy: %.1f%%" % SVM(X, Y, k='poly', p=3))
# print("Poly-10 Accuracy: %.1f%%" % SVM(X, Y, k='poly', p=10))
# print("RBF Accuracy: %.1f%%" % SVM(X, Y, k='rbf'))
# print("Sigmoid Accuracy: %.1f%%" % SVM(X, Y, k='sigmoid'))

Linear Accuracy: 75.0%


### Naive Bayes

In [158]:
def bayes(X, Y, stats=False):
    # Separate Training and testing data
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=test_percent, random_state=0)
    # Create Classifier
    clf = BernoulliNB()
    # Fit the Classifier
    clf.fit(X_train, Y_train)
    # Predictions
    predict = clf.predict(X_test)
    # Results
    if stats:
        print(confusion_matrix(Y_test, predict))
        print(classification_report(Y_test, predict))
    # Return accuracy
    return accuracy_score(Y_test, predict)*100
print("Accuracy: %.1f%%" % bayes(X, Y))

Accuracy: 50.0%


### KNN

In [164]:
def KNN(X, Y, stats=False):
    # Separate Training and testing data
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=test_percent, random_state=0)
    # Create Classifier
    clf = KNeighborsClassifier(n_neighbors=3)
    # Fit the Classifier
    clf.fit(X_train, Y_train)
    # Predictions
    predict = clf.predict(X_test)
    # Results
    if stats:
        print(confusion_matrix(Y_test, predict))
        print(classification_report(Y_test, predict))
    # Return accuracy
    return accuracy_score(Y_test, predict)*100
print("Accuracy: %.1f%%" % KNN(X, Y))

Accuracy: 62.5%


## Use User Info

In [147]:
'''Average accross all samples for each users'''
# initialize patients arrary
patients = {}
for i in range(40):
    patients[i] = df.iloc[i*26:i*26+26].agg(['mean'])
# remerge the averages
avg_df= patients[0]
for i in range(1, 40):
    avg_df = avg_df.append(patients[i])
avg_df

Unnamed: 0,Subject ID,Jitter (local),"Jitter (local, abs)",Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),...,Max Pitch,Num Pulses,Num Periods,Mean Period,Std Dev Periods,Frac Unvoiced Frames,Num Breaks,Degree of Breaks,UPDRS,class info
mean,1.0,2.319462,0.000115,1.108269,1.262692,3.324808,10.548308,1.067231,4.189692,6.317538,...,226.728654,171.5,169.384615,0.00502,0.000427,17.318192,0.730769,6.825923,23.0,1.0
mean,2.0,2.688038,0.000215,1.274769,1.453346,3.824231,12.371192,1.182192,5.537654,7.136808,...,163.935231,121.346154,119.038462,0.008001,0.000629,9.720462,0.884615,5.673423,8.0,1.0
mean,3.0,3.006423,0.000131,1.631077,1.689115,4.892885,17.155731,1.494769,8.115385,10.9622,...,308.129154,105.769231,101.846154,0.004544,0.000763,35.108115,1.115385,13.966962,40.0,1.0
mean,4.0,1.545038,6.3e-05,0.806769,0.828808,2.419923,10.647423,1.015308,5.114423,6.672269,...,272.9575,173.230769,170.846154,0.004114,0.000283,11.514115,0.884615,10.894308,5.0,1.0
mean,5.0,2.7496,0.00025,1.23496,1.2752,3.70484,14.94524,1.36904,7.24684,9.55324,...,163.00548,55.884615,53.769231,0.009014,0.000966,23.878,1.038462,11.456154,16.0,1.0
mean,6.0,2.797154,0.000209,1.381615,1.534692,4.145192,14.133038,1.302077,5.587654,8.600538,...,169.448808,115.769231,113.153846,0.007436,0.000602,17.221077,1.038462,8.463385,46.0,1.0
mean,7.0,2.7605,0.000215,1.358231,1.417885,4.074846,12.418423,1.152538,5.290192,7.008,...,182.085846,60.346154,58.269231,0.007847,0.000804,26.624962,0.692308,8.661,40.0,1.0
mean,8.0,2.216308,0.00015,1.031192,1.055462,3.093538,11.9725,1.155462,5.377923,7.255808,...,175.068,91.615385,89.769231,0.006826,0.000637,28.967808,0.615385,8.711692,20.0,1.0
mean,9.0,2.032962,0.000184,0.859885,1.039731,2.5795,11.910154,1.141462,5.1885,7.267577,...,135.174808,67.961538,66.038462,0.009003,0.00063,19.168615,0.884615,10.464231,11.0,1.0
mean,10.0,3.215538,0.000275,1.247269,1.39464,3.741885,13.704962,1.210154,6.20364,8.01752,...,253.655654,55.0,48.692308,0.009327,0.002105,36.007154,1.807692,18.766077,12.0,1.0


In [148]:
'''Average Data'''
# Get examples
X = avg_df.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
# Get labels
Y = avg_df['class info']