In [14]:
# Libraries
import numpy as np
import pandas as pd
import numpy.random as rand
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Globals
test_percent = 0.20 # percent of dataset withheld for validation

In [3]:
# Import Training Data
# column names
names = ['Subject ID',
            'Jitter (local)', 'Jitter (local, abs)', 'Jitter (rap)', 
            'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)', 
            'Shimmer (local, dB)', 'Shimmer (apq3)', 'Shimmer (apq5)', 
            'Shimmer (apq11)', 'Shimmer (dda)', 'AC', 'NTH', 'HTN',
            'Median Pitch', 'Mean Pitch', 'Std Dev Pitch', 'Min Pitch', 
            'Max Pitch', 'Num Pulses', 'Num Periods', 'Mean Period',
            'Std Dev Periods', 'Frac Unvoiced Frames', 'Num  Breaks',
            'Degree of Breaks']
# training column names
train_names = names + ['UPDRS', 'class info']
               
df = pd.read_csv("../Parkinson_Multiple_Sound_Recording/train_data.txt", 
                       header=None,
                       names =train_names)
df.head()

Unnamed: 0,Subject ID,Jitter (local),"Jitter (local, abs)",Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),...,Max Pitch,Num Pulses,Num Periods,Mean Period,Std Dev Periods,Frac Unvoiced Frames,Num Breaks,Degree of Breaks,UPDRS,class info
0,1,1.488,9e-05,0.9,0.794,2.699,8.334,0.779,4.517,4.609,...,187.576,160,159,0.006065,0.000416,0.0,0,0.0,23,1
1,1,0.728,3.8e-05,0.353,0.376,1.059,5.864,0.642,2.058,3.18,...,234.505,170,169,0.005181,0.000403,2.247,0,0.0,23,1
2,1,1.22,7.4e-05,0.732,0.67,2.196,8.719,0.875,4.347,5.166,...,211.442,1431,1427,0.006071,0.000474,10.656,1,0.178,23,1
3,1,2.502,0.000123,1.156,1.634,3.469,13.513,1.273,5.263,8.771,...,220.23,94,92,0.00491,0.00032,0.0,0,0.0,23,1
4,1,3.509,0.000167,1.715,1.539,5.145,9.112,1.04,3.102,4.927,...,225.162,117,114,0.004757,0.00038,18.182,1,13.318,23,1


In [4]:
# Get examples
X = df.drop(['UPDRS', 'class info'], axis=1)
# Get labels
Y = df['class info']

### Baseline: Linear Regression

In [22]:
# runs linear regression on X, Y, returns accuracy
def linear_model(X, Y, stats=False):
    # Separate Training and testing data
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=test_percent, random_state=0)
    # Create Classifier
    lin_reg = LinearRegression()
    # Fit the classifer
    lin_reg.fit(X_train, Y_train)
    # Prediction
    predict = np.rint(lin_reg.predict(X_test))
    # Results
    if stats:
        print(confusion_matrix(Y_test, predict))
        print(classification_report(Y_test, predict))
    # accuracy
    return accuracy_score(Y_test, predict)*100
print("Accruacy: %.1f%%" % linear_model(X,Y))

Accruacy: 97.1%


### Logistic Regression

In [23]:
# runs logistic regression on X, Y, returns results accuracy
# INPUTS: X, Y are examples and labels
# OUTPUTS:  p -> penalty ('l1', 'l2', etc.) = 'l2'
#           d -> dual (boolean)             = False
#           t -> tolerance                  = 1e-5
def logistic_model(X, Y, p='l2', d=False, t=1e-5, stats=False):
    # Separate Training and testing data
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=test_percent, random_state=0)
    # Create Classifier
    log_reg = LogisticRegression(penalty=p, dual=d, 
                                 random_state=0, solver='liblinear')
    # Fit the classifer
    log_reg.fit(X_train, Y_train)
    # Prediction
    predict = log_reg.predict(X_test)
    # Results
    if stats:
        print(confusion_matrix(Y_test, predict))
        print(classification_report(Y_test, predict))
    # return accuracy
    return accuracy_score(Y_test, predict)*100
print("Accruacy: %.1f%%" % logistic_model(X, Y))

Accruacy: 98.1%


### SVMs & Kernel Methods

In [24]:
# run an SVM with the specified Kernel
def SVM(X, Y, k='linear', p=3, stats=False):
    # Separate Training and testing data
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=test_percent, random_state=0)
    # Create Classifier
    clf = SVC(kernel=k, degree=p, random_state=0, gamma='scale')
    # Fit the Classifier
    clf.fit(X_train, Y_train)
    # Predictions
    predict = clf.predict(X_test)
    # Results
    if stats:
        print(confusion_matrix(Y_test, predict))
        print(classification_report(Y_test, predict))
    # Return accuracy
    return accuracy_score(Y_test, predict)*100

# pred = SVM(X, Y, k='linear', p=100)
print("Linear Accruacy: %.1f%%" % SVM(X, Y, k='linear'))
print("Poly-1 Accruacy: %.1f%%" % SVM(X, Y, k='poly', p=1))
print("Poly-3 Accruacy: %.1f%%" % SVM(X, Y, k='poly', p=3))
print("Poly-10 Accruacy: %.1f%%" % SVM(X, Y, k='poly', p=10))
print("RBF Accruacy: %.1f%%" % SVM(X, Y, k='rbf'))
print("Sigmoid Accruacy: %.1f%%" % SVM(X, Y, k='sigmoid'))

Linear Accruacy: 100.0%
Poly-1 Accruacy: 86.1%
Poly-3 Accruacy: 84.6%
Poly-10 Accruacy: 72.1%
RBF Accruacy: 84.6%
Sigmoid Accruacy: 55.8%
