In [1]:
# Libraries
import numpy as np
import pandas as pd
import numpy.random as rand
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
# Import Training Data
# column names
names = ['Subject ID',
            'Jitter (local)', 'Jitter (local, abs)', 'Jitter (rap)', 
            'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)', 
            'Shimmer (local, dB)', 'Shimmer (apq3)', 'Shimmer (apq5)', 
            'Shimmer (apq11)', 'Shimmer (dda)', 'AC', 'NTH', 'HTN',
            'Median Pitch', 'Mean Pitch', 'Std Dev Pitch', 'Min Pitch', 
            'Max Pitch', 'Num Pulses', 'Num Periods', 'Mean Period',
            'Std Dev Periods', 'Frac Unvoiced Frames', 'Num  Breaks',
            'Degree of Breaks']
# training column names
train_names = names + ['UPDRS', 'class info']
               
df = pd.read_csv("Parkinson_Multiple_Sound_Recording/train_data.txt", 
                       header=None,
                       names =train_names)
df.sample(5)

Unnamed: 0,Subject ID,Jitter (local),"Jitter (local, abs)",Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),...,Max Pitch,Num Pulses,Num Periods,Mean Period,Std Dev Periods,Frac Unvoiced Frames,Num Breaks,Degree of Breaks,UPDRS,class info
110,5,1.938,0.000171,0.636,0.78,1.908,14.284,1.423,5.575,8.74,...,122.45,36,34,0.008821,0.000612,26.19,1,3.908,16,1
295,12,5.931,0.000512,3.542,3.927,10.627,14.411,1.336,5.544,12.186,...,124.967,33,32,0.008633,0.000781,0.0,0,0.0,32,1
987,38,1.63,0.000131,0.457,0.634,1.37,9.52,0.946,3.788,6.239,...,153.723,35,34,0.008018,0.000981,47.143,0,0.0,1,0
720,28,1.703,6.4e-05,0.671,0.815,2.014,11.549,1.154,3.511,7.295,...,335.909,149,148,0.003747,0.000638,3.509,0,0.0,1,0
713,28,2.302,0.00012,1.041,1.148,3.123,14.581,1.377,6.232,6.639,...,225.281,51,49,0.005203,0.000782,44.231,1,27.036,1,0


In [3]:
'''Leave one out'''
# given the input of a set of dataframes, [patients], representing each patient
#  and a classifier, [clf], run leave one out
def leave_one_out(patients, clf):
    # create LOO splitter
    loo = LeaveOneOut()
    # create patient based splits
    splits = loo.split(patients)
    
    # correct terms
    tp = 0 # true +
    tn = 0 # true -
    fp = 0 # false +
    fn = 0 # false -
    
    # for each split
    for train_index, test_index in splits:
        
        # get testing data
        df_test = patients[test_index[0]]
        
        # get training data - combining them all in patients
        df_train = patients[train_index[0]]
        for i in train_index[1:-1]:
            df_train = df_train.append(patients[i])
            
        # Get examples
        X_train = df_train.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
        X_test  = df_test.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
        # Get labels
        Y_train = df_train['class info'].values
        Y_test  = df_test['class info'].values
        
        # fit the classifier
        clf.fit(X_train, Y_train)
        
        # predict on test
        prediction = np.where(clf.predict(X_test) > 0.5, 1, 0)
        # flag the correct entries
        correct = np.where(prediction == Y_test, 1, 0)
        # get the accuracy result
        result = np.rint(correct.mean())
    
        # Correct
        tp += result * Y_test
        tn += result * (1-Y_test)
        fp += (1-result) * (1-Y_test)
        fn += (1-result) * Y_test
        
    # return the accuracy
    return ((tp+tn) / loo.get_n_splits(patients))[0]

In [4]:
'''Break into Per Patient DataFrames'''
def patients(df):
    p = {}
    for i in df['Subject ID'].unique():
        p[i-1] = df.loc[df['Subject ID'] == i]
    return p

In [5]:
'''Get Feature Features - mean + std'''
def stats(df):
    # initialize features
    features = pd.DataFrame()
    # for each column in DataFrame
    for c in df.columns:
        # create a new feature of its mean
        features[c + ' mean'] = [df[c].mean(axis=0)]
        # create a new feature of its std
        features[c + ' std'] = [df[c].std(axis=0)]
    # return features
    return features

In [6]:
'''Get Stats based Patients'''
def stats_patients(df):
    # get patients
    p = patients(df)
    # intialize stat based patients dictionary
    s = {}
    # for each patient
    for (k,v) in p.items():
        s[k] = stats(v).drop(['Subject ID mean', 'UPDRS mean', 'class info mean', 
                              'Subject ID std', 'UPDRS std', 'class info std',], axis=1)
        s[k]['Subject ID'] = v['Subject ID'].values[0]
        s[k]['UPDRS'] = v['UPDRS'].values[0]
        s[k]['class info'] = v['class info'].values[0]
    return s

In [46]:
'''Normalize the features of each patient'''
def normalize_patients(df):
    # remove labels and ID
    data = df.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
    # create Scaler
    scale = StandardScaler()
    # fit and transfrom the data
    normalized = pd.DataFrame(scale.fit_transform(data), columns=names[1:])
    # put labels and ID back in
    normalized['Subject ID'] = df['Subject ID']
    normalized['UPDRS']      = df['UPDRS']
    normalized['class info'] = df['class info']
    
    # break into patients and return
    return patients(normalized)

#### Actually Run it now...

In [56]:
# leave_one_out(normalize_patients(df), KNeighborsClassifier(n_neighbors=7))
leave_one_out(stats_patients(df), SVC(kernel='linear', gamma=0.005, C=10))

0.5