**Imports**

In [2]:
import numpy as np
import pandas as pd

# Libraries for pre-processing
import seaborn as sns
import numpy.random as rand
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from random import sample

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


**Pre-Processing Functions**

*Patients*

In [0]:
'''Break into Per Patient DataFrames'''
def patients(df):
    p = {}
    for i in df['Subject ID'].unique():
        p[i-1] = df.loc[df['Subject ID'] == i]
    return p

*Summarize Patients*

In [0]:
'''Get Feature Features - mean + std'''
def stats(df):
    # initialize features
    features = pd.DataFrame()
    # for each column in DataFrame
    for c in df.columns:
        # create a new feature of its mean
        features[c + ' mean'] = [df[c].mean(axis=0)]
        # create a new feature of its std
        features[c + ' std'] = [df[c].std(axis=0)]
    # return features
    return features

In [0]:
'''Get Stats based Patients'''
def stats_patients(df, pat_func=patients):
    # get patients
    p = pat_func(df)
    # intialize stat based patients dictionary
    s = {}
    # for each patient
    for (k,v) in p.items():
        s[k] = stats(v).drop(['Subject ID mean', 'UPDRS mean', 'class info mean', 
                              'Subject ID std', 'UPDRS std', 'class info std',], axis=1)
        s[k]['Subject ID'] = v['Subject ID'].values[0]
        s[k]['UPDRS'] = v['UPDRS'].values[0]
        s[k]['class info'] = v['class info'].values[0]
    return s

*Normalize Patients*

In [0]:
'''Normalize the features of each patient'''
def normalize_patients(df):
    # remove labels and ID
    data = df.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
    # create Scaler
    scale = StandardScaler()
    # fit and transfrom the data
    normalized = pd.DataFrame(scale.fit_transform(data), columns=names[1:])
    # put labels and ID back in
    normalized['Subject ID'] = df['Subject ID']
    normalized['UPDRS']      = df['UPDRS']
    normalized['class info'] = df['class info']
    
    # break into patients and return
    return patients(normalized)

Normalize & Summarize Patients

In [0]:
'''Normalized and Stats based for each patient'''
def stats_norm_patients(df):
    return stats_patients(df, pat_func=normalize_patients)

**Splitting Data**

In [0]:
def partition(patients, size):
#  print("entered function")


  # Initializing Neural Network
  classifier = Sequential()
  # Adding the input layer and the first hidden layer
  # input_dim = 26 or 52 features
  # (input_dim + 1)/2 ~ 14 or 27 neurons
  classifier.add(Dense(14, input_dim = size, activation = 'relu'))
  # Adding the second hidden layer
  classifier.add(Dense(14, activation = 'relu'))
  classifier.add(Dense(14, activation = 'relu'))
  classifier.add(Dense(14, activation = 'relu'))
  classifier.add(Dense(14, activation = 'relu'))
  classifier.add(Dense(14, activation = 'relu'))
  classifier.add(Dense(14, activation = 'relu'))
  # Adding the output layer
  classifier.add(Dense(1, activation = 'sigmoid'))
  # Compiling the ANN
  classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

  # correct terms
  tp = 0 # true +
  tn = 0 # true -
  fp = 0 # false +
  fn = 0 # false -

  # Partition ratio
  ratio = 0.2
  test_size = ratio*40
  train_size = (1-ratio)*40

  ############### Partitioning patient dataset ###############

  # get labels
  df_train = patients[0]
  for i in range(1,39):
    df_train = df_train.append(patients[i])

  labels = df_train['class info'].values
#  print("labels complete")    
  # partition test/train datasets
  X_train, X_test, Y_train, Y_test = train_test_split(df_train, labels, test_size=ratio, random_state=42)
  
  X_train = X_train.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
  X_test  = X_test.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)

 # print("partition complete")
 # print("X_train: " + str(X_train.shape))
 # print("Y_train: " + str(Y_train.shape))
 # print("X_test: " + str(X_test.shape))
 # print("Y_test: " + str(Y_test.shape))

  # Fitting NN to the Training Set
  classifier.fit(X_train, Y_train, batch_size = 10, epochs = 150, verbose = 0)
  print("fitting complete")
  
  # predict on test
  prediction = np.where(classifier.predict(X_test) > 0.5, 1, 0)
  # flag the correct entries
  correct = np.where(prediction == Y_test, 1, 0)
  # get the accuracy result
  result = np.rint(correct.mean())
  
  accuracy = accuracy_score(Y_test, prediction)

  # Correct
  tp += result * Y_test
  tn += result * (1-Y_test)
  fp += (1-result) * (1-Y_test)
  fn += (1-result) * Y_test

  # return the accuracy
  #a = ((tp+tn) / (tp+tn+fp+fn))
  print("Accuracy: " + str(accuracy))
  return accuracy

**Leave One Out**

In [0]:
'''Leave one out'''
# given the input of a set of dataframes, [patients], representing each patient
#  and a classifier, [classifier], run leave one out
def leave_one_out(patients, classifier):
    
    # create LOO splitter
    loo = LeaveOneOut()
    # create patient based splits
    splits = loo.split(patients)

    # correct terms
    tp = 0 # true +
    tn = 0 # true -
    fp = 0 # false +
    fn = 0 # false -
   
    # for each split
    for train_index, test_index in splits:
                    
        # get testing data
        df_test = patients[test_index[0]]
        
        # get training data - combining them all in patients
        df_train = patients[train_index[0]]
        for i in train_index[1:-1]:
            df_train = df_train.append(patients[i])
            
        # Get examples
        X_train = df_train.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
        X_test  = df_test.drop(['Subject ID', 'UPDRS', 'class info'], axis=1)
        # Get labels
        Y_train = df_train['class info'].values
        Y_test  = df_test['class info'].values

        ##################### Creating Neural Net #####################
         
       
        # Fitting NN to the Training Set
        classifier.fit(X_train, Y_train, batch_size = 10, epochs = 150, verbose = 0)

        print("here")
        
        # predict on test
        prediction = np.where(classifier.predict(X_test) > 0.5, 1, 0)
        # flag the correct entries
        correct = np.where(prediction == Y_test, 1, 0)
        # get the accuracy result
        result = np.rint(correct.mean())
        
    
        # Correct
        tp += result * Y_test
        tn += result * (1-Y_test)
        fp += (1-result) * (1-Y_test)
        fn += (1-result) * Y_test
        
    # return the accuracy
    a = ((tp+tn) / loo.get_n_splits(patients))[0]
    print("accuracy = " + str(a))
    return a


**Importing Data**

In [0]:
# Import Training Data
# column names
names = ['Subject ID',
            'Jitter (local)', 'Jitter (local, abs)', 'Jitter (rap)', 
            'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)', 
            'Shimmer (local, dB)', 'Shimmer (apq3)', 'Shimmer (apq5)', 
            'Shimmer (apq11)', 'Shimmer (dda)', 'AC', 'NTH', 'HTN',
            'Median Pitch', 'Mean Pitch', 'Std Dev Pitch', 'Min Pitch', 
            'Max Pitch', 'Num Pulses', 'Num Periods', 'Mean Period',
            'Std Dev Periods', 'Frac Unvoiced Frames', 'Num  Breaks',
            'Degree of Breaks']
# training column names
train_names = names + ['UPDRS', 'class info']
               
df = pd.read_csv("train_data.txt", 
                       header=None,
                       names =train_names)
#df.sample(5)

**Varying Preprocessing Method**

In [0]:
# preprocessing functions

P = [patients, stats_patients, normalize_patients, stats_norm_patients]
features = [26, 52, 26, 52]
neurons = [14, 27, 14, 27]
#P = [patients]

# results
results = np.zeros(len(P))

# calculate them all
for i in range(len(P)):
    # not use LOO
    # results[i] = partition(P[i](df), features[i])
    # Initializing Neural Network
    classifier = Sequential()
    classifier.add(Dense(neurons[i], input_dim = features[i], activation = 'relu'))
    classifier.add(Dense(neurons[i], activation = 'relu'))
    classifier.add(Dense(1, activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    # use LOO
    results[i] = leave_one_out(P[i](df), classifier)

print("results = " + str(results))


**Varying # Layers**

In [0]:
P = [stats_norm_patients]
features = 52
neurons = 27
#P = [patients]

# results
result3 = 0
result5 = 0


# Initializing Neural Network N = 3
c3 = Sequential()
c3.add(Dense(neurons, input_dim = features, activation = 'relu'))
c3.add(Dense(neurons, activation = 'relu'))
c3.add(Dense(neurons, activation = 'relu'))
c3.add(Dense(1, activation = 'sigmoid'))
c3.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
result3 = leave_one_out(P[0](df), c3)
print("N = 3 --> Accuracy = " + str(result3))

# Initializing Neural Network N = 5
c5 = Sequential()
c5.add(Dense(neurons, input_dim = features, activation = 'relu'))
c5.add(Dense(neurons, activation = 'relu'))
c5.add(Dense(neurons, activation = 'relu'))
c5.add(Dense(neurons, activation = 'relu'))
c5.add(Dense(neurons, activation = 'relu'))
c5.add(Dense(1, activation = 'sigmoid'))
c5.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
result5 = leave_one_out(P[0](df), c5)
print("N = 5 --> Accuracy = " + str(result5))