In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from  google.colab  import  drive
drive.mount('/content/gdrive')
mydirectory = 'gdrive/My Drive/Colab Notebooks/PD/'

#Parkinson's dataset is downloaded from UCI ML repository: https://archive.ics.uci.edu/ml/datasets/parkinsons
fname = mydirectory + "parkinsons.data"       #195 rows/recordings, there are only 32 human subjects (8 of which are PD)
data = pd.read_csv(fname)                     #6 recordings per subject (7 recordings for only 3 of the subjects)
dataX = data.drop(columns=['status','name'])  #We have 6records*29subj + 7records*3subjects = 195
#print the first few rows
print(data)


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


FileNotFoundError: ignored

In [3]:
#FIRST ATTEMPT
#NOTE THAT THIS IS NOT A GOOD FIT WITH THE DATASET, WE NEED LEAVE SUBJECT OUT INSTEAD

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(dataX, data['status'], test_size=0.5, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)   #ranges, mean, stddev of the features in the training set
X_test = scaler.transform(X_test) 
#especially if you are going to use rbf-svm, this preprocessing step is very important
#because SVM will be calculateing the distance of the test example to each one of the training examples
#you dont want to have one feature ranging 100 to 200 and the other one (which is perhaps more important) ranging from 0 to 1
#Refer to the discussion in the next code cell about what normalization is.



#Now, we start training-optimizing-testing the classifier
#Find the optimal parameters by cross-validation, well, optimal only if you do a proper cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-1, 1, 10, 100],   #libsvm showed that each point has some influence around itself in the space
                     #gamma is the inverse of the sigma of the "sphere"  (the mean of that sphere is on that particular training example)
                     'C': [0.25, 0.5, 1, 2, 4, 8, 16, 32]},
                    {'kernel': ['linear'], 'C': [0.25, 0.5, 1, 2, 4, 8, 16, 32]}]

clf = GridSearchCV(SVC(), tuned_parameters, scoring='f1_macro')  #accuracy can be bad for this dataset because it is imbalanced, we have a lot more Parkinson's subjects.
#Here is the summary of the lecture on why the dataset is imbalanced, thus forcing us to use average of f1 scores of the classes. 
#Keep in mind the study is for telediagnosis of Parkinson's (a good story for COVID-19 era). 
#A good control group is needed, unfortunately we have only 8 healthy subjects.
#In the video I explained why that might be in more detail, the main idea is that finding a good control group (vs Parkinson's) is not easy. 
#You cannot use students as a control group because normally they are not going to be the calling the telediagnosis service to be tested for Parkinsonism. 
#It is important to match demographic groups etc.


clf.fit(X_train, y_train)
print("Best parameters set found on development set:",clf.best_params_)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

#WOW, we get 91% accuracy
#THAT 91% IS NOT REALISTIC THOUGH!  I EXPLAINED WHY IN THE VIDEO.

NameError: ignored

In [0]:
#A LITTLE DISCUSSION ABOUT WHAT NORMALIZATION IS:
#zero centered and unit standard deviation

X_normalized = scaler.fit_transform(dataX)   #ranges, mean, stddev etc of the features in the training set
print('original vs normalized values for the first feature, first 10 rows are shown:')
print(np.concatenate((X_normalized[:10,[0]],dataX.iloc[:10,[0]]), axis=1))
#print(X_normalized[:10,0])
#print(dataX.iloc[:10,0])


original vs normalized values for the first feature, first 10 rows are shown:
[[ -0.82929965 119.992     ]
 [ -0.77097169 122.4       ]
 [ -0.90947638 116.682     ]
 [ -0.90962172 116.676     ]
 [ -0.92565706 116.014     ]
 [ -0.81573501 120.552     ]
 [ -0.82263845 120.267     ]
 [ -1.13595747 107.332     ]
 [ -1.4169878   95.73      ]
 [ -1.43331382  95.056     ]]


In [0]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

logo = LeaveOneGroupOut()
print('N_subjects', logo.get_n_splits(dataX, data['status'],data['name'].astype(str).str[:-2]))

#SPLIT INDIVIDUALS
trains = []
tests = []
N_train = 0
for train_index, test_index in logo.split(dataX, data['status'],data['name'].astype(str).str[:-2]):
    if np.random.random()>0.25 :
        trains=np.append(trains, test_index)
        N_train=N_train+1
    else :
        tests=np.append(tests, test_index)

X_train, X_test = dataX.iloc[trains,:], dataX.iloc[tests,:]
y_train, y_test = data['status'][trains], data['status'][tests]

# Set the parameters by cross-validation
tuned_parameters = [{'svm__kernel': ['rbf'], 'svm__gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
                     'svm__C': [0.25, 0.5, 1, 2, 4, 8, 16, 32]},
                    {'svm__kernel': ['linear'], 'svm__C': [0.25, 0.5, 1, 2, 4, 8, 16, 32]}]

from sklearn.pipeline import Pipeline
pipeline = Pipeline([('scale',StandardScaler()), ('svm',SVC())])

#CROSS VALIDATION IN GRIDSEARCH BELOW IS NOT USING LEAVE SUBJECT OUT, IT USES THE STANDARD K-FOLD
#THAT BETTER BE FIXED (FOR THE SAME REASON WHY 91% WAS NOT REALISTIC ABOVE)
clf = GridSearchCV(pipeline, tuned_parameters, scoring='f1_macro', cv = N_train)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:",clf.best_params_)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


N_subjects 32
Best parameters set found on development set: {'svm__C': 32, 'svm__kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.11      0.17      0.13        12
           1       0.77      0.67      0.72        49

    accuracy                           0.57        61
   macro avg       0.44      0.42      0.43        61
weighted avg       0.64      0.57      0.60        61

