In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from  google.colab  import  drive
from sklearn.model_selection import LeaveOneOut

drive.mount('/content/gdrive')
mydirectory = 'gdrive/My Drive/Colab Notebooks/PD/'

#Parkinson's dataset is downloaded from UCI ML repository: https://archive.ics.uci.edu/ml/datasets/parkinsons
fname = mydirectory + "parkinsons.data"       #195 rows/recordings, there are only 32 human subjects (8 of which are PD)
data = pd.read_csv(fname)                     #6 recordings per subject (7 recordings for only 3 of the subjects)
dataX = data.drop(columns=['status','name'])  #We have 6records*29subj + 7records*3subjects = 195


logo = LeaveOneOut()

logo.get_n_splits(dataX)
#print the first few rows
print(data)
#print(logo)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
               name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  ...   spread2        D2       PPE
0    phon_R01_S01_1      119.992       157.302  ...  0.266482  2.301442  0.284654
1    phon_R01_S01_2      122.400       148.650  ...  0.335590  2.486855  0.368674
2    phon_R01_S01_3      116.682       131.111  ...  0.311173  2.342259  0.332634
3    phon_R01_S01_4      116.676       137.871  ...  0.334147  2.405554  0.368975
4    phon_R01_S01_5      116.014       141.781  ...  0.234513  2.332180  0.410335
..              ...          ...           ...  ...       ...       ...       ...
190  phon_R01_S50_2      174.188       230.978  ...  0.121952  2.657476  0.133050
191  phon_R01_S50_3      209.516       253.017  ...  0.129303  2.784312  0.168895
192  phon_R01_S50_4      174.688       240.005  ...  0.158453  2.679772  0.131728
193  phon_R01_S50_5      198.764       396.961  .

In [14]:
#FIRST ATTEMPT
#NOTE THAT THIS IS NOT A GOOD FIT WITH THE DATASET, WE NEED LEAVE SUBJECT OUT INSTEAD

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(dataX, data['status'], test_size=0.5, random_state=0)       #prev dataX

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)   #ranges, mean, stddev of the features in the training set
X_test = scaler.transform(X_test) 
#especially if you are going to use rbf-svm, this preprocessing step is very important
#because SVM will be calculateing the distance of the test example to each one of the training examples
#you dont want to have one feature ranging 100 to 200 and the other one (which is perhaps more important) ranging from 0 to 1
#Refer to the discussion in the next code cell about what normalization is.



#Now, we start training-optimizing-testing the classifier
#Find the optimal parameters by cross-validation, well, optimal only if you do a proper cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-1, 1, 10, 100],   #libsvm showed that each point has some influence around itself in the space
                     #gamma is the inverse of the sigma of the "sphere"  (the mean of that sphere is on that particular training example)
                     'C': [0.25, 0.5, 1, 2, 4, 8, 16, 32]},
                    {'kernel': ['linear'], 'C': [0.25, 0.5, 1, 2, 4, 8, 16, 32]}]

clf = GridSearchCV(SVC(), tuned_parameters, scoring='f1_macro')  #accuracy can be bad for this dataset because it is imbalanced, we have a lot more Parkinson's subjects.
#Here is the summary of the lecture on why the dataset is imbalanced, thus forcing us to use average of f1 scores of the classes. 
#Keep in mind the study is for telediagnosis of Parkinson's (a good story for COVID-19 era). 
#A good control group is needed, unfortunately we have only 8 healthy subjects.
#In the video I explained why that might be in more detail, the main idea is that finding a good control group (vs Parkinson's) is not easy. 
#You cannot use students as a control group because normally they are not going to be the calling the telediagnosis service to be tested for Parkinsonism. 
#It is important to match demographic groups etc.


clf.fit(X_train, y_train)
print("Best parameters set found on development set:",clf.best_params_)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

#WOW, we get 91% accuracy
#THAT 91% IS NOT REALISTIC THOUGH!  I EXPLAINED WHY IN THE VIDEO.

Best parameters set found on development set: {'C': 2, 'gamma': 0.1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.94      0.67      0.78        24
           1       0.90      0.99      0.94        74

    accuracy                           0.91        98
   macro avg       0.92      0.83      0.86        98
weighted avg       0.91      0.91      0.90        98



In [15]:
#A LITTLE DISCUSSION ABOUT WHAT NORMALIZATION IS:
#zero centered and unit standard deviation

X_normalized = scaler.fit_transform(dataX)   #ranges, mean, stddev etc of the features in the training set       dataX
print('original vs normalized values for the first feature, first 10 rows are shown:')
print(np.concatenate((X_normalized[:10,[0]],dataX.iloc[:10,[0]]), axis=1)) #dataX
print('\nX_normalized\n')
print(X_normalized[:10,0])
print('\ndataX.iloc\n')
print(dataX.iloc[:10,0])

original vs normalized values for the first feature, first 10 rows are shown:
[[ -0.82929965 119.992     ]
 [ -0.77097169 122.4       ]
 [ -0.90947638 116.682     ]
 [ -0.90962172 116.676     ]
 [ -0.92565706 116.014     ]
 [ -0.81573501 120.552     ]
 [ -0.82263845 120.267     ]
 [ -1.13595747 107.332     ]
 [ -1.4169878   95.73      ]
 [ -1.43331382  95.056     ]]

X_normalized

[-0.82929965 -0.77097169 -0.90947638 -0.90962172 -0.92565706 -0.81573501
 -0.82263845 -1.13595747 -1.4169878  -1.43331382]

dataX.iloc

0    119.992
1    122.400
2    116.682
3    116.676
4    116.014
5    120.552
6    120.267
7    107.332
8     95.730
9     95.056
Name: MDVP:Fo(Hz), dtype: float64


In [30]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

print('N_subjects', logo.get_n_splits(dataX, data['status'],data['name'].astype(str).str[:-2]))

groups = dataX
logo = LeaveOneGroupOut()
#logo = LeaveOneOut()
y=dataX.iloc

logo.get_n_splits(dataX, y, groups)

#SPLIT INDIVIDUALS
trains = []
tests = []
N_train = 0

def LeaveOneGroupOut():
  for train_index, test_index in logo.split(dataX, data['status'],data['name'].astype(str).str[:-2]):
      if (np.random.random() > 0.25) :
          trains=np.append(trains, test_index)
          N_train=N_train+1
      else :
          tests=np.append(tests, test_index)

X_train, X_test = dataX.iloc[trains,:], dataX.iloc[tests,:]
y_train, y_test = data['status'][trains], data['status'][tests]

# Set the parameters by cross-validation
tuned_parameters = [{'svm__kernel': ['rbf'], 'svm__gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
                     'svm__C': [0.25, 0.5, 1, 2, 4, 8, 16, 32]},
                    {'svm__kernel': ['linear'], 'svm__C': [0.25, 0.5, 1, 2, 4, 8, 16, 32]}]

print(trains)



#from sklearn.pipeline import Pipeline
#pipeline = Pipeline([('scale',StandardScaler()), ('svm',SVC())])

#CROSS VALIDATION IN GRIDSEARCH BELOW IS NOT USING LEAVE SUBJECT OUT, IT USES THE STANDARD K-FOLD
#THAT BETTER BE FIXED (FOR THE SAME REASON WHY 91% WAS NOT REALISTIC ABOVE)
#clf = GridSearchCV(pipeline, tuned_parameters, scoring='f1_macro', cv = N_train)
#clf.fit(X_train, y_train)
#print("Best parameters set found on development set:",clf.best_params_)
#y_pred = clf.predict(X_test)
#print(classification_report(y_test, y_pred))


N_subjects 32
[]
[]
