In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# loading the data from the text file to pandas DataFrame
data = pd.read_csv('Parkinsons disease raw data.txt', header=None)

In [3]:
# printing the first 5 rows of the dataframe
data.head()

Unnamed: 0,0
0,phon_R01_S01_1
1,119.992
2,157.302
3,74.997
4,0.00784


In [4]:
# shape of the dataframe
data.shape

(4680, 1)

In [5]:
# getting the features and target for the first data point (first 24 rows)
data_separate = data.head(24)

In [6]:
type(data_separate)

pandas.core.frame.DataFrame

In [7]:
print(data_separate)

                       0
0         phon_R01_S01_1
1                119.992
2                157.302
3                 74.997
4                0.00784
5                  7e-05
6                 0.0037
7                0.00554
8   0.011090000000000001
9                0.04374
10                 0.426
11               0.02182
12                0.0313
13  0.029710000000000004
14               0.06545
15  0.022109999999999998
16                21.033
17                     1
18              0.414783
19              0.815285
20             -4.813031
21              0.266482
22    2.3014419999999998
23              0.284654


In [8]:
# transpose the array
transposed_data = data_separate.T

In [9]:
print(transposed_data)

               0        1        2       3        4      5       6        7   \
0  phon_R01_S01_1  119.992  157.302  74.997  0.00784  7e-05  0.0037  0.00554   

                     8        9   ...       14                    15      16  \
0  0.011090000000000001  0.04374  ...  0.06545  0.022109999999999998  21.033   

  17        18        19         20        21                  22        23  
0  1  0.414783  0.815285  -4.813031  0.266482  2.3014419999999998  0.284654  

[1 rows x 24 columns]


In [10]:
# removing the first data point from the dataframe
data = data.iloc[24:]

In [11]:
# 2nd data point
data.head(24)

Unnamed: 0,0
24,phon_R01_S01_2
25,122.4
26,148.65
27,113.819
28,0.00968
29,8e-05
30,0.00465
31,0.006959999999999999
32,0.013940000000000001
33,0.06134


In [12]:
# Last Data Point
data.tail(24)

Unnamed: 0,0
4656,phon_R01_S50_6
4657,214.289
4658,260.277
4659,77.973
4660,0.0056700000000000006
4661,2.9999999999999997e-05
4662,0.00295
4663,0.00317
4664,0.00885
4665,0.01884


In [13]:
transposed_data.shape

(1, 24)

In [14]:
column_header = list(range(0, 24))

In [15]:
print(column_header)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]


In [17]:
# creating a for loop to separate each data point and creating a new structured dataframe
#we have 4680 row and we convert 24 row into first row of data. so 4680/24 = 195 rows we need

j = 24

for i in range(194):

  data_separate = data.head(j)
  df = data_separate.T

  df = df.set_axis(column_header, axis=1)

  frames = [transposed_data, df]
  transposed_data = pd.concat(frames, axis=0)

  data = data.iloc[24:]



In [18]:
transposed_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.0221099999999999,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
0,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.0069599999999999,0.01394,0.06134,...,0.0940299999999999,0.01929,19.085,1,0.458359,0.8195209999999999,-4.075192,0.33559,2.486855,0.368674
0,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.0054399999999999,0.0078099999999999,0.01633,0.0523299999999999,...,0.0827,0.01309,20.651,1,0.4298949999999999,0.8252879999999999,-4.443179,0.311173,2.342259,0.332634
0,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.4349689999999999,0.819235,-4.117501,0.3341469999999999,2.405554,0.368975
0,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.0176699999999999,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [19]:
transposed_data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,phon_R01_S50_2,174.188,230.978,94.261,0.0045899999999999,3e-05,0.00263,0.00259,0.0079,0.04087,...,0.0700799999999999,0.0276399999999999,19.517,0,0.448439,0.657899,-6.5385860000000005,0.121952,2.657476,0.13305
0,phon_R01_S50_3,209.516,253.017,89.488,0.00564,3e-05,0.00331,0.00292,0.00994,0.02751,...,0.0481199999999999,0.0181,19.147,0,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895
0,phon_R01_S50_4,174.688,240.005,74.287,0.0136,8e-05,0.00624,0.00564,0.01873,0.02308,...,0.03804,0.10715,17.883,0,0.407567,0.655683,-6.787197,0.158453,2.6797720000000003,0.1317279999999999
0,phon_R01_S50_5,198.764,396.961,74.904,0.0074,4e-05,0.0037,0.0039,0.01109,0.0229599999999999,...,0.0379399999999999,0.0722299999999999,19.02,0,0.451221,0.6439560000000001,-6.7445770000000005,0.207454,2.138608,0.123306
0,phon_R01_S50_6,214.289,260.277,77.973,0.00567,3e-05,0.00295,0.00317,0.00885,0.01884,...,0.03078,0.04398,21.209,0,0.4628029999999999,0.664357,-5.724056,0.190667,2.555477,0.148569


In [20]:
transposed_data.shape

(195, 24)

In [21]:
# reset the index of the dataframe
parkinsons_data = transposed_data.reset_index()

In [22]:
parkinsons_data.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,...,0.06545,0.0221099999999999,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,0,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.0069599999999999,0.01394,...,0.0940299999999999,0.01929,19.085,1,0.458359,0.8195209999999999,-4.075192,0.33559,2.486855,0.368674
2,0,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.0054399999999999,0.0078099999999999,0.01633,...,0.0827,0.01309,20.651,1,0.4298949999999999,0.8252879999999999,-4.443179,0.311173,2.342259,0.332634
3,0,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,...,0.08771,0.01353,20.644,1,0.4349689999999999,0.819235,-4.117501,0.3341469999999999,2.405554,0.368975
4,0,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,...,0.1047,0.0176699999999999,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [23]:
# dropping the initial index column
parkinsons_data = parkinsons_data.drop(columns='index',axis=1)

In [24]:
parkinsons_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.0221099999999999,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.0069599999999999,0.01394,0.06134,...,0.0940299999999999,0.01929,19.085,1,0.458359,0.8195209999999999,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.0054399999999999,0.0078099999999999,0.01633,0.0523299999999999,...,0.0827,0.01309,20.651,1,0.4298949999999999,0.8252879999999999,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.4349689999999999,0.819235,-4.117501,0.3341469999999999,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.0176699999999999,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [25]:
# number of rows and columns in the dataframe
parkinsons_data.shape

(195, 24)

In [26]:
# checking for missing values in each column
parkinsons_data.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
dtype: int64

In [27]:
# distribution of target Variable
parkinsons_data[17].value_counts()

17
1    147
0     48
Name: count, dtype: int64

1  --> Parkinson's Positive

0 --> Healthy


Separating the features & Target

In [28]:
X = parkinsons_data.drop(columns=[0,17], axis=1)
Y = parkinsons_data[17]

In [29]:
print(X)

          1        2        3                      4                       5   \
0    119.992  157.302   74.997                0.00784                   7e-05   
1      122.4   148.65  113.819                0.00968                   8e-05   
2    116.682  131.111  111.555                 0.0105                   9e-05   
3    116.676  137.871  111.366                0.00997                   9e-05   
4    116.014  141.781  110.655                0.01284                 0.00011   
..       ...      ...      ...                    ...                     ...   
190  174.188  230.978   94.261  0.0045899999999999995  2.9999999999999997e-05   
191  209.516  253.017   89.488                0.00564  2.9999999999999997e-05   
192  174.688  240.005   74.287                 0.0136                   8e-05   
193  198.764  396.961   74.904                 0.0074                   4e-05   
194  214.289  260.277   77.973  0.0056700000000000006  2.9999999999999997e-05   

                        6  

In [30]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
190    0
191    0
192    0
193    0
194    0
Name: 17, Length: 195, dtype: object


In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [32]:
print(X.shape, X_train.shape, X_test.shape)

(195, 22) (156, 22) (39, 22)


In [33]:
scaler = StandardScaler()

In [34]:
scaler.fit(X_train)

In [35]:
X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)

In [36]:
print(X_train)

[[ 0.63239631 -0.02731081 -0.87985049 ... -0.97586547 -0.55160318
   0.07769494]
 [-1.05512719 -0.83337041 -0.9284778  ...  0.3981808  -0.61014073
   0.39291782]
 [ 0.02996187 -0.29531068 -1.12211107 ... -0.43937044 -0.62849605
  -0.50948408]
 ...
 [-0.9096785  -0.6637302  -0.160638   ...  1.22001022 -0.47404629
  -0.2159482 ]
 [-0.35977689  0.19731822 -0.79063679 ... -0.17896029 -0.47272835
   0.28181221]
 [ 1.01957066  0.19922317 -0.61914972 ... -0.716232    1.23632066
  -0.05829386]]


Model Training

In [40]:
models = [LogisticRegression(), SVC(kernel='linear'), SVC(kernel='poly'), SVC(kernel='rbf'), SVC(kernel='sigmoid'), KNeighborsClassifier(), RandomForestClassifier(random_state=42)]

In [43]:
def compare_models():

  for model in models:
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    print('Accuracy score for the ', model,' = ', accuracy)



In [44]:
compare_models()

Accuracy score for the  LogisticRegression()  =  0.8205128205128205
Accuracy score for the  SVC(kernel='linear')  =  0.8717948717948718
Accuracy score for the  SVC(kernel='poly')  =  0.8974358974358975
Accuracy score for the  SVC()  =  0.8974358974358975
Accuracy score for the  SVC(kernel='sigmoid')  =  0.7435897435897436
Accuracy score for the  KNeighborsClassifier()  =  0.7692307692307693
Accuracy score for the  RandomForestClassifier(random_state=42)  =  0.7948717948717948



# Support Vector Machine Classifier (kernal = poly, rbf ) has high accuracy in this case. It can be due to the high dimensionality of the dataset