# Parkinson's Disease Prediction

## Importing the dependencies

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

## Data Collection and Analysis

In [3]:
parkinsons_data = pd.read_csv('parkinsons.csv')

In [4]:
parkinsons_data.shape

(195, 24)

In [5]:
parkinsons_data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [6]:
parkinsons_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

In [7]:
parkinsons_data.isnull().sum()

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

In [8]:
parkinsons_data.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,...,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,154.228641,197.104918,116.324631,0.00622,4.4e-05,0.003306,0.003446,0.00992,0.029709,0.282251,...,0.046993,0.024847,21.885974,0.753846,0.498536,0.718099,-5.684397,0.22651,2.381826,0.206552
std,41.390065,91.491548,43.521413,0.004848,3.5e-05,0.002968,0.002759,0.008903,0.018857,0.194877,...,0.030459,0.040418,4.425764,0.431878,0.103942,0.055336,1.090208,0.083406,0.382799,0.090119
min,88.333,102.145,65.476,0.00168,7e-06,0.00068,0.00092,0.00204,0.00954,0.085,...,0.01364,0.00065,8.441,0.0,0.25657,0.574282,-7.964984,0.006274,1.423287,0.044539
25%,117.572,134.8625,84.291,0.00346,2e-05,0.00166,0.00186,0.004985,0.016505,0.1485,...,0.024735,0.005925,19.198,1.0,0.421306,0.674758,-6.450096,0.174351,2.099125,0.137451
50%,148.79,175.829,104.315,0.00494,3e-05,0.0025,0.00269,0.00749,0.02297,0.221,...,0.03836,0.01166,22.085,1.0,0.495954,0.722254,-5.720868,0.218885,2.361532,0.194052
75%,182.769,224.2055,140.0185,0.007365,6e-05,0.003835,0.003955,0.011505,0.037885,0.35,...,0.060795,0.02564,25.0755,1.0,0.587562,0.761881,-5.046192,0.279234,2.636456,0.25298
max,260.105,592.03,239.17,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,...,0.16942,0.31482,33.047,1.0,0.685151,0.825288,-2.434031,0.450493,3.671155,0.527367


In [9]:
# distribution of target variable
parkinsons_data['status'].value_counts()

1    147
0     48
Name: status, dtype: int64

In [10]:
parkinsons_data.groupby('status').mean()

Unnamed: 0_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,181.937771,223.63675,145.207292,0.003866,2.3e-05,0.001925,0.002056,0.005776,0.017615,0.162958,...,0.013305,0.028511,0.011483,24.67875,0.442552,0.695716,-6.759264,0.160292,2.154491,0.123017
1,145.180762,188.441463,106.893558,0.006989,5.1e-05,0.003757,0.0039,0.011273,0.033658,0.321204,...,0.0276,0.053027,0.029211,20.974048,0.516816,0.725408,-5.33342,0.248133,2.456058,0.233828


In [11]:
# creating a more class-balanced dataset
parkinsons_positive = parkinsons_data[parkinsons_data['status'] == 1]
parkinsons_negative = parkinsons_data[parkinsons_data['status'] == 0]

In [12]:
print(parkinsons_positive.shape)
print(parkinsons_negative.shape)

(147, 24)
(48, 24)


In [13]:
parkinsons_positive.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,...,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0,147.0
mean,145.180762,188.441463,106.893558,0.006989,5.1e-05,0.003757,0.0039,0.011273,0.033658,0.321204,...,0.053027,0.029211,20.974048,1.0,0.516816,0.725408,-5.33342,0.248133,2.456058,0.233828
std,32.34805,88.33918,32.274358,0.00524,3.7e-05,0.003241,0.002998,0.009724,0.01997,0.207798,...,0.032391,0.044447,4.339143,0.0,0.101254,0.054786,0.970792,0.077809,0.375742,0.084271
min,88.333,102.145,65.476,0.00168,1e-05,0.00068,0.00092,0.00204,0.01022,0.09,...,0.01364,0.00231,8.441,1.0,0.263654,0.574282,-7.120925,0.063412,1.765957,0.093193
25%,117.572,133.7765,80.8755,0.004005,3e-05,0.00203,0.00219,0.006085,0.018295,0.168,...,0.0274,0.008445,18.782,1.0,0.439064,0.685569,-6.0383,0.199507,2.180933,0.170103
50%,145.174,163.335,99.77,0.00544,4e-05,0.00284,0.00314,0.00853,0.02838,0.263,...,0.04451,0.01658,21.414,1.0,0.530529,0.726652,-5.44004,0.240875,2.439597,0.222716
75%,170.071,207.1605,129.24,0.00767,6e-05,0.0041,0.00436,0.0123,0.042525,0.3945,...,0.068455,0.02796,24.1645,1.0,0.604573,0.764868,-4.664067,0.30366,2.668479,0.274397
max,223.361,588.518,199.02,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,...,0.16942,0.31482,29.928,1.0,0.685151,0.825288,-2.434031,0.450493,3.671155,0.527367


In [14]:
parkinsons_negative.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,...,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,181.937771,223.63675,145.207292,0.003866,2.3e-05,0.001925,0.002056,0.005776,0.017615,0.162958,...,0.028511,0.011483,24.67875,0.0,0.442552,0.695716,-6.759264,0.160292,2.154491,0.123017
std,52.731067,96.727067,58.75707,0.002055,1.5e-05,0.001066,0.000943,0.003199,0.005544,0.057822,...,0.010368,0.019088,3.43454,0.0,0.092199,0.051346,0.642782,0.062982,0.310269,0.04482
min,110.739,113.597,74.287,0.00178,7e-06,0.00092,0.00106,0.00276,0.00954,0.085,...,0.01403,0.00065,17.883,0.0,0.25657,0.62671,-7.964984,0.006274,1.423287,0.044539
25%,120.9475,139.41325,98.24375,0.002655,1e-05,0.001332,0.00148,0.003998,0.014475,0.129,...,0.02206,0.004188,22.99325,0.0,0.372126,0.654291,-7.257665,0.120623,1.974217,0.094658
50%,198.996,231.1615,113.9385,0.003355,2.5e-05,0.001625,0.001775,0.004875,0.016705,0.154,...,0.02633,0.004825,24.997,0.0,0.435368,0.682527,-6.826448,0.167356,2.12951,0.115118
75%,229.077,251.23925,199.183,0.00453,3e-05,0.001908,0.002228,0.005725,0.02021,0.18925,...,0.03454,0.009213,26.13925,0.0,0.507748,0.742284,-6.350146,0.193766,2.339487,0.147761
max,260.105,592.03,239.17,0.0136,8e-05,0.00624,0.00564,0.01873,0.04087,0.405,...,0.07008,0.10715,33.047,0.0,0.663842,0.785714,-5.198864,0.291954,2.88245,0.252404


### Undersampling

In [16]:
parkinsons_positive_sample = parkinsons_positive.sample(n=48)

### Concating the dataframes

In [17]:
new_dataset = pd.concat([parkinsons_positive_sample, parkinsons_negative], axis=0)

In [18]:
new_dataset

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
39,phon_R01_S08_4,187.733,202.324,173.015,0.00316,0.00002,0.00168,0.00182,0.00504,0.01663,...,0.02488,0.00265,26.310,1,0.396793,0.758324,-6.006647,0.266892,2.382544,0.160691
143,phon_R01_S34_4,202.805,231.508,86.232,0.00370,0.00002,0.00189,0.00211,0.00568,0.01997,...,0.03350,0.02010,18.687,1,0.536102,0.632631,-5.898673,0.213353,2.470746,0.189032
8,phon_R01_S02_3,95.730,132.068,91.754,0.00551,0.00006,0.00293,0.00332,0.00880,0.02093,...,0.03218,0.01070,21.812,1,0.615551,0.773587,-5.498678,0.327769,2.322511,0.231571
38,phon_R01_S08_3,180.198,201.249,175.456,0.00284,0.00002,0.00153,0.00166,0.00459,0.01444,...,0.02177,0.00231,26.738,1,0.403884,0.766209,-6.452058,0.212294,2.269398,0.141929
178,phon_R01_S44_2,148.790,158.359,138.990,0.00309,0.00002,0.00152,0.00186,0.00456,0.01574,...,0.02518,0.00488,24.412,1,0.402591,0.762508,-6.311987,0.182459,2.251553,0.160306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,phon_R01_S50_2,174.188,230.978,94.261,0.00459,0.00003,0.00263,0.00259,0.00790,0.04087,...,0.07008,0.02764,19.517,0,0.448439,0.657899,-6.538586,0.121952,2.657476,0.133050
191,phon_R01_S50_3,209.516,253.017,89.488,0.00564,0.00003,0.00331,0.00292,0.00994,0.02751,...,0.04812,0.01810,19.147,0,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895
192,phon_R01_S50_4,174.688,240.005,74.287,0.01360,0.00008,0.00624,0.00564,0.01873,0.02308,...,0.03804,0.10715,17.883,0,0.407567,0.655683,-6.787197,0.158453,2.679772,0.131728
193,phon_R01_S50_5,198.764,396.961,74.904,0.00740,0.00004,0.00370,0.00390,0.01109,0.02296,...,0.03794,0.07223,19.020,0,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306


In [19]:
new_dataset.shape

(96, 24)

In [20]:
new_dataset['status'].value_counts()

1    48
0    48
Name: status, dtype: int64

The dataset is perfectly class-balanced, but the number of data points in it may be too small for the model to learn some meaningful relationships. Anyways, let's see!

In [22]:
new_dataset.groupby('status').mean()

Unnamed: 0_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,181.937771,223.63675,145.207292,0.003866,2.3e-05,0.001925,0.002056,0.005776,0.017615,0.162958,...,0.013305,0.028511,0.011483,24.67875,0.442552,0.695716,-6.759264,0.160292,2.154491,0.123017
1,143.916083,164.843042,106.699146,0.005961,4.5e-05,0.003152,0.00339,0.009456,0.030195,0.281583,...,0.02381,0.048139,0.021152,21.532271,0.51085,0.731922,-5.386497,0.252616,2.460372,0.227624


In [23]:
new_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 39 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              96 non-null     object 
 1   MDVP:Fo(Hz)       96 non-null     float64
 2   MDVP:Fhi(Hz)      96 non-null     float64
 3   MDVP:Flo(Hz)      96 non-null     float64
 4   MDVP:Jitter(%)    96 non-null     float64
 5   MDVP:Jitter(Abs)  96 non-null     float64
 6   MDVP:RAP          96 non-null     float64
 7   MDVP:PPQ          96 non-null     float64
 8   Jitter:DDP        96 non-null     float64
 9   MDVP:Shimmer      96 non-null     float64
 10  MDVP:Shimmer(dB)  96 non-null     float64
 11  Shimmer:APQ3      96 non-null     float64
 12  Shimmer:APQ5      96 non-null     float64
 13  MDVP:APQ          96 non-null     float64
 14  Shimmer:DDA       96 non-null     float64
 15  NHR               96 non-null     float64
 16  HNR               96 non-null     float64
 1

## Separating the features and the label

In [24]:
X = new_dataset.drop(columns=['name', 'status'])
y = new_dataset['status']

## Splitting the dataset into training set and test test

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=0)

In [34]:
X_train.shape

(81, 22)

## Feature Scaling 

In [26]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the Model

In [27]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

## Model Evaluation

### Evaluation on Training Set

In [30]:
y_pred_train = model.predict(X_train)
print('Accuracy score = {:.2f}%'.format(accuracy_score(y_train, y_pred_train)  * 100))

Accuracy score = 90.12%


In [31]:
print(confusion_matrix(y_train, y_pred_train))

[[38  2]
 [ 6 35]]


### Evaluation on Test Set

In [32]:
y_pred_test = model.predict(X_test)
print('Accuracy score = {:.2f}%'.format(accuracy_score(y_test, y_pred_test) * 100))

Accuracy score = 86.67%


In [33]:
print(confusion_matrix(y_test, y_pred_test))

[[7 1]
 [1 6]]


Yes!!! Balancing the classes really improved the accuracy score. 

And yes...this model was trained on 81 labeled examples, while the other one on 156 labeled examples