## Prepare

In [1]:
# importing packages
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [2]:
#Data
print(os.getcwd())
data_dir = os.getcwd() + "\\Data set\\Parkinson disease.csv"
print(data_dir)

F:\Parkinson-Disease
F:\Parkinson-Disease\Data set\Parkinson disease.csv


In [3]:
df_parkinson = pd.read_csv(data_dir, delimiter = ",")
df_parkinson.info()
df_parkinson.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

Index(['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE'],
      dtype='object')

## Data processing

## Split data

In [4]:
X=(df_parkinson.iloc[:,1:]).drop(columns=["status"])
y = df_parkinson.status
print(X.shape,y.shape)

(195, 22) (195,)


In [5]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0)

# Training model 

## Logistic Regression

In [7]:
from sklearn.metrics import accuracy_score

In [8]:
from sklearn.linear_model import LogisticRegression
LoR_model = LogisticRegression(solver= 'liblinear')
LoR_model.fit(X_train, y_train)
y_pred_test = LoR_model.predict(X_test)
y_pred_train = LoR_model.predict(X_train)

gt_array = [y_train, y_test] # ground truth
pred_array = [y_pred_train, y_pred_test] # predictions
print('Model accuracy score : {:0.2%}'. format(accuracy_score(y_test, y_pred_test)))

Model accuracy score : 89.83%


In [9]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, roc_curve, auc
accuracy = accuracy_score(y_test, y_pred_test)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred_test)
print("Accuracy:", accuracy)
print("Balanced accuracy:", balanced_accuracy)
# Calculate and print precision, recall, and F1-score of the whole test set
print('Classification report:\n',classification_report(y_test,y_pred_test))

Accuracy: 0.8983050847457628
Balanced accuracy: 0.8244147157190636
Classification report:
               precision    recall  f1-score   support

           0       0.82      0.69      0.75        13
           1       0.92      0.96      0.94        46

    accuracy                           0.90        59
   macro avg       0.87      0.82      0.84        59
weighted avg       0.89      0.90      0.90        59



### k fold cross validation 

In [10]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn import datasets
from sklearn import model_selection
kf =KFold(n_splits=5, shuffle=True, random_state=42)
cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1
score1 = cross_val_score(LogisticRegression(solver= 'liblinear') , X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score1.round(4)*100}')
print(f'Average score: {"{:.2f}".format(score1.mean()*100)}',"%")

Fold:1, Train set: 156, Test set:39
Fold:2, Train set: 156, Test set:39
Fold:3, Train set: 156, Test set:39
Fold:4, Train set: 156, Test set:39
Fold:5, Train set: 156, Test set:39
Scores for each fold are: [87.18 82.05 89.74 84.62 82.05]
Average score: 85.13 %


### Standard scaller

In [11]:
X=(df_parkinson.iloc[:,1:]).drop(columns=["status"])
y = df_parkinson.status
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0)

In [12]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data
X_test_scaled = scaler.transform(X_test)

# Initialize and train the decision tree model
svm_model = LogisticRegression(solver= 'liblinear')
svm_model.fit(X_train_scaled, y_train)

# Evaluate the model
accuracy = svm_model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.864406779661017


# Naive Bayes

In [13]:
# Initialize the random forest classifier
from sklearn.naive_bayes import GaussianNB
NB_model = GaussianNB()
NB_model.fit(X_train, y_train)
y_pred_test = NB_model.predict(X_test)
y_pred_train = NB_model.predict(X_train)

gt_array = [y_train, y_test] # ground truth
pred_array = [y_pred_train, y_pred_test] # predictions
print('Model accuracy score : {:0.2%}'. format(accuracy_score(y_test, y_pred_test)))

Model accuracy score : 67.80%


In [None]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, roc_curve, auc
accuracy = accuracy_score(y_test, y_pred_test)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred_test)
print("Accuracy:", accuracy)
print("Balanced accuracy:", balanced_accuracy)
# Calculate and print precision, recall, and F1-score of the whole test set
print('Classification report:\n',classification_report(y_test,y_pred_test))

In [14]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn import datasets
from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB
kf =KFold(n_splits=5, shuffle=True, random_state=42)
cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1
score1 = cross_val_score(GaussianNB() , X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score1.round(4)*100}')
print(f'Average score: {"{:.2f}".format(score1.mean()*100)}',"%")

Fold:1, Train set: 156, Test set:39
Fold:2, Train set: 156, Test set:39
Fold:3, Train set: 156, Test set:39
Fold:4, Train set: 156, Test set:39
Fold:5, Train set: 156, Test set:39
Scores for each fold are: [69.23 74.36 71.79 64.1  69.23]
Average score: 69.74 %


In [14]:

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data
X_test_scaled = scaler.transform(X_test)

# Initialize and train the decision tree model
NB_model = GaussianNB()
NB_model.fit(X_train_scaled, y_train)

# Evaluate the model
accuracy = NB_model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy*100)

Accuracy: 67.79661016949152
