<a href="https://colab.research.google.com/github/YashSingh23/Detecting-Parkinson-s-Disease-using-Machine-Learning/blob/main/Detecting_Parkinson%E2%80%99s_Disease_using_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os # Used for file system operations, though less critical if directly uploading
import sys # Used for system-specific parameters and functions, not directly used in this script for core logic

In [4]:
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
# --- Step 1: Data Loading ---
# DataFlair - Read the data
# Make sure 'parkinsons.data' is uploaded to your Colab session or downloaded using Kaggle API
# If you uploaded it, it will be in the /content/ directory by default.
try:
    df = pd.read_csv('parkinsons.data')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'parkinsons.data' not found. Please upload the file or use the Kaggle API to download it.")
    print("You can upload it via the 'Files' tab on the left sidebar in Colab.")
    # Exit or handle the error appropriately if the file is crucial
    sys.exit(1) # Exit the script if the file is not found

# Display the first 5 records to verify
print("\nFirst 5 records of the dataset:")
print(df.head())


Dataset loaded successfully.

First 5 records of the dataset:
             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   Shimmer:DDA      NHR 

In [8]:
# --- Step 2: Feature and Label Extraction ---
# DataFlair - Get the features and labels
# Features are all columns except 'status', labels are the 'status' column
# The original instruction says .values[:,1:] which implies skipping the first column (name)
features = df.loc[:, df.columns != 'status'].values[:, 1:]
labels = df.loc[:, 'status'].values

print("\nFeatures shape:", features.shape)
print("Labels shape:", labels.shape)


Features shape: (195, 22)
Labels shape: (195,)


In [10]:
# --- Step 3: Label Distribution Check ---
# DataFlair - Get the count of each label (0 and 1) in labels
count_ones = labels[labels == 1].shape[0]
count_zeros = labels[labels == 0].shape[0]
print(f"\nNumber of samples with Parkinson's (status=1): {count_ones}")
print(f"Number of samples without Parkinson's (status=0): {count_zeros}")


Number of samples with Parkinson's (status=1): 147
Number of samples without Parkinson's (status=0): 48


In [12]:
# --- Step 4: Feature Scaling ---
# DataFlair - Scale the features to between -1 and 1
# Initialize MinMaxScaler to scale features between -1 and 1
scaler = MinMaxScaler((-1, 1))
# Apply the scaler to the features
x = scaler.fit_transform(features)
# Labels (y) do not need scaling
y = labels

print("\nFeatures after scaling (first 5 rows):")
print(x[:5])


Features after scaling (first 5 rows):
[[-0.63138346 -0.77481654 -0.89037042 -0.60864041 -0.50197628 -0.70905588
  -0.50482315 -0.70942366 -0.37557057 -0.43960559 -0.33474576 -0.30529172
  -0.65510376 -0.33483117 -0.86338606  0.02349021 -0.26168916  0.92029673
   0.13975042  0.17153026 -0.21867743 -0.0053808 ]
 [-0.6033463  -0.81013911 -0.4433544  -0.49174079 -0.4229249  -0.61753372
  -0.35262594 -0.6179162  -0.05422677 -0.11092851  0.03197227  0.07137042
  -0.4411517   0.03209655 -0.88133813 -0.13484516 -0.05833903  0.95404891
   0.40655399  0.48267409 -0.05370956  0.34265204]
 [-0.66992292 -0.88174367 -0.46942324 -0.43964422 -0.34387352 -0.54142582
  -0.26152197 -0.54117836 -0.21873288 -0.34757601 -0.11325116 -0.10773406
  -0.56030324 -0.113365   -0.92080721 -0.00755913 -0.19116806  1.
   0.2734894   0.37274182 -0.18236124  0.19336492]
 [-0.66999278 -0.85414536 -0.47159948 -0.47331639 -0.34387352 -0.58188825
  -0.35048232 -0.58227645 -0.17144422 -0.29005752 -0.04892142 -0.06784261
 

In [14]:
# --- Step 5: Dataset Splitting ---
# DataFlair - Split the dataset into training and testing sets
# 80% for training, 20% for testing
# random_state ensures reproducibility of the split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

print(f"\nTraining features shape: {x_train.shape}")
print(f"Testing features shape: {x_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing labels shape: {y_test.shape}")


Training features shape: (156, 22)
Testing features shape: (39, 22)
Training labels shape: (156,)
Testing labels shape: (39,)


In [16]:
# --- Step 6: Model Training ---
# DataFlair - Train the model
# Initialize the XGBClassifier model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss') # Suppress warning by setting use_label_encoder and eval_metric
# Train the model using the training data
model.fit(x_train, y_train)

print("\nXGBoost model training complete.")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost model training complete.


In [17]:
# --- Step 7: Prediction and Accuracy Calculation ---
# DataFlair - Calculate the accuracy
# Make predictions on the test set
y_pred = model.predict(x_test)
# Calculate the accuracy score and convert to percentage
accuracy = accuracy_score(y_test, y_pred) * 100

print(f"\nModel Accuracy: {accuracy:.2f}%")


Model Accuracy: 94.87%
