In [1]:
# Import necessary libraries
import pandas as pd  # Used for handling datasets
import numpy as np  # Used for numerical computations
import lightgbm as lgb  # LightGBM library for gradient boosting
from sklearn.model_selection import train_test_split  # Splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler  # Standardizing feature values
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # Model evaluation metrics
import joblib  # Used for saving and loading trained models

#  Load dataset
# Reads the cleaned Parkinson's dataset from the provided path
data = pd.read_csv(r'C:\Users\aryad\Downloads\Parkinson-s-Disease-Detection-main\Parkinson-s-Disease-Detection-main\My_New_Flask_app\cleaned_parkinsons_dataset.csv')

#  Define features and target variable
X = data.drop(columns=['status'])  # Features (all columns except 'status')
y = data['status']  # Target variable (binary: 0 = Healthy, 1 = Parkinson's)

#  Handle class imbalance dynamically
# Class weights calculation to balance data during training
class_weights = len(y) / (2 * np.bincount(y))  # Computes weights for each class
scale_pos_weight = class_weights[0] / class_weights[1]  # Gives more importance to the minority class

#  Split dataset into training and testing sets
# Maintains class distribution using stratify=y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#  Scale features using StandardScaler
scaler = StandardScaler()  # Initializes the scaler to normalize values
X_train = scaler.fit_transform(X_train)  # Fits the scaler to training data & transforms it
X_test = scaler.transform(X_test)  # Transforms the test data using the fitted scaler

#  Convert data to LightGBM dataset format
# LightGBM requires a specific dataset format for training
dtrain = lgb.Dataset(X_train, label=y_train)  # Training dataset
dtest = lgb.Dataset(X_test, label=y_test, reference=dtrain)  # Test dataset (reference helps in validation)

#  Define LightGBM parameters (optimized for classification)
params = {
    "objective": "binary",  # Binary classification (0 or 1)
    "metric": "binary_logloss",  # Logarithmic loss for evaluating model performance
    "learning_rate": 0.03,  # Controls how much the model learns in each iteration (lower = better generalization)
    "max_depth": 8,  # Maximum depth of trees (prevents overfitting)
    "num_leaves": 31,  # Number of leaf nodes in each tree (more leaves = complex model)
    "min_child_samples": 20,  # Minimum number of data points per leaf (avoids overfitting)
    "subsample": 0.8,  # Uses 80% of data in each boosting iteration (reduces overfitting)
    "colsample_bytree": 0.8,  # Uses 80% of features in each boosting iteration (improves diversity)
    "scale_pos_weight": scale_pos_weight,  # Balances class weights dynamically
    "random_state": 42  # Ensures reproducibility (same model each time)
}

#  Train the LightGBM model
lgb_model = lgb.train(
    params,  # Model parameters
    dtrain,  # Training dataset
    num_boost_round=500,  # Number of boosting rounds (higher = better accuracy)
    valid_sets=[dtest],  # Validation dataset to monitor performance
    callbacks=[lgb.early_stopping(50)]  # Stops training if no improvement in 50 rounds
)

#  Make predictions
y_pred_proba = lgb_model.predict(X_test)  # Get probability predictions for test data
y_pred = (y_pred_proba > 0.5).astype(int)  # Convert probabilities to class labels (threshold = 0.5)

#  Evaluate the model
accuracy = accuracy_score(y_test, y_pred)  # Calculates accuracy
conf_matrix = confusion_matrix(y_test, y_pred)  # Generates confusion matrix
report = classification_report(y_test, y_pred)  # Generates precision, recall, and F1-score

#  Print evaluation metrics
print(f" Model Accuracy: {accuracy:.4f}")  # Displays accuracy (4 decimal places)
print("Confusion Matrix:")
print(conf_matrix)  # Displays confusion matrix
print("Classification Report:")
print(report)  # Displays precision, recall, and F1-score for each class

#  Save model and scaler for future use
joblib.dump(lgb_model, "lgb_parkinsons_model.pkl")  # Saves trained model
joblib.dump(scaler, "scaler1.pkl")  # Saves scaler

print("🚀 Model and scaler saved successfully!")  # Confirms successful saving


[LightGBM] [Info] Number of positive: 118, number of negative: 38
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000834 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1128
[LightGBM] [Info] Number of data points in the train set: 156, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.756410 -> initscore=1.133098
[LightGBM] [Info] Start training from score 1.133098
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[205]	valid_0's binary_logloss: 0.261881
 Model Accuracy: 0.9231
Confusion Matrix:
[[ 7  3]
 [ 0 29]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.70      0.82        10
           1       0.91      1.00      0.95        29

    accuracy                           0.92        39
   macro avg

In [3]:

import librosa
import numpy as np
import joblib
import gradio as gr
import lightgbm as lgb  # Ensure LightGBM is imported

# Load trained LightGBM model and scaler
lgb_model = joblib.load('lgb_parkinsons_model.pkl')
scaler = joblib.load('scaler1.pkl')

def extract_features_from_audio(audio_path):
    """
    Extracts relevant audio features ensuring they match the trained model.
    """
    try:
        y, sr = librosa.load(audio_path, sr=None)  # Load audio with original sampling rate
        
        # Extract MFCC, Chroma, and Spectral Contrast features
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr, fmin=sr * 0.01), axis=1)

        # Combine features into a single array
        features = np.hstack([mfccs, chroma, spectral_contrast])  

        # Ensure feature consistency with the trained model
        expected_features = scaler.n_features_in_  # Matches trained scaler
        if features.shape[0] < expected_features:
            features = np.pad(features, (0, expected_features - features.shape[0]), mode='constant')
        elif features.shape[0] > expected_features:
            features = features[:expected_features]

        # Reshape and scale input
        features = features.reshape(1, -1)
        features = scaler.transform(features)  # Scale using trained scaler

        return features  # Return processed features

    except Exception as e:
        return str(e)  # Return error message

def predict_parkinsons(audio_file):
    """
    Predicts Parkinson’s disease from a voice recording using LightGBM.
    """
    try:
        features = extract_features_from_audio(audio_file)
        
        if isinstance(features, str):  # If an error occurred
            return f"Feature Extraction Error: {features}"

        # Get prediction probability
        probability = lgb_model.predict(features)[0]  

        # Convert probability to a classification result
        result = "Parkinson's Detected 🟠" if probability > 0.5 else "Healthy ✅"
        return f"Prediction: {result} (Confidence: {probability:.2f})"

    except Exception as e:
        return f"Prediction Error: {str(e)}"

# Gradio UI
iface = gr.Interface(
    fn=predict_parkinsons,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Parkinson's Detection using LightGBM",
    description="Upload a .wav file to check if Parkinson’s is detected.",
)

# Launch the Gradio app
iface.launch()


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


