In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, DMatrix, train
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Load dataset
data = pd.read_csv(r'C:\Users\aryad\Downloads\Parkinson-s-Disease-Detection-main\Parkinson-s-Disease-Detection-main\My_New_Flask_app\cleaned_parkinsons_dataset.csv')

# Define features and target
# X to train the model, and y to evaluate its accuracy.
X = data.drop(columns=['status'])  # Features
y = data['status']  # Target (binary: 0 = Healthy, 1 = Parkinson's)

# Handle class imbalance dynamically
#Dividing by 2 * np.bincount(y) ensures both classes have an equal impact on training.
class_weights = len(y) / (2 * np.bincount(y))  # Class weight calculation
scale_pos_weight = class_weights[0] / class_weights[1] #more attention to the minority class.

# Split dataset
#Ensure reproducibility (same split every time).
#stratify=y	Maintains the same class distribution in train/test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()  #Initializes the scaler, which standardizes values using Z-score normalization.
X_train = scaler.fit_transform(X_train)#Fits scaler on training data & transforms it (mean=0, std=1).
X_test = scaler.transform(X_test)

# Converts NumPy arrays into an optimized format for XGBoost
dtrain = DMatrix(X_train, label=y_train)
dtest = DMatrix(X_test, label=y_test)

# Optimized XGBoost parameters
params = {
    "objective": "binary:logistic",# binary classification problems
    "eval_metric": "logloss",# how much the model learns at each step.
    "learning_rate": 0.03,  # Lowered for better generalization
    "max_depth": 8,  # Increased depth
    "min_child_weight": 2,  # Adjusted to prevent overfitting,how much data is needed to split a node.
    "gamma": 0.4,  
    "subsample": 0.95, #uses 95% of data 
    "colsample_bytree": 0.9,#90% of features used in each tree.
    "scale_pos_weight": scale_pos_weight,  # Dynamic weight for class balancing
    "reg_alpha": 0.05,# Reduces unnecessary features.(L1 Regularization)
    "reg_lambda": 0.05, # (L2 Regularization) → Smoothens weights to prevent extreme values.
    "random_state": 42, # Ensures same results every time
    "tree_method": "hist"  # Faster computation
}

# Train model with early stopping
xgb_model = train(
    params, dtrain,
    num_boost_round=500,  # Increased rounds
    evals=[(dtest, "validation")],
    early_stopping_rounds=50,  # Adjusted for better generalization
    verbose_eval=False
)

# Predictions
y_pred_proba = xgb_model.predict(dtest)# to make predictions on the test data (dtest).
y_pred = (y_pred_proba > 0.5).astype(int)  # Proper threshold

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

report = classification_report(y_test, y_pred)

print(f"✅ Model Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(report)

# Save model and scaler
# joblib.dump(xgb_model, "xgb_parkinsons_model.pkl")
# joblib.dump(scaler, "scaler.pkl")

print("🚀 Updated model and scaler saved successfully!")

✅ Model Accuracy: 0.8718
Confusion Matrix:
[[ 6  4]
 [ 1 28]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.60      0.71        10
           1       0.88      0.97      0.92        29

    accuracy                           0.87        39
   macro avg       0.87      0.78      0.81        39
weighted avg       0.87      0.87      0.86        39

🚀 Updated model and scaler saved successfully!


In [10]:
import librosa # For audio processing and feature extraction  
import numpy as np
import joblib # For saving and loading trained models  
import gradio as gr
import xgboost as xgb  # Ensure XGBoost is imported for DMatrix

# Load trained model and scaler
model = joblib.load('xgb_parkinsons_model.pkl')
scaler = joblib.load('scaler.pkl')

def extract_features_from_audio(audio_path):
    """
    Extracts relevant audio features ensuring they match the trained model.
    """
    try:
        y, sr = librosa.load(audio_path, sr=None)  # Load audio with original sampling rate
        
        # Extract MFCC, Chroma, and Spectral Contrast features
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)#Captures how the voice frequency changes over time.
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1) # Measures the pitch (musical tone) of the voice.
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr, fmin=sr * 0.01), axis=1)#Captures the difference between loud and quiet parts of speech.

        # Combine features into a single array
        features = np.hstack([mfccs, chroma, spectral_contrast])  

        # Ensure feature consistency with the trained model
        expected_features = scaler.n_features_in_  # Matches trained scaler
        if features.shape[0] < expected_features:
            features = np.pad(features, (0, expected_features - features.shape[0]), mode='constant')
        elif features.shape[0] > expected_features:
            features = features[:expected_features]

        # Reshape and scale input
        features = features.reshape(1, -1)
        features = scaler.transform(features)  # Scale using trained scaler

        return features  # Return processed features

    except Exception as e:
        return str(e)  # Return error message

def predict_parkinsons(audio_file):
    """
    Predicts Parkinson’s disease from a voice recording.
    """
    try:
        features = extract_features_from_audio(audio_file)
        
        if isinstance(features, str):  # If an error occurred
            return f"please enter .wav file: {features}"

        # Convert to DMatrix for XGBoost prediction
        dmatrix_features = xgb.DMatrix(features)

        # Get prediction probability
        probability = model.predict(dmatrix_features)[0]  

        # Convert probability to a classification result
        result = "Parkinson's Detected 🟠" if probability > 0.5 else "Healthy ✅"
        return f"Prediction: {result} (Confidence: {probability:.2f})"

    except Exception as e:
        return f"Prediction Error: {str(e)}"

#Gradio UI
iface = gr.Interface(
    fn=predict_parkinsons,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Parkinson's Detection from Voice",
    description="Upload a .wav file to check if Parkinson’s is detected.",
)

# Launch the Gradio app
iface.launch()

* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.


