In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import joblib # NEW: For saving the k-NN model

from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, Dropout, Flatten,
                                     Dense, LSTM, MultiHeadAttention, Concatenate, Reshape)
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier # NEW: Import k-NN
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay # NEW: For k-NN evaluation

# =============================================================================
# --- Configuration --
# =============================================================================

# --- SELECT YOUR CONFIGURATION HERE ---
DATASET = "MPOWER_DATASET"
MODE = "ALL_VALIDS"
FEATURE_MODE = "ALL"
MODEL_NAME = "feature_extractor_cnn_att_lstm" # MODIFIED: Model name updated
# ------------------------------------

# Path Setup
dataset = "mPower"
FEATURES_FILE_PATH = os.path.join(os.getcwd(), dataset, "data", f"features_{MODE}_{FEATURE_MODE}.npz")
RESULTS_PATH = os.path.join(os.getcwd(), dataset, f"results_{MODE}_{FEATURE_MODE}")
MODEL_PATH = os.path.join(RESULTS_PATH, MODEL_NAME)
os.makedirs(MODEL_PATH, exist_ok=True)

HISTORY_SAVE_PATH = os.path.join(MODEL_PATH, "history.csv")
BEST_EXTRACTOR_PATH = os.path.join(MODEL_PATH, "best_feature_extractor.keras") # MODIFIED: Path name updated
KNN_MODEL_PATH = os.path.join(MODEL_PATH, "knn_classifier.joblib") # NEW: Path for k-NN model

# Hyperparameters
EPOCHS = 30
BATCH_SIZE = 32
LEARNING_RATE = 0.001
DROPOUT_RATE = 0.5
L2_STRENGTH = 0.01
K_NEIGHBORS = 5 # NEW: k for the k-NN classifier

# Model Checkpoint Callback
# We still need a final dense layer during training to learn the features effectively
checkpoint_cb = ModelCheckpoint(BEST_EXTRACTOR_PATH, monitor='val_loss', mode='min', save_best_only=True, verbose=1)

# =============================================================================
# --- Data Loading and Preparation ---
# =============================================================================

def load_data(path: str) -> tuple:
    """
    MODIFIED: Loads only features and labels from the .npz file, excluding age and sex.
    """
    print(f"--- Loading data from {path} ---")
    with np.load(path) as data:
        mel_spectrograms = data['mel_spectrogram']
        mfccs = data['mfcc']
        labels = data['labels']
        # Combine mel spectrograms and MFCCs
        X = np.concatenate((mel_spectrograms, mfccs), axis=1)
    print("Data loaded successfully.")
    return X, labels

# =============================================================================
# --- Model Architecture (Feature Extractor) ---
# =============================================================================
from tensorflow.keras.saving import register_keras_serializable

@register_keras_serializable()
class FeatureExtractor(Model):
    """
    MODIFIED: This model is now just the feature extractor.
    It stops before the final classification layers.
    """
    def __init__(self, input_shape, **kwargs):
        super(FeatureExtractor, self).__init__(**kwargs)
        self.input_shape_config = input_shape

        # CNN Layers
        self.reshape_in = Reshape((input_shape[0], input_shape[1], 1))
        self.conv1a = Conv2D(64, 5, activation='relu', kernel_regularizer=l2(L2_STRENGTH), padding='same')
        self.conv1b = Conv2D(64, 5, activation='relu', kernel_regularizer=l2(L2_STRENGTH), padding='same')
        self.pool1 = MaxPooling2D(5)
        self.drop1 = Dropout(DROPOUT_RATE)
        self.conv2a = Conv2D(64, 5, activation='relu', kernel_regularizer=l2(L2_STRENGTH), padding='same')
        self.conv2b = Conv2D(64, 5, activation='relu', kernel_regularizer=l2(L2_STRENGTH), padding='same')
        self.pool2 = MaxPooling2D(5)
        self.drop2 = Dropout(DROPOUT_RATE)
        self.flatten_cnn = Flatten()

        # Attention and LSTM Layers
        self.attention = MultiHeadAttention(num_heads=2, key_dim=64)
        self.flatten_att = Flatten()
        self.lstm1 = LSTM(128, return_sequences=True)
        self.lstm2 = LSTM(128, return_sequences=False)
        self.drop_lstm = Dropout(DROPOUT_RATE)

        # Final Concatenation to produce the feature vector (embedding)
        self.concat = Concatenate(name='embedding_output')

    def call(self, inputs):
        # CNN Path
        x = self.reshape_in(inputs)
        x = self.conv1a(x)
        x = self.conv1b(x)
        x = self.pool1(x)
        x = self.drop1(x)
        x = self.conv2a(x)
        x = self.conv2b(x)
        x = self.pool2(x)
        x = self.drop2(x)
        cnn_flat = self.flatten_cnn(x)

        # Attention and LSTM Path
        shape = tf.shape(x)
        sequence = tf.reshape(x, [-1, shape[1] * shape[2], shape[3]])
        att_out = self.attention(query=sequence, key=sequence, value=sequence)
        att_flat = self.flatten_att(att_out)
        lstm_seq = self.lstm1(sequence)
        lstm_out = self.lstm2(lstm_seq)
        lstm_out = self.drop_lstm(lstm_out)

        # MODIFIED: The output is the concatenated feature vector.
        return self.concat([cnn_flat, att_flat, lstm_out])

    def get_config(self):
        config = super(FeatureExtractor, self).get_config()
        config.update({"input_shape": self.input_shape_config})
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

def build_training_model(input_shape: tuple) -> Model:
    """
    MODIFIED: Builds a temporary model for training the feature extractor.
    This includes a final dense layer needed for calculating loss during training.
    """
    print("--- Building the training model ---")
    inputs = Input(shape=input_shape)

    # Instantiate the feature extractor
    feature_extractor = FeatureExtractor(input_shape=input_shape)

    # Get the embedding output
    embeddings = feature_extractor(inputs)

    # Add a temporary classification head for training purposes
    outputs = Dense(1, activation='sigmoid', name='classification_head')(embeddings)

    # Create the training model
    model = Model(inputs=inputs, outputs=outputs)

    print("Training model built successfully.")
    return model

# =============================================================================
# --- NEW: k-NN Training and Evaluation ---
# =============================================================================
from sklearn.preprocessing import StandardScaler # NEW: For feature scaling
from sklearn.model_selection import GridSearchCV # NEW: For hyperparameter tuning

# =============================================================================
# --- MODIFIED: k-NN Training, Tuning, and Evaluation ---
# =============================================================================

def train_and_evaluate_knn(X_train_features, y_train, X_test_features, y_test):
    """
    MODIFIED: Scales features, finds the best k-NN hyperparameters using GridSearchCV,
    trains the best model, and evaluates it.
    """
    # --- 1. Scale the Extracted Features ---
    print("\n--- Scaling Features ---")
    scaler = StandardScaler()

    # Fit the scaler on the TRAINING data and transform it
    X_train_scaled = scaler.fit_transform(X_train_features)

    # Apply the SAME transformation to the test data
    X_test_scaled = scaler.transform(X_test_features)
    print("Features scaled successfully.")

    # --- 2. Hyperparameter Tuning with GridSearchCV ---
    print("\n--- Searching for the best k-NN hyperparameters ---")

    # Define the grid of parameters to search
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11, 13, 15], # Test a range of odd numbers for k
        'weights': ['uniform', 'distance'],      # Test uniform vs. distance-weighted voting
        'metric': ['euclidean', 'manhattan']     # Test different distance metrics
    }

    # Set up the grid search with 5-fold cross-validation
    grid_search = GridSearchCV(
        KNeighborsClassifier(),
        param_grid,
        cv=5, # 5-fold cross-validation
        scoring='accuracy',
        verbose=1,
        n_jobs=-1 # Use all available CPU cores
    )

    # Run the search on the scaled training data
    grid_search.fit(X_train_scaled, y_train)

    # --- 3. Train and Evaluate the Best Model ---
    print("\n--- Best Hyperparameters Found ---")
    print(grid_search.best_params_)

    # Get the best model found by the grid search
    best_knn = grid_search.best_estimator_

    # Save the best trained k-NN model
    joblib.dump(best_knn, KNN_MODEL_PATH)
    print(f"\nBest k-NN model saved to '{KNN_MODEL_PATH}'")

    # --- Evaluation ---
    print("\n--- Evaluating Best k-NN on Test Set ---")
    y_pred = best_knn.predict(X_test_scaled)

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Healthy', 'Parkinson']))

    # Plot and save confusion matrix
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Healthy', 'Parkinson'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Best k-NN Confusion Matrix")
    cm_path = os.path.join(MODEL_PATH, "knn_best_confusion_matrix.png")
    plt.savefig(cm_path, dpi=300)
    plt.show()
    print(f"Confusion matrix saved to '{cm_path}'")

In [8]:
import os
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

def tune_and_evaluate_knn_pipeline(X_train, y_train, X_test, y_test, results_path):
    """
    Streamlined to find the best k-NN pipeline using SMOTE for class
    imbalance and NCA for dimensionality reduction.
    """
    # --- 1. Define the Optimized k-NN Pipeline and Search Space ---
    # This pipeline bundles scaling, resampling (SMOTE), dimensionality reduction (NCA),
    # and the final classifier (k-NN).
    knn_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('nca', NeighborhoodComponentsAnalysis(random_state=42)),
        ('classifier', KNeighborsClassifier())
    ])

    # Define the parameters to be tuned for each step of the pipeline
    param_dist = {
        'nca__n_components': [10, 13, 16, 20, 25, 31],
        'classifier__n_neighbors': [5, 7, 9, 11, 13, 15],
        'classifier__weights': ['distance'],
        'classifier__metric': ['manhattan']
    }

    # --- 2. Run the Automated Hyperparameter Search ---
    print("\n--- Tuning Optimized k-NN Pipeline ---")

    # Use RandomizedSearchCV for an efficient search
    search = RandomizedSearchCV(
        knn_pipeline,
        param_dist,
        n_iter=15,
        cv=5,
        scoring='roc_auc',
        random_state=42,
        verbose=1
    )
    search.fit(X_train, y_train)

    # --- 3. Evaluate, Report, and Save the Best Model ---
    best_pipeline = search.best_estimator_
    y_pred = best_pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"\nBest cross-validation score: {search.best_score_:.4f}")

    print("\n\n--- Detailed Report for Best k-NN Pipeline ---")
    print(f"Test Set Accuracy: {test_accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    print("\nBest Hyperparameters Found:")
    print(search.best_params_)

    # Save the best pipeline object to a file for future use
    best_pipeline_path = os.path.join(results_path, "best_knn_pipeline.joblib")
    joblib.dump(best_pipeline, best_pipeline_path)
    print(f"\n✅ Best k-NN pipeline saved to: {best_pipeline_path}")

In [3]:

# =============================================================================
# --- Main Execution ---
# =============================================================================
if __name__ == '__main__':
    # MODIFIED: Load data without age and sex
    X, y = load_data(FEATURES_FILE_PATH)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"\nData split into training ({len(y_train)}) and testing ({len(y_test)}) sets.")

    # 1. Train the Feature Extractor
    training_model = build_training_model(input_shape=(X_train.shape[1], X_train.shape[2]))
    training_model.summary()

    optimizer = Adam(learning_rate=LEARNING_RATE)
    training_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    print("\n--- Starting feature extractor training ---")
    history = training_model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[checkpoint_cb],
        verbose=1
    )
    print("--- Feature extractor training finished ---")
    pd.DataFrame(history.history).to_csv(HISTORY_SAVE_PATH, index_label='epoch')
    print(f"\nTraining history saved to '{HISTORY_SAVE_PATH}'")

--- Loading data from D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\mPower\data\features_ALL_VALIDS_ALL.npz ---
Data loaded successfully.

Data split into training (1664) and testing (416) sets.
--- Building the training model ---

Training model built successfully.



--- Starting feature extractor training ---
Epoch 1/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420ms/step - accuracy: 0.5356 - loss: 4.6076
Epoch 1: val_loss improved from None to 2.55848, saving model to D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\mPower\results_ALL_VALIDS_ALL\feature_extractor_cnn_att_lstm\best_feature_extractor.keras
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 465ms/step - accuracy: 0.5114 - loss: 3.2799 - val_accuracy: 0.5168 - val_loss: 2.5585
Epoch 2/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 517ms/step - accuracy: 0.4993 - loss: 2.5207
Epoch 2: val_loss improved from 2.55848 to 2.33255, saving model to D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\mPower\results_ALL_VALIDS_ALL\feature_extractor_cnn_att_lstm\best_feature_extractor.keras
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2

In [4]:
    # 2. Load the best feature extractor and extract embeddings
    print("\n--- Loading best feature extractor model ---")

    # --- MODIFIED: Correctly build the inference model from the subclass ---
    # First, load the entire model that was saved during training
    full_model_to_load = tf.keras.models.load_model(BEST_EXTRACTOR_PATH)

    # Now, build the standalone feature extractor model
    input_shape = (X_train.shape[1], X_train.shape[2])
    inputs = Input(shape=input_shape)

    # Instantiate your custom FeatureExtractor class
    feature_extractor_instance = FeatureExtractor(input_shape=input_shape)

    # Call the instance on the input tensor to get the output
    outputs = feature_extractor_instance(inputs)

    # Create the final inference model from the inputs and outputs
    inference_extractor_model = Model(inputs, outputs)

    # The saved model has weights for the extractor AND the final dense layer.
    # We slice off the last two weights (weights and bias for the dense layer).
    extractor_weights = full_model_to_load.get_weights()[:-2]
    inference_extractor_model.set_weights(extractor_weights)

    print("Feature extractor model built and weights loaded successfully.")
    # --- End of MODIFIED section ---

    print("--- Extracting features (embeddings) ---")
    # Use the newly created model to predict
    X_train_features = inference_extractor_model.predict(X_train, batch_size=BATCH_SIZE)
    X_test_features = inference_extractor_model.predict(X_test, batch_size=BATCH_SIZE)

    print(f"Shape of training features: {X_train_features.shape}")
    print(f"Shape of testing features: {X_test_features.shape}")



--- Loading best feature extractor model ---
Feature extractor model built and weights loaded successfully.
--- Extracting features (embeddings) ---
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 109ms/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 114ms/step
Shape of training features: (1664, 896)
Shape of testing features: (416, 896)


In [5]:
    tune_and_evaluate_knn_pipeline(X_train_features, y_train, X_test_features, y_test, RESULTS_PATH)


--- Tuning Optimized k-NN Pipeline ---
Fitting 10 folds for each of 15 candidates, totalling 150 fits

Best cross-validation score: 0.7509


--- Detailed Report for Best k-NN Pipeline ---
Test Set Accuracy: 0.6827

Classification Report:
              precision    recall  f1-score   support

           0     0.6727    0.7115    0.6916       208
           1     0.6939    0.6538    0.6733       208

    accuracy                         0.6827       416
   macro avg     0.6833    0.6827    0.6824       416
weighted avg     0.6833    0.6827    0.6824       416


Best Hyperparameters Found:
{'nca__n_components': 31, 'classifier__weights': 'distance', 'classifier__n_neighbors': 15, 'classifier__metric': 'manhattan'}

✅ Best k-NN pipeline saved to: D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\mPower\results_ALL_VALIDS_ALL\best_knn_pipeline.joblib


In [11]:
    # --- FIX: Reshape the data from 3D to 2D before scaling ---
    # Get the original dimensions
    n_samples_train, d1, d2 = X_train.shape
    n_samples_test = X_test.shape[0]

    # Reshape by multiplying the last two dimensions together
    X_train_reshaped = X_train.reshape((n_samples_train, d1 * d2))
    X_test_reshaped = X_test.reshape((n_samples_test, d1 * d2))

    print(f"Reshaped X_train shape: {X_train_reshaped.shape}") # Will be (80, 2400)

    # Now, you can scale the reshaped data without error
    scaler = StandardScaler()
    scaler.fit(X_train_reshaped)

    X_train_scaled = scaler.transform(X_train_reshaped)
    X_test_scaled = scaler.transform(X_test_reshaped)

    print("\nData successfully reshaped and scaled!")
    tune_and_evaluate_knn_pipeline(X_train_scaled, y_train, X_test_scaled, y_test, RESULTS_PATH)

Reshaped X_train shape: (1664, 5640)

Data successfully reshaped and scaled!

--- Tuning Optimized k-NN Pipeline ---
Fitting 5 folds for each of 15 candidates, totalling 75 fits

Best cross-validation score: 0.7648


--- Detailed Report for Best k-NN Pipeline ---
Test Set Accuracy: 0.7067

Classification Report:
              precision    recall  f1-score   support

           0     0.6748    0.7981    0.7313       208
           1     0.7529    0.6154    0.6772       208

    accuracy                         0.7067       416
   macro avg     0.7139    0.7067    0.7043       416
weighted avg     0.7139    0.7067    0.7043       416


Best Hyperparameters Found:
{'nca__n_components': 31, 'classifier__weights': 'distance', 'classifier__n_neighbors': 9, 'classifier__metric': 'manhattan'}

✅ Best k-NN pipeline saved to: D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\mPower\results_ALL_VALIDS_ALL\best_knn_pipeline.joblib
