In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, Dropout, Flatten,
                                     Dense, LSTM, MultiHeadAttention, Concatenate, Reshape)
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# =============================================================================
# --- Configuration ---
# =============================================================================

ITALIAN_DATASET = "ITALIAN_DATASET"
UAMS_DATASET = "UAMS_DATASET"
NEUROVOZ_DATASET = "NEUROVOZ_DATASET"
MPOWER_DATASET = "MPOWER_DATASET"

MODE_A = "A"
MODE_ALL_VALIDS = "ALL_VALIDS"

FEATURE_MODE_BASIC = "BASIC"
FEATURE_MODE_ALL = "ALL"


# --- SELECT YOUR CONFIGURATION HERE ---
DATASET = ITALIAN_DATASET
MODE = MODE_A
FEATURE_MODE = FEATURE_MODE_ALL
MODEL_NAME = "cnn_att_lstm"
# ------------------------------------

# Path Setup
dataset = "Italian" if DATASET == "ITALIAN_DATASET" else "Neurovoz"
FEATURES_FILE_PATH = os.path.join(os.getcwd(), dataset, "data", f"features_{MODE}_{FEATURE_MODE}.npz")
MODEL_PATH = os.path.join(os.getcwd(), dataset, f"results_{MODE}_{FEATURE_MODE}", MODEL_NAME)
os.makedirs(MODEL_PATH, exist_ok=True)

HISTORY_SAVE_PATH = os.path.join(MODEL_PATH, "history.csv")
EVALUATION_SAVE_PATH = os.path.join(MODEL_PATH, "evaluation_results.npz")
CONFUSION_MATRIX_SAVE_PATH = os.path.join(MODEL_PATH, "confusion_matrix.csv")
BEST_MODEL_PATH = os.path.join(MODEL_PATH, "best_model.keras")
# --- NEW: Path for saving the layer features ---
LAYER_FEATURES_PATH = os.path.join(MODEL_PATH, "best_model_layer_features.npz")

# Hyperparameters
EPOCHS = 30
BATCH_SIZE = 32
LEARNING_RATE = 0.001
DROPOUT_RATE = 0.5
L2_STRENGTH = 0.01

# Model Checkpoint Callback
checkpoint_cb = ModelCheckpoint(
    BEST_MODEL_PATH,
    monitor='val_auc',
    mode='max',
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

# =============================================================================
# --- Data Loading and Preparation ---
# =============================================================================

def load_data(path: str) -> tuple:
    """
    Loads features, demographics, and labels from the .npz file.
    Combines Mel Spectrogram and MFCCs as required by the model.
    """
    print(f"--- Loading data from {path} ---")
    with np.load(path) as data:
        mel_spectrograms = data['mel_spectrogram']
        mfccs = data['mfcc']
        labels = data['labels']

        # --- MODIFIED: Load age and sex, with a fallback for older files ---
        try:
            ages = data['age']
            sexes = data['sex']
        except KeyError:
            print("Warning: 'age' or 'sex' not found in .npz file. Creating placeholder NaN arrays.")
            ages = np.full_like(labels, np.nan)
            sexes = np.full_like(labels, np.nan)

        X = np.concatenate((mel_spectrograms, mfccs), axis=1)
        y = labels

    print("Data loaded successfully.")
    print(f"  - Input shape (X): {X.shape}")
    print(f"  - Labels shape (y): {y.shape}")
    print(f"  - Ages shape: {ages.shape}")
    print(f"  - Sexes shape: {sexes.shape}")
    return X, y, ages, sexes

# =============================================================================
# --- Model Architecture ---
# =============================================================================

def build_model(input_shape: tuple) -> Model:
    """Builds the hybrid CNN-Attention-LSTM model with named layers for feature extraction."""
    print("--- Building the model ---")
    inputs = Input(shape=input_shape)
    reshaped_input = Reshape((input_shape[0], input_shape[1], 1))(inputs)

    # --- CNN Blocks ---
    x = Conv2D(64, kernel_size=5, activation='relu', kernel_regularizer=l2(L2_STRENGTH), padding='same')(reshaped_input)
    x = Conv2D(64, kernel_size=5, activation='relu', kernel_regularizer=l2(L2_STRENGTH), padding='same')(x)
    x = MaxPooling2D(pool_size=(5, 5))(x)
    x = Dropout(DROPOUT_RATE)(x)
    x = Conv2D(64, kernel_size=5, activation='relu', kernel_regularizer=l2(L2_STRENGTH), padding='same')(x)
    x = Conv2D(64, kernel_size=5, activation='relu', kernel_regularizer=l2(L2_STRENGTH), padding='same')(x)
    x = MaxPooling2D(pool_size=(5, 5), name='cnn_output')(x) # --- MODIFIED: Added name ---
    x = Dropout(DROPOUT_RATE)(x)

    cnn_output_flattened = Flatten()(x)
    _, H, W, C = x.shape
    sequence_output = Reshape((H * W, C))(x)

    # --- Attention and LSTM ---
    attention_output = MultiHeadAttention(num_heads=2, key_dim=64, name='attention_output')( # --- MODIFIED: Added name ---
        query=sequence_output, key=sequence_output, value=sequence_output)
    attention_output_flattened = Flatten()(attention_output)

    lstm_sequence = LSTM(128, return_sequences=True)(sequence_output)
    lstm_output = LSTM(128, return_sequences=False, name='lstm_output')(lstm_sequence) # --- MODIFIED: Added name ---
    lstm_output = Dropout(DROPOUT_RATE)(lstm_output)

    # --- Concatenation and Output ---
    concatenated = Concatenate()([cnn_output_flattened, attention_output_flattened, lstm_output])
    bottleneck = Dense(128, activation='relu', name='bottleneck_features')(concatenated) # --- MODIFIED: Added name ---
    outputs = Dense(1, activation='sigmoid')(bottleneck)

    model = Model(inputs=inputs, outputs=outputs)
    print("Model built successfully.")
    return model

# =============================================================================
# --- Feature Extraction and Evaluation ---
# =============================================================================

def evaluate_and_save_results(model: Model, X_test: np.ndarray, y_test: np.ndarray):
    """Evaluates the model and saves metrics and predictions."""
    print("\n--- Evaluating model on test data ---")
    loss, accuracy, auc = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {accuracy:.4f}, Test Loss: {loss:.4f}, Test AUC: {auc:.4f}")

    y_pred_probs = model.predict(X_test).flatten()
    y_pred_classes = (y_pred_probs > 0.5).astype(int)

    print("\nClassification Report:")
    report = classification_report(y_test, y_pred_classes, target_names=['Healthy Control', 'Parkinson Patient'])
    print(report)
    cm = confusion_matrix(y_test, y_pred_classes)
    pd.DataFrame(cm, index=['True HC', 'True PD'], columns=['Pred HC', 'Pred PD']).to_csv(CONFUSION_MATRIX_SAVE_PATH)
    print(f"Confusion matrix saved to '{CONFUSION_MATRIX_SAVE_PATH}'")

    np.savez_compressed(EVALUATION_SAVE_PATH, y_true=y_test, y_pred_probs=y_pred_probs)
    print(f"Evaluation results saved to '{EVALUATION_SAVE_PATH}'")

def extract_and_save_layer_features(model_path: str, X_data: np.ndarray, y_data: np.ndarray, age_data: np.ndarray, sex_data: np.ndarray):
    """
    Loads a saved model, extracts features from key layers, and saves them
    along with labels and demographics for later analysis.
    """
    print("\n--- Extracting features from best model's layers ---")
    if not os.path.exists(model_path):
        print(f"Warning: Best model file not found at '{model_path}'. Skipping feature extraction.")
        return

    # 1. Load the best saved model
    model = load_model(model_path)

    # 2. Define the layers from which to extract features
    layer_names = ['cnn_output', 'attention_output', 'lstm_output', 'bottleneck_features']
    layers_to_extract = [model.get_layer(name).output for name in layer_names]

    # 3. Create a new feature extraction model
    extractor_model = Model(inputs=model.inputs, outputs=layers_to_extract)

    # 4. Get the feature outputs by running prediction
    print(f"Extracting features for {X_data.shape[0]} samples...")
    layer_outputs = extractor_model.predict(X_data)

    # 5. Save the features, labels, and demographics to a single .npz file
    features_to_save = {
        'labels': y_data,
        'age': age_data,
        'sex': sex_data
    }
    for name, features in zip(layer_names, layer_outputs):
        # Flatten the features to be 2D: (samples, feature_dim)
        features_to_save[name] = features.reshape(features.shape[0], -1)

    np.savez_compressed(LAYER_FEATURES_PATH, **features_to_save)
    print(f"Layer features saved successfully to '{LAYER_FEATURES_PATH}'")
    for name, features in features_to_save.items():
        print(f"  - Saved '{name}' with shape {features.shape}")

# =============================================================================
# --- Main Execution ---
# =============================================================================
if __name__ == '__main__':
    # 1. Load Data
    X, y, age, sex = load_data(FEATURES_FILE_PATH)

    # 2. Split Data (now includes age and sex)
    X_train, X_test, y_train, y_test, age_train, age_test, sex_train, sex_test = train_test_split(
        X, y, age, sex, test_size=0.2, random_state=42, stratify=y
    )
    print(f"\nData split into training ({len(y_train)}) and testing ({len(y_test)}) sets.")

    # 3. Build and Compile Model
    model = build_model(input_shape=(X_train.shape[1], X_train.shape[2]))
    model.summary()
    optimizer = Adam(learning_rate=LEARNING_RATE)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

    # 4. Train Model
    print("\n--- Starting model training ---")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[checkpoint_cb],
        verbose=1
    )
    print("--- Model training finished ---")

    # 5. Save Training History
    pd.DataFrame(history.history).to_csv(HISTORY_SAVE_PATH, index_label='epoch')
    print(f"\nTraining history saved to '{HISTORY_SAVE_PATH}'")

    # 6. Evaluate the final model
    evaluate_and_save_results(model, X_test, y_test)

    # 7. --- NEW: Extract features from the BEST model ---
    extract_and_save_layer_features(BEST_MODEL_PATH, X_test, y_test, age_test, sex_test)

--- Loading data from D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\Italian\data\features_A_ALL.npz ---
Data loaded successfully.
  - Input shape (X): (440, 60, 94)
  - Labels shape (y): (440,)
  - Ages shape: (440,)
  - Sexes shape: (440,)

Data split into training (352) and testing (88) sets.
--- Building the model ---
Model built successfully.



--- Starting model training ---
Epoch 1/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460ms/step - accuracy: 0.5275 - auc: 0.5494 - loss: 6.8362
Epoch 1: val_auc improved from None to 0.73528, saving model to D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\Italian\results_A_ALL\cnn_att_lstm\best_model.keras
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 632ms/step - accuracy: 0.5597 - auc: 0.5856 - loss: 4.6240 - val_accuracy: 0.6250 - val_auc: 0.7353 - val_loss: 2.6196
Epoch 2/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 430ms/step - accuracy: 0.6233 - auc: 0.6437 - loss: 2.7091
Epoch 2: val_auc improved from 0.73528 to 0.82696, saving model to D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\Italian\results_A_ALL\cnn_att_lstm\best_model.keras
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 490ms/step - accuracy: 0