In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

def get_acoustic_feature_names():
    """Returns the ordered list of 23 acoustic feature names for labeling."""
    return [
        'duration', 'mean_f0', 'stdev_f0', 'hnr',
        'jitter_local', 'jitter_local_absolute', 'jitter_rap', 'jitter_ppq5', 'jitter_ddp',
        'shimmer_local', 'shimmer_local_db', 'shimmer_apq3', 'shimmer_apq5', 'shimmer_apq11', 'shimmer_dda',
        'f1_mean', 'f2_mean', 'f3_mean', 'f4_mean',
        'f1_stdev', 'f2_stdev', 'f3_stdev', 'f4_stdev'
    ]

def plot_boxplots_from_npz(file_path, OUTPUT_PATH):
    """
    Loads a .npz file, automatically detects and processes available features
    (acoustic and others), and creates a single plot with a common x-axis.
    """
    # --- 1. Validation and Loading ---
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'")
        return

    print(f"Loading data from '{os.path.basename(file_path)}'...")
    try:
        data = np.load(file_path, allow_pickle=True)
    except Exception as e:
        print(f"Error: Could not load the .npz file. Reason: {e}")
        return

    # --- 2. Dynamic Feature Extraction and Processing ---
    if 'labels' not in data:
        print("Error: 'labels' array not found in the file. Cannot create plots.")
        return

    labels = data['labels']
    all_features_list = []

    print("Discovering and processing features...")
    for key in data.keys():
        if key in ['labels', 'age', 'sex']:
            continue

        feature_data = data[key]

        # Handle pre-computed acoustic features (2D array: samples x features)
        if key == 'acoustic_features':
            print(f"   -> Found acoustic features ('{key}')")
            feature_names = get_acoustic_feature_names()
            if feature_data.shape[1] != len(feature_names):
                 feature_names = [f"{key}_{i}" for i in range(feature_data.shape[1])]
            all_features_list.append(pd.DataFrame(feature_data, columns=feature_names))

        # Handle other 2D/3D features like MFCC, Mel Spectrogram, etc.
        # We take the mean to get a single representative value per sample.
        elif feature_data.ndim > 1:
            print(f"   -> Found matrix feature ('{key}'). Calculating mean per sample.")
            # Calculate mean across all axes except the first (sample) axis
            axes_to_average = tuple(range(1, feature_data.ndim))
            mean_feature = np.mean(feature_data, axis=axes_to_average)
            all_features_list.append(pd.DataFrame(mean_feature, columns=[f'mean_{key}']))

    if not all_features_list:
        print("Error: No plottable features were found in the file.")
        return

    # --- 3. Data Combination, Standardization, and Preparation ---
    print("Combining, standardizing, and preparing data for plotting...")

    # Combine all discovered feature DataFrames
    features_df = pd.concat(all_features_list, axis=1)

    # Add labels
    labels_series = pd.Series(labels, name='Condition').map({0: 'Healthy Control', 1: 'Parkinson Patient'})

    # Standardize the features for fair comparison
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_df)
    scaled_df = pd.DataFrame(scaled_features, columns=features_df.columns)

    # Combine scaled features with labels for plotting
    df_plot = pd.concat([scaled_df, labels_series], axis=1).dropna()

    if df_plot.empty:
        print("Error: No valid data remains after cleaning. Cannot generate plots.")
        return

    # Melt the dataframe to a long format suitable for seaborn
    df_melted = df_plot.melt(id_vars='Condition', var_name='Feature', value_name='Value')

    # --- 4. Plotting ---
    print("Generating a single plot with a common x-axis...")

    custom_palette = {'Healthy Control': 'blue', 'Parkinson Patient': 'red'}

    # Adjust figure width based on the number of features
    num_features = len(df_plot.columns) - 1
    fig_width = max(15, num_features * 0.8)
    plt.figure(figsize=(fig_width, 9))

    sns.boxplot(
        data=df_melted,
        x='Feature',
        y='Value',
        hue='Condition',
        palette=custom_palette
    )

    plt.title('Comparison of Standardized Features', fontsize=20, pad=15)
    plt.xlabel('Feature', fontsize=14)
    plt.ylabel('Standardized Value (Z-score)', fontsize=14)
    plt.xticks(rotation=45, ha="right", fontsize=10)
    plt.yticks(fontsize=10)
    plt.legend(title='Condition', fontsize=12)
    plt.grid(axis='y', linestyle='--', linewidth=0.5)
    plt.tight_layout()

    # Ensure the output directory exists
    os.makedirs(OUTPUT_PATH, exist_ok=True)

    # Create a directory to save the plot
    plot_filename = f"combined_features_boxplot_{os.path.basename(file_path).replace('.npz','')}.png"
    save_path = os.path.join(OUTPUT_PATH, plot_filename)
    plt.savefig(save_path)
    print(f"\nSuccessfully saved plot to '{save_path}'")
    plt.close()


if __name__ == '__main__':

    ITALIAN_DATASET = "ITALIAN_DATASET"
    UAMS_DATASET = "UAMS_DATASET"
    NEUROVOZ_DATASET = "NEUROVOZ_DATASET"
    MPOWER_DATASET = "MPOWER_DATASET"
    SYNTHETIC_DATASET = "SYNTHETIC_DATASET"

    MODE_ALL_VALIDS = "ALL_VALIDS"
    MODE_A = "A"

    FEATURE_MODE_BASIC = "BASIC"        # mel_spectrogram, mfcc, spectrogram
    FEATURE_MODE_ALL = "ALL"            # basic + fsc
    FEATURE_MODE_ACOUSTIC = "ACOUSTIC"  # acoustic_features only
    FEATURE_MODE_DEFAULT = "DEFAULT"

    DATASET = ITALIAN_DATASET
    MODE = MODE_ALL_VALIDS
    FEATURE_MODE = FEATURE_MODE_ACOUSTIC

    dataset_folder_name = ""
    if DATASET == ITALIAN_DATASET:
        dataset_folder_name = "Italian"
    elif DATASET == UAMS_DATASET:
        dataset_folder_name = "UAMS"
    elif DATASET == NEUROVOZ_DATASET:
        dataset_folder_name = "Neurovoz"
    elif DATASET == MPOWER_DATASET:
        dataset_folder_name = "mPower"
    elif DATASET == SYNTHETIC_DATASET:
        dataset_folder_name = "Synthetic"

    FEATURE_FILE_PATH = os.path.join(os.getcwd(), f"{dataset_folder_name}", "data", f"features_{MODE}_{FEATURE_MODE}.npz")
    OUTPUT_PATH = os.path.join(os.getcwd(), f"{dataset_folder_name}", f"results_{MODE}_{FEATURE_MODE}")

    plot_boxplots_from_npz(FEATURE_FILE_PATH, OUTPUT_PATH)



✅ Loading data from 'features_ALL_VALIDS_ALL.npz'...
🔍 Discovering and processing features...
   -> Found matrix feature ('spectrogram'). Calculating mean per sample.
   -> Found matrix feature ('mel_spectrogram'). Calculating mean per sample.
   -> Found matrix feature ('fsc'). Calculating mean per sample.
   -> Found matrix feature ('mfcc'). Calculating mean per sample.
📊 Combining, standardizing, and preparing data for plotting...
🎨 Generating a single plot with a common x-axis...

🎉 Successfully saved plot to 'feature_box_plots\combined_features_boxplot_features_ALL_VALIDS_ALL.png'
