In [None]:
import os
import gc
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# =============================================================================
# --- Configuration ---
# =============================================================================

# Class definitions
HEALTHY_CLASS: str = "healthy_control"
PARKINSON_CLASS: str = "parkinson_patient"
CLASSES: List[str] = [HEALTHY_CLASS, PARKINSON_CLASS]

ITALIAN_DATASET = "ITALIAN_DATASET"
UAMS_DATASET = "UAMS_DATASET"
NEUROVOZ_DATASET = "NEUROVOZ_DATASET"
MPOWER_DATASET = "MPOWER_DATASET"
SYNTHETIC_DATASET = "SYNTHETIC_DATASET"


MODE_A = "A"
MODE_ALL_VALIDS = "ALL_VALIDS"

FEATURE_MODE_BASIC = "BASIC"        # mel_spectrogram, mfcc, spectrogram
FEATURE_MODE_ALL = "ALL"            # basic + fsc
FEATURE_MODE_ACOUSTIC = "ACOUSTIC"  # acoustic_features only
FEATURE_MODE_COMBINED = "COMBINED"  # all spectral + acoustic features

# --- SELECT YOUR CONFIGURATION HERE ---
DATASET = UAMS_DATASET
MODE = MODE_ALL_VALIDS
FEATURE_MODE = FEATURE_MODE_ALL
FOLDER_NAME = "features_characteristics"
# ------------------------------------

# Path Setup
if DATASET == ITALIAN_DATASET:
    dataset_folder_name = "Italian"
elif DATASET == UAMS_DATASET:
    dataset_folder_name = "UAMS"
elif DATASET == NEUROVOZ_DATASET:
    dataset_folder_name = "Neurovoz"
elif DATASET == MPOWER_DATASET:
    dataset_folder_name = "mPower"
elif DATASET == SYNTHETIC_DATASET:
    dataset_folder_name = "Synthetic"

FEATURES_FILE: str = os.path.join(os.getcwd(), dataset_folder_name, "data", f"features_{MODE}_{FEATURE_MODE}.npz")
RESULTS_OUTPUT_PATH: str = os.path.join(os.getcwd(), dataset_folder_name, f"results_{MODE}_{FEATURE_MODE}", FOLDER_NAME)
os.makedirs(RESULTS_OUTPUT_PATH, exist_ok=True)


# =============================================================================
# --- Data Loading and Preparation ---
# =============================================================================

def get_feature_keys(feature_mode: str) -> List[str]:
    """Returns the list of audio feature keys based on the selected mode."""
    if feature_mode == "BASIC":
        return ['mel_spectrogram', 'mfcc']
    elif feature_mode == "ALL":
        return ['spectrogram', 'mel_spectrogram', 'mfcc', 'fsc']
    raise ValueError(f"Unknown FEATURE_MODE: {feature_mode}")

def load_features(features_file: str) -> Dict[str, np.ndarray]:
    """Loads all data arrays from the specified .npz file."""
    print(f"Loading features from {features_file}")
    if not os.path.exists(features_file):
        raise FileNotFoundError(f"Features file not found: {features_file}")

    with np.load(features_file) as data:
        features = {key: data[key] for key in data.keys()}

    print("Loaded data shapes:")
    for key, value in features.items():
        print(f"  - {key}: {value.shape}")
    return features

def prepare_and_clean_features(
    features_dict: Dict[str, np.ndarray],
    feature_mode: str
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, bool]:
    """
    Prepares and cleans feature data for visualization using the full dataset.
    """
    print(f"\n--- Preparing and Cleaning Features for Mode: {feature_mode} ---")
    feature_keys = get_feature_keys(feature_mode)
    labels = features_dict['labels']

    has_demographics = 'age' in features_dict and 'sex' in features_dict
    if has_demographics:
        ages = features_dict['age']
        sexes = features_dict['sex']
        print("Found 'age' and 'sex' data.")
    else:
        ages = np.full_like(labels, np.nan)
        sexes = np.full_like(labels, np.nan)

    # --- MODIFIED: Subsampling has been removed to use all data ---
    n_samples = len(labels)
    print(f"Using all {n_samples} samples for analysis. Note: This may be slow.")
    indices = np.arange(n_samples)

    labels_sub, ages_sub, sexes_sub = labels[indices], ages[indices], sexes[indices]
    X_sub = np.array([np.concatenate([features_dict[key][idx].flatten() for key in feature_keys]) for idx in indices])

    # Clean data by removing rows with NaN in age or sex
    if has_demographics:
        nan_mask = pd.isna(ages_sub) | pd.isna(sexes_sub)
        num_removed = np.sum(nan_mask)
        if num_removed > 0:
            print(f"Removing {num_removed} samples with missing age/sex data for clean visualization.")
            clean_mask = ~nan_mask
            X_sub, labels_sub, ages_sub, sexes_sub = X_sub[clean_mask], labels_sub[clean_mask], ages_sub[clean_mask], sexes_sub[clean_mask]

    print(f"Final feature matrix shape for visualization: {X_sub.shape}")
    return X_sub, labels_sub, ages_sub, sexes_sub, has_demographics


# =============================================================================
# --- Visualization Functions ---
# =============================================================================

def plot_comprehensive_analysis(X: np.ndarray, y: np.ndarray, age: np.ndarray, output_path: str, min_age: float, max_age: float):
    """
    Creates a comprehensive 3x2 plot including PCA variance curves,
    class-based projections, and an age-colored t-SNE projection.
    """
    print("\n--- Creating Comprehensive Analysis Plot ---")
    if X.shape[0] < 2: return

    X_scaled = StandardScaler().fit_transform(X)
    class_names = [cls.replace('_', ' ').title() for cls in CLASSES]
    colors = ['#2E86C1', '#E74C3C']

    fig, axes = plt.subplots(3, 2, figsize=(16, 21))
    fig.suptitle('Comprehensive Feature Analysis', fontsize=20)

    # --- Row 1: PCA Variance Analysis ---
    print("Computing PCA for variance curves...")
    n_curve_comps = min(50, X_scaled.shape[1], X_scaled.shape[0] - 1)
    pca_variance = PCA(n_components=n_curve_comps, random_state=42)
    pca_variance.fit(X_scaled)
    axes[0, 0].plot(range(1, n_curve_comps + 1), pca_variance.explained_variance_ratio_, 'bo-', markersize=4)
    axes[0, 0].set(title='PCA: Explained Variance per Component', xlabel='Principal Component', ylabel='Explained Variance Ratio'); axes[0, 0].grid(True, alpha=0.5, linestyle=':')
    cumulative_variance = np.cumsum(pca_variance.explained_variance_ratio_)
    axes[0, 1].plot(range(1, n_curve_comps + 1), cumulative_variance, 'ro-', markersize=4)
    axes[0, 1].axhline(y=0.95, color='k', linestyle='--', alpha=0.8, label='95% Variance')
    axes[0, 1].set(title='PCA: Cumulative Explained Variance', xlabel='Number of Components', ylabel='Cumulative Variance'); axes[0, 1].legend(); axes[0, 1].grid(True, alpha=0.5, linestyle=':')

    # --- Row 2: Projections by Class ---
    pca_95 = PCA(n_components=0.95, random_state=42); X_pca = pca_95.fit_transform(X_scaled)
    for i, name in enumerate(class_names): axes[1, 0].scatter(X_pca[y == i, 0], X_pca[y == i, 1], c=colors[i], label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[1, 0].set(title=f'PCA Projection (First 2 of {pca_95.n_components_} Components)', xlabel=f'PC1 ({pca_95.explained_variance_ratio_[0]:.2%})', ylabel=f'PC2 ({pca_95.explained_variance_ratio_[1]:.2%})'); axes[1, 0].legend(); axes[1, 0].grid(True, alpha=0.5, linestyle=':')

    print("Computing t-SNE...")
    perplexity = min(30, max(5, (X.shape[0] // 5) - 1)); tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto'); X_tsne = tsne.fit_transform(X_scaled)
    for i, name in enumerate(class_names): axes[1, 1].scatter(X_tsne[y == i, 0], X_tsne[y == i, 1], c=colors[i], label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[1, 1].set(title='t-SNE Projection by Class', xlabel='t-SNE Dimension 1', ylabel='t-SNE Dimension 2'); axes[1, 1].legend(); axes[1, 1].grid(True, alpha=0.5, linestyle=':')

    # --- Row 3: LDA and t-SNE by Age ---
    lda = LinearDiscriminantAnalysis(n_components=1); X_lda = lda.fit_transform(X_scaled, y)
    for i, name in enumerate(class_names): sns.kdeplot(X_lda[y == i].ravel(), ax=axes[2, 0], color=colors[i], label=f'{name} (n={np.sum(y == i)})', fill=True, alpha=0.5)
    axes[2, 0].set(title='LDA Projection', xlabel='LD1', ylabel='Density'); axes[2, 0].legend(); axes[2, 0].grid(True, alpha=0.5, linestyle=':')

    scatter = axes[2, 1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=age, cmap='plasma', s=25, alpha=0.8, vmin=min_age, vmax=max_age)
    fig.colorbar(scatter, ax=axes[2, 1], label='Age'); axes[2, 1].set(title='t-SNE Projection by Age', xlabel='t-SNE Dimension 1', ylabel='t-SNE Dimension 2'); axes[2, 1].grid(True, alpha=0.5, linestyle=':')

    plt.tight_layout(rect=[0, 0.03, 1, 0.97]); save_path = os.path.join(output_path, "comprehensive_analysis.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight'); plt.close()
    print(f"Saved comprehensive analysis plots to '{os.path.basename(save_path)}'")

def plot_demographic_analysis(X: np.ndarray, y: np.ndarray, age: np.ndarray, sex: np.ndarray, output_path: str, min_age: float, max_age: float):
    """
    Creates a 2x2 plot showing demographic distributions and their relation to features,
    with age plots stacked and sex plots stacked.
    """
    print("\n--- Creating Demographic Analysis Plots (Stacked) ---")
    if X.shape[0] < 2: return

    sex_labels = ['Female' if s == 0 else 'Male' for s in sex]
    df = pd.DataFrame({'Age': age, 'Sex': sex_labels, 'Class': [CLASSES[int(label)].replace('_', ' ').title() for label in y]})

    fig, axes = plt.subplots(2, 2, figsize=(16, 18)) # Increased height for better stacking
    fig.suptitle('Demographic and Feature Analysis', fontsize=20, y=1.02)
    class_colors = {'Healthy Control': '#2E86C1', 'Parkinson Patient': '#E74C3C'}
    sex_colors = {'Female': '#DB7093', 'Male': '#4682B4'}

    X_scaled = StandardScaler().fit_transform(X)
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X_scaled)

    # --- Column 1: Age Plots ---
    # 1. Age Distribution by Class (Top-Left)
    sns.violinplot(ax=axes[0, 0], data=df, x='Class', y='Age', hue='Class', palette=class_colors, legend=False)
    axes[0, 0].set_title('Age Distribution by Class'); axes[0, 0].grid(True, alpha=0.5, linestyle=':')

    # 2. PCA of Audio Features, Colored by Age (Bottom-Left)
    scatter_age = axes[1, 0].scatter(X_pca[:, 0], X_pca[:, 1], c=age, cmap='plasma', s=25, alpha=0.8, vmin=min_age, vmax=max_age)
    fig.colorbar(scatter_age, ax=axes[1, 0], label='Age')
    axes[1, 1].set_title('PCA of Audio Features, Colored by Age')
    axes[1, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
    axes[1, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
    axes[1, 1].grid(True, alpha=0.5, linestyle=':')

    # --- Column 2: Sex Plots ---
    # 3. Sex Distribution by Class (Top-Right)
    sns.countplot(ax=axes[0, 1], data=df, x='Sex', hue='Class', palette=class_colors, order=['Female', 'Male'])
    axes[0, 1].set_title('Sex Distribution by Class'); axes[0, 1].set_ylabel('Count')

    # 4. PCA of Audio Features, Colored by Sex (Bottom-Right)
    sns.scatterplot(ax=axes[1, 1], x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Sex'], palette=sex_colors, s=25, alpha=0.8)
    axes[1, 0].set_title('PCA of Audio Features, Colored by Sex')
    axes[1, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
    axes[1, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
    axes[1, 0].grid(True, alpha=0.5, linestyle=':')

    # Adjust layout to prevent labels from overlapping
    plt.tight_layout(rect=[0, 0, 1, 0.98])
    save_path = os.path.join(output_path, "demographic_analysis.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight'); plt.close()
    print(f"Saved demographic analysis to '{os.path.basename(save_path)}'")

def plot_individual_feature_separation(features_dict: Dict[str, np.ndarray], output_path: str, feature_mode: str):
    """
    Creates separate PCA, t-SNE, and LDA plots for each audio feature type.
    Forces PCA to 2 components if the 95% variance rule results in fewer than 2.
    """
    print("\n--- Creating Individual Audio Feature Analysis ---")
    feature_keys = get_feature_keys(feature_mode)
    titles = {'spectrogram': 'Spectrogram', 'mel_spectrogram': 'Mel Spectrogram', 'mfcc': 'MFCC', 'fsc': 'Spectral Centroid'}
    labels, n_samples = features_dict['labels'], len(features_dict['labels'])

    indices = np.arange(n_samples)
    labels_sub = labels[indices]
    class_names, colors = ["Healthy Control", "Parkinson Patient"], ['#2E86C1', '#E74C3C']

    fig, axes = plt.subplots(3, len(feature_keys), figsize=(7 * len(feature_keys), 15), squeeze=False, constrained_layout=True)
    fig.suptitle('Individual Audio Feature Analysis', fontsize=16)

    for col, feat_key in enumerate(feature_keys):
        feat_title = titles.get(feat_key, feat_key)
        print(f"  Processing {feat_title}...")
        X = np.array([features_dict[feat_key][i].flatten() for i in indices])
        X_scaled = StandardScaler().fit_transform(X)

        # --- NEW LOGIC: Check components needed and decide PCA strategy ---
        # 1. First, check how many components are needed for 95% variance.
        pca_check = PCA(n_components=0.95, random_state=42)
        pca_check.fit(X_scaled)

        # 2. If it's less than 2, force n_components to be 2 for a consistent plot.
        if pca_check.n_components_ < 2:
            print(f"  --> Forcing PCA to 2 components for '{feat_title}' (95% rule gave {pca_check.n_components_}).")
            pca = PCA(n_components=2, random_state=42)
        else:
            # Otherwise, use the standard 95% variance rule.
            pca = PCA(n_components=0.95, random_state=42)

        # 3. Fit and transform with the chosen PCA settings.
        X_pca = pca.fit_transform(X_scaled)
        # --- End of New Logic ---

        # The scatter plot now safely assumes X_pca has at least 2 columns.
        ax = axes[0, col]
        for i, name in enumerate(class_names):
            ax.scatter(X_pca[labels_sub == i, 0], X_pca[labels_sub == i, 1], c=colors[i], label=name, alpha=0.7, s=15)

        # Create a dynamic title
        total_var_explained = np.sum(pca.explained_variance_ratio_)
        title = f'PCA: {feat_title}\n({pca.n_components_} comps for {total_var_explained:.1%} var.)'
        ax.set(title=title, xlabel=f'PC1 ({pca.explained_variance_ratio_[0]:.2%})', ylabel=f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
        ax.legend(); ax.grid(True, alpha=0.5, linestyle=':')

        # t-SNE (unchanged)
        perplexity = min(30, max(5, (X_scaled.shape[0] // 5) - 1))
        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto')
        X_tsne = tsne.fit_transform(X_scaled)
        ax = axes[1, col]
        for i, name in enumerate(class_names):
            ax.scatter(X_tsne[labels_sub == i, 0], X_tsne[labels_sub == i, 1], c=colors[i], label=name, alpha=0.7, s=15)
        ax.set(title=f't-SNE: {feat_title}', xlabel='t-SNE Dim 1', ylabel='t-SNE Dim 2'); ax.legend(); ax.grid(True, alpha=0.5, linestyle=':')

        # LDA (unchanged)
        lda = LinearDiscriminantAnalysis(n_components=1)
        X_lda = lda.fit_transform(X_scaled, labels_sub)
        ax = axes[2, col]
        for i, name in enumerate(class_names):
            sns.kdeplot(X_lda[labels_sub == i].ravel(), ax=ax, color=colors[i], label=name, fill=True, alpha=0.6)
        ax.set(title=f'LDA: {feat_title}', xlabel='LD1', ylabel='Density'); ax.legend(); ax.grid(True, alpha=0.5, linestyle=':')
        del X, X_scaled; gc.collect()

    save_path = os.path.join(output_path, "individual_feature_analysis.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight'); plt.close()
    print(f"Saved individual feature analysis to '{os.path.basename(save_path)}'")

def plot_individual_features_by_age(
    features_dict: Dict[str, np.ndarray],
    output_path: str,
    feature_mode: str,
    min_age: float,
    max_age: float
):
    """
    Creates separate PCA and t-SNE plots for each audio feature, colored by age.
    """
    print("\n--- Creating Individual Audio Feature Analysis (Colored by Age) ---")
    feature_keys = get_feature_keys(feature_mode)
    titles = {'spectrogram': 'Spectrogram', 'mel_spectrogram': 'Mel Spectrogram', 'mfcc': 'MFCC', 'fsc': 'Spectral Centroid'}

    # --- MODIFIED: Subsampling removed ---
    n_samples = len(features_dict['labels'])
    indices = np.arange(n_samples)

    ages_sub = features_dict['age'][indices]
    clean_mask = ~pd.isna(ages_sub)
    indices_clean = indices[clean_mask]
    ages_clean = ages_sub[clean_mask]

    if len(indices_clean) < 2:
        print("Not enough data with valid age information to create plots. Skipping.")
        return

    fig, axes = plt.subplots(
        2, len(feature_keys),
        figsize=(7 * len(feature_keys), 10),
        squeeze=False,
        constrained_layout=True
    )
    fig.suptitle('Individual Audio Feature Analysis by Age', fontsize=16)

    mappable = None

    for col, feat_key in enumerate(feature_keys):
        feat_title = titles.get(feat_key, feat_key)
        print(f"  Processing {feat_title}...")

        X = np.array([features_dict[feat_key][i].flatten() for i in indices_clean])
        X_scaled = StandardScaler().fit_transform(X)

        pca_check = PCA(n_components=0.95, random_state=42)
        pca_check.fit(X_scaled)

        # 2. If it's less than 2, force n_components to be 2 for a consistent plot.
        if pca_check.n_components_ < 2:
            print(f"  --> Forcing PCA to 2 components for '{feat_title}' (95% rule gave {pca_check.n_components_}).")
            pca = PCA(n_components=2, random_state=42)
        else:
            # Otherwise, use the standard 95% variance rule.
            pca = PCA(n_components=0.95, random_state=42)

        # 3. Fit and transform with the chosen PCA settings.
        X_pca = pca.fit_transform(X_scaled)
        # --- End of New Logic ---
        ax = axes[0, col]
        scatter1 = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=ages_clean, cmap='plasma', vmin=min_age, vmax=max_age, alpha=0.8, s=15)
        ax.set(title=f'PCA: {feat_title}', xlabel=f'PC1 ({pca.explained_variance_ratio_[0]:.2%})', ylabel=f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
        ax.grid(True, alpha=0.5, linestyle=':')
        if mappable is None: mappable = scatter1

        # t-SNE colored by Age
        perplexity = min(30, max(5, (X_scaled.shape[0] // 5) - 1))
        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto')
        X_tsne = tsne.fit_transform(X_scaled)
        ax = axes[1, col]
        ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=ages_clean, cmap='plasma', vmin=min_age, vmax=max_age, alpha=0.8, s=15)
        ax.set(title=f't-SNE: {feat_title}', xlabel='t-SNE Dim 1', ylabel='t-SNE Dim 2')
        ax.grid(True, alpha=0.5, linestyle=':')
        del X, X_scaled; gc.collect()

    fig.colorbar(mappable, ax=axes.ravel().tolist(), label='Age', pad=0.01, aspect=30)

    save_path = os.path.join(output_path, "individual_feature_analysis_by_age.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight'); plt.close()
    print(f"Saved individual feature analysis by age to '{os.path.basename(save_path)}'")

# =============================================================================
# --- Main Execution ---
# =============================================================================
def main():
    """Main function to run the complete analysis workflow."""
    print("=" * 50); print("=== Feature & Demographic Analysis ==="); print("=" * 50)
    try:
        features = load_features(FEATURES_FILE)

        global_min_age, global_max_age = None, None
        if 'age' in features:
            global_min_age, global_max_age = np.nanmin(features['age']), np.nanmax(features['age'])

        X, y, age, sex, has_demographics = prepare_and_clean_features(features, FEATURE_MODE)

        plot_individual_feature_separation(features, RESULTS_OUTPUT_PATH, FEATURE_MODE)

        if has_demographics:
            plot_comprehensive_analysis(X, y, age, RESULTS_OUTPUT_PATH, global_min_age, global_max_age)
            plot_demographic_analysis(X, y, age, sex, RESULTS_OUTPUT_PATH, global_min_age, global_max_age)

            plot_individual_features_by_age(features, RESULTS_OUTPUT_PATH, FEATURE_MODE, global_min_age, global_max_age)
        else:
            print("\nDemographic data not found. Skipping demographic-related plots.")

    except FileNotFoundError as e:
        print(f"\nERROR: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}"); import traceback; traceback.print_exc()
    finally:
        print("\n" + "=" * 50); print("=== Analysis Complete ==="); print(f"All generated files are in: {RESULTS_OUTPUT_PATH}"); print("=" * 50)

if __name__ == "__main__":
    main()

In [None]:
import os
import numpy as np

# =============================================================================
# --- Configuration ---
# Instructions: Set these variables to match the .npz file you want to inspect.
# =============================================================================
DATASET = NEUROVOZ_DATASET
MODE = "A"
FEATURE_MODE = "ALL"

# =============================================================================
# --- Inspection Script ---
# =============================================================================
def inspect_feature_file(dataset, mode, feature_mode):
    """
    Loads and inspects the contents of a .npz feature file.
    """
    try:
        # Build the file path based on your project structure
        dataset_folder_name = "Italian" if dataset == "ITALIAN_DATASET" else "Neurovoz"
        features_file = os.path.join(os.getcwd(), dataset_folder_name, "data", f"features_{mode}_{feature_mode}.npz")

        print(f"Inspecting file: {features_file}\n")

        # Check if the file exists
        if not os.path.exists(features_file):
            raise FileNotFoundError(f"The specified file was not found.")

        # Load the .npz file
        with np.load(features_file) as data:
            print("✅ File loaded successfully. Here are its contents:\n")

            # Get the list of arrays stored in the file
            array_keys = list(data.keys())
            print(f"Stored arrays: {array_keys}\n")

            # Print details for each array
            for key in array_keys:
                array = data[key]
                print("-" * 40)
                print(f"Array Name: '{key}'")
                print(f"  - Shape: {array.shape}")
                print(f"  - Data Type: {array.dtype}")

                # Show the first 5 elements for 1D arrays, or a note for multi-dimensional ones
                if array.ndim == 1:
                    print(f"  - First 5 values: {array[:5]}")
                else:
                    print(f"  - (Multi-dimensional array, showing shape only)")

    except FileNotFoundError as e:
        print(f"❌ ERROR: {e}")
        print("Please check that the configuration variables at the top of the script are correct.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Run the inspection
if __name__ == "__main__":
    inspect_feature_file(DATASET, MODE, FEATURE_MODE)

In [None]:
import os
import numpy as np

# =============================================================================
# --- Configuration ---
# Instructions: Set these variables to match the .npz file you want to inspect.
# =============================================================================
DATASET: str = NEUROVOZ_DATASET
MODE: str = "A"
FEATURE_MODE: str = "ALL"

# =============================================================================
# --- Analysis Script ---
# =============================================================================

def find_age_range(dataset: str, mode: str, feature_mode: str):
    """
    Loads a feature file and calculates the minimum and maximum age.
    """
    try:
        # Build the file path from configuration
        dataset_folder_name: str = "Italian" if dataset == "ITALIAN_DATASET" else "Neurovoz"
        features_file: str = os.path.join(os.getcwd(), dataset_folder_name, "data", f"features_{mode}_{feature_mode}.npz")

        print(f"Analyzing file: {features_file}\n")

        if not os.path.exists(features_file):
            raise FileNotFoundError("The specified feature file was not found.")

        # Load the data
        with np.load(features_file) as data:
            if 'age' not in data:
                raise KeyError("The 'age' array was not found in the feature file.")

            age_array = data['age']

            # Calculate min and max, safely ignoring any NaN values
            min_age = np.nanmin(age_array)
            max_age = np.nanmax(age_array)

            print("📊 **Age Range Analysis Complete**")
            print(f"   - Minimum Age: {min_age}")
            print(f"   - Maximum Age: {max_age}")

    except (FileNotFoundError, KeyError) as e:
        print(f"❌ ERROR: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Run the analysis
if __name__ == "__main__":
    find_age_range(DATASET, MODE, FEATURE_MODE)

In [None]:
import os
import gc
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import NeighborhoodComponentsAnalysis

In [None]:
import os
import gc
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import NeighborhoodComponentsAnalysis

In [1]:
import os
import gc
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist, cdist

# =============================================================================
# --- Configuration ---
# =============================================================================

# Class definitions
HEALTHY_CLASS: str = "healthy_control"
PARKINSON_CLASS: str = "parkinson_patient"
CLASSES: List[str] = [HEALTHY_CLASS, PARKINSON_CLASS]

ITALIAN_DATASET = "ITALIAN_DATASET"
UAMS_DATASET = "UAMS_DATASET"
NEUROVOZ_DATASET = "NEUROVOZ_DATASET"
MPOWER_DATASET = "MPOWER_DATASET"
SYNTHETIC_DATASET = "SYNTHETIC_DATASET"

MODE_A = "A"
MODE_ALL_VALIDS = "ALL_VALIDS"

FEATURE_MODE_BASIC = "BASIC"        # mel_spectrogram, mfcc, spectrogram
FEATURE_MODE_ALL = "ALL"            # basic + fsc
FEATURE_MODE_ACOUSTIC = "ACOUSTIC"  # acoustic_features only
FEATURE_MODE_COMBINED = "COMBINED"  # all spectral + acoustic features

# --- SELECT YOUR CONFIGURATION HERE ---
DATASET = UAMS_DATASET
MODE = MODE_ALL_VALIDS
FEATURE_MODE = FEATURE_MODE_ALL
FOLDER_NAME = "features_characteristics_complete"  # Changed to reflect both NCA and non-NCA
# ------------------------------------

# Path Setup and Dataset Names
DATASET_DISPLAY_NAMES = {
    ITALIAN_DATASET: "IPVS",
    UAMS_DATASET: "UAMS",
    NEUROVOZ_DATASET: "Neurovoz_3",
    MPOWER_DATASET: "mPower",
    SYNTHETIC_DATASET: "Synthetic"
}

if DATASET == ITALIAN_DATASET:
    dataset_folder_name = "Italian"
elif DATASET == UAMS_DATASET:
    dataset_folder_name = "UAMS"
elif DATASET == NEUROVOZ_DATASET:
    dataset_folder_name = "Neurovoz"
elif DATASET == MPOWER_DATASET:
    dataset_folder_name = "mPower"
elif DATASET == SYNTHETIC_DATASET:
    dataset_folder_name = "Synthetic"

DATASET_DISPLAY_NAME = DATASET_DISPLAY_NAMES[DATASET]

FEATURES_FILE: str = os.path.join(os.getcwd(), dataset_folder_name, "data", f"features_{MODE}_{FEATURE_MODE}.npz")
RESULTS_OUTPUT_PATH: str = os.path.join(os.getcwd(), dataset_folder_name, f"results_{MODE}_{FEATURE_MODE}", FOLDER_NAME)
os.makedirs(RESULTS_OUTPUT_PATH, exist_ok=True)

# =============================================================================
# --- Metrics Calculation Functions ---
# =============================================================================

def calculate_within_class_scatter(X: np.ndarray, y: np.ndarray) -> float:
    """Calculate within-class scatter (average intra-class distance)."""
    within_scatter = 0.0
    n_total = 0

    for class_label in np.unique(y):
        class_mask = y == class_label
        X_class = X[class_mask]

        if len(X_class) > 1:
            # Calculate pairwise distances within class
            distances = pdist(X_class, metric='euclidean')
            within_scatter += np.sum(distances)
            n_total += len(distances)

    return within_scatter / n_total if n_total > 0 else 0.0

def calculate_between_class_scatter(X: np.ndarray, y: np.ndarray) -> float:
    """Calculate between-class scatter (distance between class centroids)."""
    centroids = []
    for class_label in np.unique(y):
        class_mask = y == class_label
        centroid = np.mean(X[class_mask], axis=0)
        centroids.append(centroid)

    if len(centroids) > 1:
        # Calculate distance between centroids
        centroids = np.array(centroids)
        return np.linalg.norm(centroids[0] - centroids[1])

    return 0.0

def calculate_separation_ratio(X: np.ndarray, y: np.ndarray) -> float:
    """Calculate the ratio between between-class and within-class scatter."""
    within_scatter = calculate_within_class_scatter(X, y)
    between_scatter = calculate_between_class_scatter(X, y)

    if within_scatter > 0:
        return between_scatter / within_scatter
    return np.inf if between_scatter > 0 else 0.0

def calculate_silhouette_coefficient(X: np.ndarray, y: np.ndarray) -> float:
    """Calculate silhouette coefficient for cluster separation quality."""
    if len(np.unique(y)) < 2 or X.shape[0] < 2:
        return 0.0

    try:
        return silhouette_score(X, y)
    except:
        return 0.0

def calculate_all_metrics(X: np.ndarray, y: np.ndarray) -> Dict[str, float]:
    """Calculate all separation metrics for given data."""
    metrics = {}

    try:
        metrics['within_class_scatter'] = calculate_within_class_scatter(X, y)
        metrics['between_class_scatter'] = calculate_between_class_scatter(X, y)
        metrics['separation_ratio'] = calculate_separation_ratio(X, y)
        metrics['silhouette_score'] = calculate_silhouette_coefficient(X, y)
    except Exception as e:
        print(f"Warning: Error calculating metrics: {e}")
        metrics = {
            'within_class_scatter': 0.0,
            'between_class_scatter': 0.0,
            'separation_ratio': 0.0,
            'silhouette_score': 0.0
        }

    return metrics

# =============================================================================
# --- Data Loading and Preparation ---
# =============================================================================

def get_feature_keys(feature_mode: str) -> List[str]:
    """Returns the list of audio feature keys based on the selected mode."""
    if feature_mode == "BASIC":
        return ['mel_spectrogram', 'mfcc']
    elif feature_mode == "ALL":
        return ['spectrogram', 'mel_spectrogram', 'mfcc', 'fsc']
    elif feature_mode == "ACOUSTIC":
        return ['acoustic_features']
    elif feature_mode == "COMBINED":
        return ['spectrogram', 'mel_spectrogram', 'mfcc', 'fsc', 'acoustic_features']
    raise ValueError(f"Unknown FEATURE_MODE: {feature_mode}")

def load_features(features_file: str) -> Dict[str, np.ndarray]:
    """Loads all data arrays from the specified .npz file."""
    print(f"Loading features from {features_file}")
    if not os.path.exists(features_file):
        raise FileNotFoundError(f"Features file not found: {features_file}")

    with np.load(features_file) as data:
        features = {key: data[key] for key in data.keys()}

    print("Loaded data shapes:")
    for key, value in features.items():
        print(f"  - {key}: {value.shape}")
    return features

def check_demographics_validity(ages: np.ndarray, sexes: np.ndarray) -> bool:
    """Check if demographic data contains valid (non-NaN) values."""
    valid_ages = ~pd.isna(ages)
    valid_sexes = ~pd.isna(sexes)

    n_valid_ages = np.sum(valid_ages)
    n_valid_sexes = np.sum(valid_sexes)

    print(f"Demographics check:")
    print(f"  - Valid age values: {n_valid_ages} / {len(ages)}")
    print(f"  - Valid sex values: {n_valid_sexes} / {len(sexes)}")

    # We need at least some valid demographic data for meaningful analysis
    return n_valid_ages > 10 and n_valid_sexes > 10

def prepare_and_clean_features(
    features_dict: Dict[str, np.ndarray],
    feature_mode: str
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, bool]:
    """
    Prepares and cleans feature data for visualization using the full dataset.
    Enhanced to handle NaN-only demographic columns.
    """
    print(f"\n--- Preparing and Cleaning Features for Mode: {feature_mode} ---")
    feature_keys = get_feature_keys(feature_mode)
    labels = features_dict['labels']

    has_demographics = 'age' in features_dict and 'sex' in features_dict
    if has_demographics:
        ages = features_dict['age']
        sexes = features_dict['sex']
        print("Found 'age' and 'sex' columns in dataset.")

        # Check if demographic data is actually usable
        demographics_valid = check_demographics_validity(ages, sexes)
        if not demographics_valid:
            print("Warning: Demographic columns contain insufficient valid data. Treating as unavailable.")
            has_demographics = False
            ages = np.full_like(labels, np.nan)
            sexes = np.full_like(labels, np.nan)
    else:
        ages = np.full_like(labels, np.nan)
        sexes = np.full_like(labels, np.nan)
        print("No demographic data found in dataset.")

    n_samples = len(labels)
    print(f"Using all {n_samples} samples for analysis.")
    indices = np.arange(n_samples)

    labels_sub, ages_sub, sexes_sub = labels[indices], ages[indices], sexes[indices]

    # Prepare feature matrix
    X_sub_list = []
    for idx in indices:
        feature_vector = []
        for key in feature_keys:
            if key == 'acoustic_features':
                # Acoustic features are already 1D vectors
                feature_vector.append(features_dict[key][idx])
            else:
                # Spectral features need flattening
                feature_vector.append(features_dict[key][idx].flatten())
        X_sub_list.append(np.concatenate(feature_vector))

    X_sub = np.array(X_sub_list)

    # Clean data by removing rows with NaN in demographics (only if we have valid demographics)
    if has_demographics:
        nan_mask = pd.isna(ages_sub) | pd.isna(sexes_sub)
        num_removed = np.sum(nan_mask)
        if num_removed > 0:
            print(f"Removing {num_removed} samples with missing age/sex data for clean visualization.")
            clean_mask = ~nan_mask
            X_sub, labels_sub, ages_sub, sexes_sub = X_sub[clean_mask], labels_sub[clean_mask], ages_sub[clean_mask], sexes_sub[clean_mask]

    print(f"Final feature matrix shape for visualization: {X_sub.shape}")
    return X_sub, labels_sub, ages_sub, sexes_sub, has_demographics

def apply_nca_transform(X: np.ndarray, y: np.ndarray, n_components: int = 2) -> np.ndarray:
    """Apply NCA transformation to the feature matrix."""
    print(f"Applying NCA transformation to {n_components} components...")
    try:
        # Scale features first for better NCA performance
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Apply NCA
        nca = NeighborhoodComponentsAnalysis(
            n_components=n_components,
            random_state=42,
            max_iter=1000
        )
        X_nca = nca.fit_transform(X_scaled, y)

        print(f"NCA transformation successful. Output shape: {X_nca.shape}")
        return X_nca

    except Exception as e:
        print(f"Warning: NCA transformation failed ({e}). Falling back to PCA.")
        # Fallback to PCA if NCA fails
        pca = PCA(n_components=n_components, random_state=42)
        X_scaled = StandardScaler().fit_transform(X)
        return pca.fit_transform(X_scaled)

# =============================================================================
# --- Visualization Functions with Dataset Names and Metrics ---
# =============================================================================

def plot_comprehensive_analysis_complete(X: np.ndarray, y: np.ndarray, age: np.ndarray, output_path: str, min_age: float, max_age: float, has_demographics: bool) -> Dict[str, Dict[str, float]]:
    """
    Creates a comprehensive analysis plot including both NCA and traditional methods.
    Returns metrics for CSV export.
    """
    print("\n--- Creating Comprehensive Analysis Plot (NCA + Traditional) ---")
    if X.shape[0] < 2:
        print("Not enough samples for analysis.")
        return {}

    X_scaled = StandardScaler().fit_transform(X)
    class_names = [cls.replace('_', ' ').title() for cls in CLASSES]
    colors = ['#2E86C1', '#E74C3C']

    # Initialize metrics dictionary
    comprehensive_metrics = {}

    # Determine layout based on demographic availability
    if has_demographics:
        fig, axes = plt.subplots(3, 3, figsize=(21, 18))
        fig.suptitle(f'{DATASET_DISPLAY_NAME}: Comprehensive Feature Analysis - NCA vs Traditional Methods', fontsize=20, y=0.98)
        age_row_idx = 2
    else:
        fig, axes = plt.subplots(2, 3, figsize=(21, 12))
        fig.suptitle(f'{DATASET_DISPLAY_NAME}: Comprehensive Feature Analysis - NCA vs Traditional Methods', fontsize=20, y=0.96)
        age_row_idx = None

    # --- Row 1: NCA vs PCA ---
    print("Computing NCA for 2D projection...")
    X_nca_2d = apply_nca_transform(X, y, n_components=2)
    comprehensive_metrics['NCA_2D'] = calculate_all_metrics(X_nca_2d, y)

    for i, name in enumerate(class_names):
        axes[0, 0].scatter(X_nca_2d[y == i, 0], X_nca_2d[y == i, 1], c=colors[i],
                          label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[0, 0].set(title='NCA 2D Projection by Class', xlabel='NCA Component 1', ylabel='NCA Component 2')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.5, linestyle=':')

    # PCA for comparison
    print("Computing PCA for comparison...")
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X_scaled)
    comprehensive_metrics['PCA_2D'] = calculate_all_metrics(X_pca, y)

    for i, name in enumerate(class_names):
        axes[0, 1].scatter(X_pca[y == i, 0], X_pca[y == i, 1], c=colors[i],
                          label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[0, 1].set(title='PCA 2D Projection by Class',
                   xlabel=f'PC1 ({pca.explained_variance_ratio_[0]:.2%})',
                   ylabel=f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.5, linestyle=':')

    # t-SNE
    print("Computing t-SNE...")
    perplexity = min(30, max(5, (X.shape[0] // 5) - 1))
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto')
    X_tsne = tsne.fit_transform(X_scaled)
    comprehensive_metrics['TSNE_2D'] = calculate_all_metrics(X_tsne, y)

    for i, name in enumerate(class_names):
        axes[0, 2].scatter(X_tsne[y == i, 0], X_tsne[y == i, 1], c=colors[i],
                          label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[0, 2].set(title='t-SNE Projection by Class', xlabel='t-SNE Dimension 1', ylabel='t-SNE Dimension 2')
    axes[0, 2].legend()
    axes[0, 2].grid(True, alpha=0.5, linestyle=':')

    # --- Row 2: LDA Comparisons ---
    # LDA on original features
    print("Computing LDA on original features...")
    lda_orig = LinearDiscriminantAnalysis(n_components=1)
    X_lda_orig = lda_orig.fit_transform(X_scaled, y)
    comprehensive_metrics['LDA_Original'] = calculate_all_metrics(X_lda_orig, y)

    for i, name in enumerate(class_names):
        sns.kdeplot(X_lda_orig[y == i].ravel(), ax=axes[1, 0], color=colors[i],
                   label=f'{name} (n={np.sum(y == i)})', fill=True, alpha=0.5)
    axes[1, 0].set(title='LDA on Original Features', xlabel='LD1', ylabel='Density')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.5, linestyle=':')

    # LDA on PCA features
    print("Computing LDA on PCA features...")
    max_pca_components = min(10, X.shape[1], X.shape[0] - len(np.unique(y)))
    pca_high = PCA(n_components=max_pca_components, random_state=42)
    X_pca_high = pca_high.fit_transform(X_scaled)

    lda_pca = LinearDiscriminantAnalysis(n_components=1)
    X_lda_pca = lda_pca.fit_transform(X_pca_high, y)
    comprehensive_metrics['LDA_PCA'] = calculate_all_metrics(X_lda_pca, y)

    for i, name in enumerate(class_names):
        sns.kdeplot(X_lda_pca[y == i].ravel(), ax=axes[1, 1], color=colors[i],
                   label=f'{name} (n={np.sum(y == i)})', fill=True, alpha=0.5)
    axes[1, 1].set(title='LDA on PCA Features', xlabel='LD1', ylabel='Density')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.5, linestyle=':')

    # LDA on NCA features
    print("Computing LDA on NCA features...")
    max_nca_components = min(10, X.shape[1], X.shape[0] - len(np.unique(y)))
    X_nca_high = apply_nca_transform(X, y, n_components=max_nca_components)

    lda_nca = LinearDiscriminantAnalysis(n_components=1)
    X_lda_nca = lda_nca.fit_transform(X_nca_high, y)
    comprehensive_metrics['LDA_NCA'] = calculate_all_metrics(X_lda_nca, y)

    for i, name in enumerate(class_names):
        sns.kdeplot(X_lda_nca[y == i].ravel(), ax=axes[1, 2], color=colors[i],
                   label=f'{name} (n={np.sum(y == i)})', fill=True, alpha=0.5)
    axes[1, 2].set(title='LDA on NCA Features', xlabel='LD1', ylabel='Density')
    axes[1, 2].legend()
    axes[1, 2].grid(True, alpha=0.5, linestyle=':')

    # --- Row 3: Age-colored visualizations (only if demographics available) ---
    if has_demographics and age_row_idx is not None:
        scatter_nca_age = axes[age_row_idx, 0].scatter(X_nca_2d[:, 0], X_nca_2d[:, 1], c=age, cmap='plasma',
                                            s=25, alpha=0.8, vmin=min_age, vmax=max_age)
        fig.colorbar(scatter_nca_age, ax=axes[age_row_idx, 0], label='Age')
        axes[age_row_idx, 0].set(title='NCA Projection by Age', xlabel='NCA Component 1', ylabel='NCA Component 2')
        axes[age_row_idx, 0].grid(True, alpha=0.5, linestyle=':')

        scatter_pca_age = axes[age_row_idx, 1].scatter(X_pca[:, 0], X_pca[:, 1], c=age, cmap='plasma',
                                         s=25, alpha=0.8, vmin=min_age, vmax=max_age)
        fig.colorbar(scatter_pca_age, ax=axes[age_row_idx, 1], label='Age')
        axes[age_row_idx, 1].set(title='PCA Projection by Age', xlabel='PC1', ylabel='PC2')
        axes[age_row_idx, 1].grid(True, alpha=0.5, linestyle=':')

        scatter_tsne_age = axes[age_row_idx, 2].scatter(X_tsne[:, 0], X_tsne[:, 1], c=age, cmap='plasma',
                                             s=25, alpha=0.8, vmin=min_age, vmax=max_age)
        fig.colorbar(scatter_tsne_age, ax=axes[age_row_idx, 2], label='Age')
        axes[age_row_idx, 2].set(title='t-SNE Projection by Age', xlabel='t-SNE Dimension 1', ylabel='t-SNE Dimension 2')
        axes[age_row_idx, 2].grid(True, alpha=0.5, linestyle=':')

    plt.tight_layout(rect=[0, 0.03, 1, 0.97])
    save_path = os.path.join(output_path, "comprehensive_analysis_complete.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved comprehensive analysis plots to '{os.path.basename(save_path)}'")

    return comprehensive_metrics

def plot_demographic_analysis_complete(X: np.ndarray, y: np.ndarray, age: np.ndarray, sex: np.ndarray, output_path: str, min_age: float, max_age: float):
    """
    Creates demographic analysis plots with both NCA and traditional methods.
    """
    print("\n--- Creating Demographic Analysis Plots (Complete) ---")
    if X.shape[0] < 2:
        print("Not enough samples for analysis.")
        return

    sex_labels = ['Female' if s == 0 else 'Male' for s in sex]
    df = pd.DataFrame({
        'Age': age,
        'Sex': sex_labels,
        'Class': [CLASSES[int(label)].replace('_', ' ').title() for label in y]
    })

    fig, axes = plt.subplots(3, 2, figsize=(16, 21))
    fig.suptitle(f'{DATASET_DISPLAY_NAME}: Demographic Analysis - NCA vs Traditional Methods', fontsize=20, y=0.98)
    class_colors = {'Healthy Control': '#2E86C1', 'Parkinson Patient': '#E74C3C'}
    sex_colors = {'Female': '#DB7093', 'Male': '#4682B4'}

    # Apply transformations
    X_nca = apply_nca_transform(X, y, n_components=2)
    X_scaled = StandardScaler().fit_transform(X)
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X_scaled)

    # --- Row 1: Age Distributions ---
    sns.violinplot(ax=axes[0, 0], data=df, x='Class', y='Age', hue='Class', palette=class_colors, legend=False)
    axes[0, 0].set_title('Age Distribution by Class')
    axes[0, 0].grid(True, alpha=0.5, linestyle=':')

    sns.countplot(ax=axes[0, 1], data=df, x='Sex', hue='Class', palette=class_colors, order=['Female', 'Male'])
    axes[0, 1].set_title('Sex Distribution by Class')
    axes[0, 1].set_ylabel('Count')

    # --- Row 2: NCA Projections ---
    scatter_age_nca = axes[1, 0].scatter(X_nca[:, 0], X_nca[:, 1], c=age, cmap='plasma',
                                   s=25, alpha=0.8, vmin=min_age, vmax=max_age)
    fig.colorbar(scatter_age_nca, ax=axes[1, 0], label='Age')
    axes[1, 0].set_title('NCA Projection by Age')
    axes[1, 0].set_xlabel('NCA Component 1')
    axes[1, 0].set_ylabel('NCA Component 2')
    axes[1, 0].grid(True, alpha=0.5, linestyle=':')

    sns.scatterplot(ax=axes[1, 1], x=X_nca[:, 0], y=X_nca[:, 1], hue=df['Sex'],
                   palette=sex_colors, s=25, alpha=0.8)
    axes[1, 1].set_title('NCA Projection by Sex')
    axes[1, 1].set_xlabel('NCA Component 1')
    axes[1, 1].set_ylabel('NCA Component 2')
    axes[1, 1].grid(True, alpha=0.5, linestyle=':')

    # --- Row 3: PCA Projections for Comparison ---
    scatter_age_pca = axes[2, 0].scatter(X_pca[:, 0], X_pca[:, 1], c=age, cmap='plasma',
                                   s=25, alpha=0.8, vmin=min_age, vmax=max_age)
    fig.colorbar(scatter_age_pca, ax=axes[2, 0], label='Age')
    axes[2, 0].set_title('PCA Projection by Age')
    axes[2, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
    axes[2, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
    axes[2, 0].grid(True, alpha=0.5, linestyle=':')

    sns.scatterplot(ax=axes[2, 1], x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Sex'],
                   palette=sex_colors, s=25, alpha=0.8)
    axes[2, 1].set_title('PCA Projection by Sex')
    axes[2, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
    axes[2, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
    axes[2, 1].grid(True, alpha=0.5, linestyle=':')

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    save_path = os.path.join(output_path, "demographic_analysis_complete.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved demographic analysis to '{os.path.basename(save_path)}'")

def plot_individual_feature_separation_complete(features_dict: Dict[str, np.ndarray], output_path: str, feature_mode: str) -> Dict[str, Dict[str, Dict[str, float]]]:
    """
    Creates comprehensive plots for each audio feature type with NCA, PCA, t-SNE, and LDA.
    Returns metrics for CSV export.
    """
    print("\n--- Creating Individual Audio Feature Analysis (Complete) ---")
    feature_keys = get_feature_keys(feature_mode)

    titles = {
        'spectrogram': 'Spectrogram',
        'mel_spectrogram': 'Mel Spectrogram',
        'mfcc': 'MFCC',
        'fsc': 'Spectral Centroid',
        'acoustic_features': 'Acoustic Features'
    }

    labels, n_samples = features_dict['labels'], len(features_dict['labels'])
    indices = np.arange(n_samples)
    labels_sub = labels[indices]
    class_names, colors = ["Healthy Control", "Parkinson Patient"], ['#2E86C1', '#E74C3C']

    # Initialize metrics dictionary
    individual_metrics = {}

    fig, axes = plt.subplots(4, len(feature_keys), figsize=(7 * len(feature_keys), 20),
                           squeeze=False, constrained_layout=True)
    fig.suptitle(f'{DATASET_DISPLAY_NAME}: Individual Audio Feature Analysis - Complete Comparison', fontsize=16)

    for col, feat_key in enumerate(feature_keys):
        feat_title = titles.get(feat_key, feat_key)
        print(f"  Processing {feat_title}...")
        individual_metrics[feat_key] = {}

        # Prepare feature matrix
        if feat_key == 'acoustic_features':
            X = np.array([features_dict[feat_key][i] for i in indices])
        else:
            X = np.array([features_dict[feat_key][i].flatten() for i in indices])

        X_scaled = StandardScaler().fit_transform(X)

        # Row 1: NCA
        X_nca = apply_nca_transform(X, labels_sub, n_components=2)
        individual_metrics[feat_key]['NCA_2D'] = calculate_all_metrics(X_nca, labels_sub)
        ax = axes[0, col]
        for i, name in enumerate(class_names):
            ax.scatter(X_nca[labels_sub == i, 0], X_nca[labels_sub == i, 1],
                      c=colors[i], label=name, alpha=0.7, s=15)
        ax.set(title=f'NCA: {feat_title}', xlabel='NCA Component 1', ylabel='NCA Component 2')
        ax.legend()
        ax.grid(True, alpha=0.5, linestyle=':')

        # Row 2: PCA
        pca = PCA(n_components=2, random_state=42)
        X_pca = pca.fit_transform(X_scaled)
        individual_metrics[feat_key]['PCA_2D'] = calculate_all_metrics(X_pca, labels_sub)
        ax = axes[1, col]
        for i, name in enumerate(class_names):
            ax.scatter(X_pca[labels_sub == i, 0], X_pca[labels_sub == i, 1],
                      c=colors[i], label=name, alpha=0.7, s=15)
        ax.set(title=f'PCA: {feat_title}',
               xlabel=f'PC1 ({pca.explained_variance_ratio_[0]:.2%})',
               ylabel=f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
        ax.legend()
        ax.grid(True, alpha=0.5, linestyle=':')

        # Row 3: t-SNE
        perplexity = min(30, max(5, (X_scaled.shape[0] // 5) - 1))
        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto')
        X_tsne = tsne.fit_transform(X_scaled)
        individual_metrics[feat_key]['TSNE_2D'] = calculate_all_metrics(X_tsne, labels_sub)
        ax = axes[2, col]
        for i, name in enumerate(class_names):
            ax.scatter(X_tsne[labels_sub == i, 0], X_tsne[labels_sub == i, 1],
                      c=colors[i], label=name, alpha=0.7, s=15)
        ax.set(title=f't-SNE: {feat_title}', xlabel='t-SNE Dim 1', ylabel='t-SNE Dim 2')
        ax.legend()
        ax.grid(True, alpha=0.5, linestyle=':')

        # Row 4: LDA (comparing NCA vs Original features)
        ax = axes[3, col]

        # LDA on original features
        lda_orig = LinearDiscriminantAnalysis(n_components=1)
        X_lda_orig = lda_orig.fit_transform(X_scaled, labels_sub)
        individual_metrics[feat_key]['LDA_Original'] = calculate_all_metrics(X_lda_orig, labels_sub)

        # LDA on NCA features
        max_components = min(5, X.shape[1], X.shape[0] - len(np.unique(labels_sub)))
        X_nca_high = apply_nca_transform(X, labels_sub, n_components=max_components)
        lda_nca = LinearDiscriminantAnalysis(n_components=1)
        X_lda_nca = lda_nca.fit_transform(X_nca_high, labels_sub)
        individual_metrics[feat_key]['LDA_NCA'] = calculate_all_metrics(X_lda_nca, labels_sub)

        for i, name in enumerate(class_names):
            sns.kdeplot(X_lda_orig[labels_sub == i].ravel(), ax=ax, color=colors[i],
                       label=f'{name} (Orig)', fill=False, alpha=0.8, linestyle='-')
            sns.kdeplot(X_lda_nca[labels_sub == i].ravel(), ax=ax, color=colors[i],
                       label=f'{name} (NCA)', fill=False, alpha=0.8, linestyle='--')

        ax.set(title=f'LDA: {feat_title}', xlabel='LD1', ylabel='Density')
        ax.legend()
        ax.grid(True, alpha=0.5, linestyle=':')

        del X, X_scaled
        gc.collect()

    save_path = os.path.join(output_path, "individual_feature_analysis_complete.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved individual feature complete analysis to '{os.path.basename(save_path)}'")

    return individual_metrics

def plot_individual_features_by_age_complete(
    features_dict: Dict[str, np.ndarray],
    output_path: str,
    feature_mode: str,
    min_age: float,
    max_age: float
):
    """
    Creates comprehensive age-colored plots for each audio feature with NCA, PCA, and t-SNE.
    """
    print("\n--- Creating Individual Audio Feature Analysis by Age (Complete) ---")
    feature_keys = get_feature_keys(feature_mode)

    titles = {
        'spectrogram': 'Spectrogram',
        'mel_spectrogram': 'Mel Spectrogram',
        'mfcc': 'MFCC',
        'fsc': 'Spectral Centroid',
        'acoustic_features': 'Acoustic Features'
    }

    n_samples = len(features_dict['labels'])
    indices = np.arange(n_samples)
    ages_sub = features_dict['age'][indices]
    clean_mask = ~pd.isna(ages_sub)
    indices_clean = indices[clean_mask]
    ages_clean = ages_sub[clean_mask]

    if len(indices_clean) < 2:
        print("Not enough data with valid age information to create plots. Skipping.")
        return

    labels_clean = features_dict['labels'][indices_clean]

    fig, axes = plt.subplots(
        3, len(feature_keys),
        figsize=(7 * len(feature_keys), 15),
        squeeze=False,
        constrained_layout=True
    )
    fig.suptitle(f'{DATASET_DISPLAY_NAME}: Individual Audio Feature Analysis by Age - Complete Comparison', fontsize=16)

    mappable = None

    for col, feat_key in enumerate(feature_keys):
        feat_title = titles.get(feat_key, feat_key)
        print(f"  Processing {feat_title}...")

        # Prepare feature matrix
        if feat_key == 'acoustic_features':
            X = np.array([features_dict[feat_key][i] for i in indices_clean])
        else:
            X = np.array([features_dict[feat_key][i].flatten() for i in indices_clean])

        X_scaled = StandardScaler().fit_transform(X)

        # Row 1: NCA colored by Age
        X_nca = apply_nca_transform(X, labels_clean, n_components=2)
        ax = axes[0, col]
        scatter1 = ax.scatter(X_nca[:, 0], X_nca[:, 1], c=ages_clean, cmap='plasma',
                            vmin=min_age, vmax=max_age, alpha=0.8, s=15)
        ax.set(title=f'NCA: {feat_title}', xlabel='NCA Component 1', ylabel='NCA Component 2')
        ax.grid(True, alpha=0.5, linestyle=':')
        if mappable is None:
            mappable = scatter1

        # Row 2: PCA colored by Age
        pca = PCA(n_components=2, random_state=42)
        X_pca = pca.fit_transform(X_scaled)
        ax = axes[1, col]
        ax.scatter(X_pca[:, 0], X_pca[:, 1], c=ages_clean, cmap='plasma',
                  vmin=min_age, vmax=max_age, alpha=0.8, s=15)
        ax.set(title=f'PCA: {feat_title}',
               xlabel=f'PC1 ({pca.explained_variance_ratio_[0]:.2%})',
               ylabel=f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
        ax.grid(True, alpha=0.5, linestyle=':')

        # Row 3: t-SNE colored by Age
        perplexity = min(30, max(5, (X_scaled.shape[0] // 5) - 1))
        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto')
        X_tsne = tsne.fit_transform(X_scaled)
        ax = axes[2, col]
        ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=ages_clean, cmap='plasma',
                  vmin=min_age, vmax=max_age, alpha=0.8, s=15)
        ax.set(title=f't-SNE: {feat_title}', xlabel='t-SNE Dim 1', ylabel='t-SNE Dim 2')
        ax.grid(True, alpha=0.5, linestyle=':')

        del X, X_scaled
        gc.collect()

    fig.colorbar(mappable, ax=axes.ravel().tolist(), label='Age', pad=0.01, aspect=30)

    save_path = os.path.join(output_path, "individual_feature_analysis_by_age_complete.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved individual feature analysis by age (complete) to '{os.path.basename(save_path)}'")

def plot_method_comparison_summary(X: np.ndarray, y: np.ndarray, output_path: str) -> Dict[str, Dict[str, float]]:
    """
    Creates a summary comparison of all dimensionality reduction methods.
    Returns metrics for CSV export.
    """
    print("\n--- Creating Method Comparison Summary ---")
    if X.shape[0] < 2:
        print("Not enough samples for analysis.")
        return {}

    X_scaled = StandardScaler().fit_transform(X)
    class_names = [cls.replace('_', ' ').title() for cls in CLASSES]
    colors = ['#2E86C1', '#E74C3C']

    # Initialize metrics dictionary
    summary_metrics = {}

    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle(f'{DATASET_DISPLAY_NAME}: Dimensionality Reduction Methods Comparison', fontsize=18, y=0.98)

    # NCA
    print("Computing NCA...")
    X_nca = apply_nca_transform(X, y, n_components=2)
    summary_metrics['NCA'] = calculate_all_metrics(X_nca, y)
    for i, name in enumerate(class_names):
        axes[0, 0].scatter(X_nca[y == i, 0], X_nca[y == i, 1], c=colors[i],
                          label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[0, 0].set(title='Neighborhood Component Analysis (NCA)',
                   xlabel='NCA Component 1', ylabel='NCA Component 2')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.5, linestyle=':')

    # PCA
    print("Computing PCA...")
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X_scaled)
    summary_metrics['PCA'] = calculate_all_metrics(X_pca, y)
    for i, name in enumerate(class_names):
        axes[0, 1].scatter(X_pca[y == i, 0], X_pca[y == i, 1], c=colors[i],
                          label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[0, 1].set(title='Principal Component Analysis (PCA)',
                   xlabel=f'PC1 ({pca.explained_variance_ratio_[0]:.2%})',
                   ylabel=f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.5, linestyle=':')

    # t-SNE
    print("Computing t-SNE...")
    perplexity = min(30, max(5, (X.shape[0] // 5) - 1))
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto')
    X_tsne = tsne.fit_transform(X_scaled)
    summary_metrics['TSNE'] = calculate_all_metrics(X_tsne, y)
    for i, name in enumerate(class_names):
        axes[1, 0].scatter(X_tsne[y == i, 0], X_tsne[y == i, 1], c=colors[i],
                          label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[1, 0].set(title='t-Distributed Stochastic Neighbor Embedding (t-SNE)',
                   xlabel='t-SNE Dimension 1', ylabel='t-SNE Dimension 2')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.5, linestyle=':')

    # LDA Comparison
    print("Computing LDA comparison...")
    lda_orig = LinearDiscriminantAnalysis(n_components=1)
    X_lda_orig = lda_orig.fit_transform(X_scaled, y)
    summary_metrics['LDA_Original'] = calculate_all_metrics(X_lda_orig, y)

    max_components = min(10, X.shape[1], X.shape[0] - len(np.unique(y)))
    X_nca_high = apply_nca_transform(X, y, n_components=max_components)
    lda_nca = LinearDiscriminantAnalysis(n_components=1)
    X_lda_nca = lda_nca.fit_transform(X_nca_high, y)
    summary_metrics['LDA_NCA'] = calculate_all_metrics(X_lda_nca, y)

    for i, name in enumerate(class_names):
        sns.kdeplot(X_lda_orig[y == i].ravel(), ax=axes[1, 1], color=colors[i],
                   label=f'{name} (Original)', fill=False, alpha=0.8, linestyle='-', linewidth=2)
        sns.kdeplot(X_lda_nca[y == i].ravel(), ax=axes[1, 1], color=colors[i],
                   label=f'{name} (NCA-enhanced)', fill=False, alpha=0.8, linestyle='--', linewidth=2)

    axes[1, 1].set(title='Linear Discriminant Analysis (LDA) Comparison', xlabel='LD1', ylabel='Density')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.5, linestyle=':')

    plt.tight_layout(rect=[0, 0.03, 1, 0.97])
    save_path = os.path.join(output_path, "method_comparison_summary.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved method comparison summary to '{os.path.basename(save_path)}'")

    return summary_metrics

# =============================================================================
# --- CSV Export Functions ---
# =============================================================================

def save_comprehensive_metrics_to_csv(metrics: Dict[str, Dict[str, float]], output_path: str):
    """Save comprehensive analysis metrics to CSV."""
    if not metrics:
        return

    df_data = []
    for method, method_metrics in metrics.items():
        row = {'Method': method, 'Dataset': DATASET_DISPLAY_NAME}
        row.update(method_metrics)
        df_data.append(row)

    df = pd.DataFrame(df_data)
    csv_path = os.path.join(output_path, "comprehensive_analysis_metrics.csv")
    df.to_csv(csv_path, index=False)
    print(f"Saved comprehensive metrics to '{os.path.basename(csv_path)}'")

def save_summary_metrics_to_csv(metrics: Dict[str, Dict[str, float]], output_path: str):
    """Save method comparison summary metrics to CSV."""
    if not metrics:
        return

    df_data = []
    for method, method_metrics in metrics.items():
        row = {'Method': method, 'Dataset': DATASET_DISPLAY_NAME}
        row.update(method_metrics)
        df_data.append(row)

    df = pd.DataFrame(df_data)
    csv_path = os.path.join(output_path, "method_comparison_metrics.csv")
    df.to_csv(csv_path, index=False)
    print(f"Saved method comparison metrics to '{os.path.basename(csv_path)}'")

def save_individual_features_metrics_to_csv(metrics: Dict[str, Dict[str, Dict[str, float]]], output_path: str):
    """Save individual feature analysis metrics to CSV."""
    if not metrics:
        return

    df_data = []
    for feature, feature_methods in metrics.items():
        for method, method_metrics in feature_methods.items():
            row = {'Feature': feature, 'Method': method, 'Dataset': DATASET_DISPLAY_NAME}
            row.update(method_metrics)
            df_data.append(row)

    df = pd.DataFrame(df_data)
    csv_path = os.path.join(output_path, "individual_features_metrics.csv")
    df.to_csv(csv_path, index=False)
    print(f"Saved individual features metrics to '{os.path.basename(csv_path)}'")

# =============================================================================
# --- Main Execution ---
# =============================================================================
def main():
    """Main function to run the complete analysis workflow with both NCA and traditional methods."""
    print("=" * 70)
    print("=== Comprehensive Feature & Demographic Analysis ===")
    print("=== (NCA + Traditional Methods + Metrics) ===")
    print("=" * 70)

    try:
        features = load_features(FEATURES_FILE)

        # Check for global age range (handle all-NaN case)
        global_min_age, global_max_age = None, None
        if 'age' in features:
            valid_ages = features['age'][~pd.isna(features['age'])]
            if len(valid_ages) > 0:
                global_min_age, global_max_age = np.min(valid_ages), np.max(valid_ages)
                print(f"Global age range: {global_min_age:.1f} - {global_max_age:.1f}")
            else:
                print("Warning: All age values are NaN")
                global_min_age, global_max_age = 0, 100  # Default range

        X, y, age, sex, has_demographics = prepare_and_clean_features(features, FEATURE_MODE)

        # Generate method comparison summary (always create this)
        summary_metrics = plot_method_comparison_summary(X, y, RESULTS_OUTPUT_PATH)
        save_summary_metrics_to_csv(summary_metrics, RESULTS_OUTPUT_PATH)

        # Generate comprehensive analysis
        comprehensive_metrics = plot_comprehensive_analysis_complete(X, y, age, RESULTS_OUTPUT_PATH,
                                           global_min_age or 0, global_max_age or 100, has_demographics)
        save_comprehensive_metrics_to_csv(comprehensive_metrics, RESULTS_OUTPUT_PATH)

        # Generate individual feature analysis (always create this)
        individual_metrics = plot_individual_feature_separation_complete(features, RESULTS_OUTPUT_PATH, FEATURE_MODE)
        save_individual_features_metrics_to_csv(individual_metrics, RESULTS_OUTPUT_PATH)

        # Generate demographic-specific analyses if data is available
        if has_demographics and global_min_age is not None:
            plot_demographic_analysis_complete(X, y, age, sex, RESULTS_OUTPUT_PATH, global_min_age, global_max_age)
            plot_individual_features_by_age_complete(features, RESULTS_OUTPUT_PATH, FEATURE_MODE, global_min_age, global_max_age)
        else:
            print("\nDemographic data not available or insufficient. Skipping demographic-related plots.")
            print("Generated plots will focus on class separation using available features only.")

        print(f"\n{'='*70}")
        print("=== Analysis Summary ===")
        print(f"Dataset: {DATASET} ({DATASET_DISPLAY_NAME})")
        print(f"Mode: {MODE}")
        print(f"Feature Mode: {FEATURE_MODE}")
        print(f"Demographics Available: {has_demographics}")
        print(f"Total Samples: {len(X)}")
        print(f"Feature Dimensionality: {X.shape[1] if len(X.shape) > 1 else 'N/A'}")

        generated_files = [
            "method_comparison_summary.png",
            "method_comparison_metrics.csv",
            "comprehensive_analysis_complete.png",
            "comprehensive_analysis_metrics.csv",
            "individual_feature_analysis_complete.png",
            "individual_features_metrics.csv"
        ]

        if has_demographics:
            generated_files.extend([
                "demographic_analysis_complete.png",
                "individual_feature_analysis_by_age_complete.png"
            ])

        print(f"\nGenerated Files ({len(generated_files)}):")
        for file in generated_files:
            print(f"  ✓ {file}")

    except FileNotFoundError as e:
        print(f"\nERROR: {e}")
        print("Please ensure the features file exists and the path is correct.")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")
        import traceback
        traceback.print_exc()
    finally:
        print(f"\n{'='*70}")
        print("=== Complete Analysis Finished ===")
        print(f"All generated files are in: {RESULTS_OUTPUT_PATH}")
        print("=" * 70)

if __name__ == "__main__":
    main()


=== Comprehensive Feature & Demographic Analysis ===
=== (NCA + Traditional Methods + Metrics) ===
Loading features from D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\UAMS\data\features_ALL_VALIDS_ALL.npz
Loaded data shapes:
  - spectrogram: (328, 1025, 94)
  - mel_spectrogram: (328, 30, 94)
  - fsc: (328, 1, 94)
  - mfcc: (328, 30, 94)
  - labels: (328,)
  - sex: (328,)
  - age: (328,)
Global age range: 18.0 - 85.0

--- Preparing and Cleaning Features for Mode: ALL ---
Found 'age' and 'sex' columns in dataset.
Demographics check:
  - Valid age values: 328 / 328
  - Valid sex values: 328 / 328
Using all 328 samples for analysis.
Final feature matrix shape for visualization: (328, 102084)

--- Creating Method Comparison Summary ---
Computing NCA...
Applying NCA transformation to 2 components...
NCA transformation successful. Output shape: (328, 2)
Computing PCA...
Computing t-SNE...
Computing LDA comparison...
Applying NCA transformation