In [1]:
import os
import gc
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# =============================================================================
# --- Configuration ---
# =============================================================================

# Class definitions
HEALTHY_CLASS: str = "healthy_control"
PARKINSON_CLASS: str = "parkinson_patient"
CLASSES: List[str] = [HEALTHY_CLASS, PARKINSON_CLASS]

ITALIAN_DATASET = "ITALIAN_DATASET"
UAMS_DATASET = "UAMS_DATASET"
NEUROVOZ_DATASET = "NEUROVOZ_DATASET"
MPOWER_DATASET = "MPOWER_DATASET"
SYNTHETIC_DATASET = "SYNTHETIC_DATASET"


MODE_A = "A"
MODE_ALL_VALIDS = "ALL_VALIDS"

FEATURE_MODE_BASIC = "BASIC"
FEATURE_MODE_ALL = "ALL"

# --- SELECT YOUR CONFIGURATION HERE ---
DATASET = UAMS_DATASET
MODE = MODE_ALL_VALIDS
FEATURE_MODE = FEATURE_MODE_ALL
MODEL_NAME = "cnn_att_lstm"
# ------------------------------------

# Path Setup

if DATASET == ITALIAN_DATASET:
    dataset_folder_name = "Italian"
elif DATASET == UAMS_DATASET:
    dataset_folder_name = "UAMS"
elif DATASET == NEUROVOZ_DATASET:
    dataset_folder_name = "Neurovoz"

FEATURES_FILE: str = os.path.join(os.getcwd(), dataset_folder_name, "data", f"features_{MODE}_{FEATURE_MODE}.npz")
RESULTS_OUTPUT_PATH: str = os.path.join(os.getcwd(), dataset_folder_name, f"results_{MODE}_{FEATURE_MODE}", MODEL_NAME)
os.makedirs(RESULTS_OUTPUT_PATH, exist_ok=True)


# =============================================================================
# --- Data Loading and Preparation ---
# =============================================================================

def get_feature_keys(feature_mode: str) -> List[str]:
    """Returns the list of audio feature keys based on the selected mode."""
    if feature_mode == "BASIC":
        return ['mel_spectrogram', 'mfcc']
    elif feature_mode == "ALL":
        return ['spectrogram', 'mel_spectrogram', 'mfcc', 'fsc']
    raise ValueError(f"Unknown FEATURE_MODE: {feature_mode}")

def load_features(features_file: str) -> Dict[str, np.ndarray]:
    """Loads all data arrays from the specified .npz file."""
    print(f"Loading features from {features_file}")
    if not os.path.exists(features_file):
        raise FileNotFoundError(f"Features file not found: {features_file}")

    with np.load(features_file) as data:
        features = {key: data[key] for key in data.keys()}

    print("Loaded data shapes:")
    for key, value in features.items():
        print(f"  - {key}: {value.shape}")
    return features

def prepare_and_clean_features(
    features_dict: Dict[str, np.ndarray],
    feature_mode: str
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, bool]:
    """
    Prepares and cleans feature data for visualization using the full dataset.
    """
    print(f"\n--- Preparing and Cleaning Features for Mode: {feature_mode} ---")
    feature_keys = get_feature_keys(feature_mode)
    labels = features_dict['labels']

    has_demographics = 'age' in features_dict and 'sex' in features_dict
    if has_demographics:
        ages = features_dict['age']
        sexes = features_dict['sex']
        print("Found 'age' and 'sex' data.")
    else:
        ages = np.full_like(labels, np.nan)
        sexes = np.full_like(labels, np.nan)

    # --- MODIFIED: Subsampling has been removed to use all data ---
    n_samples = len(labels)
    print(f"Using all {n_samples} samples for analysis. Note: This may be slow.")
    indices = np.arange(n_samples)

    labels_sub, ages_sub, sexes_sub = labels[indices], ages[indices], sexes[indices]
    X_sub = np.array([np.concatenate([features_dict[key][idx].flatten() for key in feature_keys]) for idx in indices])

    # Clean data by removing rows with NaN in age or sex
    if has_demographics:
        nan_mask = pd.isna(ages_sub) | pd.isna(sexes_sub)
        num_removed = np.sum(nan_mask)
        if num_removed > 0:
            print(f"Removing {num_removed} samples with missing age/sex data for clean visualization.")
            clean_mask = ~nan_mask
            X_sub, labels_sub, ages_sub, sexes_sub = X_sub[clean_mask], labels_sub[clean_mask], ages_sub[clean_mask], sexes_sub[clean_mask]

    print(f"Final feature matrix shape for visualization: {X_sub.shape}")
    return X_sub, labels_sub, ages_sub, sexes_sub, has_demographics


# =============================================================================
# --- Visualization Functions ---
# =============================================================================

def plot_comprehensive_analysis(X: np.ndarray, y: np.ndarray, age: np.ndarray, output_path: str, min_age: float, max_age: float):
    """
    Creates a comprehensive 3x2 plot including PCA variance curves,
    class-based projections, and an age-colored t-SNE projection.
    """
    print("\n--- Creating Comprehensive Analysis Plot ---")
    if X.shape[0] < 2: return

    X_scaled = StandardScaler().fit_transform(X)
    class_names = [cls.replace('_', ' ').title() for cls in CLASSES]
    colors = ['#2E86C1', '#E74C3C']

    fig, axes = plt.subplots(3, 2, figsize=(16, 21))
    fig.suptitle('Comprehensive Feature Analysis', fontsize=20)

    # --- Row 1: PCA Variance Analysis ---
    print("Computing PCA for variance curves...")
    n_curve_comps = min(50, X_scaled.shape[1], X_scaled.shape[0] - 1)
    pca_variance = PCA(n_components=n_curve_comps, random_state=42)
    pca_variance.fit(X_scaled)
    axes[0, 0].plot(range(1, n_curve_comps + 1), pca_variance.explained_variance_ratio_, 'bo-', markersize=4)
    axes[0, 0].set(title='PCA: Explained Variance per Component', xlabel='Principal Component', ylabel='Explained Variance Ratio'); axes[0, 0].grid(True, alpha=0.5, linestyle=':')
    cumulative_variance = np.cumsum(pca_variance.explained_variance_ratio_)
    axes[0, 1].plot(range(1, n_curve_comps + 1), cumulative_variance, 'ro-', markersize=4)
    axes[0, 1].axhline(y=0.95, color='k', linestyle='--', alpha=0.8, label='95% Variance')
    axes[0, 1].set(title='PCA: Cumulative Explained Variance', xlabel='Number of Components', ylabel='Cumulative Variance'); axes[0, 1].legend(); axes[0, 1].grid(True, alpha=0.5, linestyle=':')

    # --- Row 2: Projections by Class ---
    pca_95 = PCA(n_components=0.95, random_state=42); X_pca = pca_95.fit_transform(X_scaled)
    for i, name in enumerate(class_names): axes[1, 0].scatter(X_pca[y == i, 0], X_pca[y == i, 1], c=colors[i], label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[1, 0].set(title=f'PCA Projection (First 2 of {pca_95.n_components_} Components)', xlabel=f'PC1 ({pca_95.explained_variance_ratio_[0]:.2%})', ylabel=f'PC2 ({pca_95.explained_variance_ratio_[1]:.2%})'); axes[1, 0].legend(); axes[1, 0].grid(True, alpha=0.5, linestyle=':')

    print("Computing t-SNE...")
    perplexity = min(30, max(5, (X.shape[0] // 5) - 1)); tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto'); X_tsne = tsne.fit_transform(X_scaled)
    for i, name in enumerate(class_names): axes[1, 1].scatter(X_tsne[y == i, 0], X_tsne[y == i, 1], c=colors[i], label=f'{name} (n={np.sum(y == i)})', alpha=0.7, s=25)
    axes[1, 1].set(title='t-SNE Projection by Class', xlabel='t-SNE Dimension 1', ylabel='t-SNE Dimension 2'); axes[1, 1].legend(); axes[1, 1].grid(True, alpha=0.5, linestyle=':')

    # --- Row 3: LDA and t-SNE by Age ---
    lda = LinearDiscriminantAnalysis(n_components=1); X_lda = lda.fit_transform(X_scaled, y)
    for i, name in enumerate(class_names): sns.kdeplot(X_lda[y == i].ravel(), ax=axes[2, 0], color=colors[i], label=f'{name} (n={np.sum(y == i)})', fill=True, alpha=0.5)
    axes[2, 0].set(title='LDA Projection', xlabel='LD1', ylabel='Density'); axes[2, 0].legend(); axes[2, 0].grid(True, alpha=0.5, linestyle=':')

    scatter = axes[2, 1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=age, cmap='plasma', s=25, alpha=0.8, vmin=min_age, vmax=max_age)
    fig.colorbar(scatter, ax=axes[2, 1], label='Age'); axes[2, 1].set(title='t-SNE Projection by Age', xlabel='t-SNE Dimension 1', ylabel='t-SNE Dimension 2'); axes[2, 1].grid(True, alpha=0.5, linestyle=':')

    plt.tight_layout(rect=[0, 0.03, 1, 0.97]); save_path = os.path.join(output_path, "comprehensive_analysis.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight'); plt.close()
    print(f"Saved comprehensive analysis plots to '{os.path.basename(save_path)}'")

def plot_demographic_analysis(X: np.ndarray, y: np.ndarray, age: np.ndarray, sex: np.ndarray, output_path: str, min_age: float, max_age: float):
    """
    Creates a 2x2 plot showing demographic distributions and their relation to features,
    with age plots stacked and sex plots stacked.
    """
    print("\n--- Creating Demographic Analysis Plots (Stacked) ---")
    if X.shape[0] < 2: return

    sex_labels = ['Female' if s == 0 else 'Male' for s in sex]
    df = pd.DataFrame({'Age': age, 'Sex': sex_labels, 'Class': [CLASSES[int(label)].replace('_', ' ').title() for label in y]})

    fig, axes = plt.subplots(2, 2, figsize=(16, 18)) # Increased height for better stacking
    fig.suptitle('Demographic and Feature Analysis', fontsize=20, y=1.02)
    class_colors = {'Healthy Control': '#2E86C1', 'Parkinson Patient': '#E74C3C'}
    sex_colors = {'Female': '#DB7093', 'Male': '#4682B4'}

    X_scaled = StandardScaler().fit_transform(X)
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X_scaled)

    # --- Column 1: Age Plots ---
    # 1. Age Distribution by Class (Top-Left)
    sns.violinplot(ax=axes[0, 0], data=df, x='Class', y='Age', hue='Class', palette=class_colors, legend=False)
    axes[0, 0].set_title('Age Distribution by Class'); axes[0, 0].grid(True, alpha=0.5, linestyle=':')

    # 2. PCA of Audio Features, Colored by Age (Bottom-Left)
    scatter_age = axes[1, 0].scatter(X_pca[:, 0], X_pca[:, 1], c=age, cmap='plasma', s=25, alpha=0.8, vmin=min_age, vmax=max_age)
    fig.colorbar(scatter_age, ax=axes[1, 0], label='Age')
    axes[1, 1].set_title('PCA of Audio Features, Colored by Age')
    axes[1, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
    axes[1, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
    axes[1, 1].grid(True, alpha=0.5, linestyle=':')

    # --- Column 2: Sex Plots ---
    # 3. Sex Distribution by Class (Top-Right)
    sns.countplot(ax=axes[0, 1], data=df, x='Sex', hue='Class', palette=class_colors, order=['Female', 'Male'])
    axes[0, 1].set_title('Sex Distribution by Class'); axes[0, 1].set_ylabel('Count')

    # 4. PCA of Audio Features, Colored by Sex (Bottom-Right)
    sns.scatterplot(ax=axes[1, 1], x=X_pca[:, 0], y=X_pca[:, 1], hue=df['Sex'], palette=sex_colors, s=25, alpha=0.8)
    axes[1, 0].set_title('PCA of Audio Features, Colored by Sex')
    axes[1, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
    axes[1, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
    axes[1, 0].grid(True, alpha=0.5, linestyle=':')

    # Adjust layout to prevent labels from overlapping
    plt.tight_layout(rect=[0, 0, 1, 0.98])
    save_path = os.path.join(output_path, "demographic_analysis.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight'); plt.close()
    print(f"Saved demographic analysis to '{os.path.basename(save_path)}'")

def plot_individual_feature_separation(features_dict: Dict[str, np.ndarray], output_path: str, feature_mode: str):
    """
    Creates separate PCA, t-SNE, and LDA plots for each audio feature type.
    Forces PCA to 2 components if the 95% variance rule results in fewer than 2.
    """
    print("\n--- Creating Individual Audio Feature Analysis ---")
    feature_keys = get_feature_keys(feature_mode)
    titles = {'spectrogram': 'Spectrogram', 'mel_spectrogram': 'Mel Spectrogram', 'mfcc': 'MFCC', 'fsc': 'Spectral Centroid'}
    labels, n_samples = features_dict['labels'], len(features_dict['labels'])

    indices = np.arange(n_samples)
    labels_sub = labels[indices]
    class_names, colors = ["Healthy Control", "Parkinson Patient"], ['#2E86C1', '#E74C3C']

    fig, axes = plt.subplots(3, len(feature_keys), figsize=(7 * len(feature_keys), 15), squeeze=False, constrained_layout=True)
    fig.suptitle('Individual Audio Feature Analysis', fontsize=16)

    for col, feat_key in enumerate(feature_keys):
        feat_title = titles.get(feat_key, feat_key)
        print(f"  Processing {feat_title}...")
        X = np.array([features_dict[feat_key][i].flatten() for i in indices])
        X_scaled = StandardScaler().fit_transform(X)

        # --- NEW LOGIC: Check components needed and decide PCA strategy ---
        # 1. First, check how many components are needed for 95% variance.
        pca_check = PCA(n_components=0.95, random_state=42)
        pca_check.fit(X_scaled)

        # 2. If it's less than 2, force n_components to be 2 for a consistent plot.
        if pca_check.n_components_ < 2:
            print(f"  --> Forcing PCA to 2 components for '{feat_title}' (95% rule gave {pca_check.n_components_}).")
            pca = PCA(n_components=2, random_state=42)
        else:
            # Otherwise, use the standard 95% variance rule.
            pca = PCA(n_components=0.95, random_state=42)

        # 3. Fit and transform with the chosen PCA settings.
        X_pca = pca.fit_transform(X_scaled)
        # --- End of New Logic ---

        # The scatter plot now safely assumes X_pca has at least 2 columns.
        ax = axes[0, col]
        for i, name in enumerate(class_names):
            ax.scatter(X_pca[labels_sub == i, 0], X_pca[labels_sub == i, 1], c=colors[i], label=name, alpha=0.7, s=15)

        # Create a dynamic title
        total_var_explained = np.sum(pca.explained_variance_ratio_)
        title = f'PCA: {feat_title}\n({pca.n_components_} comps for {total_var_explained:.1%} var.)'
        ax.set(title=title, xlabel=f'PC1 ({pca.explained_variance_ratio_[0]:.2%})', ylabel=f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
        ax.legend(); ax.grid(True, alpha=0.5, linestyle=':')

        # t-SNE (unchanged)
        perplexity = min(30, max(5, (X_scaled.shape[0] // 5) - 1))
        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto')
        X_tsne = tsne.fit_transform(X_scaled)
        ax = axes[1, col]
        for i, name in enumerate(class_names):
            ax.scatter(X_tsne[labels_sub == i, 0], X_tsne[labels_sub == i, 1], c=colors[i], label=name, alpha=0.7, s=15)
        ax.set(title=f't-SNE: {feat_title}', xlabel='t-SNE Dim 1', ylabel='t-SNE Dim 2'); ax.legend(); ax.grid(True, alpha=0.5, linestyle=':')

        # LDA (unchanged)
        lda = LinearDiscriminantAnalysis(n_components=1)
        X_lda = lda.fit_transform(X_scaled, labels_sub)
        ax = axes[2, col]
        for i, name in enumerate(class_names):
            sns.kdeplot(X_lda[labels_sub == i].ravel(), ax=ax, color=colors[i], label=name, fill=True, alpha=0.6)
        ax.set(title=f'LDA: {feat_title}', xlabel='LD1', ylabel='Density'); ax.legend(); ax.grid(True, alpha=0.5, linestyle=':')
        del X, X_scaled; gc.collect()

    save_path = os.path.join(output_path, "individual_feature_analysis.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight'); plt.close()
    print(f"Saved individual feature analysis to '{os.path.basename(save_path)}'")

def plot_individual_features_by_age(
    features_dict: Dict[str, np.ndarray],
    output_path: str,
    feature_mode: str,
    min_age: float,
    max_age: float
):
    """
    Creates separate PCA and t-SNE plots for each audio feature, colored by age.
    """
    print("\n--- Creating Individual Audio Feature Analysis (Colored by Age) ---")
    feature_keys = get_feature_keys(feature_mode)
    titles = {'spectrogram': 'Spectrogram', 'mel_spectrogram': 'Mel Spectrogram', 'mfcc': 'MFCC', 'fsc': 'Spectral Centroid'}

    # --- MODIFIED: Subsampling removed ---
    n_samples = len(features_dict['labels'])
    indices = np.arange(n_samples)

    ages_sub = features_dict['age'][indices]
    clean_mask = ~pd.isna(ages_sub)
    indices_clean = indices[clean_mask]
    ages_clean = ages_sub[clean_mask]

    if len(indices_clean) < 2:
        print("Not enough data with valid age information to create plots. Skipping.")
        return

    fig, axes = plt.subplots(
        2, len(feature_keys),
        figsize=(7 * len(feature_keys), 10),
        squeeze=False,
        constrained_layout=True
    )
    fig.suptitle('Individual Audio Feature Analysis by Age', fontsize=16)

    mappable = None

    for col, feat_key in enumerate(feature_keys):
        feat_title = titles.get(feat_key, feat_key)
        print(f"  Processing {feat_title}...")

        X = np.array([features_dict[feat_key][i].flatten() for i in indices_clean])
        X_scaled = StandardScaler().fit_transform(X)

        pca_check = PCA(n_components=0.95, random_state=42)
        pca_check.fit(X_scaled)

        # 2. If it's less than 2, force n_components to be 2 for a consistent plot.
        if pca_check.n_components_ < 2:
            print(f"  --> Forcing PCA to 2 components for '{feat_title}' (95% rule gave {pca_check.n_components_}).")
            pca = PCA(n_components=2, random_state=42)
        else:
            # Otherwise, use the standard 95% variance rule.
            pca = PCA(n_components=0.95, random_state=42)

        # 3. Fit and transform with the chosen PCA settings.
        X_pca = pca.fit_transform(X_scaled)
        # --- End of New Logic ---
        ax = axes[0, col]
        scatter1 = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=ages_clean, cmap='plasma', vmin=min_age, vmax=max_age, alpha=0.8, s=15)
        ax.set(title=f'PCA: {feat_title}', xlabel=f'PC1 ({pca.explained_variance_ratio_[0]:.2%})', ylabel=f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
        ax.grid(True, alpha=0.5, linestyle=':')
        if mappable is None: mappable = scatter1

        # t-SNE colored by Age
        perplexity = min(30, max(5, (X_scaled.shape[0] // 5) - 1))
        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca', learning_rate='auto')
        X_tsne = tsne.fit_transform(X_scaled)
        ax = axes[1, col]
        ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=ages_clean, cmap='plasma', vmin=min_age, vmax=max_age, alpha=0.8, s=15)
        ax.set(title=f't-SNE: {feat_title}', xlabel='t-SNE Dim 1', ylabel='t-SNE Dim 2')
        ax.grid(True, alpha=0.5, linestyle=':')
        del X, X_scaled; gc.collect()

    fig.colorbar(mappable, ax=axes.ravel().tolist(), label='Age', pad=0.01, aspect=30)

    save_path = os.path.join(output_path, "individual_feature_analysis_by_age.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight'); plt.close()
    print(f"Saved individual feature analysis by age to '{os.path.basename(save_path)}'")

# =============================================================================
# --- Main Execution ---
# =============================================================================
def main():
    """Main function to run the complete analysis workflow."""
    print("=" * 50); print("=== Feature & Demographic Analysis ==="); print("=" * 50)
    try:
        features = load_features(FEATURES_FILE)

        global_min_age, global_max_age = None, None
        if 'age' in features:
            global_min_age, global_max_age = np.nanmin(features['age']), np.nanmax(features['age'])

        X, y, age, sex, has_demographics = prepare_and_clean_features(features, FEATURE_MODE)

        plot_individual_feature_separation(features, RESULTS_OUTPUT_PATH, FEATURE_MODE)

        if has_demographics:
            plot_comprehensive_analysis(X, y, age, RESULTS_OUTPUT_PATH, global_min_age, global_max_age)
            plot_demographic_analysis(X, y, age, sex, RESULTS_OUTPUT_PATH, global_min_age, global_max_age)

            plot_individual_features_by_age(features, RESULTS_OUTPUT_PATH, FEATURE_MODE, global_min_age, global_max_age)
        else:
            print("\nDemographic data not found. Skipping demographic-related plots.")

    except FileNotFoundError as e:
        print(f"\nERROR: {e}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}"); import traceback; traceback.print_exc()
    finally:
        print("\n" + "=" * 50); print("=== Analysis Complete ==="); print(f"All generated files are in: {RESULTS_OUTPUT_PATH}"); print("=" * 50)

if __name__ == "__main__":
    main()

=== Feature & Demographic Analysis ===
Loading features from D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\UAMS\data\features_ALL_VALIDS_ALL.npz
Loaded data shapes:
  - spectrogram: (328, 1025, 94)
  - mel_spectrogram: (328, 30, 94)
  - fsc: (328, 1, 94)
  - mfcc: (328, 30, 94)
  - labels: (328,)
  - sex: (328,)
  - age: (328,)

--- Preparing and Cleaning Features for Mode: ALL ---
Found 'age' and 'sex' data.
Using all 328 samples for analysis. Note: This may be slow.
Final feature matrix shape for visualization: (328, 102084)

--- Creating Individual Audio Feature Analysis ---
  Processing Spectrogram...
  Processing Mel Spectrogram...
  Processing MFCC...
  Processing Spectral Centroid...
Saved individual feature analysis to 'individual_feature_analysis.png'

--- Creating Comprehensive Analysis Plot ---
Computing PCA for variance curves...
Computing t-SNE...
Saved comprehensive analysis plots to 'comprehensive_analysis.png'

--- Creat

In [4]:
import os
import numpy as np

# =============================================================================
# --- Configuration ---
# Instructions: Set these variables to match the .npz file you want to inspect.
# =============================================================================
DATASET = NEUROVOZ_DATASET
MODE = "A"
FEATURE_MODE = "ALL"

# =============================================================================
# --- Inspection Script ---
# =============================================================================
def inspect_feature_file(dataset, mode, feature_mode):
    """
    Loads and inspects the contents of a .npz feature file.
    """
    try:
        # Build the file path based on your project structure
        dataset_folder_name = "Italian" if dataset == "ITALIAN_DATASET" else "Neurovoz"
        features_file = os.path.join(os.getcwd(), dataset_folder_name, "data", f"features_{mode}_{feature_mode}.npz")

        print(f"Inspecting file: {features_file}\n")

        # Check if the file exists
        if not os.path.exists(features_file):
            raise FileNotFoundError(f"The specified file was not found.")

        # Load the .npz file
        with np.load(features_file) as data:
            print("✅ File loaded successfully. Here are its contents:\n")

            # Get the list of arrays stored in the file
            array_keys = list(data.keys())
            print(f"Stored arrays: {array_keys}\n")

            # Print details for each array
            for key in array_keys:
                array = data[key]
                print("-" * 40)
                print(f"Array Name: '{key}'")
                print(f"  - Shape: {array.shape}")
                print(f"  - Data Type: {array.dtype}")

                # Show the first 5 elements for 1D arrays, or a note for multi-dimensional ones
                if array.ndim == 1:
                    print(f"  - First 5 values: {array[:5]}")
                else:
                    print(f"  - (Multi-dimensional array, showing shape only)")

    except FileNotFoundError as e:
        print(f"❌ ERROR: {e}")
        print("Please check that the configuration variables at the top of the script are correct.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Run the inspection
if __name__ == "__main__":
    inspect_feature_file(DATASET, MODE, FEATURE_MODE)

Inspecting file: D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\Neurovoz\data\features_A_ALL.npz

✅ File loaded successfully. Here are its contents:

Stored arrays: ['spectrogram', 'mel_spectrogram', 'fsc', 'mfcc', 'labels', 'sex', 'age']

----------------------------------------
Array Name: 'spectrogram'
  - Shape: (1064, 1025, 94)
  - Data Type: float32
  - (Multi-dimensional array, showing shape only)
----------------------------------------
Array Name: 'mel_spectrogram'
  - Shape: (1064, 30, 94)
  - Data Type: float32
  - (Multi-dimensional array, showing shape only)
----------------------------------------
Array Name: 'fsc'
  - Shape: (1064, 1, 94)
  - Data Type: float64
  - (Multi-dimensional array, showing shape only)
----------------------------------------
Array Name: 'mfcc'
  - Shape: (1064, 30, 94)
  - Data Type: float32
  - (Multi-dimensional array, showing shape only)
----------------------------------------
Array Name: 'lab

In [3]:
import os
import numpy as np

# =============================================================================
# --- Configuration ---
# Instructions: Set these variables to match the .npz file you want to inspect.
# =============================================================================
DATASET: str = NEUROVOZ_DATASET
MODE: str = "A"
FEATURE_MODE: str = "ALL"

# =============================================================================
# --- Analysis Script ---
# =============================================================================

def find_age_range(dataset: str, mode: str, feature_mode: str):
    """
    Loads a feature file and calculates the minimum and maximum age.
    """
    try:
        # Build the file path from configuration
        dataset_folder_name: str = "Italian" if dataset == "ITALIAN_DATASET" else "Neurovoz"
        features_file: str = os.path.join(os.getcwd(), dataset_folder_name, "data", f"features_{mode}_{feature_mode}.npz")

        print(f"Analyzing file: {features_file}\n")

        if not os.path.exists(features_file):
            raise FileNotFoundError("The specified feature file was not found.")

        # Load the data
        with np.load(features_file) as data:
            if 'age' not in data:
                raise KeyError("The 'age' array was not found in the feature file.")

            age_array = data['age']

            # Calculate min and max, safely ignoring any NaN values
            min_age = np.nanmin(age_array)
            max_age = np.nanmax(age_array)

            print("📊 **Age Range Analysis Complete**")
            print(f"   - Minimum Age: {min_age}")
            print(f"   - Maximum Age: {max_age}")

    except (FileNotFoundError, KeyError) as e:
        print(f"❌ ERROR: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Run the analysis
if __name__ == "__main__":
    find_age_range(DATASET, MODE, FEATURE_MODE)

Analyzing file: D:\Projects\Voice\Parkinson-s-Disease-Detector-Using-AI\Parkinson-s-Disease-Detector-Using-AI\1\Neurovoz\data\features_A_ALL.npz

📊 **Age Range Analysis Complete**
   - Minimum Age: 31.0
   - Maximum Age: 88.0
