In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import seaborn as sns
import gc


# =============================================================================
# --- Configuration ---
# =============================================================================

# Class definitions
HEALTHY_CLASS = "healthy_control"
PARKINSON_CLASS = "parkinson_patient"
CLASSES = [HEALTHY_CLASS, PARKINSON_CLASS]

# Dataset and mode selection
ITALIAN_DATASET = "ITALIAN_DATASET"
NEUROVOZ_DATASET = "NEUROVOZ_DATASET"
UAMS_DATASET = "UAMS_DATASET"
MPOWER_DATASET = "MPOWER_DATASET"
MODE_A = "A"
MODE_ALL_VALIDS = "ALL_VALIDS"

FEATURE_MODE_DEFAULT = "DEFAULT"
FEATURE_MODE_ALL = "ALL"

# --- SELECT YOUR CONFIGURATION HERE ---
DATASET = MPOWER_DATASET
MODE = MODE_A
FEATURE_MODE = FEATURE_MODE_DEFAULT
FOLDER_NAME = "plots"

# --- Path Setup ---
dataset_folder_name = "Italian" if DATASET == ITALIAN_DATASET else "Neurovoz"
dataset_folder_name = ""
if DATASET == ITALIAN_DATASET:
    dataset_folder_name = "Italian"
elif DATASET == NEUROVOZ_DATASET:
    dataset_folder_name = "Neurovoz"
elif DATASET == UAMS_DATASET:
    dataset_folder_name = "UAMS"
elif DATASET == MPOWER_DATASET:
    dataset_folder_name = "mPower"

FEATURES_FILE = os.path.join(os.getcwd(), dataset_folder_name, "data", f"features_{MODE}_{FEATURE_MODE}.npz")
RESULTS_OUTPUT_PATH = os.path.join(os.getcwd(), dataset_folder_name, f"results_{MODE}_{FEATURE_MODE}", FOLDER_NAME)
os.makedirs(RESULTS_OUTPUT_PATH, exist_ok=True)

# =============================================================================
# --- Helper Functions ---
# =============================================================================

def get_feature_keys(feature_mode):
    """Returns the list of feature keys based on the selected mode."""
    if feature_mode == FEATURE_MODE_DEFAULT:
        return ['mel_spectrogram', 'mfcc']
    elif feature_mode == FEATURE_MODE_ALL:
        return ['spectrogram', 'mel_spectrogram', 'mfcc', 'fsc']
    else:
        raise ValueError(f"Unknown FEATURE_MODE: {feature_mode}")

# =============================================================================
# --- Data Loading and Preparation ---
# =============================================================================

def load_features(features_file):
    """Loads features from the specified .npz file."""
    print(f"Loading features from {features_file}")
    if not os.path.exists(features_file):
        raise FileNotFoundError(f"Features file not found: {features_file}")

    with np.load(features_file) as data:
        features = {key: data[key] for key in data.keys()}

    print("Loaded feature shapes:")
    total_memory = 0
    for key, value in features.items():
        memory_mb = value.nbytes / (1024 * 1024)
        print(f"  - {key}: {value.shape} ({memory_mb:.2f} MB)")
        if key != 'labels':
            total_memory += memory_mb
    print(f"Total feature memory: {total_memory:.2f} MB")
    return features

def prepare_features_efficiently(features_dict, feature_mode, max_samples=None, use_subsample=True):
    """Prepares features for dimensionality reduction with a focus on memory efficiency."""
    print(f"--- Preparing Features for Mode: {feature_mode} (Memory Efficient) ---")

    feature_keys = get_feature_keys(feature_mode)
    print(f"Using features: {feature_keys}")
    labels = features_dict['labels']
    n_samples = len(labels)

    if use_subsample and n_samples > 2000:
        if max_samples is None: max_samples = 2000
        print(f"Large dataset ({n_samples} samples). Stratified subsampling to ~{max_samples} for visualization.")
        indices = []
        unique_labels = np.unique(labels)
        samples_per_class = max_samples // len(unique_labels)
        for label in unique_labels:
            label_indices = np.where(labels == label)[0]
            selected_indices = np.random.choice(label_indices, min(len(label_indices), samples_per_class), replace=False)
            indices.extend(selected_indices)
        indices = np.array(indices)
        np.random.shuffle(indices)
    else:
        indices = np.arange(n_samples)

    labels_subsampled = labels[indices]
    n_subsamples = len(labels_subsampled)

    feature_vectors = []
    for i, idx in enumerate(indices):
        if (i + 1) % 500 == 0:
            print(f"  Processing sample {i+1}/{n_subsamples}")
        combined_features_list = [features_dict[key][idx].flatten() for key in feature_keys]
        feature_vectors.append(np.concatenate(combined_features_list))

    X = np.array(feature_vectors)
    del feature_vectors
    gc.collect()

    print(f"Final feature matrix shape: {X.shape}")
    return X, labels_subsampled

# =============================================================================
# --- Visualization and Analysis Functions ---
# =============================================================================

def create_dimensionality_reduction_plots(features_dict, output_path, feature_mode, mode):
    """Creates PCA, t-SNE, and LDA plots with the specified layout."""
    print("\n--- Creating Combined Feature Dimensionality Reduction Plots ---")
    try:
        X, y = prepare_features_efficiently(features_dict, feature_mode, max_samples=1500)
        print("Standardizing features...")
        X_scaled = StandardScaler().fit_transform(X)

        class_names = [HEALTHY_CLASS, PARKINSON_CLASS]
        colors = ['#2E86C1', '#E74C3C']

        # ** REFINED: Changed subplot layout to 3x2 to match the example image **
        fig, axes = plt.subplots(3, 2, figsize=(16, 18))
        fig.suptitle('Dimensionality Reduction Analysis: HC vs PD', fontsize=16)

        # --- PCA Explained Variance ---
        print("Computing PCA Explained Variance...")
        n_components_full = min(50, X_scaled.shape[0] - 1, X_scaled.shape[1])
        pca_full = PCA(n_components=n_components_full)
        pca_full.fit(X_scaled)

        # Plot 1: Explained Variance per Component
        axes[0, 0].plot(range(1, n_components_full + 1), pca_full.explained_variance_ratio_, 'bo-', markersize=4)
        axes[0, 0].set_xlabel('Principal Component')
        axes[0, 0].set_ylabel('Explained Variance Ratio')
        axes[0, 0].set_title('PCA: Explained Variance per Component')
        axes[0, 0].grid(True, alpha=0.5, linestyle=':')

        # Plot 2: Cumulative Explained Variance
        cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
        axes[0, 1].plot(range(1, n_components_full + 1), cumulative_variance, 'ro-', markersize=4)
        axes[0, 1].axhline(y=0.95, color='k', linestyle='--', alpha=0.8, label='95% Variance')
        axes[0, 1].set_xlabel('Number of Components')
        axes[0, 1].set_ylabel('Cumulative Explained Variance')
        axes[0, 1].set_title('PCA: Cumulative Explained Variance')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.5, linestyle=':')

        # --- 2D PCA plot ---
        print("Computing 2D PCA Projection...")
        pca_2d = PCA(n_components=2)
        X_pca_2d = pca_2d.fit_transform(X_scaled)
        for i, (name, color) in enumerate(zip(class_names, colors)):
            mask = y == i
            axes[1, 0].scatter(X_pca_2d[mask, 0], X_pca_2d[mask, 1], c=color, label=f'{name.replace("_", " ").title()} (n={np.sum(mask)})', alpha=0.7, s=20)
        axes[1, 0].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.2%} variance)')
        axes[1, 0].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.2%} variance)')
        axes[1, 0].set_title('PCA: 2D Projection')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.5, linestyle=':')

        # --- t-SNE Analysis ---
        print("Computing t-SNE Projection...")
        perplexity = min(30, max(5, (X_scaled.shape[0] // 5) - 1))
        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, n_iter=1000, init='pca', learning_rate='auto')
        X_tsne = tsne.fit_transform(X_scaled)
        for i, (name, color) in enumerate(zip(class_names, colors)):
            mask = y == i
            axes[1, 1].scatter(X_tsne[mask, 0], X_tsne[mask, 1], c=color, label=f'{name.replace("_", " ").title()} (n={np.sum(mask)})', alpha=0.7, s=20)
        axes[1, 1].set_xlabel('t-SNE Dimension 1')
        axes[1, 1].set_ylabel('t-SNE Dimension 2')
        axes[1, 1].set_title('t-SNE: 2D Projection')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.5, linestyle=':')

        # --- LDA Analysis ---
        print("Computing LDA Projection...")
        lda = LinearDiscriminantAnalysis(n_components=1)
        X_lda = lda.fit_transform(X_scaled, y)
        for i, (name, color) in enumerate(zip(class_names, colors)):
            mask = y == i
            sns.kdeplot(X_lda[mask].ravel(), ax=axes[2, 0], color=color, label=f'{name.replace("_", " ").title()} (n={np.sum(mask)})', fill=True, alpha=0.5)
        axes[2, 0].set_xlabel('LD1')
        axes[2, 0].set_ylabel('Density')
        axes[2, 0].set_title('LDA: 1D Projection')
        axes[2, 0].legend()
        axes[2, 0].grid(True, alpha=0.5, linestyle=':')

        # Turn off the last unused subplot
        axes[2, 1].axis('off')

        plt.tight_layout(rect=[0, 0, 1, 0.96])
        save_path = os.path.join(output_path, f"dimensionality_reduction.png")
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved combined feature plots to '{os.path.basename(save_path)}'")
        del X, y, X_scaled, X_pca_2d, X_tsne, X_lda
        gc.collect()

    except Exception as e:
        print(f"Could not generate dimensionality reduction plots: {e}")
        import traceback
        traceback.print_exc()

def create_individual_feature_analysis(features_dict, output_path, feature_mode, mode, max_samples=1000):
    """Creates separate dimensionality reduction plots for each individual feature type."""
    print("\n--- Creating Individual Feature Analysis ---")
    try:
        feature_keys = get_feature_keys(feature_mode)
        titles = {'spectrogram': 'Spectrogram', 'mel_spectrogram': 'Mel Spectrogram', 'mfcc': 'MFCC', 'fsc': 'Spectral Centroid'}

        num_features = len(feature_keys)
        labels = features_dict['labels']
        n_samples = len(labels)

        indices = np.random.choice(n_samples, min(n_samples, max_samples), replace=False)
        labels_sub = labels[indices]

        class_names = ["Healthy Control", "Parkinson Patient"]
        colors = ['#2E86C1', '#E74C3C']

        # ** REFINED: Set layout to 3 rows (PCA, tSNE, LDA) and N columns for features **
        fig, axes = plt.subplots(3, num_features, figsize=(7 * num_features, 15), squeeze=False)
        fig.suptitle('Individual Feature Analysis: PCA, t-SNE, and LDA', fontsize=16)

        for col, feat_key in enumerate(feature_keys):
            feat_title = titles.get(feat_key, feat_key.replace("_", " ").title())
            print(f"  Processing {feat_title}...")

            X = np.array([features_dict[feat_key][i].flatten() for i in indices])
            X_scaled = StandardScaler().fit_transform(X)

            # --- PCA ---
            pca = PCA(n_components=2)
            X_pca = pca.fit_transform(X_scaled)
            ax = axes[0, col]
            for i, (name, color) in enumerate(zip(class_names, colors)):
                mask = labels_sub == i
                ax.scatter(X_pca[mask, 0], X_pca[mask, 1], c=color, label=f'{name} (n={np.sum(mask)})', alpha=0.7, s=15)
            ax.set_title(f'PCA: {feat_title}')
            ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
            ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
            ax.legend()
            ax.grid(True, alpha=0.5, linestyle=':')

            # --- t-SNE ---
            perplexity = min(30, max(5, (X_scaled.shape[0] // 5) - 1))
            tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, n_iter=1000, init='pca', learning_rate='auto')
            X_tsne = tsne.fit_transform(X_scaled)
            ax = axes[1, col]
            for i, (name, color) in enumerate(zip(class_names, colors)):
                mask = labels_sub == i
                ax.scatter(X_tsne[mask, 0], X_tsne[mask, 1], c=color, label=f'{name} (n={np.sum(mask)})', alpha=0.7, s=15)
            ax.set_title(f't-SNE: {feat_title}')
            ax.set_xlabel('t-SNE Dim 1')
            ax.set_ylabel('t-SNE Dim 2')
            ax.legend()
            ax.grid(True, alpha=0.5, linestyle=':')

            # --- LDA ---
            lda = LinearDiscriminantAnalysis(n_components=1)
            X_lda = lda.fit_transform(X_scaled, labels_sub)
            ax = axes[2, col]
            for i, (name, color) in enumerate(zip(class_names, colors)):
                mask = labels_sub == i
                sns.kdeplot(X_lda[mask].ravel(), ax=ax, color=color, label=f'{name} (n={np.sum(mask)})', fill=True, alpha=0.6)
            ax.set_title(f'LDA: {feat_title}')
            ax.set_xlabel('LD1')
            ax.set_ylabel('Density')
            ax.legend()
            ax.grid(True, alpha=0.5, linestyle=':')

            del X, X_scaled, X_pca, X_tsne, X_lda
            gc.collect()

        plt.tight_layout(rect=[0, 0, 1, 0.96])
        save_path = os.path.join(output_path, f"individual_feature_analysis.png")
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.close()

        print(f"Saved individual feature analysis to '{os.path.basename(save_path)}'")

    except Exception as e:
        print(f"Could not generate individual feature analysis: {e}")
        import traceback
        traceback.print_exc()

def calculate_separability_metrics(features_dict, feature_mode, max_samples=2000):
    """
    ** REFINED: Calculates the specific metrics requested: inter/intra-class distance,
    Fisher's ratio, and silhouette score. **
    """
    print("\n--- Calculating Separability Metrics ---")
    try:
        X, y = prepare_features_efficiently(features_dict, feature_mode, max_samples=max_samples)
        X_scaled = StandardScaler().fit_transform(X)

        class_0_mask = (y == 0)
        class_1_mask = (y == 1)

        if not np.any(class_0_mask) or not np.any(class_1_mask):
            print("Warning: One or both classes are missing in the sample. Cannot calculate metrics.")
            return None

        # Calculate centroids for each class
        centroid_0 = np.mean(X_scaled[class_0_mask], axis=0)
        centroid_1 = np.mean(X_scaled[class_1_mask], axis=0)

        # 1. Inter-class distance (distance between centroids)
        inter_class_distance = np.linalg.norm(centroid_0 - centroid_1)

        # 2. Intra-class distances (average distance of samples from their centroid)
        intra_class_0 = np.mean([np.linalg.norm(x - centroid_0) for x in X_scaled[class_0_mask]])
        intra_class_1 = np.mean([np.linalg.norm(x - centroid_1) for x in X_scaled[class_1_mask]])

        # 3. Fisher's Discriminant Ratio
        denominator = intra_class_0**2 + intra_class_1**2
        fisher_ratio = inter_class_distance**2 / denominator if denominator > 0 else 0

        # 4. Silhouette Score
        silhouette = silhouette_score(X_scaled, y)

        metrics = {
            'inter_class_distance': inter_class_distance,
            'intra_class_0': intra_class_0,
            'intra_class_1': intra_class_1,
            'fisher_ratio': fisher_ratio,
            'silhouette_score': silhouette
        }

        print(f"Separability Metrics for '{feature_mode}' mode:")
        for key, value in metrics.items():
            print(f"  - {key}: {value:.4f}")

        return metrics

    except Exception as e:
        print(f"Could not calculate separability metrics: {e}")
        import traceback
        traceback.print_exc()
        return None

# =============================================================================
# --- Main Execution ---
# =============================================================================

def main():
    """Main function to run the complete analysis workflow."""
    print("=" * 50)
    print("=== Feature Dimensionality Analysis ===")
    print("=" * 50)
    print(f"Dataset: {DATASET}")
    print(f"Mode: {MODE}")
    print(f"Feature Mode: {FEATURE_MODE}")
    print(f"Output Path: {RESULTS_OUTPUT_PATH}\n")

    try:
        # Load features from disk
        features = load_features(FEATURES_FILE)

        # Create combined and individual feature visualizations
        create_dimensionality_reduction_plots(features, RESULTS_OUTPUT_PATH, FEATURE_MODE, MODE)
        create_individual_feature_analysis(features, RESULTS_OUTPUT_PATH, FEATURE_MODE, MODE)

        # Calculate and save separability metrics
        metrics = calculate_separability_metrics(features, FEATURE_MODE)
        if metrics:
            metrics_df = pd.DataFrame([metrics])
            metrics_file = os.path.join(RESULTS_OUTPUT_PATH, f"separability_metrics.csv")
            metrics_df.to_csv(metrics_file, index=False)
            print(f"\nMetrics saved to {os.path.basename(metrics_file)}")

    except FileNotFoundError as e:
        print(f"\nERROR: {e}")
        print("Please ensure the feature file exists and the configuration is correct.")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")
        import traceback
        traceback.print_exc()

    finally:
        print("\n" + "=" * 50)
        print("=== Analysis Complete ===")
        print(f"All generated files are saved in: {RESULTS_OUTPUT_PATH}")
        print("=" * 50)

if __name__ == "__main__":
    main()