In [None]:
import pandas as pd
import os
os.environ['OMP_NUM_THREADS'] = '4'
#os.environ['OMP_NUM_THREADS'] = '1'
from sklearn.decomposition import PCA, FastICA, NMF
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from scipy.linalg import eigh
from numpy.linalg import pinv, norm
from scipy.stats import kurtosis
from itertools import combinations
import time
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go
import kaleido
from sklearn.cluster import KMeans
import warnings
from sklearn.exceptions import ConvergenceWarning

In [None]:
#Visualizations PCA,ICA,NMF - each has own different number of components automatically found by using reconst. error (frobenius)

sns.set_theme(style="whitegrid")

# Create output directory for plots if it doesn't exist
output_dir = "output_plots"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load datasets into a dictionary (these variables should be defined elsewhere)
datasets = {
    "Combined Autism": autismcombinedcommon,
    "Twitch": twitch,
    "Accent": accent,
    "Dim512": dim512,
    "Dim1024": dim1024
}

# Store dimensionality reduction results
results = []

def find_extreme_loadings(components, feature_names, method_name, dataset_name):
    """
    For the top 5 components (or fewer if not available), identify and print the 7 most extreme loadings
    by absolute value per component.
    """
    n_components = min(components.shape[0], 5)
    for i in range(n_components):
        comp = components[i, :]
        top_indices = np.argsort(np.abs(comp))[-7:][::-1]
        print(f"\nTop 7 Extreme Loadings for {dataset_name} ({method_name}) - Component {i+1}:")
        for idx in top_indices:
            print(f"{feature_names[idx]}: {comp[idx]:.4f}")

def apply_dim_reduction(dataset_name, df):
    print(f"\n--- Processing {dataset_name} Dataset ---")
    df_original = df.copy()
    # Exclude the group/dummy columns (id columns have been manually removed)
    exclude_columns = [col for col in df.columns if col.startswith("language_") or col.startswith("Group_")]
    color_data = df_original[exclude_columns]
    X = df.drop(columns=exclude_columns).values
    feature_names = df.drop(columns=exclude_columns).columns

    # ----------------------- PCA -----------------------
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    pca = PCA(0.95, random_state=42)
    pca_result = pca.fit_transform(X_scaled)
    # Reconstruct the data using inverse transform
    reconstructed = pca.inverse_transform(pca_result)
    # Absolute reconstruction error (Frobenius norm)
    pca_reconstruction_error = np.linalg.norm(X_scaled - reconstructed, 'fro')
    # Relative error = error divided by norm of scaled input
    pca_relative_error = pca_reconstruction_error / np.linalg.norm(X_scaled, 'fro')
    pca_components = pca.n_components_
    find_extreme_loadings(pca.components_, feature_names, "PCA", dataset_name)

    # ----------------------- ICA -----------------------
    best_ica_components, best_ica_result, best_ica_error, best_ica_mixing = 1, None, np.inf, None
    for n_components in range(1, X_scaled.shape[1] // 2):
        ica = FastICA(n_components=n_components, random_state=42, max_iter=30000)
        try:
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always", ConvergenceWarning)
                ica_result = ica.fit_transform(X_scaled)
                # Fixed reconstruction using the transpose of the mixing matrix
                reconstructed = ica_result @ ica.mixing_.T
                ica_error = np.linalg.norm(X_scaled - reconstructed, 'fro')
                if any(issubclass(warning.category, ConvergenceWarning) for warning in w):
                    print(f"[ICA Warning] n_components={n_components}, n_iter_={ica.n_iter_}, error={ica_error:.4f}")
        except Exception as e:
            print(f"ICA failed for n_components={n_components}: {e}")
            continue

        if ica_error < best_ica_error:
            best_ica_error = ica_error
            best_ica_components = n_components
            best_ica_result = ica_result
            best_ica_mixing = ica.mixing_
    if best_ica_mixing is not None:
        find_extreme_loadings(best_ica_mixing.T, feature_names, "ICA", dataset_name)
    # Relative error for ICA computed on the same scaled data:
    best_ica_relative_error = best_ica_error / np.linalg.norm(X_scaled, 'fro')

    # ----------------------- NMF -----------------------
    minmax_scaler = MinMaxScaler()
    X_non_negative = minmax_scaler.fit_transform(X)

    best_nmf_components = 1
    best_nmf_result = None
    best_nmf_error = np.inf
    best_nmf_components_matrix = None

    for n_components in range(1, X_scaled.shape[1] // 2):
        nmf = NMF(n_components=n_components, max_iter=30000, tol=1e-3, random_state=42, init='nndsvda')
        try:
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always", ConvergenceWarning)
                nmf_result = nmf.fit_transform(X_non_negative)
                reconstructed = np.dot(nmf_result, nmf.components_)
                nmf_error = np.linalg.norm(X_non_negative - reconstructed, 'fro')
                if any(issubclass(warning.category, ConvergenceWarning) for warning in w):
                    print(f"[NMF Warning] n_components={n_components}, n_iter_={nmf.n_iter_}, error={nmf_error:.4f} (tol={nmf.tol})")
        except Exception as e:
            print(f"NMF failed for n_components = {n_components}: {e}")
            continue

        if nmf_error < best_nmf_error:
            best_nmf_error = nmf_error
            best_nmf_components = n_components
            best_nmf_result = nmf_result
            best_nmf_components_matrix = nmf.components_
    find_extreme_loadings(best_nmf_components_matrix, feature_names, "NMF", dataset_name)
    # Relative error for NMF computed on nonnegative-scaled data:
    best_nmf_relative_error = best_nmf_error / np.linalg.norm(X_non_negative, 'fro')

    results.append({
        "Dataset": dataset_name,
        "PCA Components": pca_components,
        "PCA Reconstruction Error (abs)": round(pca_reconstruction_error, 4),
        "PCA Relative Error": round(pca_relative_error, 4),
        "ICA Components": best_ica_components,
        "ICA Reconstruction Error (abs)": round(best_ica_error, 4),
        "ICA Relative Error": round(best_ica_relative_error, 4),
        "NMF Components": best_nmf_components,
        "NMF Reconstruction Error (abs)": round(best_nmf_error, 4),
        "NMF Relative Error": round(best_nmf_relative_error, 4)
    })

    visualize_dim_reduction(dataset_name, pca, pca_result, best_ica_result, best_nmf_result, 
                              pca.components_, color_data, best_nmf_components, best_ica_mixing,
                              best_nmf_components_matrix)

def visualize_dim_reduction(dataset_name, pca, pca_result, ica_result, nmf_result, 
                            pca_components_matrix, color_data, nmf_components, ica_mixing,
                            nmf_components_matrix):
    methods = {"PCA": pca_result, "ICA": ica_result, "NMF": nmf_result}

    # Combine dummy columns into a single group column.
    group_labels = None
    custom_colors = None
    if "Accent" in dataset_name:
        group_cols = [col for col in color_data.columns if col.startswith("language_")]
        if group_cols:
            def get_group(row):
                if row[group_cols].sum() == 0:
                    return "language_ES"
                else:
                    for col in group_cols:
                        if row[col] == 1:
                            return col
            color_data = color_data.copy()
            color_data['group'] = color_data.apply(get_group, axis=1)
            group_labels = color_data['group']
            custom_colors = px.colors.qualitative.Plotly[:6]  # 6 colors for Accent dataset
    elif "Autism" in dataset_name:
        group_cols = [col for col in color_data.columns if col.startswith("Group_")]
        if group_cols:
            def get_group(row):
                if row[group_cols].sum() == 0:
                    return "Group_Adolescent"
                else:
                    for col in group_cols:
                        if row[col] == 1:
                            return col
            color_data = color_data.copy()
            color_data['group'] = color_data.apply(get_group, axis=1)
            group_labels = color_data['group']
            custom_colors = px.colors.qualitative.Plotly[:3]  # 3 colors for Combined Autism dataset

    for method_name, result in methods.items():
        print(f"\nVisualizing {method_name} results for {dataset_name}...")

        # 3D scatter plot using Plotly Express
        reduced_df = pd.DataFrame(result[:, :3], columns=[f'{method_name} 1', f'{method_name} 2', f'{method_name} 3'])
        if group_labels is not None:
            reduced_df['group'] = group_labels.values
            fig = px.scatter_3d(
                reduced_df, 
                x=f'{method_name} 1', 
                y=f'{method_name} 2', 
                z=f'{method_name} 3',
                color='group',
                title=f'3D Scatter Plot for {dataset_name} ({method_name})',
                color_discrete_sequence=custom_colors
            )
        else:
            fig = px.scatter_3d(
                reduced_df, 
                x=f'{method_name} 1', 
                y=f'{method_name} 2', 
                z=f'{method_name} 3',
                title=f'3D Scatter Plot for {dataset_name} ({method_name})'
            )
        plotly_filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_3D.png")
        fig.write_image(plotly_filename)
        fig.show()

        # Heatmaps for feature loadings
        if method_name == "PCA":
            plt.figure(figsize=(10, 6))
            sns.heatmap(pca_components_matrix[:5], cmap='coolwarm')
            plt.title(f'{dataset_name} ({method_name}) Feature Loadings')
            filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_heatmap.png")
            plt.savefig(filename, bbox_inches='tight')
            plt.show()
        if method_name == "ICA" and ica_mixing is not None:
            plt.figure(figsize=(10, 6))
            sns.heatmap(ica_mixing.T[:5, :], cmap='coolwarm')
            plt.title(f'{dataset_name} ({method_name}) Feature Loadings')
            filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_heatmap.png")
            plt.savefig(filename, bbox_inches='tight')
            plt.show()
        if method_name == "NMF":
            plt.figure(figsize=(10, 6))
            sns.heatmap(nmf_components_matrix[:5, :], cmap='coolwarm')
            plt.title(f'{dataset_name} ({method_name}) Feature Loadings')
            filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_heatmap.png")
            plt.savefig(filename, bbox_inches='tight')
            plt.show()

        # --------------------- 2D Pairwise Scatter Plots ---------------------
        n_components = min(result.shape[1], 5)
        pair_df = pd.DataFrame(result[:, :n_components], columns=[f'{method_name} {i+1}' for i in range(n_components)])
        if group_labels is not None:
            pair_df['group'] = group_labels.values
            pairplot = sns.pairplot(pair_df, hue='group', diag_kind="kde", palette=custom_colors)
        else:
            pairplot = sns.pairplot(pair_df, diag_kind="kde")
        plt.suptitle(f'Pairwise 2D Components for {dataset_name} ({method_name})', y=1.02)
        filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_pairplot.png")
        pairplot.fig.savefig(filename, bbox_inches='tight')
        plt.show()

# Process each dataset
for dataset_name, df in datasets.items():
    apply_dim_reduction(dataset_name, df)

results_df = pd.DataFrame(results)
print("\n--- Dimensionality Reduction Summary ---")
print(results_df)

# Save the summary results to a CSV file
results_df.to_csv("dimensionality_reduction_summary.csv", index=False)

In [None]:
#Visualizations PCA,ICA,NMF - each has own different number of components automatically found by using L1 error (not frobenius)

sns.set_theme(style="whitegrid")

# Create output directory for plots if it doesn't exist
output_dir = "output_plots"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load datasets into a dictionary (these variables should be defined elsewhere)
datasets = {
    "Combined Autism": autismcombinedcommon,
    "Twitch": twitch,
    "Accent": accent,
    "Dim512": dim512,
    "Dim1024": dim1024
}

# Store dimensionality reduction results
results = []

def find_extreme_loadings(components, feature_names, method_name, dataset_name):
    """
    For the top 5 components (or fewer if not available), identify and print the 7 most extreme loadings
    by absolute value per component.
    """
    n_components = min(components.shape[0], 5)
    for i in range(n_components):
        comp = components[i, :]
        top_indices = np.argsort(np.abs(comp))[-7:][::-1]
        print(f"\nTop 7 Extreme Loadings for {dataset_name} ({method_name}) - Component {i+1}:")
        for idx in top_indices:
            print(f"{feature_names[idx]}: {comp[idx]:.4f}")

def apply_dim_reduction(dataset_name, df):
    print(f"\n--- Processing {dataset_name} Dataset ---")
    df_original = df.copy()
    # Exclude the group/dummy columns (id columns have been manually removed)
    exclude_columns = [col for col in df.columns if col.startswith("language_") or col.startswith("Group_")]
    color_data = df_original[exclude_columns]
    X = df.drop(columns=exclude_columns).values
    feature_names = df.drop(columns=exclude_columns).columns

    # ----------------------- PCA -----------------------
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    pca = PCA(0.95, random_state=42)
    pca_result = pca.fit_transform(X_scaled)
    # Reconstruct the data using inverse transform
    reconstructed = pca.inverse_transform(pca_result)
    # Calculate L1 reconstruction error (sum of absolute differences)
    pca_reconstruction_error = np.sum(np.abs(X_scaled - reconstructed))
    # Relative error for PCA is based on the scaled data
    pca_relative_error = pca_reconstruction_error / np.sum(np.abs(X_scaled))
    pca_components = pca.n_components_
    find_extreme_loadings(pca.components_, feature_names, "PCA", dataset_name)

    # ----------------------- ICA -----------------------
    best_ica_components, best_ica_result, best_ica_error, best_ica_mixing = 1, None, np.inf, None
    for n_components in range(1, X_scaled.shape[1] // 2):
        ica = FastICA(n_components=n_components, random_state=42, max_iter=30000)
        try:
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always", ConvergenceWarning)
                ica_result = ica.fit_transform(X_scaled)
                # Fixed reconstruction using the transpose of the mixing matrix
                reconstructed = ica_result @ ica.mixing_.T
                # Calculate L1 reconstruction error
                ica_error = np.sum(np.abs(X_scaled - reconstructed))
                if any(issubclass(warning.category, ConvergenceWarning) for warning in w):
                    print(f"[ICA Warning] n_components={n_components}, n_iter_={ica.n_iter_}, error={ica_error:.4f}")
        except Exception as e:
            print(f"ICA failed for n_components={n_components}: {e}")
            continue
        
        if ica_error < best_ica_error:
            best_ica_error = ica_error
            best_ica_components = n_components
            best_ica_result = ica_result
            best_ica_mixing = ica.mixing_
    if best_ica_mixing is not None:
        find_extreme_loadings(best_ica_mixing.T, feature_names, "ICA", dataset_name)
    # Relative error for ICA on the scaled data
    best_ica_relative_error = best_ica_error / np.sum(np.abs(X_scaled))

    # ----------------------- NMF -----------------------
    minmax_scaler = MinMaxScaler()
    X_non_negative = minmax_scaler.fit_transform(X)

    best_nmf_components = 1
    best_nmf_result = None
    best_nmf_error = np.inf
    best_nmf_components_matrix = None

    for n_components in range(1, X_scaled.shape[1] // 2):
        nmf = NMF(n_components=n_components, max_iter=30000, tol=1e-3, random_state=42, init='nndsvda')
        try:
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always", ConvergenceWarning)
                nmf_result = nmf.fit_transform(X_non_negative)
                reconstructed = np.dot(nmf_result, nmf.components_)
                # Calculate L1 reconstruction error for NMF (on nonnegative data)
                nmf_error = np.sum(np.abs(X_non_negative - reconstructed))
                if any(issubclass(warning.category, ConvergenceWarning) for warning in w):
                    print(f"[NMF Warning] n_components={n_components}, n_iter_={nmf.n_iter_}, error={nmf_error:.4f} (tol={nmf.tol})")
        except Exception as e:
            print(f"NMF failed for n_components = {n_components}: {e}")
            continue

        if nmf_error < best_nmf_error:
            best_nmf_error = nmf_error
            best_nmf_components = n_components
            best_nmf_result = nmf_result
            best_nmf_components_matrix = nmf.components_
    find_extreme_loadings(best_nmf_components_matrix, feature_names, "NMF", dataset_name)
    # Relative error for NMF based on the nonnegative data:
    best_nmf_relative_error = best_nmf_error / np.sum(np.abs(X_non_negative))

    results.append({
        "Dataset": dataset_name,
        "PCA Components": pca_components,
        "PCA Reconstruction Error": round(pca_reconstruction_error, 4),
        "PCA Relative Error": round(pca_relative_error, 4),
        "ICA Components": best_ica_components,
        "ICA Reconstruction Error": round(best_ica_error, 4),
        "ICA Relative Error": round(best_ica_relative_error, 4),
        "NMF Components": best_nmf_components,
        "NMF Reconstruction Error": round(best_nmf_error, 4),
        "NMF Relative Error": round(best_nmf_relative_error, 4)
    })
    
    visualize_dim_reduction(dataset_name, pca, pca_result, best_ica_result, best_nmf_result, 
                              pca.components_, color_data, best_nmf_components, best_ica_mixing,
                              best_nmf_components_matrix)

def visualize_dim_reduction(dataset_name, pca, pca_result, ica_result, nmf_result, 
                            pca_components_matrix, color_data, nmf_components, ica_mixing,
                            nmf_components_matrix):
    methods = {"PCA": pca_result, "ICA": ica_result, "NMF": nmf_result}

    # Combine dummy columns into a single group column.
    group_labels = None
    custom_colors = None
    if "Accent" in dataset_name:
        group_cols = [col for col in color_data.columns if col.startswith("language_")]
        if group_cols:
            def get_group(row):
                if row[group_cols].sum() == 0:
                    return "language_ES"
                else:
                    for col in group_cols:
                        if row[col] == 1:
                            return col
            color_data = color_data.copy()
            color_data['group'] = color_data.apply(get_group, axis=1)
            group_labels = color_data['group']
            custom_colors = px.colors.qualitative.Plotly[:6]  # 6 colors for Accent dataset
    elif "Autism" in dataset_name:
        group_cols = [col for col in color_data.columns if col.startswith("Group_")]
        if group_cols:
            def get_group(row):
                if row[group_cols].sum() == 0:
                    return "Group_Adolescent"
                else:
                    for col in group_cols:
                        if row[col] == 1:
                            return col
            color_data = color_data.copy()
            color_data['group'] = color_data.apply(get_group, axis=1)
            group_labels = color_data['group']
            custom_colors = px.colors.qualitative.Plotly[:3]  # 3 colors for Combined Autism dataset

    for method_name, result in methods.items():
        print(f"\nVisualizing {method_name} results for {dataset_name}...")

        # 3D scatter plot using Plotly Express
        # If the result has fewer than 3 dimensions, pad with zeros.
        if result.shape[1] < 3:
            pad_width = 3 - result.shape[1]
            padded = np.hstack([result, np.zeros((result.shape[0], pad_width))])
            reduced_df = pd.DataFrame(padded, columns=[f'{method_name} 1', f'{method_name} 2', f'{method_name} 3'])
        else:
            reduced_df = pd.DataFrame(result[:, :3], columns=[f'{method_name} 1', f'{method_name} 2', f'{method_name} 3'])
        if group_labels is not None:
            reduced_df['group'] = group_labels.values
            fig = px.scatter_3d(
                reduced_df, 
                x=f'{method_name} 1', 
                y=f'{method_name} 2', 
                z=f'{method_name} 3',
                color='group',
                title=f'3D Scatter Plot for {dataset_name} ({method_name})',
                color_discrete_sequence=custom_colors
            )
        else:
            fig = px.scatter_3d(
                reduced_df, 
                x=f'{method_name} 1', 
                y=f'{method_name} 2', 
                z=f'{method_name} 3',
                title=f'3D Scatter Plot for {dataset_name} ({method_name})'
            )
        plotly_filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_3D_L1.png")
        fig.write_image(plotly_filename)
        fig.show()

        # Heatmaps for feature loadings
        if method_name == "PCA":
            plt.figure(figsize=(10, 6))
            sns.heatmap(pca_components_matrix[:5], cmap='coolwarm')
            plt.title(f'{dataset_name} ({method_name}) Feature Loadings')
            filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_heatmap_L1.png")
            plt.savefig(filename, bbox_inches='tight')
            plt.show()
        if method_name == "ICA" and ica_mixing is not None:
            plt.figure(figsize=(10, 6))
            sns.heatmap(ica_mixing.T[:5, :], cmap='coolwarm')
            plt.title(f'{dataset_name} ({method_name}) Feature Loadings')
            filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_heatmap_L1.png")
            plt.savefig(filename, bbox_inches='tight')
            plt.show()
        if method_name == "NMF":
            plt.figure(figsize=(10, 6))
            # Use the NMF components matrix for the heatmap instead of the transformed data
            sns.heatmap(nmf_components_matrix[:5, :], cmap='coolwarm')
            plt.title(f'{dataset_name} ({method_name}) Feature Loadings')
            filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_heatmap_L1.png")
            plt.savefig(filename, bbox_inches='tight')
            plt.show()
        
        # --------------------- 2D Pairwise Scatter Plots ---------------------
        n_components = min(result.shape[1], 5)
        pair_df = pd.DataFrame(result[:, :n_components], columns=[f'{method_name} {i+1}' for i in range(n_components)])
        if group_labels is not None:
            pair_df['group'] = group_labels.values
            pairplot = sns.pairplot(pair_df, hue='group', diag_kind="kde", palette=custom_colors)
        else:
            pairplot = sns.pairplot(pair_df, diag_kind="kde")
        plt.suptitle(f'Pairwise 2D Components for {dataset_name} ({method_name})', y=1.02)
        filename = os.path.join(output_dir, f"{dataset_name}_{method_name}_pairplot_L1.png")
        pairplot.fig.savefig(filename, bbox_inches='tight')
        plt.show()

for dataset_name, df in datasets.items():
    apply_dim_reduction(dataset_name, df)

results_df = pd.DataFrame(results)
print("\n--- Dimensionality Reduction Summary ---")
print(results_df)

# Save the summary results to a CSV file (with _L1 in the filename)
results_df.to_csv("dimensionality_reduction_summary_L1.csv", index=False)