In [None]:
"""
This contains code to test orthogonality of expert specialization.
"""
None

In [None]:
"""
Imports
"""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np
import scipy
import cupy
import cuml

import importlib
import gc
import pickle

from tqdm import tqdm
from termcolor import colored
import plotly.express as px

from utils.memory import check_memory, clear_all_cuda_memory
from utils.quantize import compare_bf16_fp16_batched

main_device = 'cuda:0'
seed = 1234
clear_all_cuda_memory()
check_memory()

## Load base model

In [None]:
"""
Load the base tokenizer/model
"""
model_id = 'allenai/OLMoE-1B-7B-0125-Instruct'
model_prefix = 'olmoe'
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token = False, add_bos_token = False, padding_side = 'left', trust_remote_code = True)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16, trust_remote_code = True).cuda().eval()

In [None]:
"""
Load dataset
"""
def load_data(model_prefix):
    all_pre_mlp_hs = torch.load(f'data/{model_prefix}/all-pre-mlp-hidden-states.pt')
    all_expert_outputs = torch.load(f'data/{model_prefix}/all-expert-outputs.pt')
    with open(f'data/{model_prefix}/metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    
    return all_pre_mlp_hs, all_expert_outputs, metadata['sample_df'], metadata['topk_df'], metadata['all_pre_mlp_hidden_states_layers'], metadata['all_expert_outputs_layers']

all_pre_mlp_hs_import, all_expert_outputs_import, sample_df_import, topk_df_import, act_map, expert_map = load_data(model_prefix)

In [None]:
"""
Let's clean up the mappings here. We'll get everything to a sample_ix level first.
"""
sample_df_raw =\
    sample_df_import\
    .assign(sample_ix = lambda df: df.groupby(['batch_ix', 'sequence_ix', 'token_ix']).ngroup())\
    .assign(seq_id = lambda df: df.groupby(['batch_ix', 'sequence_ix']).ngroup())\
    .reset_index()

topk_df =\
    topk_df_import\
    .merge(sample_df_raw[['sample_ix', 'batch_ix', 'sequence_ix', 'token_ix']], how = 'inner', on = ['sequence_ix', 'token_ix', 'batch_ix'])\
    .drop(columns = ['sequence_ix', 'token_ix', 'batch_ix'])

sample_df =\
    sample_df_raw\
    .drop(columns = ['batch_ix', 'sequence_ix'])

del sample_df_import, topk_df_import
display(topk_df)
display(sample_df)

In [None]:
"""
Convert activations to fp16 (for compatability) + dict
"""
all_pre_mlp_hs = all_pre_mlp_hs_import.to(torch.float16)
compare_bf16_fp16_batched(all_pre_mlp_hs_import, all_pre_mlp_hs)
del all_pre_mlp_hs_import
all_pre_mlp_hs = {layer_ix: all_pre_mlp_hs[:, save_ix, :] for save_ix, layer_ix in enumerate(act_map)}

all_expert_outputs = all_expert_outputs_import.to(torch.float16)
compare_bf16_fp16_batched(all_expert_outputs_import, all_expert_outputs)
del all_expert_outputs_import
all_expert_outputs = {layer_ix: all_expert_outputs[:, save_ix, :, :] for save_ix, layer_ix in enumerate(expert_map)}

gc.collect()

## Compare activation clusters of single expert

In [None]:
"""
Let's filter for topk = 1 only
"""
topk1_df =\
    topk_df\
    .pipe(lambda df: df[df['topk_ix'] == 1])

topk1_df

In [None]:
"""
Visualize clusters
"""
layer_ix = 7
expert_id = 24


relevant_sample_ids =\
    topk1_df\
    .pipe(lambda df: df[df['layer_ix'] == layer_ix])\
    .pipe(lambda df: df[df['expert'] == expert_id])\
    .sort_values(by = 'sample_ix', ascending = True)\
    ['sample_ix']\
    .tolist()

# Get expert IDs of previous layer
prev_experts_df =\
    topk1_df\
    .pipe(lambda df: df[df['layer_ix'] == layer_ix - 1])\
    .pipe(lambda df: df[df['sample_ix'].isin(relevant_sample_ids)])\
    .sort_values(by = 'sample_ix', ascending = True)\
    .rename(columns = {'expert': 'prev_expert'})\
    [['sample_ix', 'prev_expert']]

# Get sample dfs of relevant sample IDs, include expert IDs of previous layer
relevant_samples_df =\
    sample_df[sample_df['sample_ix'].isin(relevant_sample_ids)]\
    .merge(prev_experts_df, on = 'sample_ix', how = 'inner')

display(relevant_samples_df)

relevant_pre_mlp_hs = all_pre_mlp_hs[layer_ix][relevant_sample_ids]


In [None]:
"""
Plot PCA + UMAP, color by previous expert ID
"""
def reduce_pca(input_tensor: torch.Tensor, n_components = 2):
    hs_cupy = cupy.asarray(input_tensor.to(torch.float32))
    model = cuml.PCA(
        iterated_power = 100,
        n_components = n_components,
        verbose = True
    )
    model.fit(hs_cupy)
    # print(f'Explained variance ratio: {model.explained_variance_ratio_}')
    print(f'Cumulative variance ratio: {np.cumsum(model.explained_variance_ratio_)[-1]}')
    pred = cupy.asnumpy(model.fit_transform(hs_cupy))
    clear_all_cuda_memory(False)
    return pred

def reduce_umap(input_tensor: torch.Tensor, n_components = 2, metric = 'cosine', n_epochs = 200):
    hs_cupy = cupy.asarray(input_tensor.to(torch.float32))
    model = cuml.UMAP(
        n_components = n_components, 
        n_neighbors = 20, # 15 for default, smaller = more local data preserved [2 - 100]
        metric = metric, # euclidean, cosine, manhattan, l2, hamming
        min_dist = 0.5, # 0.1 by default, effective distance between embedded points
        n_epochs = n_epochs, # 200 by default for large datasets
        random_state = 123, # Allow parallelism
        verbose = False
    )
    pred = cupy.asnumpy(model.fit_transform(hs_cupy))
    clear_all_cuda_memory(False)
    return pred

def plot_reduction(relevant_pre_mlp_hs, relevant_samples_df, reducer, *args, **kwargs):
    reduce_res = reducer(relevant_pre_mlp_hs, *args, **kwargs)
    px.scatter(
        pd.concat([pd.DataFrame({'d1': reduce_res[:, 0], 'd2': reduce_res[:, 1]}), relevant_samples_df], axis = 1)\
            .sample(5000)\
            .assign(prev_expert = lambda df: df['prev_expert'].astype(str)),
        x = 'd1', y = 'd2', color = 'prev_expert', hover_data = ['token']
    ).show()

plot_reduction(relevant_pre_mlp_hs, relevant_samples_df, reduce_pca, 2)
plot_reduction(relevant_pre_mlp_hs, relevant_samples_df, reduce_umap, 2, 'cosine', 500)

In [None]:
"""
Plot PCA + UMAP, but this time include token samples from outside this expert
"""
nonrelevant_sample_ids =\
    topk1_df\
    .pipe(lambda df: df[df['layer_ix'] == layer_ix])\
    .pipe(lambda df: df[df['expert'] != expert_id])\
    .sort_values(by = 'sample_ix', ascending = True)\
    ['sample_ix']\
    .tolist()

# Get sample dfs of relevant sample IDs, include expert IDs of previous layer
nonrelevant_samples_df =\
    sample_df[sample_df['sample_ix'].isin(nonrelevant_sample_ids)]

nonrelevant_pre_mlp_hs = all_pre_mlp_hs[layer_ix][nonrelevant_sample_ids]

def plot_reduction(relevant_pre_mlp_hs, nonrelevant_pre_mlp_hs, relevant_samples_df, nonrelevant_samples_df, reducer, *args, **kwargs):
    
    reduce_res = reducer(torch.concat([relevant_pre_mlp_hs, nonrelevant_pre_mlp_hs], dim = 0), *args, **kwargs)

    merged_samples_df = pd.concat([
        relevant_samples_df.assign(prev_expert = lambda df: df['prev_expert'].astype(str)), 
        nonrelevant_samples_df.assign(prev_expert = lambda df: 'NA')
    ]).reset_index(drop = True)

    merged_df = pd.concat([pd.DataFrame({'d1': reduce_res[:, 0], 'd2': reduce_res[:, 1]}), merged_samples_df], axis = 1)

    balanced_merged_df = pd.concat([
        merged_df[merged_df['prev_expert'] == 'NA'].sample(10_000),
        merged_df[merged_df['prev_expert'] != 'NA'].sample(10_000)
    ])

    px.scatter(
        balanced_merged_df,
        x = 'd1', y = 'd2', color = 'prev_expert', hover_data = ['token']
    ).show()

plot_reduction(relevant_pre_mlp_hs, nonrelevant_pre_mlp_hs, relevant_samples_df, nonrelevant_samples_df, reduce_pca, 2)
plot_reduction(relevant_pre_mlp_hs, nonrelevant_pre_mlp_hs, relevant_samples_df, nonrelevant_samples_df, reduce_umap, 2, 'cosine', 200)

In [None]:
"""
Now let's measure orthogonality
"""

relevant_pre_mlp_hs # A tensor of n_samples_routed_to_specific_expert x D shape containing pre-MLP activations
relevant_samples_df # A df with n_samples_routed_to_specific_expert length, including an identifier column 'sample_ix', and a seperate column 'prev_expert'


torch.Size([13958, 2048])


Unnamed: 0,index,token_ix,token_id,output_id,output_prob,token,source,sample_ix,seq_id,prev_expert
0,7,866,387,253,0.85,at,en,7,0,45
1,12,871,273,13662,0.92,of,en,12,0,28
2,58,917,273,253,0.23,of,en,58,0,28
3,59,918,253,6516,0.13,the,en,59,0,4
4,61,920,273,13662,0.79,of,en,61,0,54
...,...,...,...,...,...,...,...,...,...,...
13953,6160,828,372,3641,0.79,de,es,812572,1799,0
13954,6161,829,826,39368,0.76,la,es,812573,1799,4
13955,6310,978,372,2340,0.71,de,es,812722,1799,63
13956,6351,1019,372,877,0.15,de,es,812763,1799,46


In [None]:
      
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.linalg import subspace_angles
from collections import defaultdict
import math

# Ensure reproducibility if needed
# torch.manual_seed(42)
# np.random.seed(42)

def analyze_expert_input_clusters(
    relevant_pre_mlp_hs: torch.Tensor,
    relevant_samples_df: pd.DataFrame,
    cluster_column: str = 'prev_expert',
    min_cluster_size: int = 10,
    n_pca_components_subspace: int = 10 # Number of components for subspace analysis
) -> dict:
    """
    Analyzes the geometric relationships between clusters of hidden states
    activating a specific expert, where clusters are defined by a categorical column.

    Args:
        relevant_pre_mlp_hs: Tensor of shape (N, D) containing hidden states.
        relevant_samples_df: DataFrame of shape (N, ...) containing metadata,
                             including the cluster_column and 'sample_ix'.
                             Assumes DataFrame index aligns with tensor rows OR
                             'sample_ix' can be used if alignment isn't guaranteed
                             (this implementation assumes index alignment for simplicity).
        cluster_column: Name of the column in relevant_samples_df to use for grouping.
        min_cluster_size: Minimum number of samples required to analyze a cluster.
        n_pca_components_subspace: Number of PCA components to define subspaces.

    Returns:
        A dictionary containing various geometric analysis results.
    """
    if relevant_pre_mlp_hs.shape[0] != len(relevant_samples_df):
        raise ValueError("Mismatch between tensor rows and DataFrame length.")

    if not torch.is_tensor(relevant_pre_mlp_hs):
         relevant_pre_mlp_hs = torch.tensor(relevant_pre_mlp_hs)

    # Ensure float32 for calculations
    hs_data = relevant_pre_mlp_hs.float().cpu().numpy() # Work with numpy for sklearn/scipy
    D = hs_data.shape[1]

    results = {
        'cluster_stats': {},
        'pairwise_centroid_similarity': {},
        'pairwise_pc1_similarity': {},
        'pairwise_subspace_angles': {},
        'separation_metrics': {}
    }

    # 1. Group hidden states by cluster label
    grouped_hs = defaultdict(list)
    cluster_labels = relevant_samples_df[cluster_column].unique()

    print(f"Found {len(cluster_labels)} potential clusters based on '{cluster_column}'.")

    valid_clusters = {}
    for label in cluster_labels:
        # Get indices for the current cluster label
        indices = relevant_samples_df.index[relevant_samples_df[cluster_column] == label].tolist()
        if len(indices) >= min_cluster_size:
            cluster_hs = hs_data[indices, :]
            valid_clusters[label] = {
                'indices': indices,
                'hs': cluster_hs,
                'size': len(indices)
            }
        else:
             print(f"  Skipping cluster '{label}' (size {len(indices)} < {min_cluster_size})")


    if len(valid_clusters) < 2:
        print("Need at least two valid clusters to perform pairwise analysis.")
        return results

    print(f"Analyzing {len(valid_clusters)} valid clusters.")
    valid_cluster_labels = list(valid_clusters.keys())

    # 2. Calculate per-cluster stats (Centroids, PCA)
    cluster_centroids = {}
    cluster_pcas = {} # Store fitted PCA objects
    cluster_pc1s = {}
    cluster_top_k_pcs = {}
    cluster_avg_intra_dist = {}

    overall_mean = hs_data.mean(axis=0) # For comparison if needed

    for label in valid_cluster_labels:
        cluster_data = valid_clusters[label]['hs']

        # Centroid
        centroid = cluster_data.mean(axis=0)
        cluster_centroids[label] = centroid

        # Intra-cluster distance (average distance to centroid)
        distances_to_centroid = np.linalg.norm(cluster_data - centroid, axis=1)
        cluster_avg_intra_dist[label] = distances_to_centroid.mean()

        # PCA
        try:
            # Use min(n_samples, D) components, ensure enough for subspace analysis
            n_components = min(cluster_data.shape[0], D, max(n_pca_components_subspace, 1))
            if n_components == 0: continue # Should not happen with min_cluster_size > 0

            pca = PCA(n_components=n_components, svd_solver='auto')
            pca.fit(cluster_data)
            cluster_pcas[label] = pca
            cluster_pc1s[label] = pca.components_[0] # First principal component
            # Store top-k components for subspace analysis
            effective_k = min(n_pca_components_subspace, pca.n_components_)
            cluster_top_k_pcs[label] = pca.components_[:effective_k].T # Store as D x k

        except Exception as e:
             print(f"  PCA failed for cluster '{label}': {e}")
             # Handle cases where PCA might fail (e.g., data is rank deficient)


        results['cluster_stats'][label] = {
            'size': valid_clusters[label]['size'],
            'avg_intra_dist_to_centroid': cluster_avg_intra_dist.get(label, np.nan),
            'pca_explained_variance_ratio': cluster_pcas[label].explained_variance_ratio_.tolist() if label in cluster_pcas else []
        }


    # 3. Calculate pairwise inter-cluster relationships
    all_centroid_sims = []
    all_pc1_sims = []
    all_subspace_angles_deg = [] # Store max angle in degrees
    all_inter_centroid_dists = []

    for i in range(len(valid_cluster_labels)):
        for j in range(i + 1, len(valid_cluster_labels)):
            label_i = valid_cluster_labels[i]
            label_j = valid_cluster_labels[j]
            pair_key = f"{label_i}_vs_{label_j}"

            # Centroid Similarity
            if label_i in cluster_centroids and label_j in cluster_centroids:
                sim = cosine_similarity(
                    cluster_centroids[label_i].reshape(1, -1),
                    cluster_centroids[label_j].reshape(1, -1)
                )[0, 0]
                results['pairwise_centroid_similarity'][pair_key] = sim
                all_centroid_sims.append(abs(sim)) # Use absolute value for average orthogonality check

            # PC1 Similarity
            if label_i in cluster_pc1s and label_j in cluster_pc1s:
                sim = cosine_similarity(
                    cluster_pc1s[label_i].reshape(1, -1),
                    cluster_pc1s[label_j].reshape(1, -1)
                )[0, 0]
                results['pairwise_pc1_similarity'][pair_key] = sim
                all_pc1_sims.append(abs(sim))

            # Subspace Angles
            if label_i in cluster_top_k_pcs and label_j in cluster_top_k_pcs:
                 # Ensure both subspaces have the same dimension k for scipy
                 k_i = cluster_top_k_pcs[label_i].shape[1]
                 k_j = cluster_top_k_pcs[label_j].shape[1]
                 common_k = min(k_i, k_j, n_pca_components_subspace)

                 if common_k > 0:
                     subspace_i = cluster_top_k_pcs[label_i][:, :common_k]
                     subspace_j = cluster_top_k_pcs[label_j][:, :common_k]
                     # subspace_angles returns angles in radians, largest first
                     angles_rad = subspace_angles(subspace_i, subspace_j)
                     # Max angle: Larger angle means more orthogonal (closer to pi/2 or 90 deg)
                     max_angle_deg = np.rad2deg(angles_rad[0])
                     results['pairwise_subspace_angles'][pair_key] = {
                         'max_angle_degrees': max_angle_deg,
                         'all_angles_degrees': np.rad2deg(angles_rad).tolist()
                     }
                     all_subspace_angles_deg.append(max_angle_deg)
                 else:
                     results['pairwise_subspace_angles'][pair_key] = {'error': 'Subspace dimension is zero'}


            # Inter-Centroid Distance
            if label_i in cluster_centroids and label_j in cluster_centroids:
                dist = np.linalg.norm(cluster_centroids[label_i] - cluster_centroids[label_j])
                all_inter_centroid_dists.append(dist)

    # 4. Calculate overall separation metrics
    avg_inter_centroid_dist = np.mean(all_inter_centroid_dists) if all_inter_centroid_dists else np.nan
    avg_intra_dist = np.mean(list(cluster_avg_intra_dist.values())) if cluster_avg_intra_dist else np.nan

    results['separation_metrics']['average_inter_centroid_distance'] = avg_inter_centroid_dist
    results['separation_metrics']['average_intra_cluster_dist_to_centroid'] = avg_intra_dist
    if not np.isnan(avg_intra_dist) and avg_intra_dist > 1e-9: # Avoid division by zero
         results['separation_metrics']['inter_intra_distance_ratio'] = avg_inter_centroid_dist / avg_intra_dist
    else:
         results['separation_metrics']['inter_intra_distance_ratio'] = np.nan

    # Add average similarities (lower means more orthogonal on average)
    results['summary_orthogonality'] = {
        'average_abs_centroid_similarity': np.mean(all_centroid_sims) if all_centroid_sims else np.nan,
        'average_abs_pc1_similarity': np.mean(all_pc1_sims) if all_pc1_sims else np.nan,
        'average_max_subspace_angle_deg': np.mean(all_subspace_angles_deg) if all_subspace_angles_deg else np.nan,
    }

    print("\nAnalysis Summary:")
    print(f"  Avg Abs Centroid Similarity: {results['summary_orthogonality']['average_abs_centroid_similarity']:.4f} (Lower is more orthogonal)")
    print(f"  Avg Abs PC1 Similarity: {results['summary_orthogonality']['average_abs_pc1_similarity']:.4f} (Lower is more orthogonal)")
    print(f"  Avg Max Subspace Angle (Deg): {results['summary_orthogonality']['average_max_subspace_angle_deg']:.2f} (Higher is more orthogonal, max 90)")
    print(f"  Inter/Intra Distance Ratio: {results['separation_metrics']['inter_intra_distance_ratio']:.4f} (Higher is better separated)")

    return results

analyze_expert_input_clusters(relevant_pre_mlp_hs, relevant_samples_df, 'prev_expert', 10)

Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '62' (size 1 < 10)
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '52' (size 3 < 10)
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '27' (size 1 < 10)
Analyzing 45 valid clusters.

Analysis Summary:
  Avg Abs Centroid Similarity: 0.5931 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.1128 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 89.31 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.4003 (Higher is better separated)


{'cluster_stats': {np.int64(45): {'size': 441,
   'avg_intra_dist_to_centroid': np.float32(21.44465),
   'pca_explained_variance_ratio': [0.05160859599709511,
    0.032154325395822525,
    0.018006838858127594,
    0.017352556809782982,
    0.015039470978081226,
    0.014631594531238079,
    0.012786870822310448,
    0.01169520616531372,
    0.011536695063114166,
    0.010659563355147839]},
  np.int64(28): {'size': 396,
   'avg_intra_dist_to_centroid': np.float32(21.886295),
   'pca_explained_variance_ratio': [0.03277204558253288,
    0.02342105098068714,
    0.02010740526020527,
    0.01715378277003765,
    0.016366254538297653,
    0.015381968580186367,
    0.013098567724227905,
    0.012781770899891853,
    0.01218391116708517,
    0.011295720003545284]},
  np.int64(4): {'size': 3310,
   'avg_intra_dist_to_centroid': np.float32(21.161522),
   'pca_explained_variance_ratio': [0.034403394907712936,
    0.02046547457575798,
    0.012653369456529617,
    0.011198612861335278,
    0.0105

In [116]:
"""
Compare against random intra-expert assignments
"""
def run_baseline_analysis(
    original_hs_tensor: torch.Tensor,
    original_samples_df: pd.DataFrame,
    cluster_column: str = 'prev_expert',
    n_shuffles: int = 100,
    min_cluster_size: int = 10,
    n_pca_components_subspace: int = 10
) -> pd.DataFrame:
    """
    Runs the cluster analysis multiple times with shuffled cluster labels
    to generate baseline metrics.

    Args:
        original_hs_tensor: The original tensor of hidden states (N, D).
        original_samples_df: The original DataFrame with metadata.
        cluster_column: The column containing labels to shuffle.
        n_shuffles: Number of times to shuffle and re-run the analysis.
        min_cluster_size: Minimum cluster size for analysis.
        n_pca_components_subspace: PCA components for subspace analysis.

    Returns:
        DataFrame containing summary metrics from each shuffle.
    """
    baseline_results = []
    print(f"Running {n_shuffles} baseline analyses with shuffled '{cluster_column}' labels...")

    for i in tqdm(range(n_shuffles)):
        # Create a copy to avoid modifying the original df
        shuffled_df = original_samples_df.copy()

        # Shuffle the cluster column - IMPORTANT: reset index ensures alignment after shuffling
        shuffled_labels = shuffled_df[cluster_column].sample(frac=1, random_state=i).values
        shuffled_df[cluster_column] = shuffled_labels

        try:
            # Run the analysis with the shuffled labels
            results = analyze_expert_input_clusters(
                original_hs_tensor,
                shuffled_df, # Use the df with shuffled labels
                cluster_column=cluster_column,
                min_cluster_size=min_cluster_size,
                n_pca_components_subspace=n_pca_components_subspace
            )

            # Store the summary metrics
            summary = results.get('summary_orthogonality', {})
            separation = results.get('separation_metrics', {})
            baseline_results.append({
                'shuffle_run': i,
                'avg_abs_centroid_similarity': summary.get('average_abs_centroid_similarity', np.nan),
                'avg_abs_pc1_similarity': summary.get('average_abs_pc1_similarity', np.nan),
                'avg_max_subspace_angle_deg': summary.get('average_max_subspace_angle_deg', np.nan),
                'inter_intra_distance_ratio': separation.get('inter_intra_distance_ratio', np.nan),
            })
        except Exception as e:
             print(f"\nError during shuffle run {i}: {e}. Skipping run.")
             baseline_results.append({
                 'shuffle_run': i,
                 'avg_abs_centroid_similarity': np.nan,
                 'avg_abs_pc1_similarity': np.nan,
                 'avg_max_subspace_angle_deg': np.nan,
                 'inter_intra_distance_ratio': np.nan,
             })

    return pd.DataFrame(baseline_results).dropna() # Drop rows where analysis might have failed completely

run_baseline_analysis(relevant_pre_mlp_hs, relevant_samples_df, 'prev_expert', 10, 10)

Running 10 baseline analyses with shuffled 'prev_expert' labels...


  0%|          | 0/10 [00:00<?, ?it/s]

Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '27' (size 1 < 10)
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '62' (size 1 < 10)
  Skipping cluster '52' (size 3 < 10)
Analyzing 45 valid clusters.


 10%|█         | 1/10 [00:03<00:31,  3.46s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8671 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.5072 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.58 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1825 (Higher is better separated)
Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '52' (size 3 < 10)
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '62' (size 1 < 10)
  Skipping cluster '27' (size 1 < 10)
Analyzing 45 valid clusters.


 20%|██        | 2/10 [00:06<00:25,  3.22s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8700 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.5016 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.32 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1805 (Higher is better separated)
Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '52' (size 3 < 10)
  Skipping cluster '27' (size 1 < 10)
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '62' (size 1 < 10)
Analyzing 45 valid clusters.


 30%|███       | 3/10 [00:09<00:22,  3.27s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8655 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.5335 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.51 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1827 (Higher is better separated)
Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '62' (size 1 < 10)
  Skipping cluster '52' (size 3 < 10)
  Skipping cluster '27' (size 1 < 10)
Analyzing 45 valid clusters.


 40%|████      | 4/10 [00:13<00:20,  3.34s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8684 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.5044 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.50 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1815 (Higher is better separated)
Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '62' (size 1 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '27' (size 1 < 10)
  Skipping cluster '52' (size 3 < 10)
Analyzing 45 valid clusters.


 50%|█████     | 5/10 [00:16<00:16,  3.38s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8684 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.4874 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.52 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1818 (Higher is better separated)
Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '27' (size 1 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '62' (size 1 < 10)
  Skipping cluster '52' (size 3 < 10)
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '60' (size 6 < 10)
Analyzing 45 valid clusters.


 60%|██████    | 6/10 [00:20<00:13,  3.39s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8692 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.4534 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.26 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1816 (Higher is better separated)
Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '52' (size 3 < 10)
  Skipping cluster '27' (size 1 < 10)
  Skipping cluster '62' (size 1 < 10)
  Skipping cluster '22' (size 2 < 10)
Analyzing 45 valid clusters.


 70%|███████   | 7/10 [00:23<00:10,  3.37s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8663 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.5050 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.67 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1823 (Higher is better separated)
Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '52' (size 3 < 10)
  Skipping cluster '27' (size 1 < 10)
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '62' (size 1 < 10)
Analyzing 45 valid clusters.


 80%|████████  | 8/10 [00:26<00:06,  3.34s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8682 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.5034 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.76 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1821 (Higher is better separated)
Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '52' (size 3 < 10)
  Skipping cluster '62' (size 1 < 10)
  Skipping cluster '17' (size 4 < 10)
  Skipping cluster '27' (size 1 < 10)
Analyzing 45 valid clusters.


 90%|█████████ | 9/10 [00:30<00:03,  3.32s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8740 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.4991 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.30 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1799 (Higher is better separated)
Found 53 potential clusters based on 'prev_expert'.
  Skipping cluster '1' (size 4 < 10)
  Skipping cluster '39' (size 5 < 10)
  Skipping cluster '60' (size 6 < 10)
  Skipping cluster '22' (size 2 < 10)
  Skipping cluster '27' (size 1 < 10)
  Skipping cluster '62' (size 1 < 10)
  Skipping cluster '52' (size 3 < 10)
  Skipping cluster '17' (size 4 < 10)
Analyzing 45 valid clusters.


100%|██████████| 10/10 [00:33<00:00,  3.33s/it]


Analysis Summary:
  Avg Abs Centroid Similarity: 0.8700 (Lower is more orthogonal)
  Avg Abs PC1 Similarity: 0.5561 (Lower is more orthogonal)
  Avg Max Subspace Angle (Deg): 87.34 (Higher is more orthogonal, max 90)
  Inter/Intra Distance Ratio: 0.1828 (Higher is better separated)





Unnamed: 0,shuffle_run,avg_abs_centroid_similarity,avg_abs_pc1_similarity,avg_max_subspace_angle_deg,inter_intra_distance_ratio
0,0,0.867104,0.507186,87.579712,0.182521
1,1,0.870021,0.501633,87.322777,0.180477
2,2,0.865502,0.533488,87.509087,0.182679
3,3,0.86836,0.504447,87.504318,0.181496
4,4,0.86838,0.487438,87.517799,0.181823
5,5,0.869198,0.453447,87.259544,0.181601
6,6,0.866306,0.504985,87.669853,0.182342
7,7,0.868226,0.503418,87.760117,0.182094
8,8,0.874022,0.499135,87.304276,0.179905
9,9,0.869955,0.556128,87.335716,0.182774


## SVD-based clusters

"""
Does h_para show more distinct clustering? What about h_orth?
"""
