In [None]:
"""
Test clustering
"""
None

In [None]:
"""
Imports
"""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np
from tqdm import tqdm
from termcolor import colored
import importlib

from utils.memory import check_memory, clear_all_cuda_memory

# https://docs.rapids.ai/install/
import cupy
import cuml

import plotly.express as px
import pickle

main_device = 'cuda:0'
seed = 1234
clear_all_cuda_memory()
check_memory()

## Load base model

In [None]:
"""
Load the base tokenizer/model
"""
model_prefix = 'qwen2.5-3b'
# tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token = False, add_bos_token = False, padding_side = 'left', trust_remote_code = True)
# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16, trust_remote_code = True).cuda().eval()

## Load data

In [None]:
"""
Load dataset
"""
def load_data(model_prefix):
    all_post_layer_hs = torch.load(f'data/{model_prefix}-all-post-layer-hidden-states.pt')
    with open(f'data/{model_prefix}-metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    
    return all_post_layer_hs, metadata['sample_df']

all_post_layer_hs, sample_df_import = load_data(model_prefix)

## Prep for Clustering

In [None]:
"""
Let's clean up the mappings here. We'll get everything to a sample_ix level first.
"""
sample_df =\
    sample_df_import\
    .assign(sample_ix = lambda df: df.groupby(['batch_ix', 'sequence_ix', 'token_ix']).ngroup())\
    .assign(seq_id = lambda df: df.groupby(['batch_ix', 'sequence_ix']).ngroup())\
    .reset_index()\
    .drop(columns = ['batch_ix', 'sequence_ix'])

display(sample_df)

## Basic clustering

In [None]:
def print_samples(df, grouping_cols):
    """
    Takes a wide dataframe and groups it, then prints random groups
    """
    res =\
        df\
        .groupby(grouping_cols, as_index = False)\
        .agg(
            n_samples = ('token', 'size'),
            samples = ('token', lambda s: s.sample(n = min(len(s), 10)).tolist())
        )\
        .pipe(lambda df: df[df['n_samples'] >= 5])\
        .sample(25)

    display(res)

In [None]:
"""
Base K-Means (note - returns imbalanced clusters)
""" 
def cluster_kmeans(layer_hs: torch.Tensor, n_clusters = 64):
    """
    Params:
        @layer_hs: A n_token_samples x D tensor for a single layer
        @n_clusters: The number of clusters to return

    Returns:
        A list of length n_token_samples of cluster ids
    """
    hs_cupy = cupy.asarray(layer_hs.to(torch.float32))
    kmeans_model = cuml.cluster.KMeans(
        n_clusters = n_clusters,
        max_iter = 1000,
        random_state = 123,
        verbose = True
    )
    kmeans_model.fit(hs_cupy)
    cluster_labels = kmeans_model.labels_ # shape = (n_samples,)
    # cluster_centers = kmeans_model.cluster_centers_ # shape = (num_clusters, D)
    return cluster_labels.tolist()

kmeans_res = [
    {'layer_ix': layer_ix, 'cluster_ids': cluster_kmeans(layer_hs, 64)}
    for layer_ix, layer_hs in tqdm(enumerate(all_post_layer_hs.unbind(dim = 1)))
]

kmeans_df =\
    pd.concat([pd.DataFrame({'layer_' + str(x['layer_ix']) + '_id': x['cluster_ids']}) for x in kmeans_res], axis = 1)\
    .pipe(lambda df: pd.concat([df, sample_df], axis = 1))

display(kmeans_df.groupby('layer_1_id', as_index = False).agg(n_samples = ('token', 'size')).sort_values(by = 'n_samples', ascending = False))
display(kmeans_df.groupby('layer_3_id', as_index = False).agg(n_samples = ('token', 'size')).sort_values(by = 'n_samples', ascending = False))
clear_all_cuda_memory()

print_samples(kmeans_df, ['layer_2_id', 'layer_3_id', 'layer_4_id', 'layer_5_id', 'layer_6_id'])

## Dimension reduction clustering

In [None]:
""" 
Test decomp methods
"""
def reduce_pca(layer_hs: torch.Tensor, n_components = 2):
    # https://docs.rapids.ai/api/cuml/stable/api/#principal-component-analysis
    hs_cupy = cupy.asarray(layer_hs.to(torch.float32))
    model = cuml.PCA(
        iterated_power = 20,
        n_components = n_components,
        verbose = True
    )
    model.fit(hs_cupy)
    # print(f'Explained variance ratio: {model.explained_variance_ratio_}')
    print(f'Cumulative variance ratio: {np.cumsum(model.explained_variance_ratio_)[-1]}')
    # print(f'Means by feature: {model.mean_}')
    # print(f'Max feature mean: {np.max(model.mean_)} | Min feature mean: {np.min(model.mean_)}')
    pred = cupy.asnumpy(model.transform(hs_cupy))
    clear_all_cuda_memory()
    return pred

pca_test = reduce_pca(all_post_layer_hs.unbind(dim = 1)[0], 2)
px.scatter(
    pd.concat([pd.DataFrame({'d1': pca_test[:, 0], 'd2': pca_test[:, 1]}), sample_df.head(pca_test.shape[0])], axis = 1)\
        .sample(5000)
        .assign(is_of = lambda df: np.where(df['token'] == ' of', 1, 0)),
    x = 'd1', y = 'd2', color = 'is_of', hover_data = ['token']
).show()

pca_10 = [reduce_pca(layer_hs, 10) for layer_hs in tqdm(all_post_layer_hs.unbind(dim = 1))]
pca_100 = [reduce_pca(layer_hs, 100) for layer_hs in tqdm(all_post_layer_hs.unbind(dim = 1))]

In [None]:
def reduce_umap(layer_hs: torch.Tensor, n_components = 2, metric = 'cosine'):
    # https://docs.rapids.ai/api/cuml/stable/api/#umap
    hs_cupy = cupy.asarray(layer_hs.to(torch.float16))

    model = cuml.UMAP(
        n_components = n_components, 
        n_neighbors = 15, # 15 for default, smaller = more local data preserved [2 - 100]
        metric = metric, # euclidean, cosine, manhattan, l2, hamming
        min_dist = 0.1, # 0.1 by default, effective distance between embedded points
        n_epochs = 200, # 200 by default for large datasets
        random_state = None, # Allow parallelism
        verbose = False
    )
    pred = cupy.asnumpy(model.fit_transform(hs_cupy))
    clear_all_cuda_memory()
    return pred

umap_test = reduce_umap(all_post_layer_hs.unbind(dim = 1)[0], 2, 'cosine') # 300k = 2min
px.scatter(
    pd.concat([pd.DataFrame({'d1': umap_test[:, 0], 'd2': umap_test[:, 1]}), sample_df.head(umap_test.shape[0])], axis = 1)\
        .sample(5000)
        .assign(is_of = lambda df: np.where(df['token'] == ' of', 1, 0)),
    x = 'd1', y = 'd2', color = 'is_of', hover_data = ['token']
).show()

# umap_euc_10 = [reduce_umap(layer_hs, 10, 'euclidean') for layer_hs in tqdm(all_post_layer_hs.unbind(dim = 1))]
# umap_euc_100 = [reduce_umap(layer_hs, 100, 'euclidean') for layer_hs in tqdm(all_post_layer_hs.unbind(dim = 1))]
umap_cos_10 = [reduce_umap(layer_hs, 10, 'cosine') for layer_hs in tqdm(all_post_layer_hs.unbind(dim = 1))] # Cosine most closely maps to router (dot product)
umap_cos_100 = [reduce_umap(layer_hs, 100, 'cosine') for layer_hs in tqdm(all_post_layer_hs.unbind(dim = 1))]

In [None]:
"""
Kmeans
"""
def cluster_kmeans(layer_hs_np: np.ndarray):
    """
    Cluster a layer using Kmeans

    Params:
        @layer_hs_np: An np array of size n_samples x Dhat, where Dhat is some possibly compressed hidden state dimension.
    """
    # https://docs.rapids.ai/api/cuml/stable/api/#kmeans
    hs_cupy = cupy.asarray(layer_hs_np)

    model = cuml.KMeans(
        n_clusters = 100,
        max_iter = 500
    )
    cluster_labels = model.fit_predict(hs_cupy).tolist()
    return cluster_labels


def test_kmeans(layer_hs_list, layers_to_group):
    """
    Cluster multiple layers and print diagnostics, then print cross-layer groups.
    
    Params:
        @layer_hs_list: A list of np arrays, each of size n_samples x Dhat, where Dhat is some possibly compressed hidden state dimension
        @layers_to_group: The indices of layer_hs_list (0-indexed) to be used for grouping clusters across layers.
    """
    cl_res = [{'layer_ix': l, 'cluster_ids': cluster_kmeans(layer_hs_list[l])} for l in tqdm(layers_to_group)]
    
    cl_df =\
        pd.concat([pd.DataFrame({'layer_' + str(x['layer_ix']) + '_id': x['cluster_ids']}) for x in cl_res], axis = 1)\
        .pipe(lambda df: pd.concat([df, sample_df.head(layer_hs_list[0].shape[0])], axis = 1))

    display(cl_df.groupby(f"layer_{str(layers_to_group[0])}_id", as_index = False).agg(n_samples = ('token', 'size')).sort_values(by = 'n_samples', ascending = False))
    print('Cross-layer clusters:')
    print_samples(cl_df, [f"layer_{str(l)}_id"  for l in layers_to_group])

    return cl_df

kmeans_path_1 = test_kmeans(umap_cos_100, [2, 3, 4, 5, 6])

In [None]:
"""
Kmeans
"""
def cluster_aggc(layer_hs_np: np.ndarray):
    """
    Cluster a layer using Kmeans

    Params:
        @layer_hs_np: An np array of size n_samples x Dhat, where Dhat is some possibly compressed hidden state dimension.
    """
    # https://docs.rapids.ai/api/cuml/stable/api/#dbscan
    hs_cupy = cupy.asarray(layer_hs_np)

    model = cuml.AgglomerativeClustering(
        n_clusters = 100,
        metric = 'cosine'
    )
    cluster_labels = model.fit_predict(hs_cupy).tolist()
    return cluster_labels


def test_aggc(layer_hs_list, layers_to_group):
    """
    Cluster multiple layers and print diagnostics, then print cross-layer groups.
    
    Params:
        @layer_hs_list: A list of np arrays, each of size n_samples x Dhat, where Dhat is some possibly compressed hidden state dimension
        @layers_to_group: The indices of layer_hs_list (0-indexed) to be used for grouping clusters across layers.
    """
    cl_res = [{'layer_ix': l, 'cluster_ids': cluster_aggc(layer_hs_list[l])} for l in tqdm(layers_to_group)]
    
    cl_df =\
        pd.concat([pd.DataFrame({'layer_' + str(x['layer_ix']) + '_id': x['cluster_ids']}) for x in cl_res], axis = 1)\
        .pipe(lambda df: pd.concat([df, sample_df.head(layer_hs_list[0].shape[0])], axis = 1))

    display(cl_df.groupby(f"layer_{str(layers_to_group[0])}_id", as_index = False).agg(n_samples = ('token', 'size')).sort_values(by = 'n_samples', ascending = False))
    print('Cross-layer clusters:')
    print_samples(cl_df, [f"layer_{str(l)}_id"  for l in layers_to_group])

    return cl_df

aggc_path_1 = test_aggc(umap_cos_100, [2, 3, 4, 5, 6])

In [None]:
"""
DBScan
"""
def cluster_dbscan(layer_hs_np: np.ndarray, metric = 'euclidean'):
    """
    Cluster a layer using DBScan

    Params:
        @layer_hs_np: An np array of size n_samples x Dhat, where Dhat is some possibly compressed hidden state dimension.
        @metric: The distance metric to use. Either "euclidean" or "cosine" are reasonable.
    """
    # https://docs.rapids.ai/api/cuml/stable/api/#dbscan
    hs_cupy = cupy.asarray(layer_hs_np)

    model = cuml.DBSCAN(
        metric = metric, # Or cosine
        min_samples = hs_cupy.shape[0] // 64 // 100, # Number of samples st the group can be considered a core point
        verbose = False
    )
    cluster_labels = model.fit_predict(hs_cupy).tolist()
    return cluster_labels


def test_dbscan(layer_hs_list, metric, layers_to_group):
    """
    Cluster multiple layers and print diagnostics, then print cross-layer groups.
    
    Params:
        @layer_hs_list: A list of np arrays, each of size n_samples x Dhat, where Dhat is some possibly compressed hidden state dimension
        @metric: The distance metric to use. Either "euclidean" or "cosine" are reasonable.
        @layers_to_group: The indices of layer_hs_list (0-indexed) to be used for grouping clusters across layers.
    """
    cl_res = [{'layer_ix': l, 'cluster_ids': cluster_dbscan(layer_hs_list[l] , metric)} for l in tqdm(layers_to_group)]
    
    for r in cl_res:
        print(f"Clusters {len(set(r['cluster_ids'])):,} | Unassigned to clusters: {len([x for x in r['cluster_ids'] if x == -1]):,}/{len(r['cluster_ids']):,}")

    cl_df =\
        pd.concat([pd.DataFrame({'layer_' + str(x['layer_ix']) + '_id': x['cluster_ids']}) for x in cl_res], axis = 1)\
        .pipe(lambda df: pd.concat([df, sample_df.head(layer_hs_list[0].shape[0])], axis = 1))

    display(cl_df.groupby(f"layer_{str(layers_to_group[0])}_id", as_index = False).agg(n_samples = ('token', 'size')).sort_values(by = 'n_samples', ascending = False))
    print('Cross-layer clusters:')
    print_samples(cl_df, [f"layer_{str(l)}_id"  for l in layers_to_group])

    return cl_df

dbscan_paths_1 = test_dbscan(umap_cos_100, 'cosine', [2, 3, 4, 5, 6])

In [None]:
"""
HDBSCAN
"""

def cluster_hdbscan(layer_hs_np: np.ndarray, metric = 'euclidean'):
    """
    Cluster a layer using HDBScan

    Params:
        @layer_hs_np: An np array of size n_samples x Dhat, where Dhat is some possibly compressed hidden state dimension.
        @metric: The distance metric to use. Either "euclidean" or "cosine" are reasonable.
    """
    # https://docs.rapids.ai/api/cuml/stable/api/#dbscan
    hs_cupy = cupy.asarray(layer_hs_np)

    model = cuml.HDBSCAN(
        min_cluster_size = len(hs_cupy) // 64 // 20, # Min 1/20 of the uniform dist value
        max_cluster_size = len(hs_cupy) // 64 * 20, # Max 20x the uniform dist values 
        metric = metric,
        min_samples = 1,
    )
    cluster_labels = model.fit_predict(hs_cupy).tolist()
    return cluster_labels

def test_hdbscan(layer_hs_list, metric, layers_to_group):
    """
    Cluster multiple layers and print diagnostics, then print cross-layer groups.
    
    Params:
        @layer_hs_list: A list of np arrays, each of size n_samples x Dhat, where Dhat is some possibly compressed hidden state dimension
        @metric: The distance metric to use. Either "euclidean" or "cosine" are reasonable.
        @layers_to_group: The indices of layer_hs_list (0-indexed) to be used for grouping clusters across layers.
    """
    cl_res = [{'layer_ix': l, 'cluster_ids': cluster_hdbscan(layer_hs_list[l] , metric)} for l in tqdm(layers_to_group)]
    
    for r in cl_res:
        print(f"Clusters {len(set(r['cluster_ids'])):,} | Unassigned to clusters: {len([x for x in r['cluster_ids'] if x == -1]):,}/{len(r['cluster_ids']):,}")

    cl_df =\
        pd.concat([pd.DataFrame({'layer_' + str(x['layer_ix']) + '_id': x['cluster_ids']}) for x in cl_res], axis = 1)\
        .pipe(lambda df: pd.concat([df, sample_df.head(layer_hs_list[0].shape[0])], axis = 1))

    display(cl_df.groupby(f"layer_{str(layers_to_group[0])}_id", as_index = False).agg(n_samples = ('token', 'size')).sort_values(by = 'n_samples', ascending = False))
    print('Cross-layer clusters:')
    print_samples(cl_df, [f"layer_{str(l)}_id"  for l in layers_to_group])

    return cl_df

hdbscan_paths_1 = test_hdbscan(umap_cos_10, 'euclidean', [2, 3, 4, 5, 6])