In [None]:
"""
Test clustering
"""
None

In [None]:
"""
Imports
"""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np
from tqdm import tqdm
from termcolor import colored
import importlib 

from utils.memory import check_memory, clear_all_cuda_memory
import gc

# https://docs.rapids.ai/install/
import cupy
import cuml

import plotly.express as px
import pickle

main_device = 'cuda:0'
seed = 1234
clear_all_cuda_memory()
check_memory()

## Load base model

In [None]:
"""
Load the base tokenizer/model

Architectures supported currently:
- OlMoE architecture, includes OLMoE-1B-7B-0125-Instruct (1B/7B)
- Qwen2MoE architecture, inclues Qwen1.5-MoE-A2.7B-Chat (2.7B/14.3B), Qwen2-57B-A14B (14B/57B)
- Deepseek v2 architecture, includes Deepseek-v2-Lite (2.4B/15.7B), Deepseek-v2 (21B/236B)
- Deepseek v3 architecture, includes Deepseek-v3 (37B/671B), Deepseek-R1 (37B/671B), Moonlight-16B-A3B (3B/16B)
"""
selected_model_index = 1

def get_model(index):
    model = [
        ('allenai/OLMoE-1B-7B-0125-Instruct', 'olmoe', 'olmoe'),
        ('Qwen/Qwen1.5-MoE-A2.7B-Chat', 'qwen1.5moe', 'qwen2moe'),
        ('deepseek-ai/DeepSeek-V2-Lite', 'dsv2', 'dsv2'),
        ('moonshotai/Moonlight-16B-A3B', 'moonlight', 'dsv3')
    ][index]

    return model[0], model[1], model[2]

model_id, model_prefix, model_architecture = get_model(selected_model_index)
# tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token = False, add_bos_token = False, padding_side = 'left', trust_remote_code = True)
# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16, trust_remote_code = True).cuda().eval()

## Load data

In [None]:
"""
Load dataset
"""
def load_data(model_prefix):
    all_pre_mlp_hs = torch.load(f'data/{model_prefix}-all-pre-mlp-hidden-states.pt')
    with open(f'data/{model_prefix}-metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    
    return all_pre_mlp_hs, metadata['sample_df'], metadata['topk_df']

all_pre_mlp_hs, sample_df_import, topk_df_import = load_data(model_prefix)

## Prep for Clustering

In [None]:
"""
Let's clean up the mappings here. We'll get everything to a sample_ix level first.
"""
sample_df_raw =\
    sample_df_import\
    .assign(sample_ix = lambda df: df.groupby(['batch_ix', 'sequence_ix', 'token_ix']).ngroup())\
    .assign(seq_id = lambda df: df.groupby(['batch_ix', 'sequence_ix']).ngroup())\
    .reset_index()

topk_df =\
    topk_df_import\
    .merge(sample_df_raw[['sample_ix', 'batch_ix', 'sequence_ix', 'token_ix']], how = 'inner', on = ['sequence_ix', 'token_ix', 'batch_ix'])\
    .drop(columns = ['sequence_ix', 'token_ix', 'batch_ix'])

sample_df =\
    sample_df_raw\
    .drop(columns = ['batch_ix', 'sequence_ix'])

display(topk_df)
display(sample_df)

## Clustering

In [None]:
""" 
Cross-layer Topk = 1 Clusters
"""
def print_samples(df, grouping_cols):
    """
    Takes a wide dataframe and groups it, then prints random groups
    """
    res =\
        df\
        .groupby(grouping_cols, as_index = False)\
        .agg(
            n_samples = ('token', 'size'),
            samples = ('token', lambda s: s.sample(n = min(len(s), 10)).tolist())
        )\
        .pipe(lambda df: df[df['n_samples'] >= 5])\
        .sample(25)
    display(res)

topk_wide =\
    topk_df\
    .pipe(lambda df: df[df['topk_ix'] == 1])\
    .merge(sample_df[['sample_ix', 'token']], on = 'sample_ix', how = 'inner')\
    .pivot(index = ['sample_ix', 'token'], columns = 'layer_ix', values = 'expert')\
    .rename(columns = lambda c: f'layer_{c}_id')\
    .reset_index()

display(topk_wide.groupby('layer_1_id', as_index = False).agg(n_samples = ('token', 'size')).sort_values(by = 'n_samples', ascending = False))
print_samples(topk_wide, ['layer_2_id', 'layer_3_id', 'layer_4_id', 'layer_5_id', 'layer_6_id'])

In [None]:
"""
Within layer clusters
"""
# Pivot by layer and topk to get expert_l4_k1, etc.
layer_topk_df =\
    topk_df\
    .pipe(lambda df: df[df['layer_ix'].isin([4, 6])])\
    .merge(sample_df[['sample_ix', 'token']], on = 'sample_ix', how = 'inner')\
    .assign(layer_topk_ix = lambda df: 'l' + df['layer_ix'].astype(str) + '_k' + df['topk_ix'].astype(str))\
    .pivot(index = ['sample_ix', 'token'], columns = ['layer_topk_ix'], values = 'expert')\
    .rename(columns = lambda c: f'expert_{c}')\
    .merge(sample_df[['sample_ix', 'token']], on = 'sample_ix', how = 'inner')\
    .reset_index(drop = True)

print_samples(layer_topk_df, ['expert_l4_k1', 'expert_l4_k2', 'expert_l4_k3', 'expert_l4_k4'])
print_samples(layer_topk_df, ['expert_l6_k1', 'expert_l6_k2', 'expert_l6_k3', 'expert_l6_k4'])
print_samples(layer_topk_df, ['expert_l4_k1', 'expert_l4_k2', 'expert_l6_k1', 'expert_l6_k2'])

In [None]:
"""
K-Means (note - returns imbalanced clusters)
""" 
def cluster_kmeans(layer_hs: torch.Tensor, n_clusters = 64):
    """
    Params:
        @layer_hs: A n_token_samples x D tensor for a single layer
        @n_clusters: The number of clusters to return

    Returns:
        A list of length n_token_samples of cluster ids
    """
    hs_cupy = cupy.asarray(layer_hs.to(torch.float32))
    kmeans_model = cuml.cluster.KMeans(
        n_clusters = n_clusters,
        max_iter = 1000,
        random_state = 123,
        verbose = True
    )
    kmeans_model.fit(hs_cupy)
    cluster_labels = kmeans_model.labels_ # shape = (n_samples,)
    # cluster_centers = kmeans_model.cluster_centers_ # shape = (num_clusters, D)
    return cluster_labels.tolist()

kmeans_res = [
    {'layer_ix': layer_ix, 'cluster_ids': cluster_kmeans(layer_hs, 64)}
    for layer_ix, layer_hs in tqdm(enumerate(all_pre_mlp_hs.unbind(dim = 1)))
]

kmeans_df =\
    pd.concat([pd.DataFrame({'layer_' + str(x['layer_ix']) + '_id': x['cluster_ids']}) for x in kmeans_res], axis = 1)\
    .pipe(lambda df: pd.concat([df, sample_df], axis = 1))

display(kmeans_df.groupby('layer_1_id', as_index = False).agg(n_samples = ('token', 'size')).sort_values(by = 'n_samples', ascending = False))
display(kmeans_df.groupby('layer_3_id', as_index = False).agg(n_samples = ('token', 'size')).sort_values(by = 'n_samples', ascending = False))
clear_all_cuda_memory()

print_samples(kmeans_df, ['layer_2_id', 'layer_3_id', 'layer_4_id', 'layer_5_id', 'layer_6_id'])

In [None]:
""" 
Test decomp methods
"""
def reduce_pca(layer_hs: torch.Tensor, n_components = 2, fit_samples: None | int = 10_000):
    # https://docs.rapids.ai/api/cuml/stable/api/#principal-component-analysis
    hs_cupy = cupy.asarray(layer_hs.to(torch.float32))
    if fit_samples:
        subset_indices = np.random.default_rng(123).choice(hs_cupy.shape[0], min(hs_cupy.shape[0], fit_samples), replace = False)
    else:
        subset_indices = list(range(0, hs_cupy.shape[0]))

    model = cuml.PCA(
        iterated_power = 20,
        n_components = n_components,
        verbose = True
    )
    model.fit(hs_cupy[subset_indices, :])
    # print(f'Explained variance ratio: {model.explained_variance_ratio_}')
    # print(f'Cumulative variance ratio: {np.cumsum(model.explained_variance_ratio_)[-1]}')
    # print(f'Means by feature: {model.mean_}')
    # print(f'Max feature mean: {np.max(model.mean_)} | Min feature mean: {np.min(model.mean_)}')
    pred = cupy.asnumpy(model.transform(hs_cupy)) 
    clear_all_cuda_memory()
    return pred

pca_test = reduce_pca(all_pre_mlp_hs.unbind(dim = 1)[0], 100, 100_000)
px.scatter(
    pd.concat([pd.DataFrame({'d1': pca_test[:, 0], 'd2': pca_test[:, 1]}), sample_df.head(pca_test.shape[0])], axis = 1)\
        .sample(5000)
        .assign(is_of = lambda df: np.where(df['token'] == ' of', 1, 0)),
    x = 'd1', y = 'd2', color = 'is_of', hover_data = ['token']
).show()

pca_10 = [reduce_pca(layer_hs, 10, 1_000_000) for layer_hs in tqdm(all_pre_mlp_hs.unbind(dim = 1))]
pca_100 = [reduce_pca(layer_hs, 100, 1_000_000) for layer_hs in tqdm(all_pre_mlp_hs.unbind(dim = 1))]

In [None]:
import os
os.environ["LD_LIBRARY_PATH"] = ""


In [None]:
def reduce_umap(layer_hs: torch.Tensor, n_components = 2, metric = 'euclidean', fit_samples: None | int = 10_000):
    # https://docs.rapids.ai/api/cuml/stable/api/#umap
    hs_cupy = cupy.asarray(layer_hs.to(torch.float32))
    if fit_samples:
        subset_indices = np.random.default_rng(123).choice(hs_cupy.shape[0], min(hs_cupy.shape[0], fit_samples), replace = False)
    else:
        subset_indices = list(range(0, hs_cupy.shape[0]))

    model = cuml.UMAP(
        n_components = n_components, 
        n_neighbors = 15, # 15 for default, smaller = more local data preserved [2 - 100]
        # metric = metric, # euclidean, cosine, manhattan, l2, hamming
        # min_dist = 0.1, # 0.1 by default, effective distance between embedded points
        # n_epochs = 200, # 200 by default for large datasets
        # random_state = None, # Allow parallelism
        # verbose = True,
        build_algo = 'nn_descent',
        build_kwds = {'nnd_n_clusters': 10, 'nnd_do_batch': True}
    )
    model.fit(hs_cupy[subset_indices, :], data_on_host = True)
    pred = cupy.asnumpy(model.transform(hs_cupy)) 
    clear_all_cuda_memory()
    return pred

umap_test = reduce_umap(all_pre_mlp_hs.unbind(dim = 1)[0], 100, 'cosine', 10_000) # 300k = 2min
clear_all_cuda_memory()
px.scatter(
    pd.concat([pd.DataFrame({'d1': umap_test[:, 0], 'd2': umap_test[:, 1]}), sample_df.head(umap_test.shape[0])], axis = 1)\
        .sample(5000)
        .assign(is_of = lambda df: np.where(df['token'] == ' of', 1, 0)),
    x = 'd1', y = 'd2', color = 'is_of', hover_data = ['token']
).show()

# umap_euc_10 = [reduce_umap(layer_hs, 10, 'euclidean', 1_000_000) for layer_hs in tqdm(all_pre_mlp_hs.unbind(dim = 1))]
# clear_all_cuda_memory()
# umap_cos_10 = [reduce_umap(layer_hs, 10, 'cosine', 1_000_000) for layer_hs in tqdm(all_pre_mlp_hs.unbind(dim = 1))]
# clear_all_cuda_memory()
# umap_euc_100 = [reduce_umap(layer_hs, 100, 'euclidean', 1_000_000) for layer_hs in tqdm(all_pre_mlp_hs.unbind(dim = 1))]
# clear_all_cuda_memory()
# umap_cos_100 = [reduce_umap(layer_hs, 100, 'cosine', 1_000_000) for layer_hs in tqdm(all_pre_mlp_hs.unbind(dim = 1))]
# clear_all_cuda_memory()

In [None]:
"""
DBScan
"""
def cluster_dbscan(layer_hs: torch.Tensor, fit_samples: None | int = 10_000):
    # https://docs.rapids.ai/api/cuml/stable/api/#dbscan
    hs_cupy = cupy.asarray(layer_hs.to(torch.float32))
    if fit_samples:
        subset_indices = np.random.default_rng(123).choice(hs_cupy.shape[0], min(hs_cupy.shape[0], fit_samples), replace = False)
    else:
        subset_indices = list(range(0, hs_cupy.shape[0]))

    dbscan_model = cuml.cluster.DBSCAN(
        metric = 'euclidean', # Or cosine
        min_samples = 5, # Number of samples st the group can be considered a core point
        verbose = True
    )
    dbscan_model.fit(hs_cupy[subset_indices, :])
    pred = cupy.asnumpy(model.transform(hs_cupy).labels_).tolist() # shape = n_samples
    print(f"Values unassigned to clusters: {len([l for l in pred if l == -1])}/{len(pred)}")
    del model, hs_cupy
    gc.collect()
    return pred


dbscan_cls = cluster_dbscan(umap_euc_10[0], 100)

# kmeans_res = [{'layer_ix': layer_ix, 'cluster_ids': cluster_dbscan(layer_hs, 64)} for layer_ix, layer_hs in tqdm(enumerate(all_pre_mlp_hs.unbind(dim = 1)))]

# kmeans_df =\
#     pd.concat([pd.DataFrame({'layer_' + str(x['layer_ix']) + '_id': x['cluster_ids']}) for x in kmeans_res], axis = 1)\
#     .pipe(lambda df: pd.concat([df, sample_df], axis = 1))



In [None]:
"""
UMAP -> 100 + HDBSCAN
"""
import umap

def cluster_umap_to_hdbscan(layer_hs: torch.Tensor, umap_dim: int = 100, fit_samples: None | int = 10_000):
    # https://docs.rapids.ai/api/cuml/stable/api/#hdbscan
    hs_cupy = cupy.asarray(layer_hs.to(torch.float32))

    if fit_samples:
        subset_indices = np.random.randint(0, hs_cupy.shape[0], size = max(hs_cupy.shape[0], fit_samples)) # 50k = 3min
    else:
        subset_indices = list(range(0, hs_cupy.shape[0]))

    hdbscan_model = cuml.cluster.HDBSCAN(
        min_cluster_size = len(hs_cupy) // (64 * 100), # Min 1/20 of the uniform dist value
        max_cluster_size = len(hs_cupy) // (64 * 1/100), # Max 20x the uniform dist values 
        metric = 'euclidean',
        min_samples = 1,
        verbose = True
    )
    hdbscan_model.fit(hs_cupy[subset_indices, :])
    pred = hdbscan_model.transform(hs_cupy)
    cluster_labels = pred.labels_ # shape = (n_samples,)
    
    print(f"Values unassigned to clusters: {len([l for l in cluster_labels.tolist() if l == -1])}/{len(cluster_labels)}")

    return cluster_labels.tolist()

cluster_hdbscan(all_pre_mlp_hs.unbind(dim = 1)[0])

## Compare to Dense Models