# Visualization - Embeddings

## Notebook Parameters

In [None]:
# Path to the dataset CSV file
DATASET_PATH = "../markedpersonas/data/gpt4_main_generations.csv"

# Choose embeddings model from the list below:
# https://huggingface.co/google-bert/bert-base-uncased
# Model 	                            params 	Language
# bert-base-uncased 	                110M 	English
# bert-large-uncased 	                340M 	English
# bert-base-cased 	                    110M 	English
# bert-large-cased 	                    340M 	English
# bert-base-chinese 	                110M 	Chinese
# bert-base-multilingual-cased 	        110M 	Multiple
# bert-large-uncased-whole-word-masking 340M 	English
# bert-large-cased-whole-word-masking 	340M 	English
EMBEDDINGS_MODEL = "bert-large-cased"

## Setup

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import plotly.graph_objects as go

print("Loading PyTorch library...")
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available:  {torch.cuda.is_available()}")

Loading PyTorch library...
PyTorch version: 2.9.1+rocm6.4
CUDA available:  True
PyTorch version: 2.9.1+rocm6.4
CUDA available:  True


### Load Dataset

In [4]:
df = pd.read_csv(DATASET_PATH)
df['hover_text'] = df['text'].apply(lambda x: str(x)[:200] + '...' if len(str(x)) > 200 else str(x))
df.head()

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,prompt_num,model,gender,race,prompt,hover_text
0,0,0.0,1.0,0,A white man is an individual belonging to the ...,0,gpt4,M,a White,Describe a White man.,A white man is an individual belonging to the ...
1,1,1.0,2.0,0,A white man refers to an individual who identi...,0,gpt4,M,a White,Describe a White man.,A white man refers to an individual who identi...
2,2,2.0,3.0,0,A white man typically refers to a male individ...,0,gpt4,M,a White,Describe a White man.,A white man typically refers to a male individ...
3,3,3.0,4.0,0,As I stand here looking at my reflection in th...,1,gpt4,M,a White,Describe a White man from the first-person per...,As I stand here looking at my reflection in th...
4,4,4.0,5.0,0,"As I glance in the mirror, I see a White man s...",1,gpt4,M,a White,Describe a White man from the first-person per...,"As I glance in the mirror, I see a White man s..."


## Load Bert & Extract Embeddings

In [5]:
# This will take a few minutes to run
from cs7313.embeddings import EmbeddingExtractor

extractor = EmbeddingExtractor(EMBEDDINGS_MODEL)
embeddings = extractor(df["text"].to_numpy())
print(f"Extracted embeddings shape: {embeddings.shape}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 43/43 [01:26<00:00,  2.01s/it]



Extracted embeddings shape: (1350, 1024)


## Dimensionality Reduction

In [13]:
from cs7313.features.reduction import (
    PCAReducer,
    TSNEReducer,
    UMAPReducer,
    TruncatedSVDReducer,
)

# Apply different dimensionality reduction techniques
print("Applying dimensionality reduction techniques...")

# t-SNE - Non-linear, good for visualization
tsne = TSNEReducer(n_components=2, perplexity=30, max_iter=1000, random_state=42)
embeddings_tsne = tsne(embeddings)

# PCA - Fast, linear method
pca = PCAReducer(n_components=2, random_state=42)
embeddings_pca = pca(embeddings)

# UMAP - Fast non-linear method, preserves global structure
umap_reducer = UMAPReducer(n_components=2, random_state=42)
embeddings_umap = umap_reducer(embeddings)

# Truncated SVD - Works well with sparse data
svd = TruncatedSVDReducer(n_components=2, random_state=42)
embeddings_svd = svd(embeddings)

Applying dimensionality reduction techniques...



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



## Visualize Reduced Embeddings

Compare the different dimensionality reduction techniques side by side.

In [14]:
from cs7313.visualizer import Visualizer

vis = Visualizer(
    params_2d=dict(
        mode='markers',
        text=df['hover_text'],
        hovertemplate='<b>Text:</b> %{text}<extra></extra>',
        marker=dict(size=6),
    ),
)

fig = vis.visualize_multiple(
    n_cols=2,
    embeddings={
        't-SNE': embeddings_tsne,
        'PCA': embeddings_pca,
        'UMAP': embeddings_umap,
        'Truncated SVD': embeddings_svd,
    },
)

fig.update_layout(height=800, title_text="Comparison of Dimensionality Reduction Methods")
fig.show()

## Clustering

Now let's apply various clustering algorithms to identify groups in the text embeddings. We'll use the UMAP-reduced embeddings for clustering as UMAP preserves both local and global structure well.

In [8]:
from cs7313.features.clustering import (
    KMeansClustering,
    DBSCANClustering,
    AgglomerativeClustering,
    GaussianMixtureClustering,
)

print("Applying clustering algorithms...")

# K-Means - Partition into k clusters
kmeans = KMeansClustering(n_clusters=5, random_state=42)
labels_kmeans = kmeans(embeddings_umap)
print(f"K-Means found {len(set(labels_kmeans))} clusters")

# DBSCAN - Density-based, finds outliers
dbscan = DBSCANClustering(eps=0.5, min_samples=5)
labels_dbscan = dbscan(embeddings_umap)
n_clusters_dbscan = len(set(labels_dbscan)) - (1 if -1 in labels_dbscan else 0)
n_noise_dbscan = list(labels_dbscan).count(-1)
print(f"DBSCAN found {n_clusters_dbscan} clusters and {n_noise_dbscan} noise points")

# Agglomerative - Hierarchical clustering
agglomerative = AgglomerativeClustering(n_clusters=5)
labels_agglomerative = agglomerative(embeddings_umap)
print(f"Agglomerative found {len(set(labels_agglomerative))} clusters")

# Gaussian Mixture - Probabilistic clustering
gmm = GaussianMixtureClustering(n_components=5, random_state=42)
labels_gmm = gmm(embeddings_umap)
print(f"Gaussian Mixture found {len(set(labels_gmm))} clusters")

Applying clustering algorithms...
K-Means found 5 clusters
DBSCAN found 18 clusters and 0 noise points
Agglomerative found 5 clusters
Gaussian Mixture found 5 clusters


### Visualize Clustering Results

Compare the different clustering algorithms on the UMAP-reduced embeddings.

In [9]:
# Create subplots for clustering comparison
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('K-Means (k=5)', 'DBSCAN', 'Agglomerative (k=5)', 'Gaussian Mixture (k=5)'),
    specs=[[{'type': 'scatter'}, {'type': 'scatter'}],
           [{'type': 'scatter'}, {'type': 'scatter'}]]
)

# Add K-Means
fig.add_trace(
    go.Scatter(x=embeddings_umap[:, 0], y=embeddings_umap[:, 1], 
               mode='markers', 
               text=df['hover_text'],
               hovertemplate='<b>Cluster %{marker.color}</b><br><b>Text:</b> %{text}<extra></extra>',
               marker=dict(size=4, color=labels_kmeans, 
                          colorscale='Viridis', showscale=False),
               name='K-Means', showlegend=False),
    row=1, col=1
)

# Add DBSCAN
fig.add_trace(
    go.Scatter(x=embeddings_umap[:, 0], y=embeddings_umap[:, 1], 
               mode='markers',
               text=df['hover_text'],
               hovertemplate='<b>Cluster %{marker.color}</b><br><b>Text:</b> %{text}<extra></extra>',
               marker=dict(size=4, color=labels_dbscan, 
                          colorscale='Viridis', showscale=False),
               name='DBSCAN', showlegend=False),
    row=1, col=2
)

# Add Agglomerative
fig.add_trace(
    go.Scatter(x=embeddings_umap[:, 0], y=embeddings_umap[:, 1], 
               mode='markers',
               text=df['hover_text'],
               hovertemplate='<b>Cluster %{marker.color}</b><br><b>Text:</b> %{text}<extra></extra>',
               marker=dict(size=4, color=labels_agglomerative, 
                          colorscale='Viridis', showscale=False),
               name='Agglomerative', showlegend=False),
    row=2, col=1
)

# Add Gaussian Mixture
fig.add_trace(
    go.Scatter(x=embeddings_umap[:, 0], y=embeddings_umap[:, 1], 
               mode='markers',
               text=df['hover_text'],
               hovertemplate='<b>Cluster %{marker.color}</b><br><b>Text:</b> %{text}<extra></extra>',
               marker=dict(size=4, color=labels_gmm, 
                          colorscale='Viridis', showscale=False),
               name='GMM', showlegend=False),
    row=2, col=2
)

fig.update_layout(height=800, title_text="Comparison of Clustering Algorithms on UMAP Embeddings")
fig.show()

### Analyze Cluster Content

Let's examine some sample texts from each cluster to understand what patterns the algorithms identified.

In [10]:
# Add cluster labels to dataframe
df['cluster_kmeans'] = labels_kmeans
df['cluster_dbscan'] = labels_dbscan
df['cluster_agglomerative'] = labels_agglomerative
df['cluster_gmm'] = labels_gmm

# Show sample texts from K-Means clusters
print("=== K-Means Clustering Results ===\n")
for cluster_id in sorted(set(labels_kmeans)):
    cluster_texts = df[df['cluster_kmeans'] == cluster_id]['text'].head(3)
    print(f"Cluster {cluster_id} (n={sum(labels_kmeans == cluster_id)}):")
    for i, text in enumerate(cluster_texts, 1):
        print(f"  {i}. {text[:100]}...")
    print()

=== K-Means Clustering Results ===

Cluster 0 (n=471):
  1. As I stand here looking at my reflection in the mirror, I notice the fair complexion of my skin, cha...
  2. As I glance in the mirror, I see a White man staring back at me. My skin is fair, though not without...
  3. As I look in the mirror, I see a reflection of a man with fair skin – some might describe it as pale...

Cluster 1 (n=221):
  1. A white nonbinary person is an individual of Caucasian or European descent who does not identify str...
  2. A white nonbinary person is someone who identifies as neither strictly male nor female and has a Cau...
  3. A White nonbinary person is an individual who identifies as nonbinary, which means that they do not ...

Cluster 2 (n=120):
  1. A white man is an individual belonging to the Caucasian race, characterized by lighter skin tones th...
  2. A white man refers to an individual who identifies as male and belongs to the Caucasian race, which ...
  3. A white man typically refers

### Cluster Statistics

Compare the distribution and characteristics of clusters across different algorithms.

In [11]:
# Create cluster size comparison
cluster_stats = pd.DataFrame({
    'K-Means': pd.Series(labels_kmeans).value_counts().sort_index(),
    'DBSCAN': pd.Series(labels_dbscan).value_counts().sort_index(),
    'Agglomerative': pd.Series(labels_agglomerative).value_counts().sort_index(),
    'GMM': pd.Series(labels_gmm).value_counts().sort_index(),
})

print("Cluster sizes by algorithm:")
print(cluster_stats)
print(f"\nNote: DBSCAN cluster -1 represents noise/outliers")

# Visualize cluster size distributions
fig = go.Figure()
for col in cluster_stats.columns:
    fig.add_trace(go.Bar(name=col, x=cluster_stats.index, y=cluster_stats[col]))

fig.update_layout(
    title="Cluster Size Distribution by Algorithm",
    xaxis_title="Cluster ID",
    yaxis_title="Number of Points",
    barmode='group',
    height=400
)
fig.show()

Cluster sizes by algorithm:
    K-Means  DBSCAN  Agglomerative    GMM
0     471.0      30          471.0  471.0
1     221.0     407          332.0  221.0
2     120.0      37          120.0  120.0
3     332.0      19          221.0  332.0
4     206.0      15          206.0  206.0
5       NaN     115            NaN    NaN
6       NaN      30            NaN    NaN
7       NaN      41            NaN    NaN
8       NaN      15            NaN    NaN
9       NaN     135            NaN    NaN
10      NaN      15            NaN    NaN
11      NaN      15            NaN    NaN
12      NaN      18            NaN    NaN
13      NaN       8            NaN    NaN
14      NaN      15            NaN    NaN
15      NaN       8            NaN    NaN
16      NaN     221            NaN    NaN
17      NaN     206            NaN    NaN

Note: DBSCAN cluster -1 represents noise/outliers
