In [None]:
import pickle
import numpy as np

# Load embedding vectors (replace with actual path)
embedding_path = "./Mathlib4_embeddings/outputs/embeddings/best_model_dict_poincare_300_my_dataset"  # Hyperbolic embedding example


with open(embedding_path, "rb") as f:
    embeddings_dict = pickle.load(f)  # Format: {word: vector, ...}

# Convert to vocabulary and vector matrix (filter high-frequency words to reduce computation, optional)
words = list(embeddings_dict.keys())
vectors = np.array([embeddings_dict[word] for word in words])

# Optional: Keep only the first 5000 words (avoid overly crowded visualization)
n_samples = min(5000, len(words))
words = words[:n_samples]
vectors = vectors[:n_samples]

print(f"Loading completed: {len(words)} words, vector dimension {vectors.shape[1]}")

In [None]:
from sklearn.manifold import TSNE
import umap

# Method 1: t-SNE (suitable for local structure, relatively slow computation)
tsne = TSNE(
    n_components=3,     # Output 3D for visualization
    perplexity=30,      # Controls balance between local and global structure, typically between 5~50
    random_state=42,    # Ensure reproducible results
    max_iter=1000       # Use new parameter name, replacing deprecated n_iter
)
vectors_3d_tsne = tsne.fit_transform(vectors)

# Method 2: UMAP (suitable for preserving global structure, faster computation)
umap_model = umap.UMAP(
    n_components=3,     # Output 3D
    n_neighbors=15,     # Number of neighbors for each point, affects preservation of local structure
    min_dist=0.1,       # Smaller values result in tighter clusters
    random_state=42     # Disables parallel computation (n_jobs=1) when random seed is set
)
vectors_3d_umap = umap_model.fit_transform(vectors)

In [None]:
import plotly.express as px
import pandas as pd
import numpy as np

# Ensure vectors_3d_umap and words have consistent lengths
assert len(vectors_3d_umap) == len(words), "Mismatch between vector and word count."

# Build DataFrame
df = pd.DataFrame({
    "x": vectors_3d_umap[:, 0],
    "y": vectors_3d_umap[:, 1],
    "z": vectors_3d_umap[:, 2],
    "word": words
})

# Set keywords to be labeled (modify as needed)
target_words = {"vector", "matrix"}

# Add label column, only keep target words as labels, others remain empty
df["label"] = df["word"].apply(lambda w: w if w in target_words else "")

# Plot interactive 3D scatter plot
fig = px.scatter_3d(
    df,
    x="x", y="y", z="z",
    text="label",  # Only label target words
    hover_name="word",  # Show full word on mouse hover
    title="3D Visualization of Hyperbolic Word Embeddings (UMAP)",
    opacity=0.7
)

# Adjust font and axis labels
fig.update_traces(marker=dict(size=3), textfont_size=10)
fig.update_layout(
    scene=dict(
        xaxis_title="Dimension 1",
        yaxis_title="Dimension 2",
        zaxis_title="Dimension 3"
    ),
    margin=dict(l=0, r=0, b=0, t=30)
)

# Display chart (automatically selects compatible display method)
fig.show()

In [None]:
def poincare_distance(u, v):
    """Calculate the distance between two points on the Poincaré sphere"""
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    if norm_u >= 1 or norm_v >= 1:
        return np.inf  # Exceeds unit sphere range
    delta = np.linalg.norm(u - v)
    return np.arccosh(1 + 2 * (delta ** 2) / ((1 - norm_u **2) * (1 - norm_v** 2)))

# Example: Calculate distances between hierarchical words
hierarchy = [
    ("vector", "matrix"),
]

distances = []
for (w1, w2) in hierarchy:
    if w1 in embeddings_dict and w2 in embeddings_dict:
        u = embeddings_dict[w1]
        v = embeddings_dict[w2]
        dist = poincare_distance(u, v)
        distances.append((w1, w2, dist))

# Print distances (expected: deeper hierarchy has smaller distances, e.g., dog→poodle < mammal→dog)
for d in distances:
    print(f"Distance {d[0]}→{d[1]}: {d[2]:.4f}")

In [None]:
import plotly.graph_objects as go

# Based on previous UMAP 3D data, filter coordinates of hierarchical words
hierarchy_coords = []
for (w1, w2, _) in distances:
    if w1 in df["word"].values and w2 in df["word"].values:
        coord1 = df[df["word"] == w1][["x", "y", "z"]].values[0]
        coord2 = df[df["word"] == w2][["x", "y", "z"]].values[0]
        hierarchy_coords.append((w1, coord1, w2, coord2))

# Create connection trajectories
lines = []
for (w1, c1, w2, c2) in hierarchy_coords:
    lines.append(
        go.Scatter3d(
            x=[c1[0], c2[0]],
            y=[c1[1], c2[1]],
            z=[c1[2], c2[2]],
            mode="lines",
            line=dict(color="red", width=2),
            hovertext=f"{w1}→{w2}"
        )
    )

# Add connections to the original figure
fig.add_traces(lines)
fig.update_layout(title="3D Visualization with Hierarchy Connections")
fig.show()

# MuRP


In [None]:
import pickle
import numpy as np

# Load MuRP saved embedding dictionary (replace path with actual file location)
embedding_path = "./Mathlib4_embeddings/outputs/embeddings/best_model_dict_poincare_200_my_dataset"  # Hyperbolic embedding example
with open(embedding_path, "rb") as f:
    embeddings_dict = pickle.load(f)

# Convert to numpy array (list of entity vectors)
entities = list(embeddings_dict.keys())
embeddings = np.array([embeddings_dict[entity] for entity in entities])

# Verify if embeddings are within unit sphere (optional, ensure hyperbolic space constraints)
norms = np.linalg.norm(embeddings, ord=2, axis=1)
print(f"Maximum embedding vector norm: {np.max(norms)} (should be < 1.0)")

In [None]:
import numpy as np
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401

def poincare_distance(u, v):
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    diff_norm = np.linalg.norm(u - v)
    arg = 1 + 2 * (diff_norm ** 2) / ((1 - norm_u ** 2) * (1 - norm_v ** 2) + 1e-9)
    arg = np.clip(arg, 1, None)
    return np.arccosh(arg)

# Calculate distance matrix
n = embeddings.shape[0]
dist_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i + 1, n):
        dist = poincare_distance(embeddings[i], embeddings[j])
        dist_matrix[i, j] = dist
        dist_matrix[j, i] = dist

# MDS 3D dimensionality reduction
mds = MDS(n_components=3, dissimilarity='precomputed', random_state=42)
points_3d = mds.fit_transform(dist_matrix)

# Calculate hyperbolic distance from each point to origin (using Euclidean distance as approximation, hyperbolic would be more accurate)
# Using hyperbolic distance to calculate distance to origin
origin = np.zeros(embeddings.shape[1])
dist_to_center = np.array([poincare_distance(p, origin) for p in embeddings])

# Normalize distance to map colors (larger distances become lighter colors, using Blues_r reversed colormap)
from matplotlib import cm
norm_dist = (dist_to_center - dist_to_center.min()) / (dist_to_center.max() - dist_to_center.min() + 1e-9)
colors = cm.Blues_r(norm_dist)  # Blues_r colormap, larger values result in lighter colors

# Plot 3D graph
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(points_3d[:, 0], points_3d[:, 1], points_3d[:, 2], c=colors, alpha=0.8)

ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")
ax.set_zlabel("Dimension 3")
# ax.set_title("3D MDS on Hyperbolic Distances with Color by Distance from Center")

plt.show()

In [None]:
import numpy as np
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401
from matplotlib import cm

def poincare_distance(u, v):
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    diff_norm = np.linalg.norm(u - v)
    arg = 1 + 2 * (diff_norm ** 2) / ((1 - norm_u ** 2) * (1 - norm_v ** 2) + 1e-9)
    arg = np.clip(arg, 1, None)
    return np.arccosh(arg)

# Calculate distance matrix
n = embeddings.shape[0]
dist_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i + 1, n):
        dist = poincare_distance(embeddings[i], embeddings[j])
        dist_matrix[i, j] = dist
        dist_matrix[j, i] = dist

# MDS 3D dimensionality reduction
mds = MDS(n_components=3, dissimilarity='precomputed', random_state=42)
points_3d = mds.fit_transform(dist_matrix)

# Calculate hyperbolic distance from each point to origin
origin = np.zeros(embeddings.shape[1])
dist_to_center = np.array([poincare_distance(p, origin) for p in embeddings])

# Normalize distance to map colors
norm_dist = (dist_to_center - dist_to_center.min()) / (dist_to_center.max() - dist_to_center.min() + 1e-9)
colors = cm.Blues_r(norm_dist)

# Specify points to label
labels_to_mark = ["matrix", "vector", "isdiag", "diagonal_matrix"]
label_indices = [entities.index(label) for label in labels_to_mark if label in entities]


fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(points_3d[:, 0], points_3d[:, 1], points_3d[:, 2], c=colors, alpha=0.8)

# Add labels to specified points
for idx in label_indices:
    ax.text(points_3d[idx, 0], points_3d[idx, 1], points_3d[idx, 2], entities[idx], 
            color='red', fontsize=10, weight='bold')

# 1. Draw connections between specified points
for i in range(len(label_indices)):
    for j in range(i + 1, len(label_indices)):
        p1 = points_3d[label_indices[i]]
        p2 = points_3d[label_indices[j]]
        ax.plot([p1[0], p2[0]], [p1[1], p2[1]], [p1[2], p2[2]], c='red', linestyle='--', alpha=0.6)

# 2. Draw connections from specified points to center (center at 0,0,0)
origin_3d = np.zeros(3)
for idx in label_indices:
    p = points_3d[idx]
    ax.plot([origin_3d[0], p[0]], [origin_3d[1], p[1]], [origin_3d[2], p[2]], c='green', linestyle=':', alpha=0.6)

ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")
ax.set_zlabel("Dimension 3")
ax.set_title("3D MDS on Hyperbolic Distances with Highlighted Labels and Lines")

plt.show()
fig.savefig("./visualization_images/hyperbolic_embedding_3d.pdf", format="pdf", bbox_inches='tight', dpi=300)

# Filter out existing entity indices
label_indices = [entities.index(label) for label in labels_to_mark if label in entities]

print("Hyperbolic distances from specified points to center:")
for idx in label_indices:
    print(f"{entities[idx]}: {dist_to_center[idx]:.4f}")

print("\nHyperbolic distance matrix between specified points:")
print("    " + "  ".join([entities[i] for i in label_indices]))
for i in label_indices:
    row = [f"{dist_matrix[i, j]:.4f}" for j in label_indices]
    print(f"{entities[i]:<12} {'  '.join(row)}")