In [None]:
import pandas as pd
import numpy as np
# from cuml.manifold import UMAP
import umap
import json

In [None]:
# Downsample to 50k points

df_raw = pd.read_csv('mammoth_a.csv')
df = df_raw.sample(50000)

In [None]:
# Test UMAP

reducer = umap.UMAP(n_neighbors=200, init='spectral', verbose=True)

embedding = reducer.fit_transform(df)

In [None]:
# Plot UMAP projection

import matplotlib.pyplot as plt

plt.figure(figsize=(40,40),facecolor='w')

plt.axis('off')
plt.scatter(reducer.embedding_[:, 0], reducer.embedding_[:, 1], s=2.5, c='black')

In [None]:
# Cluster for colors

from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=12).fit(df)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = \
    train_test_split(df, df,
                     test_size=45000, random_state=42)
AC = AgglomerativeClustering(n_clusters=11, linkage='ward')
AC.fit(X_train)
labels_unscaled = AC.labels_

KN = KNeighborsClassifier(n_neighbors=10)
KN.fit(X_train,labels_unscaled)
labels = KN.predict(df)

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(48,35))

ax = fig.add_subplot(111, projection='3d')

ax.set_axis_off()
ax.scatter(df['x'], df['y'], df['z'], s=5,c=labels,cmap='Spectral')
ax.view_init(10, -170)

plt.show()

In [None]:
plt.figure(figsize=(40,40),facecolor='w')

plt.axis('off')
plt.scatter(embedding[:, 0], embedding[:, 1], s=5,c=labels,cmap='Spectral')

In [None]:
n_neighbor_parameters = [3,5,10,15,20,50,100,200]
n_neighbor_parameters.reverse()
min_distance_parameters = [0.0, 0.1, 0.25, 0.5, 0.8, 0.99]
min_distance_parameters.reverse()

projections_reverse = {}
first = True

for n in n_neighbor_parameters:
    for d in min_distance_parameters:
        if first:
            first = False
            reducer = umap.UMAP(n_components=2, n_neighbors=n, min_dist=d, metric='euclidean', verbose=True)
            reducer.fit(df)
        else:
            reducer = umap.UMAP(n_components=2, n_neighbors=n, min_dist=d, metric='euclidean', learning_rate=0.9, init=embedding, verbose=True)
            reducer.fit(df)

        embedding = reducer.embedding_
        projections_reverse[f"n={n},d={d}"] = embedding.tolist()

In [None]:
# Choose which embedding you want to plot
embedding_key = "n=200,d=0.1"  # Replace with your desired parameters

# Get the embedding from the dictionary and convert to numpy array
embedding_array = np.array(projections_reverse[embedding_key])

# Plot it exactly like you did before
plt.figure(figsize=(40,40), facecolor='w')
plt.axis('off')
plt.scatter(embedding_array[:, 0], embedding_array[:, 1], s=5, c=labels, cmap='Spectral')

In [None]:
projections = dict(reversed(list(projections_reverse.items())))

# Check the new order
print("New order:")
for i, key in enumerate(list(projections.keys())[:10]):  # Show first 10
    print(f"{i+1}: {key}")

In [None]:
# Scale floating-point coordinates to 10-bit integers

def scale_coordinates_to_integers(projections, target_bits=10):

    max_val = (2 ** target_bits) - 1  # 1023 for 10 bits
    scaled_projections = {}

    for key, coords in projections.items():
        coords_array = np.array(coords)

        # Find min/max for both x and y
        x_coords = coords_array[:, 0]
        y_coords = coords_array[:, 1]
        x_min, x_max = x_coords.min(), x_coords.max()
        y_min, y_max = y_coords.min(), y_coords.max()

        # Scale to [0, max_val] range
        x_scaled = np.round((x_coords - x_min) / (x_max - x_min) * max_val).astype(int)
        y_scaled = np.round((y_coords - y_min) / (y_max - y_min) * max_val).astype(int)

        scaled_projections[key] = np.column_stack([x_scaled, y_scaled]).tolist()

    return scaled_projections

scaled_projections = scale_coordinates_to_integers(projections)


In [None]:
# Recreate the same sorting logic that compress_mammoth.js uses
label_index_pairs = [(label, idx) for idx, label in enumerate(labels)]
sorted_pairs = sorted(label_index_pairs, key=lambda x: x[0])  # Sort by cluster label
sorted_indices = [idx for label, idx in sorted_pairs]

# 1. Reorder projections
reordered_projections = {}
for key, projection in scaled_projections.items():
    reordered_projection = [projection[idx] for idx in sorted_indices]
    reordered_projections[key] = reordered_projection

# 2. Reorder 3D data (assuming you're using df_10k or similar)
reordered_3d = [df.iloc[idx].tolist() for idx in sorted_indices]

# 3. Reorder labels
reordered_labels = [labels[idx] for idx in sorted_indices]

print("Data successfully reordered!")

In [None]:
# Assemble JSON structure and write to disk

out = {
    "projections": scaled_projections,
    "labels": labels.tolist(),
    "3d": df.values.tolist()
}

with open('mammoth_a_50k.json', "w") as f:
    json.dump(out, f)

In [None]:

# Load the JSON
with open('mammoth_a_50k.json', 'r') as f:
    data = json.load(f)

# Create random indices
random_indices = np.random.choice(len(data['labels']), size=10000, replace=False)

# Convert to numpy arrays for efficient indexing
labels_array = np.array(data['labels'])
points_3d_array = np.array(data['3d'])

# Subsample
subsampled_data = {
    "projections": {
        key: np.array(proj_list)[random_indices].tolist()
        for key, proj_list in data['projections'].items()
    },
    "labels": labels_array[random_indices].tolist(),
    "3d": points_3d_array[random_indices].tolist()
}

# Save
with open('mammoth_a_10k.json', 'w') as f:
    json.dump(subsampled_data, f)