# Mapper Algorithm applied to Sentence Embeddings 

Load Data:

In [None]:
# load data
from pathlib import Path
from pandas import read_parquet, concat, DataFrame

# select dataset
dataset = "aql" 
if dataset in ["aql", "aol"]:
    suffix = "special"
else:
    suffix = "all"

# path to embeddings
path = Path(f"/mnt/ceph/storage/data-in-progress/data-teaching/theses/thesis-schneg/analysis_data/analysis/{dataset}-get-embeddings-{suffix}")

# get number of files in path
files = len(list(path.glob("*.parquet"))) 
print(f"Number of files: {files}")

# set number of files to load
numfiles = 5
# files = None
# load embeddings
embeddings_data = DataFrame()
for cnt,path in enumerate(path.glob("*.parquet")):
    print(f"Loading {cnt+1}/{files} {path.name}")
    df = read_parquet(path)
    embeddings_data = concat([embeddings_data, df], ignore_index=True)
    # limit to files for testing
    if cnt+1 == numfiles:
        break
print(embeddings_data.shape)
print(embeddings_data.columns)

Number of files: 200
Loading 1/5 2_000084_000000.parquet
Loading 2/5 2_000049_000000.parquet
Loading 3/5 2_000144_000000.parquet
Loading 4/5 2_000099_000000.parquet
Loading 5/5 2_000054_000000.parquet
(25879, 2)
Index(['serp_query_text_url', 'embeddings'], dtype='object')


Preprocess Data:

In [2]:
import numpy as np
# change dtype of arrays in the embeddings column to float32
embeddings_data["embeddings"] = embeddings_data["embeddings"].apply(lambda x: np.array(x, dtype=np.float32))
# convert to numpy array, standardize data
embeddings = embeddings_data.to_numpy()

# Stack the arrays in the embeddings column into a 2D array
emb_array = np.stack(embeddings[:,1])
# Standardize each feature (column-wise)
emb_array = (emb_array - np.mean(emb_array, axis=0)) / np.std(emb_array, axis=0)

print(emb_array.shape)

(25879, 768)


Apply Mapper:

In [None]:
import kmapper as km
from kmapper.jupyter import display
import umap
import sklearn
import sklearn.manifold as manifold

# initialize Kepler Mapper
mapper = km.KeplerMapper(verbose=1)

# project data into 2D subsapce via 2 step transformation, 1)isomap 2)UMAP
projected_data = mapper.fit_transform(emb_array, projection=[manifold.Isomap(n_components=100, n_jobs=-1), umap.UMAP(n_components=2,random_state=1)])


KeplerMapper(verbose=1)
..Composing projection pipeline of length 2:
	Projections: Isomap(n_components=100, n_jobs=-1)
		UMAP(random_state=1)
	Distance matrices: False
False
	Scalers: MinMaxScaler()
MinMaxScaler()
..Projecting on data shaped (25879, 768)

..Projecting data using: 
	Isomap(n_components=100, n_jobs=-1)



  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])



..Scaling with: MinMaxScaler()

..Projecting on data shaped (25879, 100)

..Projecting data using: 
	UMAP(random_state=1, verbose=1)

UMAP(n_jobs=1, random_state=1, verbose=1)
Thu Jun  5 14:48:09 2025 Construct fuzzy simplicial set
Thu Jun  5 14:48:09 2025 Finding Nearest Neighbors
Thu Jun  5 14:48:09 2025 Building RP forest with 13 trees


  warn(


Thu Jun  5 14:48:14 2025 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	Stopping threshold met -- exiting after 4 iterations
Thu Jun  5 14:48:26 2025 Finished Nearest Neighbor Search
Thu Jun  5 14:48:28 2025 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Thu Jun  5 14:48:38 2025 Finished embedding

..Scaling with: MinMaxScaler()



In [4]:
# cluster data using DBSCAN
G = mapper.map(projected_data, emb_array, clusterer=sklearn.cluster.DBSCAN(metric="cosine"))

Mapping on data shaped (25879, 768) using lens shaped (25879, 2)

Creating 100 hypercubes.

Created 150 edges and 194 nodes in 0:00:01.712144.


In [5]:
# define an excessively long filename (helpful if saving multiple Mapper variants for single dataset)
fileID = 'projection=' + G['meta_data']['projection'].split('(')[0] + '_' + \
'n_cubes=' + str(G['meta_data']['n_cubes']) + '_' + \
'perc_overlap=' + str(G['meta_data']['perc_overlap']) + '_' + \
'clusterer=' + G['meta_data']['clusterer'].split('(')[0] + '_' + \
'scaler=' + G['meta_data']['scaler'].split('(')[0]

In [None]:
# visualize graph
mapper.visualize(G, 
                path_html=f"../data/mapper_{dataset}_NumFiles_{str(files)}_{fileID}.html",
                title=fileID,
                custom_tooltips = embeddings_data.iloc[:,0].to_numpy(),
                color_function_name = 'Log Percent Returns',
                node_color_function = np.array(['average', 'std', 'sum', 'max', 'min']))

# display mapper in jupyter
# km.jupyter.display("../data/mapper_example_" + fileID + ".html")



Wrote visualization to: ../data/mapper_example_NumFiles_5_projection=UMAP_n_cubes=10_perc_overlap=0.1_clusterer=DBSCAN_scaler=MinMaxScaler.html


'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta name="generator" content="KeplerMapper">\n  <title>projection=UMAP_n_cubes=10_perc_overlap=0.1_clusterer=DBSCAN_scaler=MinMaxScaler | KeplerMapper</title>\n\n  <link rel="icon" type="image/png" href="http://i.imgur.com/axOG6GJ.jpg" />\n\n  <link href=\'https://fonts.googleapis.com/css?family=Roboto+Mono:700,300\' rel=\'stylesheet\' type=\'text/css\'>\n  <style>* {\n  margin: 0;\n  padding: 0;\n}\n\nhtml, body {\n  height: 100%;\n}\n\nbody {\n  font-family: "Roboto Mono", "Helvetica", sans-serif;\n  font-size: 14px;\n}\n\n#logo {\n  width:  85px;\n  height: 85px;\n}\n\n#display {\n  color: #95A5A6;\n  background: #212121;\n}\n\n#header {\n  background: #111111;\n}\n\n#print {\n  color: #000;\n  background: #FFF;\n}\n\nh1 {\n  font-size: 21px;\n  font-weight: 300;\n  font-weight: 300;\n}\n\nh2 {\n  font-size: 18px;\n  padding-bottom: 20px;\n  font-weight: 300;\n}\n\nh3 {\n  font-size: 14px;\n  font-weight: 700;\n  text

In [None]:
import os 

# move html file to data folder
os.system(f"mv ../data/mapper_{dataset}_NumFiles_{str(files)}_{fileID}.html /mnt/c/Users/Benjamin/Desktop/mapper-results/")

0