In [1]:
from pathlib import Path

from llama_index.core import VectorStoreIndex, load_index_from_storage
from llama_index.core.storage import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

# Use this so don't need to bother with API key to default OPENAI model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5", device="cpu"  # Use "cuda" for GPU acceleration
)

# Define the directory where the index is stored
persist_dir = "./storage"  # Change this to your actual storage directory

# Load the index from storage
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
index = load_index_from_storage(storage_context)
retriever = index.as_retriever()
nodes = list(index.docstore.docs.values())


In [2]:
len(nodes)

2968

In [3]:
# Extract embeddings & data
import numpy as np
import pandas as pd

# Extract embeddings and metadata
data = []
for node in nodes:
    metadata = node.metadata  # Extract metadata
    data.append({
        "note_id": node.id_,
        "note_title": metadata.get("file_name", "Unknown").replace(".md", ""),
        "folder": metadata.get("folder_name", "Uncategorized"),
        "links": metadata.get("wikilinks", []),  # Store internal links
        "backlinks": metadata.get("backlinks", []),  # Store backlinks
        "embedding": index._vector_store._data.embedding_dict[node.id_]
    })

# Convert to DataFrame
df = pd.DataFrame(data)

# Convert embeddings into a NumPy array for clustering
embeddings = np.vstack(df["embedding"].values)


In [4]:
import umap

# Reduce dimensionality to 2D for plotting
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine")
embedding_2d = reducer.fit_transform(embeddings)

# Add 2D coordinates to DataFrame
df["x"] = embedding_2d[:, 0]
df["y"] = embedding_2d[:, 1]




In [6]:
from sklearn.cluster import MiniBatchKMeans

# Define number of clusters (can be tuned)
num_clusters = 50 
kmeans = MiniBatchKMeans(n_clusters=num_clusters, batch_size=100, random_state=42)
df["cluster"] = kmeans.fit_predict(embeddings)

print("Clustering complete.")


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



Clustering complete.


In [8]:
import plotly.express as px

# Create an interactive scatter plot
fig = px.scatter(
    df, x="x", y="y",
    color=df["cluster"].astype(str),  # Color by cluster
    hover_data=["note_title","folder"],
    title="Clustered Note Embeddings"
)

# Show the plot
fig.show()


In [10]:
df.head()

Unnamed: 0,note_id,note_title,folder,links,backlinks,embedding,x,y,cluster
0,dfa280e0-5f55-412e-b2f4-2a432835d714,"Home - Priorities, Next, Index",.,"[0 - Tasks this Week, AMOC - Meta - Obsidian -...","[AMOC - Priorities, Vision, Long Term, WK - 25...","[-0.01763683557510376, 0.007754608057439327, 0...",2.871727,6.164631,13
1,06203e5a-6c30-4e41-81e5-6e897f027889,"Home - Priorities, Next, Index",.,"[MOC Journal - State of Union - Ruvi, Homework...","[AMOC - Priorities, Vision, Long Term, WK - 25...","[-0.01486171130090952, 0.01706085354089737, -0...",2.80065,6.00842,13
2,39591306-197b-430d-9d80-902764ab8a64,"Home - Priorities, Next, Index",.,"[AMOC - Finances, AMOC - Self - Mental, AMOC -...","[AMOC - Priorities, Vision, Long Term, WK - 25...","[-0.016235508024692535, 0.01410036999732256, -...",2.836192,6.143332,13
3,1168cd8e-d311-4b09-beb7-93414aec5497,"Home - Priorities, Next, Index",.,"[Ideas - Someday Maybe List, Ideas - Content a...","[AMOC - Priorities, Vision, Long Term, WK - 25...","[-0.013082915917038918, 0.02079198881983757, -...",2.723908,5.905755,13
4,136c218f-f1c0-4e96-9267-91c41c3206c6,Time series RAG,0-Inbox,[AMOC - Trading],"[2025-02-01 - AI for distillation, for search,...","[-0.026576388627290726, -0.00799475982785225, ...",1.533731,9.943548,0


# Using PyVis now
- De-dupe the dataframe by note
- Create graph using the lnks
- plot interactive

In [11]:
def merge_links(series):
    merged = set()
    for item in series:
        if pd.isna(item):
            continue
        # If item is a list (or already a Python object), update directly.
        if isinstance(item, list):
            merged.update(item)
        # Otherwise, assume a comma separated string.
        elif isinstance(item, str):
            # Remove whitespace and split on comma.
            links = [s.strip() for s in item.split(',') if s.strip()]
            merged.update(links)
    return list(merged)

In [42]:
df_dedup = df.drop(columns=['embedding'])

In [43]:
df_dedup['links'] = df_dedup['links'].apply(merge_links)
df_dedup['backlinks'] = df_dedup['backlinks'].apply(merge_links)

In [44]:
df_dedup.dtypes

note_id        object
note_title     object
folder         object
links          object
backlinks      object
x             float32
y             float32
cluster         int32
dtype: object

In [45]:
df_dedup = df_dedup.groupby('note_title', as_index=False).agg({
    'note_id': 'first',    # Use the first note_id for this note_title
    'folder': 'first',     # Use the first folder value
    'x': 'mean',           # Average the x coordinates
    'y': 'mean',           # Average the y coordinates
    'links': 'first',      # Pick the first merged links list
    'backlinks': 'first',   # Pick the first merged backlinks list
    'cluster': 'first'
})

In [46]:
df_dedup.head()

Unnamed: 0,note_title,note_id,folder,x,y,links,backlinks,cluster
0,#137 Justin Su'a Peak Mental Performance,1d5940a9-9b62-4e30-a3d1-010d565a4d98,2-Reference/Readwise/Podcasts,0.803228,2.595648,[The Knowledge Project with Shane Parrish],[],7
1,"#648 James Clear, Atomic Habits — Simple Strat...",c4c75526-e5bf-4137-8aba-b327ef5dfade,2-Reference/Readwise/Podcasts,8.856791,5.156964,[The Tim Ferriss Show],[],32
2,0 - Quick Bits - 2024-08-30 - Learning,13b99377-8ab2-4f37-ae2a-3a9d89731165,3-MOCs and Projects/Projects/Podcast - BoC/Shows,1.03972,2.631599,[],[],7
3,0 - Quick Bits - 2024-09-13 - start your secon...,f1988c95-cafb-4a8e-af76-07f398be6891,3-MOCs and Projects/Projects/Podcast - BoC/Shows,2.970463,4.43839,"[Homework For Life, 1 - Recent Notes]",[],7
4,0 - Tasks this Week,2719bd02-92b3-4649-8f6d-dc641f696689,0-Current Focus,2.647129,5.63778,"[Home - Priorities, Index, Next]","[Home - Priorities, Index, Next]",13


### Build Network graph

In [34]:
from pyvis.network import Network


net = Network(height='800px', width='100%', notebook=True, directed=True)
net.toggle_physics(False)
# net.barnes_hut()

scale_factor = 500

for idx, row in df_dedup.iterrows():
    tooltip = f"<strong>{row['note_title']}</strong><br>Folder: {row['folder']}"
    # The node id can be the note_id (or note_title), and we pass x, y.
    # Setting physics=False fixes the position.
    net.add_node(
        n_id=row['note_title'], 
        label=row['note_title'], 
        title=tooltip, 
        x=float(row['x'] * scale_factor), 
        y=float(row['y'] * scale_factor),
        fixed={'x': True, 'y': True}
    )

valid_titles = set(df_dedup['note_title'])

for idx, row in df_dedup.iterrows():
    source_title = row['note_title']
    # Check if links exists and iterate over them.
    if row['links']:
        for target_title in row['links']:
            # Optional: clean up the target title string.
            target_title = target_title.strip()
            
            # Only add the edge if the target note exists in our data.
            if target_title in valid_titles:
                net.add_edge(source_title, target_title)
            else:
                print(f"Warning: Target note '{target_title}' not found for note '{source_title}'.")





In [57]:
import pandas as pd
from pyvis.network import Network


# --- Normalize the x and y coordinates ---

# Min-max normalization: new_value = (value - min) / (max - min)
df_dedup['x_norm'] = (df_dedup['x'] - df_dedup['x'].min()) / (df_dedup['x'].max() - df_dedup['x'].min())
df_dedup['y_norm'] = (df_dedup['y'] - df_dedup['y'].min()) / (df_dedup['y'].max() - df_dedup['y'].min())


# Example: assume df_dedup is your deduplicated DataFrame with these columns:
# note_title, folder, x, y, links, cluster
# (and links is a list of note titles)

# Define a simple color mapping based on cluster number.
def get_color(cluster):
    # A palette of colors. Add more if you have many clusters.
    palette = ['#e6194B', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
               '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe']
    try:
        idx = int(cluster) % len(palette)
        return palette[idx]
    except (ValueError, TypeError):
        # In case cluster is missing or not convertible to int
        return '#cccccc'  # Default gray

# Initialize the network.
net = Network(height='800px', width='100%', notebook=True, directed=True)

# Disable physics globally to force fixed positions.
net.toggle_physics(False)

# Choose a scaling factor for the UMAP coordinates.
scale_factor = 1000

# Add nodes to the network.
for idx, row in df_dedup.iterrows():
    # Build a tooltip using plain text with a newline.
    # The newline (\n) should create a line break in the tooltip.
    tooltip = f"{row['note_title']}\nFolder: {row['folder']}\nCluster: {row['cluster']}"

    x_pos = float(row['x_norm'] * scale_factor)
    y_pos = float(row['y_norm'] * scale_factor)
    
    # Add node. Notice label is set to an empty string so it won't show by default.
    net.add_node(
        n_id=row['note_title'],  # Using note_title as unique identifier.
        label='',                   # Empty label; title shows on hover instead.
        title=tooltip,
        x=x_pos,
        y=y_pos,
        fixed={'x': True, 'y': True},
        color=get_color(row['cluster']),
        # Optionally, you can set a size and some scaling options.
        size=10,
        shadow=True,  # Adds a shadow effect.
        scaling={"min": 10, "max": 20}  # This helps with interactive hover effects.
    )

# Build a set of valid note titles for edge lookups.
valid_titles = set(df_dedup['note_title'])

# Add edges: for each note, draw an edge from the source note to each linked note.
for idx, row in df_dedup.iterrows():
    source_title = row['note_title']
    if row['links']:
        for target_title in row['links']:
            target_title = target_title.strip()
            if target_title in valid_titles:
                net.add_edge(source_title, target_title)
            else:
                print(f"Warning: Target note '{target_title}' not found for note '{source_title}'.")

# (Optional) Customize additional options using vis.js configuration.
# For example, you can enable hover effects with additional configuration.
net.set_options("""
var options = {
  "nodes": {
    "font": {
      "size": 0,
      "face": "arial",
      "color": "rgba(0,0,0,0)"
    },
    "borderWidth": 2,
    "hover": {
      "enabled": true,
      "size": 20
    },
    "shadow": {
      "enabled": true
    }
  },
  "edges": {
    "color": {
      "inherit": true
    },
    "smooth": false
  },
  "interaction": {
    "hover": true,
    "tooltipDelay": 200
  }
}
""")

# Save the network to an HTML file.
net.show('notes_network.html')


notes_network.html


In [62]:
import pandas as pd
import numpy as np
from pyvis.network import Network

# --- Example DataFrame ---
# Assume df_dedup has columns: note_title, folder, x, y, links, cluster

# Compute the center (you could also use median if preferred)
center_x = df_dedup['x'].mean()
center_y = df_dedup['y'].mean()

# Define parameters for the radial transformation:
# threshold: radii beyond which points will be compressed.
# alpha: exponent (<1) that compresses distances beyond the threshold.
threshold = 50   # adjust based on your data's scale
alpha = 0.5      # 0.5 compresses distances (square-root compression)

def compress_point(x, y, center_x, center_y, threshold, alpha):
    """
    Convert (x, y) to polar coordinates relative to (center_x, center_y),
    compress the radius if above threshold, and convert back to Cartesian.
    """
    # Compute the displacement from center.
    dx = x - center_x
    dy = y - center_y
    r = np.sqrt(dx*dx + dy*dy)
    theta = np.arctan2(dy, dx)
    
    # Apply non-linear transformation only if r exceeds the threshold.
    if r > threshold:
        # Compress the extra radius: new_r = threshold + (r - threshold)^alpha
        new_r = threshold + (r - threshold) ** alpha
    else:
        new_r = r
    
    # Convert back to Cartesian coordinates.
    new_x = center_x + new_r * np.cos(theta)
    new_y = center_y + new_r * np.sin(theta)
    return new_x, new_y

# Apply the transformation to your DataFrame.
# Create new columns for the transformed positions.
transformed_x = []
transformed_y = []
for idx, row in df_dedup.iterrows():
    new_x, new_y = compress_point(row['x'], row['y'], center_x, center_y, threshold, alpha)
    transformed_x.append(new_x)
    transformed_y.append(new_y)

df_dedup['x_trans'] = transformed_x
df_dedup['y_trans'] = transformed_y

# (Optional) You might want to scale or shift these new coordinates further
# so that they fit well on your canvas. For example:
scale_factor = 150  # adjust as needed
df_dedup['x_final'] = df_dedup['x_trans'] * scale_factor
df_dedup['y_final'] = df_dedup['y_trans'] * scale_factor

df_dedup['x_rot'] = -df_dedup['y_final']
df_dedup['y_rot'] = df_dedup['x_final']

# --- Build the Pyvis network with the transformed coordinates ---

def get_color(cluster):
    palette = [
        "#e6194B", "#3cb44b", "#ffe119", "#4363d8", "#f58231",
        "#911eb4", "#46f0f0", "#f032e6", "#bcf60c", "#fabebe",
        "#008080", "#e6beff", "#9a6324", "#fffac8", "#800000",
        "#aaffc3", "#808000", "#ffd8b1", "#000075", "#808080",
        "#ff0000", "#00ff00", "#0000ff", "#ffff00", "#ff00ff",
        "#00ffff", "#800080", "#696969", "#c0c0c0", "#ffa500",
        "#f0e68c", "#dda0dd", "#98fb98", "#afeeee", "#db7093",
        "#ffefd5", "#ffdab9", "#cd853f", "#8b4513", "#d2691e",
        "#b8860b", "#d2b48c", "#deb887", "#f4a460", "#b22222",
        "#ff6347", "#e9967a", "#fa8072", "#ffa07a", "#7fffd4"
    ]
    try:
        idx = int(cluster) % len(palette)
        return palette[idx]
    except (ValueError, TypeError):
        return '#cccccc'  # default gray if cluster value is invalid


net = Network(height='800px', width='100%', notebook=True, directed=True)
net.toggle_physics(False)  # use fixed positions

# Add nodes using the new (x_final, y_final) coordinates.
for idx, row in df_dedup.iterrows():
    tooltip = f"{row['note_title']}\nFolder: {row['folder']}\nCluster: {row['cluster']}"
    net.add_node(
        n_id=row['note_title'],
        label='',                   # hide label by default
        title=tooltip,
        x=float(row['x_rot']),
        y=float(row['y_rot']),
        fixed={'x': True, 'y': True},
        color=get_color(row['cluster']),
        size=10,
        shadow=True,
        scaling={"min": 10, "max": 20}
    )

# Add edges.
valid_titles = set(df_dedup['note_title'])
for idx, row in df_dedup.iterrows():
    source_title = row['note_title']
    if row['links']:
        for target_title in row['links']:
            target_title = target_title.strip()
            if target_title in valid_titles:
                net.add_edge(source_title, target_title)
            else:
                print(f"Warning: Target note '{target_title}' not found for note '{source_title}'.")

# Override global node label settings to hide labels (they will show only on hover).
net.set_options("""
var options = {
  "nodes": {
    "font": {
      "size": 0,
      "face": "arial",
      "color": "rgba(0,0,0,0)"
    },
    "borderWidth": 2,
    "hover": {
      "enabled": true,
      "size": 20
    },
    "shadow": {
      "enabled": true
    }
  },
  "edges": {
    "color": {
      "inherit": true
    },
    "smooth": false
  },
  "interaction": {
    "hover": true,
    "tooltipDelay": 200
  }
}
""")

net.show('notes_network.html')


notes_network.html
