In [None]:
from pathlib import Path

from llama_index.core import VectorStoreIndex, load_index_from_storage
from llama_index.core.storage import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

# Use this so don't need to bother with API key to default OPENAI model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5", device="cpu"  # Use "cuda" for GPU acceleration
)

# Define the directory where the index is stored
persist_dir = "./storage"  # Change this to your actual storage directory

# Load the index from storage
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
index = load_index_from_storage(storage_context)
retriever = index.as_retriever()
nodes = list(index.docstore.docs.values())


In [None]:
len(nodes)

In [None]:
vars(nodes[0])

In [None]:
index._vector_store._data.embedding_dict[nodes[0].id_]

In [None]:
# Extract embeddings & data
import numpy as np
import pandas as pd

# Extract embeddings and metadata
data = []
for node in nodes:
    metadata = node.metadata  # Extract metadata
    data.append({
        "note_id": node.id_,
        "note_title": metadata.get("file_name", "Unknown").replace(".md", ""),
        "folder": metadata.get("folder_name", "Uncategorized"),
        "links": metadata.get("wikilinks", []),  # Store internal links
        "backlinks": metadata.get("backlinks", []),  # Store backlinks
        "embedding": index._vector_store._data.embedding_dict[node.id_]
    })

# Convert to DataFrame
df = pd.DataFrame(data)

# Convert embeddings into a NumPy array for clustering
embeddings = np.vstack(df["embedding"].values)


In [None]:
df.head()