In [None]:
#| default_exp indexing

# Indexing


In [None]:
#| export
from nanorag.base import *
from nanorag.store import *
from nanorag.context import *
from nanorag.llm import *
from nanorag.loaders import *
from typing import Union, List, Dict, Tuple, Optional, Any
import numpy as np

In [None]:
#| export
#| eval: false
context = ModelContext()
context.set_default()
store = DocumentStore()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

For testing I would start exploring by having a document I want to be able to retrieve information from.

My naive implementation would be an index for the embedding and mapping with the node index. Lets try that.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
#| export
class VectorNodesIndex: #Compatible with TextNode right now. Storage of reference of certain nodes.
    #Have a store that has the nodes corresponding to certain ids.Ex Node Store. 
    #Question. Treat docstore the
    def __init__(self, context): #May not be needed in postgres. 
        self.idx_to_node = {}
        self.idx = np.array([], dtype=np.int64)
        self.context = context
        #This line below accepts huggingface embeddings format. 
        self.embedding_dim = self.context.embedding[1].word_embedding_dimension
        if self.embedding_dim is not None:
            self.embeddings = np.empty((0, self.embedding_dim))
        else:
            self.embeddings = np.array([])

    def add(self, nodes: Union[TextNode, List[TextNode]]): #Embed with non excluded content. 
        if isinstance(nodes, TextNode):
            nodes = [nodes]
        elif isinstance(nodes, list):
            new_embeddings = np.vstack([node.embedding for node in nodes])
            if self.embeddings.size == 0:
                self.embeddings = new_embeddings
            else:
                self.embeddings = np.append(self.embeddings, new_embeddings, axis=0)
            node_idx = np.arange(len(self.idx), self.embeddings.shape[0])
            for node, idx in zip(nodes, node_idx):
                self.idx_to_node[idx] = node.id
                node.idx_ref = idx
            self.idx = np.concatenate((self.idx, node_idx))

    def get_nodes(self, idx_refs: List[int]):
        """Providing a list of idx_refs of the nodes, get the corresponding node ids"""
        return [self.idx_to_node[idx_ref] for idx_ref in idx_refs]

    def get_embedding(self, idx_ref: Union[List[int], int]):
        """Providing the idx_ref of the node, or nodes get the embedding"""
        if isinstance(idx_ref, np.int64) or isinstance(idx_ref, int):
            idx_ref = np.array([idx_ref], dtype = np.int64)
        if isinstance(idx_ref, list):
            idx_ref = np.array(idx_ref, dtype= np.int64)
        if isinstance(idx_ref, np.ndarray):
            return self.embeddings[idx_ref]

    def query(self, query_str: List[str], top_k: int = 10):
        """Query the index with a list of strings. Returns the top_k results."""
        query_embedding = self.context.embedding.encode(query_str)
        similarity = cosine_similarity(query_embedding, self.embeddings)
        distance = 1 - similarity  # Compute cosine distance as a score
        topk_idx = np.argsort(distance).flatten()
        topk_idx_ref = self.idx[topk_idx][:top_k]
        nodes = self.get_nodes(topk_idx_ref)
        return nodes


In [None]:
#| hide
#| eval: false
index = VectorNodesIndex(context)
loader = PDFLoader('datasets/papers_pdf', store = store)

In [None]:
# | hide
# | eval: false

documents = loader.get_documents()
document = DocumentBridge(documents, context = context).to_doc()
nodes = DocumentBridge(document, context = context).to_nodes()
document.save()
store.add(nodes)
index.add(nodes)

In [None]:
# | hide
# | eval: false

#Get source. 
idx_node_list = index.query(['End to end'], top_k = 3)
doc_id = [doc.doc_id for doc in store.get(idx_node_list)]
store.get(doc_id)[0].text