In [None]:
#| default_exp indexing

# Indexing


In [None]:
#| export
from nanorag.base import *
from nanorag.store import *
from nanorag.context import *
from nanorag.llm import *
from nanorag.loaders import *
from typing import Union, List, Dict, Tuple, Optional, Any
import numpy as np
from abc import ABC, abstractmethod

In [None]:
#| export
#| eval: false
context = ModelContext()
context.set_default()
store = DocumentStore()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

For testing I would start exploring by having a document I want to be able to retrieve information from.

My naive implementation would be an index for the embedding and mapping with the node index. Lets try that.

Todo: 

* Try out different scoring strategies weighting other type of things, like metadata similarity, options that have been picked by an LLM and such.
* Docker container to rapidly deploy agents.
* Support own models apart from sentence transformer models.

In [None]:
#| Export
#Won't be used for now but serves for dependency injection for the index, to try diff retrieval strategies and combine them.
#Will separate the retrieval strategies in the future.  
class Retriever(ABC):
    @abstractmethod
    def retrieve(self, query_embedding, embeddings, top_k):
        pass

In [72]:
#| export
class VectorNodesIndex: #Compatible with TextNode right now. Storage of reference of certain nodes.
    #Try out with SVM retrieval strategy and other ones. 
    #Question. Treat docstore the
    #TODO: Proper context validation
    """Inside here the embeddings stored are normalized. So when doing operations with vectors has to be kept into account. """
    def __init__(self, context): #May not be needed in postgres. 
        self.idx_to_node = {}
        self.idx = np.array([], dtype=np.int64)
        self.context = context
        #This line below accepts huggingface embeddings format. 
        self.embedding_dim = self.context.embedding[1].word_embedding_dimension
        if self.embedding_dim is not None:
            self.embeddings = np.empty((0, self.embedding_dim))
        else:
            self.embeddings = np.array([])
        self.retrieval_strategies = {
            "dot_product": self.retrieve_dot_product,
        }

    def add(self, nodes: Union[TextNode, List[TextNode]]): #Embed with non excluded content. 
        ##TODO: Add nodes to docstore that is referenced there by default. 
        # And retrive them directly
        if isinstance(nodes, TextNode):
            nodes = [nodes]
        elif isinstance(nodes, list):
            new_embeddings = np.vstack([node.embedding for node in nodes])
            if self.embeddings.size == 0:
                self.embeddings = new_embeddings
            else:
                self.embeddings = np.append(self.embeddings, new_embeddings, axis=0)
            node_idx = np.arange(len(self.idx), self.embeddings.shape[0])
            for node, idx in zip(nodes, node_idx):
                self.idx_to_node[idx] = node.id
                node.idx_ref = idx
            self.idx = np.concatenate((self.idx, node_idx))

    def get_node_ids(self, idx_refs: List[int]):
        """Providing a list of idx_refs of embeddings of the nodes, get the corresponding node ids"""
        return [self.idx_to_node[idx_ref] for idx_ref in idx_refs]

    def get_embedding(self, idx_ref: Union[List[int], int]):
        """Providing the idx_ref of the node, or nodes get the embedding"""
        if isinstance(idx_ref, np.int64) or isinstance(idx_ref, int):
            idx_ref = np.array([idx_ref], dtype = np.issubdtype(type(idx_ref), np.integer))
        if isinstance(idx_ref, list):
            idx_ref = np.array(idx_ref, dtype= np.int64)
        if isinstance(idx_ref, np.ndarray):
            return self.embeddings[idx_ref]

    def retrieve(self, query_str: List[str], top_k: int = 10, strategy: str = 'dot_product'):
        query_embedding = self.context.embedding.encode(query_str)
        query_embedding = query_embedding / np.linalg.norm(query_embedding)  # Ensure normalization
        retrieve_method = self.retrieval_strategies.get(strategy, self.retrieve_dot_product)
        top_k_idx_matches, top_k_scores = retrieve_method(query_embedding, top_k)
        nodes = self.get_node_ids(top_k_idx_matches)
        return nodes, top_k_scores
    
    def retrieve_dot_product(self, query_embedding, top_k):
        similarity = self.cosine_similarity(query_embedding, self.embeddings.T)[0]
        top_matches_similarity = np.argsort(similarity)[::-1]
        top_k_idxs = top_matches_similarity[:top_k]
        top_k_scores = similarity[top_k_idxs]
        return top_k_idxs, top_k_scores
    
    def cosine_similarity(self,vecA, vecB):
        return np.dot(vecA, vecB)

In [83]:
#| hide
#| eval: false
index = VectorNodesIndex(context)
loader = PDFLoader('datasets/papers_pdf', store = store)

In [84]:
# | hide
# | eval: false
documents = loader.get_documents()
document = DocumentBridge(documents, context = context).to_doc()
nodes = DocumentBridge(document, context = context).to_nodes()
document.save()
store.add(nodes)
index.add(nodes)

In [85]:
# | hide
# | eval: false

node_ids, top_k_scores = index.retrieve(['What does deep learning do for mathematics in the industry of artificial intelligence?'], top_k = 5)
top_nodes = store.get(node_ids)