In [None]:
#| default_exp indexing

In [None]:
#| export
from nanorag.base import *
from nanorag.store import *
from nanorag.context import *
from nanorag.llm import *
from nanorag.loaders import *
from typing import Union, List, Dict, Tuple, Optional, Any
import numpy as np

In [None]:
#| export
#| eval: false
context = ModelContext()
context.set_default()
store = DocumentStore()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

For testing I would start exploring by having a document I want to be able to retrieve information from.

My naive implementation would be an index for the embedding and mapping with the node index. Lets try that.

In [109]:
store.get()

[Document(id = ade3b74e-033d-4ec3-85d4-a366607cdd4d, name = 1  , metadata = {'category': 'PDF', '/Author': 'Shesh', '/Creator': 'Microsoft® Word LTSC', '/CreationDate': "D:20220404224045-04'00'", '/ModDate': "D:20220404224045-04'00'", '/Producer': 'Microsoft® Word LTSC', 'pages': 11}, source_id = ccc614bc-cb09-47fb-a89c-cf8e5f5ef501),
 Document(id = 5b82252e-a44a-4221-9d43-360ca58acc67, name = Mindful Explanations: Prevalence and Impact of Mind, metadata = {'category': 'PDF', '/CreationDate': 'D:20231220024706Z', '/Creator': 'LaTeX with acmart 2022/10/24 v1.88 Typesetting articles for the Association for Computing Machinery and hyperref 2023-04-22 v7.00x Hypertext links for LaTeX', '/ModDate': 'D:20231220024706Z', '/PTEX.Fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', '/Producer': 'pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', '/Subject': '-  Human-centered computing  ->  HCI theory, concepts and 

In [None]:
class VectorIndex: #Compatible with TextNode right now
    def __init__(self, context): #May not be needed in postgres. 
        self.node_to_idx = {}
        self.idx_to_node = {}
        self.idx = np.array([], dtype=np.int64)
        self.context = context
        #This line below accepts huggingface embeddings format. 
        self.embedding_dim = self.context.embedding[1].word_embedding_dimension
        if self.embedding_dim is not None:
            self.embeddings = np.empty((0, self.embedding_dim))
        else:
            self.embeddings = np.array([])

    def add(self, nodes: Union[TextNode, List[TextNode]]):
        if isinstance(nodes, TextNode):
            nodes = [nodes]
        elif isinstance(nodes, list):
            new_embeddings = np.vstack([node.embedding for node in nodes])
            if self.embeddings.size == 0:
                self.embeddings = new_embeddings
            else:
                self.embeddings = np.append(self.embeddings, new_embeddings, axis=0)
            node_idx = np.arange(len(self.idx), self.embeddings.shape[0])
            for node, idx in zip(nodes, node_idx):
                self.node_to_idx[node.id] = idx
                self.idx_to_node[idx] = node.id
                node.idx = idx
            self.idx = np.concatenate((self.idx, node_idx))

    def get_embedding(self, ids: Union[List[int], int]):
        if isinstance(ids, np.int64) or isinstance(ids, int):
            ids = np.array([ids], dtype = np.int64)
        if isinstance(ids, list):
            ids = np.array(ids)
        if isinstance(ids, np.ndarray):
            return self.embeddings[ids]

#TODO: Integration with the DocumentBridge and Document Object. 
#TODO: Try querying the Index with some text converted to embeddings.

In [None]:
#| hide
#| eval: false
index = VectorIndex(context = context)

In [None]:
#| hide
#| eval: false
loader = PDFLoader('datasets/papers_pdf', store = store)
documents = loader.get_documents()
document = DocumentBridge(documents, context = context).to_doc()
document.save()
nodes = DocumentBridge(document, context = context).to_nodes()
index.add(nodes)

In [None]:
#| hide
#| eval: false
nodes_idxs = [node.idx for node in nodes]
index.get_embedding(nodes_idxs).shape

(59, 384)

In [None]:
#| hide
#| eval: false
index.embeddings.shape

(124, 384)

In [None]:
#| hide
#| eval: false
loader = PDFLoader('datasets/papers_pdf', store = store)
documents = loader.get_documents()
document = DocumentBridge(documents, context = context).to_doc()
document.save()
nodes = DocumentBridge(document, context = context).to_nodes()
index.add(nodes)
