# Store

> Provides storage for documents

In [None]:
#| default_exp store

In [None]:
#| export
from nanorag.base import Document, abstractmethod, ABC
from nanorag.context import ModelContext
from nanorag.loaders import PDFLoader
from typing import Union, List, Dict, Optional
from uuid import UUID
from collections import defaultdict

In [None]:
#| export
class BaseDocumentStore(ABC):
    """
    Base class for document storage"""
    def __init__(self, documents : Dict[str, Document] = {}):
        pass
    @abstractmethod
    def __call__(self, db): #Connect to backend and specific collection parameters
        pass
    @abstractmethod    
    def add(self, document: Document):
        pass
    @abstractmethod 
    def ids(self):
        pass
    @abstractmethod    
    def delete(self, ids: Union[List, str]):
        pass
    @abstractmethod
    def get(self, ids: Union[UUID, List]):
        pass
        

In [None]:
#| export
class DocumentStore(BaseDocumentStore):
    """Key value type document store that store documents by their id in a dictionary.
    Also checks for duplicates via hashing and doesn't admit them. Compatible with both nodes and documents."""
    def __init__(self, documents : Union[List[Document], Document]= []):
        if isinstance(documents, list):
            self.documents = {document.id: document for document in documents}
        elif isinstance(documents, Document):
            self.documents = {documents.id: documents}

    def __call__(self, db): #Connect to backend and specific collection parameters
        pass
        
    def add(self, document: Union[List[Document], Document]) -> str:
        doc_ids = []
        if isinstance(document, list):
            for doc in document:
                self.add(doc)
                doc_ids.append(doc.id)
        else:
            for stored_document in self.documents:
                if self.documents[stored_document].hash == document.hash:
                    return f"You tried to add a duplicate document: {document.hash}"
                elif self.documents[stored_document].id == document.id:
                    self.documents[document.id] = document
                    return f"Document with id {document.id} has been updated"
            self.documents[document.id] =  document
        return f"The following documents have been added: {doc_ids}"

    def ids(self):
        doc_ids = [doc for doc in self.documents]
        return doc_ids

    def delete(self, ids: Union[List, str]):
        deleted_docs = []
        if isinstance(ids, List):
            for id in ids:
                deleted_doc = self.documents.pop(id, None)
                if deleted_doc != None:
                    deleted_docs.append(deleted_doc)
        elif isinstance(ids, UUID):
            deleted_doc = self.documents.pop(ids, None)
            print(f'Ids are: {ids} and theoretically deleted doc is {deleted_doc}')
            if deleted_doc != None:
                deleted_docs.append(deleted_doc)
        return f"The following docs have been deleted {deleted_docs}"
        
    def get(self, ids: Optional[Union[List[UUID], UUID]] = None) -> Optional[Union[Document, List[Document]]]:
        if ids == None:
            ids = self.ids()
            if isinstance(ids, List):
                if len(ids) == 0:
                    return None
        if isinstance(ids, List):
            docs = [self.documents[id] for id in ids if id in self.documents]
            if len(docs) == 0:
                return None
            return docs
        elif isinstance(ids, UUID):
            doc = self.documents.get(ids, None)
            if doc is None:
                return None
            return doc
        return None
    def group_by_source_id(self, source_id = None): #Other type of filters can be added
        grouped_documents = defaultdict(list)
        for doc in self.documents.values():
            if source_id == None:
                grouped_documents[doc.source_id].append(doc)
            elif doc.source_id not in grouped_documents or doc.source_id == source_id:
                grouped_documents[doc.source_id].append(doc)
        if source_id != None:
            return grouped_documents[source_id]
        return dict(grouped_documents)
#NOTE: Could I store both nodes and docs in same store? 

In [None]:
class PostgresDocumentStore(BaseDocumentStore):
    pass

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

Example insert into DocStore

In [None]:
#| hide
#| eval: false
loader = PDFLoader('datasets/papers_pdf')
documents = loader.get_documents()
store = DocumentStore(documents)

In [None]:
#| hide
#| eval: false
documents_2 = loader.get_documents()
store.add(documents_2)

"The following documents have been added: [UUID('1e31d61a-5232-4b59-90dc-d246fd050a00'), UUID('f83ecaa5-8940-4045-8d80-2e43e3561c32'), UUID('4990eebe-04e5-4038-980e-1a85bf7596ff'), UUID('c0fcb466-6a3d-4744-a76a-df0f1cc9c7db'), UUID('b890edd9-1970-4eb1-b5f9-57bac73727e3'), UUID('dbfa29dd-e849-4a27-b887-866211886329'), UUID('b9a9ad05-0724-45f7-bce9-9ed161563bee'), UUID('dc04e910-e573-41ac-bbdc-d9118d49c25c'), UUID('482d5a74-2989-4acc-8dc7-f6e69f49975b'), UUID('3607bfdd-897e-41d1-a61d-62645ba477b9'), UUID('57473132-e5ab-4549-a61d-79614da8272d'), UUID('f853176c-94ce-447e-99d1-1230fc0d10c1'), UUID('52efbad6-1365-45fa-b9ea-d349d8b76be8'), UUID('79d30bf1-a124-42b5-834a-8f8c69d4a228'), UUID('c7c47eea-6e8f-402c-a323-3f62a7662a9a'), UUID('7a4b11a7-c8f6-4e92-9523-d7eebe4a4f71'), UUID('7e368dc9-8a09-4a47-adf5-e3586d2cc5b6'), UUID('786eb7db-37c9-4892-8093-99fe61a7c396'), UUID('b34707b6-8137-4f2b-bb0a-f8d2ddbb5d76'), UUID('3719f574-74c9-4568-a9ba-5c3657dc5b90'), UUID('6f08bd39-ebd9-449d-934d-87d6aa5

In [None]:
#| hide
#| eval: false
document = store.group_by_source_id('6c3a0db6-631d-4dd4-8d12-f04382f633c5')

In [None]:
#| hide
#| eval: false
documents = store.get()
#Method in the docstore to group by source id for example. 
#Add source id to nodes as optional too. Lets have more complex docs. 