In [1]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

app_dir = os.path.join(os.getcwd(), "app")
load_dotenv(os.path.join(app_dir, ".env"))

loader = DirectoryLoader("./data", glob="**/*.txt")
docs = loader.load()

model = ChatOpenAI()
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings()
)

In [2]:
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever

In [3]:
docstore = InMemoryStore()
child_splitter = RecursiveCharacterTextSplitter(chunk_size=250)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=600)

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
retriever.add_documents(docs, ids=None)

In [4]:
len(list(docstore.yield_keys()))

22

In [5]:
retriever.invoke("who is the owner?")

[Document(page_content="One evening, as the sun cast a golden glow over the city, a renowned food critic, Elena Rossi, stepped into Chef Amico. Her mission was to uncover the secret behind the restaurant's growing fame. She was greeted by Amico himself, whose eyes sparkled with the joy of a man who loved his work.", metadata={'source': 'data\\restaurant.txt'}),
 Document(page_content='Creating Chef Amico’s Restaurant\n\nReturning to Palermo with a vision, Amico opened the doors to "Chef Amico," a restaurant that was a culmination of his travels and a tribute to his Sicilian roots. Nestled in a quaint corner of the city, the restaurant quickly gained fame for its authentic flavors and Amico’s innovative twists on traditional recipes.', metadata={'source': 'data\\founder.txt'}),
 Document(page_content='As he grew, so did his desire to explore beyond the shores of Sicily. Venturing through Italy, Amico worked alongside renowned chefs, each teaching him a new facet of Italian cuisine. From

### Create a custom Store with PostgreSQL

In [6]:
from pydantic import BaseModel, Field
from typing import Optional


class DocumentModel(BaseModel):
    key: Optional[str] = Field(None)
    page_content: Optional[str] = Field(None)
    metadata: dict = Field(default_factory=dict)

In [7]:
from sqlalchemy import Column, String, create_engine
from sqlalchemy.orm import declarative_base
from sqlalchemy.dialects.postgresql import JSONB

Base = declarative_base()


class SQLDocument(Base):
    __tablename__ = "docstore"
    key = Column(String, primary_key=True)
    value = Column(JSONB)

    def __repr__(self):
        return f"<SQLDocument(key='{self.key}', value='{self.value}')>"

In [8]:
import logging
from typing import Generic, Iterator, Sequence, TypeVar
from langchain.schema import Document
from langchain_core.stores import BaseStore

from sqlalchemy.orm import sessionmaker, scoped_session

logger = logging.getLogger(__name__)

D = TypeVar("D", bound=Document)


class PostgresStore(BaseStore[str, DocumentModel], Generic[D]):
    def __init__(self, connection_string: str):
        self.engine = create_engine(connection_string)
        Base.metadata.create_all(self.engine)
        self.Session = scoped_session(sessionmaker(bind=self.engine))

    def serialize_document(self, doc: Document) -> dict:
        return {"page_content": doc.page_content, "metadata": doc.metadata}

    def deserialize_document(self, value: dict) -> Document:
        return Document(
            page_content=value.get("page_content", ""),
            metadata=value.get("metadata", {}),
        )

    def mget(self, keys: Sequence[str]) -> list[Document]:
        with self.Session() as session:
            try:
                sql_documents = (
                    session.query(SQLDocument).filter(SQLDocument.key.in_(keys)).all()
                )
                return [
                    self.deserialize_document(sql_doc.value)
                    for sql_doc in sql_documents
                ]
            except Exception as e:
                logger.error(f"Error in mget: {e}")
                session.rollback()
                return []

    def mset(self, key_value_pairs: Sequence[tuple[str, Document]]) -> None:
        with self.Session() as session:
            try:
                serialized_docs = []
                for key, document in key_value_pairs:
                    serialized_doc = self.serialize_document(document)
                    serialized_docs.append((key, serialized_doc))

                documents_to_update = [
                    SQLDocument(key=key, value=value) for key, value in serialized_docs
                ]
                session.bulk_save_objects(documents_to_update, update_changed_only=True)
                session.commit()
            except Exception as e:
                logger.error(f"Error in mset: {e}")
                session.rollback()

    def mdelete(self, keys: Sequence[str]) -> None:
        with self.Session() as session:
            try:
                session.query(SQLDocument).filter(SQLDocument.key.in_(keys)).delete(
                    synchronize_session=False
                )
                session.commit()
            except Exception as e:
                logger.error(f"Error in mdelete: {e}")
                session.rollback()

    def yield_keys(self) -> Iterator[str]:
        with self.Session() as session:
            try:
                query = session.query(SQLDocument.key)
                for key in query:
                    yield key[0]
            except Exception as e:
                logger.error(f"Error in yield_keys: {e}")
                session.rollback()


In [11]:
from langchain_community.vectorstores.pgvector import PGVector
from langchain_openai import OpenAIEmbeddings


DATABASE_URL = "postgresql+psycopg2://admin:admin@localhost:5432/vectordb"

embeddings = OpenAIEmbeddings()

store = PGVector(
    collection_name="vectordb",
    connection_string=DATABASE_URL,
    embedding_function=embeddings,
)

Exception: Failed to create vector extension: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: FATAL:  password authentication failed for user "admin"

(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [12]:
retriever = ParentDocumentRetriever(
    vectorstore=store,
    docstore=PostgresStore(connection_string=DATABASE_URL),
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
retriever.add_documents(docs, ids=None)

In [11]:
retriever.invoke("who is the owner?")

[Document(page_content='In the heart of the old quarter of Palermo, amidst the bustling market stalls and the echoes of lively street life, Amico was born into a family where food was more than sustenance—it was the language of love. Raised in the warmth of his Nonna Lucia\'s kitchen, young Amico was captivated by the symphony of flavors and aromas that danced in the air, a testament to his family’s Sicilian heritage.\n\nAmico\'s life was deeply entwined with the vibrant essence of Sicilian cuisine. In the rustic kitchen where his Nonna conjured culinary magic, Amico found his calling. These formative years, filled with the rhythmic chopping of fresh herbs and the sizzling of rich tomato sauces, laid the foundation of his passion for cooking.\n\nThe Journey to Chef Amico\n\nFrom a young age, Amico was immersed in the art of Sicilian cooking. His days were punctuated by visits to the bustling markets of Palermo, where he learned to choose the freshest fish from the Mediterranean and the