# Example RAG solution

Imports PDFs and uses ChromaDB as a vector store.  Creates a chain and runs some test prompts.

## Imports and set up

```
source .venv/bin/activate

pip install dotenv
pip install langchain
pip install langchain_openai
pip install langchain_ollama
pip install langchain_core
pip install langchain_community
pip install langchain_chroma
pip ibstall operator
```

In [1]:
import os, hashlib

from dotenv import load_dotenv

from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_ollama import OllamaLLM
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma

from operator import itemgetter

load_dotenv()

# Require a key if using OpenAI.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Select a MODEL.
MODEL = "llama3.2"
# MODEL = "smollm:135m"
# MODEL = "gemma3"
# MODEL = "embeddinggemma"

# Set model/embeddings based on the MODEL selected.
if MODEL.startswith("gpt"):
    model = ChatOpenAI(api_key=OPENAI_API_KEY, model=MODEL)
    embeddings = OpenAIEmbeddings()
else:
    model = OllamaLLM(model=MODEL)
    embeddings = OllamaEmbeddings(model=MODEL)


## Create persistent vector store from PDF

In [None]:
def ingest_pdf_folder_safely(
    folder_path: str = "./pdfs",
    persist_dir: str = "./chroma_db",
    collection_name: str = "pdf_docs",
    embeddings=None
):
    """
    Ingest all PDFs in a folder into Chroma safely.
    - Initializes DB if missing
    - Skips PDFs already ingested
    - Deduplicates chunks
    """
    if embeddings is None:
        embeddings = OllamaEmbeddings(model=MODEL)

    # Initialize or connect to collection
    db_exists = os.path.exists(persist_dir)
    if db_exists:
        vectorstore = Chroma(
            persist_directory=persist_dir,
            embedding_function=embeddings,
            collection_name=collection_name
        )
    else:
        vectorstore = None  # will create on first PDF

    # Scan folder for PDFs
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]

    for pdf_file in pdf_files:
        filepath = os.path.join(folder_path, pdf_file)

        # Load PDF and split pages
        loader = PyPDFLoader(filepath)
        pages = loader.load_and_split()

        # Add metadata and stable IDs
        ids = []
        for i, page in enumerate(pages):
            page.metadata["source"] = pdf_file
            raw_id = f"{pdf_file}-{i}-{page.page_content}"
            ids.append(hashlib.md5(raw_id.encode("utf-8")).hexdigest())

        if vectorstore is not None:
            # Check if already ingested
            existing_docs = vectorstore._collection.get(where={"source": pdf_file})
            if existing_docs["ids"]:
                print(f"ðŸ“Œ PDF '{pdf_file}' already exists â€” skipping.")
                continue
            # Add new PDF
            vectorstore.add_documents(pages, ids=ids)
            print(f"âœ… PDF '{pdf_file}' added to existing collection.")
        else:
            # First PDF: create DB and collection
            vectorstore = Chroma.from_documents(
                documents=pages,
                embedding=embeddings,
                persist_directory=persist_dir,
                collection_name=collection_name,
                ids=ids
            )
            print(f"âœ… PDF '{pdf_file}' added â€” database initialized.")

    return vectorstore

# Load the PDF
vectorstore = ingest_pdf_folder_safely()

# Use retriever
retriever = vectorstore.as_retriever()


## Create and test a prompt template

In [3]:
# Create prompt template.
template = """
Answer the question based on the context below. If you can't
answer the question, replay "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
# prompt.format(context="Here is some context", question="Here is a question")

## Create chain, questions, and parse responses

In [None]:
# Create parser.
parser = StrOutputParser()

# Create chain.
chain = (
    {
    "context": itemgetter("question") | retriever,
    "question": itemgetter("question")
    }
    | prompt
    | model
    | parser
)

# List questions to apply to prompt.
questions = [
   "Who recommended Emma?",
   "Who is Emma Rowland?",
   "How many practitioners has she helped?",
   "What is Reset Ready?",
   "What is Spark Change Revolution?",
   "Where should you start if you are new to business?"
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({ "question": question })}")
    print()