## **PDF QnA with ChromaDB and Groq**

### Importing Libraries

In [9]:
import os
import yaspin
import chromadb
import argparse
import pypdfium2
from groq import Groq
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Config Setup

In [2]:
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_CLIENT  = Groq(api_key=GROQ_API_KEY)

In [13]:
GROQ_CLIENT

<groq.Groq at 0x1bc3ebfc440>

In [3]:
chromadb_client = chromadb.Client()

collection_name = "pdf_qa"
existing_collections = chromadb_client.list_collections()

if collection_name in [c.name for c in existing_collections]:
    collection = chromadb_client.get_collection(name=collection_name)
    print(f"✅ Using existing collection: {collection_name}")
else:
    collection = chromadb_client.create_collection(name=collection_name)
    print(f"✅ Created new collection: {collection_name}")

embedding_function = embedding_functions.DefaultEmbeddingFunction() ### By default, Chroma uses the Sentence Transformers all-MiniLM-L6-v2 model to compute embeddings for documents.

✅ Created new collection: pdf_qa


### Load PDF (using PyPdfium2)

In [4]:
def load_pdf_text(file_path):

    pdf  = pypdfium2.PdfDocument(file_path)
    text = ""

    for page in pdf:
        text += page.get_textpage().get_text()

    return text

### Split Text (using LangChain)

In [5]:
def split_text(text, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    splitted_text = text_splitter.split_text(text)

    return splitted_text

### Store Embeddings

In [6]:
def store_embeddings(chunks):
    for idx, chunk in enumerate(chunks):
        collection.add(
            ids=[str(idx)],
            documents=[chunk],
            embeddings=[embedding_function(chunk)]
        )

### Search for Context

In [7]:
def search_context(query, top_k=3):
    query_embeddings = embedding_function(query)
    results = collection.query(
        query_embeddings=query_embeddings,
        n_results=top_k
    )

    if results["documents"]:
        context = "\n".join(doc[0] for doc in results["documents"])
        return context
    return ""

### Query to Groq

In [10]:
def query_groq(context, question):
    prompt = f"Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion: {question}"
    
    with yaspin(text="Thinking...", color="cyan") as spinner:
        try:
            response = GROQ_CLIENT.chat.completions.create(
                model="llama3-70b-4096",
                messages=[
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                temperature=0.1,
                max_tokens=512
            )

            spinner.ok("✅ ")
            output_text = response.choices[0].message.content.strip()

            return output_text
        except Exception as e:
            spinner.fail("💥 ")
            raise e


### Main function

In [11]:
if __name__ == "__main__":
    
    # taking pdf from terminal
    parser = argparse.ArgumentParser(description="PDF-based Question Answering with Groq and ChromaDB")
    parser.add_argument(
        "--pdf", 
        type=str, 
        required=True, 
        help="Path to the PDF document"
    )
    args = parser.parse_args()

    pdf_file_path = args.pdf

    if not os.path.exists(pdf_file_path):
        print(f"File not found: {pdf_file_path}")
        exit(1)

    print("[1] Loading and processing PDF...")
    pdf_text = load_pdf_text(pdf_file_path)
    print("[2] Splitting text and storing embeddings in ChromaDB")
    chunks = split_text(pdf_text)
    store_embeddings(chunks)

    while True:
        user_question = input("\nAsk a question (or type 'exit'): ")
        if user_question.lower() == 'exit':
            break
        context = search_context(user_question)
        if not context:
            print("No relevant context found.")
            continue
        answer = query_groq(context, user_question)
        print("\nAnswer:", answer)


usage: ipykernel_launcher.py [-h] --pdf PDF
ipykernel_launcher.py: error: the following arguments are required: --pdf


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
