Load Libraries

In [None]:
from sentence_transformers import SentenceTransformer
from groq import Groq
import os
from pypdf import PdfReader
import chromadb
import numpy as np
from numpy import linalg

Load embedder model

In [None]:
model = SentenceTransformer('BAAI/bge-small-en-v1.5')

In [None]:
# Normalize Embeddings

def normalize_embeddings(embeddings):
  norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
  return (embeddings/norms).tolist()

Function to extract text from documents

In [None]:
def extract_text(path):
  reader = PdfReader(path)
  text = ""
  for page in reader.pages:
    text += page.extract_text() + "\n"
  return text

Load the text the extract text

In [None]:
text = extract_text('The Finance Bill 2025.pdf')

Turn into chunks

In [None]:
def chunk_text(text, size=300):
  words = text.split()
  chunks=[]
  for i in range(0, len(words), size):
    chunks.append(" ".join(words[i:i+size]))
  return chunks

In [None]:
chunks = chunk_text(text)

Turn text to chunks

In [None]:
const_embeddings = model.encode(chunks)

In [None]:
const_embeddings = normalize_embeddings(const_embeddings)
const_embeddings

Load chromadb to store documents and embeddings

In [None]:
client = chromadb.PersistentClient(path="./chroma_db")

In [None]:
collection = client.create_collection(
    name="cons_documents", metadata={"description": "My document collection"}
)

print("collection created:", collection.name)

In [None]:
const_embeddings = model.encode(chunks).tolist()

In [None]:
ids = [f"doc_{i}" for i in range(len(chunks))]

Add to collections

In [None]:
collection.add(
    documents=chunks,
    embeddings=const_embeddings,
    ids=ids
)

Add inferencing - The brain

In [None]:
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

In [None]:
ai_model = 'meta-llama/llama-4-scout-17b-16e-instruct'

In [None]:
def generate_answer(question, retrieved_docs):
  context = "\n\n".join(retrieved_docs)

  system_prompt = """
  You are an expert assistant.
  Answer ONLY using the provided context.
  If the answer is not in the context, say:
  "The document does not contain this information"
  """

  user_prompt = f"""
  context:
  {context}

  Question:
  {question}
  """

  response = groq_client.chat.completions.create(
      model=ai_model,
      messages=[
          {"role": "system", "content": system_prompt},
          {"role": "user", "content": user_prompt}
      ],
      temperature=0,
      max_tokens=800
  )

  return response.choices[0].message.content

In [None]:
def ask(question):
  query_embedding = model.encode([question])
  query_embedding = normalize_embeddings(query_embedding)

  results = collection.query(
      query_embeddings=query_embedding,
      n_results=3
  )

  retrieved_docs = results["documents"][0]

  return generate_answer(question, retrieved_docs)

In [None]:
answer = ask("What does the document say about lands")
print(answer)