Load Libraries

In [1]:
from sentence_transformers import SentenceTransformer
from groq import Groq
import os
from pypdf import PdfReader
import chromadb
import numpy as np
from numpy import linalg

  from .autonotebook import tqdm as notebook_tqdm


Load embedder model

In [2]:
model = SentenceTransformer('BAAI/bge-small-en-v1.5')

Loading weights: 100%|██████████| 199/199 [00:03<00:00, 54.72it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [3]:
# Normalize Embeddings

def normalize_embeddings(embeddings):
  norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
  return (embeddings/norms).tolist()

Function to extract text from documents

In [4]:
def extract_text(path):
  reader = PdfReader(path)
  text = ""
  for page in reader.pages:
    text += page.extract_text() + "\n"
  return text

Load the text the extract text

In [5]:
text = extract_text('The Finance Bill 2025.pdf')

Turn into chunks

In [6]:
def chunk_text(text, size=300):
  words = text.split()
  chunks=[]
  for i in range(0, len(words), size):
    chunks.append(" ".join(words[i:i+size]))
  return chunks

In [7]:
chunks = chunk_text(text)

Turn text to chunks

In [8]:
const_embeddings = model.encode(chunks)

In [9]:
const_embeddings = normalize_embeddings(const_embeddings)
const_embeddings

[[-0.09797094017267227,
  -0.020707568153738976,
  0.07623841613531113,
  -0.01181878987699747,
  0.0510750450193882,
  0.00539123872295022,
  0.012470796704292297,
  -0.006119597237557173,
  -0.023356573656201363,
  0.030947301536798477,
  0.04620823264122009,
  0.049239207059144974,
  0.013988767750561237,
  -0.002862711437046528,
  -0.0002702516212593764,
  -0.07529973238706589,
  -0.02943987213075161,
  -0.10582203418016434,
  -0.033844299614429474,
  0.10654786974191666,
  0.10271919518709183,
  -0.008671934716403484,
  0.045767344534397125,
  0.028683392331004143,
  0.06997165828943253,
  0.03884996846318245,
  -0.004838030319660902,
  -0.05340319499373436,
  0.016615573316812515,
  -0.20066888630390167,
  0.004712728317826986,
  -0.041345853358507156,
  0.018404416739940643,
  -0.010201681405305862,
  0.008146158419549465,
  -0.01979670487344265,
  -0.034120187163352966,
  0.008674985729157925,
  0.015539241954684258,
  0.019634241238236427,
  0.000697420327924192,
  -0.01239355

Load chromadb to store documents and embeddings

In [10]:
client = chromadb.PersistentClient(path="./chroma_db")

In [11]:
collection = client.create_collection(
    name="cons_documents", metadata={"description": "My document collection"}
)

print("collection created:", collection.name)

collection created: cons_documents


In [12]:
const_embeddings = model.encode(chunks).tolist()

In [14]:
ids = [f"doc_{i}" for i in range(len(chunks))]

Add to collections

In [15]:
collection.add(
    documents=chunks,
    embeddings=const_embeddings,
    ids=ids
)

Add inferencing - The brain

In [16]:
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

In [17]:
ai_model = 'meta-llama/llama-4-scout-17b-16e-instruct'

In [18]:
def generate_answer(question, retrieved_docs):
  context = "\n\n".join(retrieved_docs)

  system_prompt = """
  You are an expert assistant.
  Answer ONLY using the provided context.
  If the answer is not in the context, say:
  "The document does not contain this information"
  """

  user_prompt = f"""
  context:
  {context}

  Question:
  {question}
  """

  response = groq_client.chat.completions.create(
      model=ai_model,
      messages=[
          {"role": "system", "content": system_prompt},
          {"role": "user", "content": user_prompt}
      ],
      temperature=0,
      max_tokens=800
  )

  return response.choices[0].message.content

In [19]:
def ask(question):
  query_embedding = model.encode([question])
  query_embedding = normalize_embeddings(query_embedding)

  results = collection.query(
      query_embeddings=query_embedding,
      n_results=3
  )

  retrieved_docs = results["documents"][0]

  return generate_answer(question, retrieved_docs)

In [20]:
answer = ask("What does the document say about lands")
print(answer)

The document mentions "land" or "lands" in the following contexts:

- It discusses gains or profits from the sale of standing timber by a person who has purchased the land on which the timber is situated, and allows for a deduction of the value of the standing timber at the time the owner acquired the land.

- It defines "agricultural land" and allows for an expenditure of a capital nature incurred in clearing such land or on clearing and planting thereon permanent or semi-permanent crops.

- It defines "land" situated in Kenya as including any right or interest in or over that land.

No further information seems to be provided about lands.
