In [1]:
import os
import glob
from dotenv import load_dotenv
from pathlib import Path
import fitz  # PyMuPDF
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
GROQ_KEY = os.environ.get("GROQ_API_KEY")

model_name="meta-llama/llama-4-scout-17b-16e-instruct"

In [3]:
client = Groq() # GROQ_API_KEY

def call_groq(model_name, user_message):  
  completion = client.chat.completions.create(
    model=model_name,
    messages=[
      {
        "role": "user",
        "content": f"{user_message}"
      }
    ],
    temperature=0,
    max_completion_tokens=8192,
    top_p=1,
    # reasoning_effort="medium", # not supported for llama 3.3 70B
    stream=False,
    stop=None
  )

  return completion.choices[0].message

NameError: name 'Groq' is not defined

In [4]:
PDF_DIR = Path("docs")
OUT_DIR = Path("extracted_docs")

OUT_DIR.mkdir(exist_ok=True)


def extract_text_from_pdf(pdf_path: Path) -> str:
    doc = fitz.open(pdf_path)
    pages = []

    for i, page in enumerate(doc):
        text = page.get_text("text")
        if text.strip():
            pages.append(f"\n--- Page {i + 1} ---\n{text}")

    return "\n".join(pages)


def process_pdfs(pdf_dir: Path, out_dir: Path):
    for pdf_file in pdf_dir.glob("*.pdf"):
        print(f"Extracting: {pdf_file.name}")
        text = extract_text_from_pdf(pdf_file)

        output_file = out_dir / f"{pdf_file.stem}.txt"
        output_file.write_text(text, encoding="utf-8")

process_pdfs(PDF_DIR, OUT_DIR)

Extracting: Axis_investments_oct-dec-2025.pdf
Extracting: VisionDecember2025.pdf


In [4]:
def make_documents(text: str, source: str):
    return [
        Document(
            page_content=text,
            metadata={"source": source}
        )
    ]


def chunk_documents(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    return splitter.split_documents(docs)


def get_embeddings():
    return HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )


def build_vectorstore(chunks, embeddings):
    return Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory="chroma_db",
        collection_name="docs_rag"
    )


def format_docs(docs):
    return "\n\n".join(d.page_content for d in docs)

In [5]:
# this takes time!

all_docs = []

for text in glob.glob("extracted_docs/*.txt"):
    with open(text, 'r') as txt:
        docs = make_documents(txt.read(), text)
        all_docs.extend(docs)

chunks = chunk_documents(all_docs)
embeddings = get_embeddings()
vectorstore = build_vectorstore(chunks, embeddings)

ImportError: Could not import chromadb python package. Please install it with `pip install chromadb`.

In [None]:
# vectorstore.save_local("faiss_index")

vectorstore = Chroma(
    persist_directory="chroma_db",
    embedding_function=embeddings,
    collection_name="docs_rag"
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6}
)

In [None]:
llm = ChatGroq(
    model=model_name,
    temperature=0.0
)

In [7]:
prompt = ChatPromptTemplate.from_template("""
You are a factual assistant.
Answer ONLY using the provided context.
If the answer is not in the context, say "I don't know".

Context:
{context}

Question:
{question}
""")


In [8]:
# chaining
rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
)

rag_question = "what is axis bank?"

response = rag_chain.invoke(rag_question)
print()
print(rag_question)
print("RAG reply: ", response.content)

NameError: name 'retriever' is not defined

In [None]:
message = call_groq(model_name=model_name, user_message=rag_question)

print("Model Response: ", message.content)

Model Response:  Simon Sinek is a British-American motivational speaker, author, and organizational consultant known for his work on leadership, communication, and inspiring others. He is best known for his TED Talk, "How Great Leaders Inspire Action," which has been viewed over 55 million times, making it one of the most-watched TED Talks of all time.

Simon Sinek was born on October 9, 1973, in London, England. He studied at City University London and later earned an MBA from Harvard Business School. After working in marketing and advertising, Sinek founded his own consulting firm, The Sinek Group, where he works with leaders and organizations to help them develop their communication skills and inspire their teams.

Sinek's main area of focus is on the concept of "why" – the purpose, cause, or belief that drives an individual or organization. He argues that most people and organizations focus on "what" they do and "how" they do it, but the most successful and inspiring leaders and or

Ultimate Test

In [None]:
ultimate_questions = ["Who is Ajay?", "Which shell does ajay use, and why?", "which is the programming lang used by ajay?"]

for ultimate_question in ultimate_questions:
    response = rag_chain.invoke(ultimate_question)

    print()

    print(ultimate_question)
    print("Model WITH RAG: ", response.content)

    print()

    ans = call_groq(model_name=model_name, user_message=ultimate_question)
    print("Model WITHOUT RAG: ", ans.content)
    print('-' * 40)


Who is Ajay?
Model WITH RAG:  Ajay is an AI Engineer with a BTech degree in Artificial Intelligence and Data Science, currently working at New Street Technologies, Bangalore.

Model WITHOUT RAG:  Ajay is a common name in several cultures, particularly in India and other parts of South Asia. There are many notable individuals with the name Ajay, so without more context, it's challenging to pinpoint a specific person. Here are a few examples:

1. **Ajay Bhardwaj**: Indian film music composer and singer.
2. **Ajay Devgn**: Indian actor, director, and producer who works in Hindi cinema.
3. **Ajay Jadeja**: Former Indian cricketer and current cricket commentator.
4. **Ajay Mago**: Indian businessman and politician.

If you could provide more context or information about the Ajay you're referring to, I might be able to give a more specific answer!
----------------------------------------

Which shell does ajay use, and why?
Model WITH RAG:  Ajay prefers using the fish shell in Linux. He eve

INVESTIGATION (for undrstanding)

In [None]:
rag_chain

{
  context: VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x76be678c4110>, search_kwargs={'k': 4})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nYou are a factual assistant.\nAnswer ONLY using the provided context.\nIf the answer is not in the context, say "I don\'t know".\n\nContext:\n{context}\n\nQuestion:\n{question}\n'), additional_kwargs={})])
| ChatGroq(profile={'max_input_tokens': 131072, 'max_output_tokens': 8192, 'image_inputs': True, 'audio_inputs': False, 'video_inputs': False, 'image_outputs': False, 'audio_outputs': False, 'video_outputs': False, 'reasoning_output': False, 'tool_calling': True}, client=<groq.reso

In [None]:
chunks

[Document(metadata={'source': 'extracted_text/StartWithWhy.txt'}, page_content='--- Page 2 ---\n  \nSTART WITH \nSTART WITH \nSTART WITH \nSTART WITH \nWHY\nWHY\nWHY\nWHY \nHOW GREAT LEADERS INSPIRE \nEVERYONE TO TAKE ACTION \nSIMON SINEK\nSIMON SINEK\nSIMON SINEK\nSIMON SINEK \nPORTFOLIO'),
 Document(metadata={'source': 'extracted_text/StartWithWhy.txt'}, page_content="--- Page 3 ---\n \n \n \n \n \nPORTFOLIO \nPublished by the Penguin Group \nPenguin Group (USA) Inc., 375 Hudson Street, New York, New York 10014, U.S.A. Penguin Group (Canada), 90 \nEglinton Avenue East, Suite 700, Toronto, Ontario, Canada M4P 2Y3 (a division of Pearson Penguin Canada Inc.) \nPenguin Books Ltd, 80 Strand, London WC2R ORL, England \nPenguin Ireland, 25 St. Stephen's Green, Dublin 2, Ireland (a division of Penguin Books Ltd) Penguin Books \nAustralia Ltd, 250 Camberwell Road, Camberwell, Victoria 3124, Australia (a division of Pearson Australia \nGroup Pty Ltd) \nPenguin Books India Pvt Ltd, 11 Community

In [None]:
print("Total chunks: ", len(chunks))
print()
print("First 10 chunk lengths:")
for i in range(10):
    print(f"Chunk {i+1}: {len(chunks[i].page_content)}")


Total chunks:  2214

First 10 chunk lengths:
Chunk 1: 193
Chunk 2: 781
Chunk 3: 787
Chunk 4: 743
Chunk 5: 731
Chunk 6: 418
Chunk 7: 520
Chunk 8: 670
Chunk 9: 276
Chunk 10: 769


In [None]:
print(all_docs[2])
print("\n\n")
print(all_docs[2].metadata)
print("\n\n")
print(all_docs[2].page_content)

page_content='Ajay is an AI Engineer with a BTech degree in Artificial Intelligence and Data Science, currently working at New Street Technologies, Bangalore.

His primary programming language is Python, where he trying to go deep, he says - "I should be able to program efficiently even if I was given with just a python interpreter (means, no external libs installed)"

Ajay prefers using the fish shell in Linux, he even wrote a blog on fish named "Bash to Fish" explaining features of fish shell.
' metadata={'source': 'extracted_text/Ajay.txt'}



{'source': 'extracted_text/Ajay.txt'}



Ajay is an AI Engineer with a BTech degree in Artificial Intelligence and Data Science, currently working at New Street Technologies, Bangalore.

His primary programming language is Python, where he trying to go deep, he says - "I should be able to program efficiently even if I was given with just a python interpreter (means, no external libs installed)"

Ajay prefers using the fish shell in Linux, he e

In [None]:
aj_qn_ret_txts = retriever.invoke(ultimate_questions[2])
aj_qn_ret_txts

[Document(id='339d1c21-6f22-44ce-ae1f-d56eb3879171', metadata={'source': 'extracted_text/Ajay.txt'}, page_content='Ajay is an AI Engineer with a BTech degree in Artificial Intelligence and Data Science, currently working at New Street Technologies, Bangalore.\n\nHis primary programming language is Python, where he trying to go deep, he says - "I should be able to program efficiently even if I was given with just a python interpreter (means, no external libs installed)"\n\nAjay prefers using the fish shell in Linux, he even wrote a blog on fish named "Bash to Fish" explaining features of fish shell.'),
 Document(id='3155fa80-533d-4298-9f66-0f61256d27f0', metadata={'source': 'extracted_text/Sapiens.txt'}, page_content='understood its usefulness, refined it, and spread it through the Middle East and then to\nEurope. When several other signs were later added to the Arab numerals (such as the signs for\naddition, subtraction and multiplication), the basis of modern mathematical notation cam

In [None]:
for qn in aj_qn_ret_txts:
    print(qn.page_content)

Ajay is an AI Engineer with a BTech degree in Artificial Intelligence and Data Science, currently working at New Street Technologies, Bangalore.

His primary programming language is Python, where he trying to go deep, he says - "I should be able to program efficiently even if I was given with just a python interpreter (means, no external libs installed)"

Ajay prefers using the fish shell in Linux, he even wrote a blog on fish named "Bash to Fish" explaining features of fish shell.
understood its usefulness, refined it, and spread it through the Middle East and then to
Europe. When several other signs were later added to the Arab numerals (such as the signs for
addition, subtraction and multiplication), the basis of modern mathematical notation came
into being.
Although this system of writing remains a partial script, it has become the world’s
dominant language. Almost all states, companies, organisations and institutions – whether
they speak Arabic, Hindi, English or Norwegian – use m

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print(f"question: {ultimate_questions[2]}")

retrieved_docs = retriever.invoke(ultimate_questions[2])

query_embedding = embeddings.embed_query(ultimate_questions[2])
doc_embeddings = embeddings.embed_documents(
    [doc.page_content for doc in retrieved_docs]
)

sims = cosine_similarity(
    [query_embedding],
    doc_embeddings
)[0]

for i, (doc, sim) in enumerate(zip(retrieved_docs, sims), 1):
    print(f"\n\n--- Chunk {i} ---")
    print()
    print(f"Cosine similarity: {sim:.4f}")
    print()
    print(doc.page_content[:200], "...terminated")

question: which is the programming lang used by ajay?




--- Chunk 1 ---

Cosine similarity: 0.6142

Ajay is an AI Engineer with a BTech degree in Artificial Intelligence and Data Science, currently working at New Street Technologies, Bangalore.

His primary programming language is Python, where he t ...terminated


--- Chunk 2 ---

Cosine similarity: 0.3659

understood its usefulness, refined it, and spread it through the Middle East and then to
Europe. When several other signs were later added to the Arab numerals (such as the signs for
addition, subtrac ...terminated


--- Chunk 3 ---

Cosine similarity: 0.3464

--- Page 97 ---
* Even after Akkadian became the spoken language, Sumerian remained the language of administration and thus the
language recorded with writing. Aspiring scribes thus had to speak Sumer ...terminated


--- Chunk 4 ---

Cosine similarity: 0.3385

Indian language that became the sacred tongue of Hindu ritual, and the Greek and Latin
languages, as well as similarities between all these languages and Gothic, Celtic, O

In [None]:
from sentence_transformers import SentenceTransformer 

model = SentenceTransformer("all-MiniLM-L6-v2") 

sentences = ["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."] 

embeddings = model.encode(sentences) 

for sen, emb in zip(sentences, embeddings):
    print(f"{sen} => {emb[:20]}")
    print()

The weather is lovely today. => [ 0.01919573  0.12008536  0.15959828  0.0670659   0.0500748  -0.02591872
  0.0564682  -0.09285779 -0.03761145  0.00632382 -0.04288772  0.00402827
  0.00472777  0.03246762  0.04951977  0.05298184 -0.04044547 -0.02148373
 -0.03027608  0.02208583]

It's so sunny outside! => [-0.01869039  0.04151868  0.07431544  0.07843276  0.0755697  -0.01250757
  0.08835688 -0.06978878 -0.00632884  0.01998481  0.00445825 -0.02595664
  0.04095364  0.04328066  0.05310476  0.05139592  0.01447441 -0.02598937
  0.00961291 -0.00699315]

He drove to the stadium. => [ 0.136502    0.08227322 -0.02526165  0.03045136  0.0533121   0.0508092
  0.08616423  0.09999194  0.01759902  0.04281873 -0.02428658 -0.04200496
 -0.01194716  0.03269915  0.03689959  0.01280997 -0.02256223  0.02595218
 -0.04172948 -0.05341916]



Streamlit ChatUI