In [None]:
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
DRIVE_BASE = Path('/content/drive/MyDrive/drhp_rag')
print("Drive base:", DRIVE_BASE)


def drive_path(*parts):
    return DRIVE_BASE.joinpath(*parts)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive base: /content/drive/MyDrive/drhp_rag


In [None]:
!pip install -q \
  langchain langchain-community langchain-core \
  faiss-cpu sentence-transformers \
  rank_bm25 gradio PyMuPDF pypdf \
  tiktoken langchainhub chromadb \
  pandas numpy scikit-learn \
  openai duckduckgo-search tqdm langchain-groq


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer,CrossEncoder
from langchain_groq import ChatGroq
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from groq import Groq
import gradio as gr



In [None]:
#Read the files
def process_all_pdfs(pdf_directory):
    """
    Recursively read all PDFs from pdf_directory using PyMuPDFLoader.
    Adds a 'source' metadata key to each Document for traceability.
    Returns: list of langchain Document objects
    """
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in {pdf_directory}")

    all_documents = []
    for pdf_file in pdf_files:
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            docs = loader.load()
            # Tag source info for provenance
            for d in docs:
                # Keep filename + page number if available
                source_name = f"{pdf_file.name}"
                if hasattr(d, "metadata") and isinstance(d.metadata, dict):
                    d.metadata["source"] = source_name
                else:
                    d.metadata = {"source": source_name}
            all_documents.extend(docs)
            print(f"Loaded {len(docs)} pages/chunks from {pdf_file.name}")
        except Exception as e:
            print(f"Error loading {pdf_file}: {e}")

    print(f"Total documents loaded: {len(all_documents)}")
    return all_documents


pdf_directory = "/content/drive/MyDrive/drhp_rag"
all_pdf_documents = process_all_pdfs(pdf_directory)


Found 1 PDF files in /content/drive/MyDrive/drhp_rag
Loaded 761 pages/chunks from Lenskart Solutions Limited-DRHP-1753782641.pdf
Total documents loaded: 761


In [None]:
# Split documents
def split_documents(documents, chunk_size=1200, chunk_overlap=150):
    """
    Use RecursiveCharacterTextSplitter to split large page texts into chunks.
    Returns: list of chunk Documents with preserved metadata
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ".", " "]
    )
    split_docs = splitter.split_documents(documents)
    # Keep source metadata copied over
    print(f"Split {len(documents)} source docs into {len(split_docs)} chunks.")
    return split_docs

chunks = split_documents(all_pdf_documents)


Split 761 source docs into 3228 chunks.


In [None]:
# Embeddings and FAISS vectorstore

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create FAISS vector store from chunks
faiss_store = FAISS.from_documents(chunks, embedding_model)

print("FAISS vectorstore created with embeddings.")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS vectorstore created with embeddings.


In [None]:
# BM25 retriever and Hybrid (Ensemble) retriever

bm25_retriever = BM25Retriever.from_documents(chunks)

# Use FAISS as a dense retriever
faiss_retriever = faiss_store.as_retriever(search_kwargs={"k": 10})

# Ensemble/Hybrid retriever: combine both with weights
hybrid_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights=[0.4, 0.6]  # tweak these weights based on experiments
)

print("Hybrid retriever (BM25 + FAISS) ready.")


Hybrid retriever (BM25 + FAISS) ready.


In [None]:
# Reranker setup

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank_docs(query, docs, top_k=3, batch_size=16):
    """
    Re-rank a list of Documents with a CrossEncoder.
    Returns the top_k Documents after reranking.
    """
    if not docs:
        return []
    pairs = [(query, d.page_content) for d in docs]
    # Predict returns a numpy array of scores
    scores = reranker.predict(pairs, batch_size=batch_size)
    ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    top_docs = [doc for doc, _ in ranked[:top_k]]
    return top_docs

print("Reranker initialized.")


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Reranker initialized.


In [None]:
# Combined retrieval function
def get_relevant_docs(query, top_k_retrieve=10, top_k_rerank=3):
    """
    1) Use the hybrid retriever to get the top_k_retrieve docs
    2) Rerank them with CrossEncoder and return top_k_rerank docs
    """
    # Step 1: hybrid retrieval
    retrieved_docs = hybrid_retriever.get_relevant_documents(query)
    # Step 2: rerank
    top_docs = rerank_docs(query, retrieved_docs, top_k=top_k_rerank)
    return top_docs

# quick sanity test (uncomment to run)
#sample = "What is the shareholding pattern"
#print([d.metadata.get("source", "unknown") for d in get_relevant_docs(sample)])


  retrieved_docs = hybrid_retriever.get_relevant_documents(query)


['Lenskart Solutions Limited-DRHP-1753782641.pdf', 'Lenskart Solutions Limited-DRHP-1753782641.pdf', 'Lenskart Solutions Limited-DRHP-1753782641.pdf']


In [None]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = "your api key"

llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.3-70b-versatile",temperature=0.1,max_tokens=1024)

print("Groq LLM initialized successfully.")

# Test it
response = llm.invoke("Say hello!")
print(response.content)

Groq LLM initialized successfully.
Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?


In [None]:
# Query function (Groq + hybrid + rerank)
def query_prospecta(query, top_k_retrieve=10, top_k_rerank=3):
    """
    Full Groq-based RAG pipeline:
    1) Retrieve relevant docs via hybrid retriever
    2) Rerank with CrossEncoder
    3) Build contextual prompt
    4) Query Groq LLM and return concise answer + sources
    """
    docs = get_relevant_docs(query, top_k_retrieve, top_k_rerank)
    if not docs:
        return {"answer": "Sorry, no relevant information found.", "sources": []}

    # Build context + provenance
    context_parts, sources = [], []
    for i, d in enumerate(docs):
        src = d.metadata.get("source", "unknown_source")
        snippet = d.page_content.strip().replace("\n", " ")[:800]
        context_parts.append(f"Source {i+1} ({src}): {snippet}")
        sources.append({"rank": i+1, "source": src, "snippet": snippet})

    context = "\n\n".join(context_parts)

    system_prompt = (
       """ You are Prospecta, an intelligent assistant that helps users understand and analyze information from Draft Red Herring Prospectuses (DRHPs) and related financial documents.
Provide concise, accurate, and context-based answers using only the given context.
If the information is not available, clearly say 'The provided documents do not contain that information.'
Always reference your sources as (Source 1, Source 2, ...)."""
    )

    user_prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

    response = llm.invoke([("system", system_prompt), ("user", user_prompt)])
    answer_text = response.content.strip() if hasattr(response, "content") else str(response)

    return {"answer": answer_text, "sources": sources}


In [None]:
# Test sample queries
sample_queries = [
    "Where is the registered office",
    "give the type of offer"
]

for q in sample_queries:
    print("\n" + "="*80)
    print(f"Query: {q}")
    result = query_prospecta(q)
    print("\nAnswer:\n", result["answer"])
    print("\nSources (top):")
    for s in result["sources"]:
        print(f" - Rank {s['rank']}: {s['source']} | Snippet: {s['snippet'][:200]}...")



Query: Where is the registered office

Answer:
 The registered office locations are as follows:

1. Baofeng Framekart Technology Limited: No. 1, Xingbao Road, Industrial Cluster Area, Bao Feng County, Pingdingshan City, China (Source 1)
2. Le Petit Lunetier Paris SAS: 155 rue de Charonne, 75011 Paris (Source 1)
3. NESO Brands: 30 Cecil Street, #19-08, Prudential Tower - 049712, Singapore (Source 2)
4. Lenskart Solutions Limited: Plot No. 151, Okhla Industrial Estate, Phase III, New Delhi – 110 020, Delhi, India (Source 3)

Sources (top):
 - Rank 1: Lenskart Solutions Limited-DRHP-1753782641.pdf | Snippet: 639  3.  Baofeng Framekart Technology Limited  Registered Office  The registered office of Baofeng is situated at No. 1, Xingbao Road, Industrial Cluster Area, Bao Feng County,  Pingdingshan City, Chi...
 - Rank 2: Lenskart Solutions Limited-DRHP-1753782641.pdf | Snippet: Singapore. Its unique entity number is 202139502H. Its registered office is situated at 30 Cecil Street, #19-08, 

In [None]:
import gradio as gr

def gradio_interface(query):
    """Wrapper for Gradio to call query_prospecta and format response."""
    result = query_prospecta(query)
    answer = result.get("answer", "No answer found.")
    sources = result.get("sources", [])

    # Format sources
    if sources:
        source_text = "\n".join(
            [f"• {s['source']} – {s['snippet'][:150]}..." for s in sources]
        )
    else:
        source_text = "No sources found."

    return f"**Prospecta:** {answer}\n\n📂 **Sources:**\n{source_text}"

# Create the Gradio UI
iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(label="Ask your DHRP related question", placeholder="e.g. What is DRHP?"),
    outputs=gr.Markdown(label="Response"),
    title="Prospecta",
    description="Ask any question about IPO documents, filings, or DRHP details.",
    theme="soft",
    examples=[
        ["What is DRHP?"],
        ["What is the type of issue?"],
        ["Where is the registered office?"]
    ],
)

# Launch the app
iface.launch(server_name="0.0.0.0")


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://434dee6d7801d01936.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


