<a href="https://colab.research.google.com/github/aasthaBaid/hackRx/blob/main/hackrx_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain langchain-google-genai python-dotenv pypdf python-docx faiss-cpu requests tiktoken




In [2]:
!pip install -U langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-n

In [3]:
# --- Imports and API Key Setup ---
import requests
import json
import os
from io import BytesIO

# LangChain components
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import LLMChain

from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate

# Import Gemini classes
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

# Securely get the API key from Colab secrets
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

# Configure the genai library
import google.generativeai as genai
genai.configure(api_key=GOOGLE_API_KEY)

In [4]:
def create_vector_store(documents: list[Document]):
    """
    Creates a FAISS vector store from document chunks using Gemini embeddings.
    """
    if not documents:
        print("No documents to process.")
        return None

    try:
        # Use Google's embedding model
        embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
        vector_store = FAISS.from_documents(documents, embeddings)
        print("Vector store created successfully using Gemini embeddings.")
        return vector_store
    except Exception as e:
        print(f"An error occurred while creating the vector store: {e}")
        return None

In [5]:
def query_system(query: str, vector_store):
    """
    Queries the system using Gemini Pro to get a structured JSON response.
    """
    if not vector_store:
        return {"error": "Vector store is not available."}

    # 4. Clause Matching (Semantic Similarity Search)
    print(f"Searching for clauses relevant to: '{query}'")
    relevant_docs = vector_store.similarity_search(query, k=4)
    context_clauses = "\n\n".join([doc.page_content for doc in relevant_docs])

    # 5. Logic Evaluation & 6. JSON Output
    prompt_template = """
    You are an expert AI assistant for analyzing legal and insurance documents.
    Your task is to answer the user's query based ONLY on the provided context clauses from a document.
    Do not use any external knowledge. If the context doesn't contain the answer, say so.

    Respond in a valid JSON format with the following structure:
    {{
      "decision": "Yes", "No", or "Partial/Conditional",
      "explanation": "A clear, concise explanation for your decision, citing the reasoning from the context.",
      "confidence_score": A value between 0.0 and 1.0 indicating your confidence in the answer.,
      "relevant_clauses": [
        "A list of the exact text snippets from the context that support your decision."
      ]
    }}

    CONTEXT CLAUSES:
    {context}

    USER QUERY:
    {query}

    JSON RESPONSE:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "query"])

    # Initialize the Gemini LLM
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key=GOOGLE_API_KEY,
        temperature=0.0, # Low temperature for factual output
        convert_system_message_to_human=True # Important for some LangChain chains
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    print("Sending query and context to Gemini for evaluation...")
    response_text = chain.run(context=context_clauses, query=query)

    # Clean up the response text
    if "```json" in response_text:
        response_text = response_text.split("```json")[1].split("```")[0]

    try:
        return json.loads(response_text)
    except json.JSONDecodeError:
        return {"error": "Failed to parse Gemini response into JSON.", "raw_response": response_text}

In [7]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
def process_document(path: str) -> list[Document]:
    """
    Downloads, loads, and splits a document (PDF or DOCX) from a URL or local path.
    """
    try:
        if os.path.exists(path):
            print(f"Loading document from local path: {path}")
            file_extension = os.path.splitext(path)[1].lower()
            if file_extension == ".pdf":
                loader = PyPDFLoader(path)
            elif file_extension == ".docx":
                loader = Docx2txtLoader(path)
            else:
                print(f"Unsupported file type: {file_extension}")
                return None
        else:
            print(f"Downloading document from: {path}")
            response = requests.get(path)
            response.raise_for_status()  # Raise an exception for bad status codes
            file_content = BytesIO(response.content)
            file_extension = os.path.splitext(path)[1].lower()

            print(f"Processing a {file_extension.upper()} file...")
            if file_extension == ".pdf":
                # We need to save the PDF to a temporary file to load it
                with open("temp.pdf", "wb") as f:
                    f.write(file_content.getbuffer())
                loader = PyPDFLoader("temp.pdf")
            elif file_extension == ".docx":
                # Docx2txtLoader can handle the BytesIO object directly
                loader = Docx2txtLoader(file_content)
            else:
                print(f"Unsupported file type: {file_extension}")
                return None

        documents = loader.load()

        # 3. Chunking
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        doc_chunks = text_splitter.split_documents(documents)
        print(f"Document split into {len(doc_chunks)} chunks.")
        return doc_chunks

    except requests.exceptions.RequestException as e:
        print(f"Error downloading the document: {e}")
        return None
    except Exception as e:
        print(f"An error occurred during document processing: {e}")
        return None

In [8]:
# --- MAIN EXECUTION CELL ---

# Define the path to the document you want to query.
# This can be a local file path or a URL.
document_path = "sample_policy.pdf"

# Define the query you want to ask.
user_query = "Are there any sub-limits on room rent and ICU charges for Plan A?"

# --- Start the Workflow ---
print("🚀 Starting the intelligent query process...")

# 1. INGESTION PHASE: Process the document from the path.
doc_chunks = process_document(document_path)

# Proceed only if document processing was successful.
if doc_chunks:
    # 2. EMBEDDING PHASE: Create the vector store from the document chunks.
    # This step uses the Gemini embedding model.
    vector_store = create_vector_store(doc_chunks)

    if vector_store:
        # 3. QUERYING PHASE: Query the system with your question.
        # This step uses the Gemini Pro model for reasoning.
        final_response = query_system(user_query, vector_store)

        # Print the final structured JSON response.
        print("\n--- ✅ Final Structured Response ---")
        print(json.dumps(final_response, indent=2))
    else:
        print("Could not create the vector store. Halting process.")
else:
    print("Could not process the document. Halting process.")

🚀 Starting the intelligent query process...
Loading document from local path: sample_policy.pdf
Document split into 100 chunks.
Vector store created successfully using Gemini embeddings.
Searching for clauses relevant to: 'Are there any sub-limits on room rent and ICU charges for Plan A?'


  chain = LLMChain(llm=llm, prompt=prompt)
  response_text = chain.run(context=context_clauses, query=query)


Sending query and context to Gemini for evaluation...

--- ✅ Final Structured Response ---
{
  "decision": "Yes",
  "explanation": "Yes, there are sub-limits on room rent and ICU charges.  Room rent, boarding, and nursing expenses are limited to 2% of the sum insured, up to a maximum of Rs. 5,000 per day.  ICU charges are limited to 5% of the sum insured, up to a maximum of Rs. 10,000 per day.  These limits are explicitly stated in the provided text.",
  "confidence_score": 1.0,
  "relevant_clauses": [
    "Sublimit for\nroom/doctors fee\n1. Room Rent, Boarding, Nursing Expenses all inclusive as provided by the Hospital/ Nursing Home\nup to 2% of the Sum Insured subject to maximum of Rs. 5,000/- per day\n2. Intensive Care Unit (ICU) charges/ Intensive Cardiac Care Unit (ICCU) charges all-inclusive as\nprovided by the Hospital/ Nursing Home up to 5% of the Sum Insured subject to maximum of Rs.\n10,000/- per day"
  ]
}


In [12]:
questions = [
    "What is the grace period for premium payment under the National Parivar Mediclaim Plus Policy?",
    "What is the waiting period for pre-existing diseases (PED) to be covered?",
    "Does this policy cover maternity expenses, and what are the conditions?",
    "What is the waiting period for cataract surgery?",
    "Are the medical expenses for an organ donor covered under this policy?",
    "What is the No Claim Discount (NCD) offered in this policy?",
    "Is there a benefit for preventive health check-ups?",
    "How does the policy define a 'Hospital'?",
    "What is the extent of coverage for AYUSH treatments?",
    "Are there any sub-limits on room rent and ICU charges for Plan A?"
]

results = {}

for question in questions:
    #print(f"\n--- 🤔 Querying for: {question} ---")
    response = query_system(question, vector_store)
    results[question] = response
    print(json.dumps(response, indent=2))

print("\n\n--- 🎉 All questions processed! ---")

Searching for clauses relevant to: 'What is the grace period for premium payment under the National Parivar Mediclaim Plus Policy?'
Sending query and context to Gemini for evaluation...




{
  "decision": "Yes",
  "explanation": "The grace period for premium payment is thirty days, applicable when premiums are paid in quarterly, half-yearly, or annual installments.  This is explicitly stated in multiple sections of the provided text (clauses).",
  "confidence_score": 1.0,
  "relevant_clauses": [
    "Grace Period The grace period of thirty days (where premium is paid in quarterly/half yearly/annual instalments) is available on the premium due date, to pay the premium.",
    "3.22. Grace Period  means the specified period of time, immediately following the premium due date during which premium payment can be made to renew or continue a policy in force without loss of continuity benefits pertaining to Waiting Periods and coverage of Pre-Existing Diseases. The Grace Period for payment of the premium shall be thirty days.",
    "In case of Premium payment in instalments, if the due instalment premium is paid within Grace Period during the Policy Period, coverage shall be ava



{
  "decision": "Yes",
  "explanation": "The waiting period for pre-existing diseases (PED) is 36 months of continuous coverage after the date of inception of the first policy.  This waiting period can be reduced if the insured person has continuous coverage without a break, as defined by IRDAI regulations, to the extent of prior coverage.",
  "confidence_score": 1.0,
  "relevant_clauses": [
    "6.1. Pre-Existing Diseases (Excl 01)\na) Expenses related to the treatment of a Pre -Existing Disease (PED) and its direct complications shall be excluded until the expiry of 36 (thirty six) months of continuous coverage after the date of inception of the first policy with us.",
    "c) If the Insured Person is continuously covered without any break as defined under the portability norms of the extant IRDAI (Health Insurance) Regulations then waiting period for the same would be reduced to the extent of prior coverage."
  ]
}
Searching for clauses relevant to: 'Does this policy cover maternity



{
  "decision": "Yes",
  "explanation": "Yes, the policy covers maternity expenses.  Clause 7.15 details that it covers medical treatment expenses related to childbirth (including complicated deliveries and C-sections during hospitalization), excluding ectopic pregnancies. It also covers expenses for miscarriage (unless due to an accident) and lawful medical termination of pregnancy during the policy period.",
  "confidence_score": 1.0,
  "relevant_clauses": [
    "7.15. Maternity Expenses (Code \u2013 Excl 18)\ni. Medical treatment expenses traceable to childbirth (including complicated deliveries and caesarean sections incurred\nduring hospitalization) except ectopic pregnancy;\nii. Expenses towards miscarriage (unless due to an accident) and lawful medical termination of pregnancy during the policy\nperiod."
  ]
}
Searching for clauses relevant to: 'What is the waiting period for cataract surgery?'
Sending query and context to Gemini for evaluation...




{
  "decision": "Partial/Conditional",
  "explanation": "The provided text states that cataract surgery has a 24-month waiting period if it's considered a pre-existing condition (clause f, i, 9). However,  clause d mentions that after 36 months, coverage for pre-existing diseases is subject to declaration at application and acceptance by the insurer.  Therefore, the waiting period is dependent on whether the cataract is a pre-existing condition and when it was declared.",
  "confidence_score": 0.8,
  "relevant_clauses": [
    "f) List of specific diseases/procedures \ni. 24 Months waiting period \n9. Cataract and age related eye ailments",
    "d) Coverage under the policy after the expiry of 36 (thirty six) months for any pre-existing disease is subject to the same being declared at the time of application and accepted by us."
  ]
}
Searching for clauses relevant to: 'Are the medical expenses for an organ donor covered under this policy?'
Sending query and context to Gemini for evalua



{
  "decision": "No",
  "explanation": "The provided text focuses on coverage for hospitalization, pre- and post-hospitalization expenses, specific treatments like cataract surgery and AYUSH treatments, and modern treatments.  There is no mention of coverage for organ donor medical expenses.",
  "confidence_score": 1.0,
  "relevant_clauses": []
}
Searching for clauses relevant to: 'What is the No Claim Discount (NCD) offered in this policy?'
Sending query and context to Gemini for evaluation...




{
  "decision": "No",
  "explanation": "The provided text does not mention a No Claim Discount (NCD).  While it discusses cumulative bonus increases for claim-free years and reductions for claims, it does not refer to this as an NCD.",
  "confidence_score": 1.0,
  "relevant_clauses": []
}
Searching for clauses relevant to: 'Is there a benefit for preventive health check-ups?'
Sending query and context to Gemini for evaluation...




{
  "decision": "No",
  "explanation": "The provided text does not mention any benefits for preventive health check-ups.  The clauses describe exclusions, waiting periods for specific procedures, policy cancellation, withdrawal, revision of terms, and claim settlement procedures, but there is no information regarding coverage or benefits for preventive care.",
  "confidence_score": 1.0,
  "relevant_clauses": []
}
Searching for clauses relevant to: 'How does the policy define a 'Hospital'?'
Sending query and context to Gemini for evaluation...




{
  "decision": "Yes",
  "explanation": "The policy defines a 'Hospital' as \"any institution established for in-patient care and day care treatment of disease/ injuries and which has been registered as a hospital with the local authorities under the Clinical Establishments (Registration and Regulation) Act, 2010\".",
  "confidence_score": 1.0,
  "relevant_clauses": [
    "3.23. Hospital means any institution established for in-patient care and day care treatment of disease/ injuries and which has been registered as a hospital with the local authorities under the Clinical Establishments (Registration and Regulation) Act, 2010"
  ]
}
Searching for clauses relevant to: 'What is the extent of coverage for AYUSH treatments?'
Sending query and context to Gemini for evaluation...




{
  "decision": "Yes",
  "explanation": "The policy covers medical expenses incurred for inpatient AYUSH treatment up to the sum insured specified in the policy schedule.  This applies to Ayurveda, Yoga and Naturopathy, Unani, Sidha and Homeopathy systems of medicine, during each policy period, in any AYUSH hospital.",
  "confidence_score": 1.0,
  "relevant_clauses": [
    "4.2. AYUSH Treatment\nThe Company shall indemnify Medical Expenses incurred for Inpatient Care treatment under Ayurveda, Yoga and Naturopathy, Unani, Sidha and Homeopathy systems of medicines  during each Policy Period up to the limit of sum insured as specified in the policy schedule in any AYUSH Hospital."
  ]
}
Searching for clauses relevant to: 'Are there any sub-limits on room rent and ICU charges for Plan A?'
Sending query and context to Gemini for evaluation...




{
  "decision": "Yes",
  "explanation": "Yes, there are sub-limits on room rent and ICU charges.  Room rent, boarding, and nursing expenses are limited to 2% of the sum insured, up to a maximum of Rs. 5,000 per day.  ICU charges are limited to 5% of the sum insured, up to a maximum of Rs. 10,000 per day.",
  "confidence_score": 1.0,
  "relevant_clauses": [
    "Sublimit for\nroom/doctors fee\n1. Room Rent, Boarding, Nursing Expenses all inclusive as provided by the Hospital/ Nursing Home\nup to 2% of the Sum Insured subject to maximum of Rs. 5,000/- per day\n2. Intensive Care Unit (ICU) charges/ Intensive Cardiac Care Unit (ICCU) charges all-inclusive as\nprovided by the Hospital/ Nursing Home up to 5% of the Sum Insured subject to maximum of Rs.\n10,000/- per day"
  ]
}


--- 🎉 All questions processed! ---


In [13]:
for question, answer in results.items():
    print(f"--- Question: {question} ---")
    print(json.dumps(answer, indent=2))
    print("-" * 30)

--- Question: What is the grace period for premium payment under the National Parivar Mediclaim Plus Policy? ---
{
  "decision": "Yes",
  "explanation": "The grace period for premium payment is thirty days, applicable when premiums are paid in quarterly, half-yearly, or annual installments.  This is explicitly stated in multiple sections of the provided text (clauses).",
  "confidence_score": 1.0,
  "relevant_clauses": [
    "Grace Period The grace period of thirty days (where premium is paid in quarterly/half yearly/annual instalments) is available on the premium due date, to pay the premium.",
    "3.22. Grace Period  means the specified period of time, immediately following the premium due date during which premium payment can be made to renew or continue a policy in force without loss of continuity benefits pertaining to Waiting Periods and coverage of Pre-Existing Diseases. The Grace Period for payment of the premium shall be thirty days.",
    "In case of Premium payment in insta

In [15]:
import json

# Example input JSON structure
input_json = {
    "questions": [
        "What is the grace period for premium payment under the National Parivar Mediclaim Plus Policy?",
        "What is the waiting period for pre-existing diseases (PED) to be covered?",
        "Does this policy cover maternity expenses, and what are the conditions?",
        "What is the waiting period for cataract surgery?",
        "Are the medical expenses for an organ donor covered under this policy?",
        "What is the No Claim Discount (NCD) offered in this policy?",
        "Is there a benefit for preventive health check-ups?",
        "How does the policy define a 'Hospital'?",
        "What is the extent of coverage for AYUSH treatments?",
        "Are there any sub-limits on room rent and ICU charges for Plan A?"
    ]
}

# Process the questions from the input JSON
final_answers = {
    "answers": []
}

for question in input_json["questions"]:
    final_answers["answers"].append(results[question]["explanation"])

# Output the answers in JSON format
print(json.dumps(final_answers, indent=2))

{
  "answers": [
    "The grace period for premium payment is thirty days, applicable when premiums are paid in quarterly, half-yearly, or annual installments.  This is explicitly stated in multiple sections of the provided text (clauses).",
    "The waiting period for pre-existing diseases (PED) is 36 months of continuous coverage after the date of inception of the first policy.  This waiting period can be reduced if the insured person has continuous coverage without a break, as defined by IRDAI regulations, to the extent of prior coverage.",
    "Yes, the policy covers maternity expenses.  Clause 7.15 details that it covers medical treatment expenses related to childbirth (including complicated deliveries and C-sections during hospitalization), excluding ectopic pregnancies. It also covers expenses for miscarriage (unless due to an accident) and lawful medical termination of pregnancy during the policy period.",
    "The provided text states that cataract surgery has a 24-month waiti