In [None]:
print("---starting with the hackathon---")

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
from PIL import Image
from IPython.display import display

img_path = '/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/IMG_6107.PNG'
img = Image.open(img_path)
img_resized = img.resize((500, 500))

display(img_resized)


# **File Cleaner v1.0**

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Files to delete
files_to_delete = [
    "/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/Parsed_text/all_policy_chunks.jsonl",
    "/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/Faiss_Index/policy_chunks_faiss_index.bin",
    "/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/Faiss_Index/policy_chunks_metadata.json",
]

# Folder to clear
enpdf_folder = "/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/EnPDF"

# Ask user for confirmation
confirm = input("Type 'YES' to delete the files and clear EnPDF folder: ")

if confirm.strip().upper() == 'YES':
    # Delete individual files
    for file_path in files_to_delete:
        if os.path.isfile(file_path):
            try:
                os.remove(file_path)
                print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Error deleting {file_path}: {e}")
        else:
            print(f"Not found: {file_path}")

    # Delete files from EnPDF folder
    if os.path.exists(enpdf_folder):
        for fname in os.listdir(enpdf_folder):
            fpath = os.path.join(enpdf_folder, fname)
            if os.path.isfile(fpath):
                try:
                    os.remove(fpath)
                    print(f"Deleted from EnPDF: {fpath}")
                except Exception as e:
                    print(f"Failed to delete {fpath}: {e}")
    else:
        print(f"EnPDF folder not found: {enpdf_folder}")
else:
    print("Deletion cancelled.")


#**api handling**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install python-dotenv
import os
from dotenv import load_dotenv
env_path = r"/content/drive/MyDrive/Colab Notebooks/BajajFinserv/.env"

load_dotenv(dotenv_path=env_path)

api_key = os.getenv("API_KEY")
print("API Key Loaded:", api_key is not None)  # Avoid printing actual key!


# **Parser**

# **Parser v1.0**


In [None]:
!pip -q install pypdf
import os
import pypdf
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List, Dict, Any
import json # For saving chunks as JSON Lines
from google.colab import drive, files # Import files for user uploads
import shutil # For moving files

# --- Function to parse and chunk PDF (no change needed here) ---
def parse_and_chunk_pdf(file_path: str) -> List[Dict[str, Any]]:
    all_page_texts = []
    try:
        with open(file_path, "rb") as f:
            reader = pypdf.PdfReader(f)
            num_pages = len(reader.pages)

            for i, page in enumerate(reader.pages):
                page_text = page.extract_text()
                if page_text:
                    all_page_texts.append({"text": page_text, "page_number": i + 1})

        if not all_page_texts:
            print(f"Warning: No text extracted from {file_path}. It might be an image-only PDF or empty.")
            return []

    except Exception as e:
        print(f"Error reading PDF file {file_path}: {e}")
        raise

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len,
        is_separator_regex=False,
    )

    processed_chunks = []
    for page_info in all_page_texts:
        page_content = page_info["text"]
        page_number = page_info["page_number"]

        chunks_from_page = text_splitter.split_text(page_content)

        for i, chunk_content in enumerate(chunks_from_page):
            clean_chunk = ' '.join(chunk_content.split()).strip()
            if clean_chunk:
                processed_chunks.append({
                    "content": clean_chunk,
                    "metadata": {
                        "page_number": page_number,
                        "source_file": os.path.basename(file_path),
                    }
                })

    return processed_chunks

# --- Main execution block for handling multiple PDFs and storing chunks ---
if __name__ == "__main__":
    print("Mounting Google Drive...")
    drive.mount('/content/drive')
    print("Google Drive mounted.")

    # Define the base directory for your PDF files on Google Drive
    PDF_INPUT_FOLDER = '/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/EnPDF/'
    # Define the output directory for storing the single aggregated JSONL file
    PARSED_TEXT_OUTPUT_FOLDER = '/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/Parsed_text/'
    # Define the filename for the single aggregated chunks file
    AGGREGATED_CHUNKS_FILENAME = 'all_policy_chunks.jsonl'
    full_output_jsonl_path = os.path.join(PARSED_TEXT_OUTPUT_FOLDER, AGGREGATED_CHUNKS_FILENAME)

    # Create the input and output folders if they don't exist
    os.makedirs(PDF_INPUT_FOLDER, exist_ok=True)
    os.makedirs(PARSED_TEXT_OUTPUT_FOLDER, exist_ok=True)
    print(f"PDF Input Folder: {PDF_INPUT_FOLDER}")
    print(f"Parsed Text Output Folder: {PARSED_TEXT_OUTPUT_FOLDER}")

    # --- User PDF Upload Section ---
    print("\n--- Upload your PDF files ---")
    print("You can upload multiple PDF files.")
    print("After selecting files, click 'Done' or simply close the upload dialog.")

    uploaded = files.upload() # This opens a file picker dialog

    if uploaded:
        print("\nMoving uploaded files to Google Drive...")
        for filename in uploaded.keys():
            destination_path = os.path.join(PDF_INPUT_FOLDER, filename)
            with open(destination_path, 'wb') as f:
                f.write(uploaded[filename])
            print(f"Moved '{filename}' to '{PDF_INPUT_FOLDER}'")
    else:
        print("No files were uploaded. Proceeding with existing PDFs in the folder (if any).")
    # --- End of User PDF Upload Section ---


    all_document_chunks_aggregated: List[Dict[str, Any]] = []

    if os.path.isdir(PDF_INPUT_FOLDER):
        print(f"\n--- Processing PDFs from: {PDF_INPUT_FOLDER} ---")
        pdf_files = [f for f in os.listdir(PDF_INPUT_FOLDER) if f.lower().endswith('.pdf')]

        if not pdf_files:
            print(f"No PDF files found in {PDF_INPUT_FOLDER}. Please upload some or place them manually.")
        else:
            for pdf_file_name in pdf_files:
                full_pdf_path = os.path.join(PDF_INPUT_FOLDER, pdf_file_name)

                print(f"\n--- Processing {pdf_file_name} ---")
                try:
                    chunks_from_current_pdf = parse_and_chunk_pdf(full_pdf_path)

                    if chunks_from_current_pdf:
                        print(f"  Extracted {len(chunks_from_current_pdf)} chunks from {pdf_file_name}.")
                        all_document_chunks_aggregated.extend(chunks_from_current_pdf)
                    else:
                        print(f"  No chunks extracted from {pdf_file_name}.")

                except Exception as e:
                    print(f"  Skipping {pdf_file_name} due to error: {e}")

            print(f"\n--- Finished processing all PDFs ---")
            print(f"Total chunks aggregated from all documents: {len(all_document_chunks_aggregated)}")

            # Save all aggregated chunks to a single JSON Lines file
            if all_document_chunks_aggregated:
                with open(full_output_jsonl_path, 'w', encoding='utf-8') as outfile:
                    for chunk in all_document_chunks_aggregated:
                        outfile.write(json.dumps(chunk, ensure_ascii=False) + '\n')
                print(f"All aggregated chunks saved to: {full_output_jsonl_path}")
            else:
                print("No chunks were aggregated to save.")


            print("\n--- Sample of aggregated chunks (first 5 across all docs) ---")
            for i, chunk in enumerate(all_document_chunks_aggregated):
                if i < 5:
                    print(f"--- Chunk {i+1} (Source: {chunk['metadata']['source_file']}, Page {chunk['metadata']['page_number']}) ---")
                    print(chunk['content'])
                    print("-" * 20)
                else:
                    break
            if len(all_document_chunks_aggregated) > 5:
                print("...")
                print("Only first 5 combined chunks printed for brevity. The 'all_document_chunks_aggregated' list holds all of them.")

    else:
        print(f"Error: Directory not found at {PDF_INPUT_FOLDER}.")
        print("Please check the path and ensure your Google Drive is mounted correctly.")

#**Vectorizer v1.0**

In [None]:
!pip install -q transformers sentence-transformers faiss-cpu # Install necessary libraries

import os
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any

# --- Configuration ---
# Path to your aggregated chunks file
PARSED_TEXT_OUTPUT_FOLDER = '/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/Parsed_text/'
AGGREGATED_CHUNKS_FILENAME = 'all_policy_chunks.jsonl'
full_chunks_jsonl_path = os.path.join(PARSED_TEXT_OUTPUT_FOLDER, AGGREGATED_CHUNKS_FILENAME)

# Path to save the FAISS index
FAISS_INDEX_OUTPUT_FOLDER = '/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/Faiss_Index/'
FAISS_INDEX_FILENAME = 'policy_chunks_faiss_index.bin' # Binary file for FAISS index
FAISS_METADATA_FILENAME = 'policy_chunks_metadata.json' # JSON for metadata (chunk content, page, source)

full_faiss_index_path = os.path.join(FAISS_INDEX_OUTPUT_FOLDER, FAISS_INDEX_FILENAME)
full_faiss_metadata_path = os.path.join(FAISS_INDEX_OUTPUT_FOLDER, FAISS_METADATA_FILENAME)

# Create the FAISS output folder if it doesn't exist
os.makedirs(FAISS_INDEX_OUTPUT_FOLDER, exist_ok=True)

# --- Load Embedding Model ---
print("Loading Sentence Transformer model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")

# --- Step 1: Load Chunks ---
print(f"Loading chunks from {full_chunks_jsonl_path}...")
all_chunks: List[Dict[str, Any]] = []
if os.path.exists(full_chunks_jsonl_path):
    with open(full_chunks_jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            all_chunks.append(json.loads(line))
    print(f"Loaded {len(all_chunks)} chunks.")
else:
    print(f"Error: Chunks file not found at {full_chunks_jsonl_path}. Please run the PDF parsing script first.")
    exit() # Exit if chunks are not found

# Extract content for embedding
chunk_contents: List[str] = [chunk['content'] for chunk in all_chunks]
chunk_metadatas: List[Dict[str, Any]] = [
    {
        "source_file": chunk['metadata']['source_file'],
        "page_number": chunk['metadata']['page_number'],
        "content": chunk['content'] # Store content here for easy retrieval
    }
    for chunk in all_chunks
]

# --- Step 2 & 3: Generate Embeddings ---
print(f"Generating embeddings for {len(chunk_contents)} chunks...")
chunk_embeddings = embedding_model.encode(chunk_contents, show_progress_bar=True)
print("Embeddings generated.")

# Ensure embeddings are in float32 for FAISS
chunk_embeddings = np.array(chunk_embeddings).astype('float32')

# --- Step 4 & 5: Initialize and Populate FAISS Index ---
embedding_dimension = chunk_embeddings.shape[1]
print(f"Embedding dimension: {embedding_dimension}")

print("Initializing FAISS index...")
index = faiss.IndexFlatL2(embedding_dimension)

print("Adding embeddings to FAISS index...")
index.add(chunk_embeddings)
print(f"FAISS index contains {index.ntotal} vectors.")

# --- Step 6: Save the FAISS Index and Metadata ---
print(f"Saving FAISS index to {full_faiss_index_path}...")
faiss.write_index(index, full_faiss_index_path)
print("FAISS index saved.")

print(f"Saving chunk metadata to {full_faiss_metadata_path}...")
with open(full_faiss_metadata_path, 'w', encoding='utf-8') as f:
    json.dump(chunk_metadatas, f, ensure_ascii=False, indent=2)
print("Metadata saved.")

print("\n--- FAISS Indexing Complete ---")
print(f"Total chunks indexed: {index.ntotal}")
print("The FAISS index and its corresponding metadata have been saved. They are now ready for retrieval when a user query is provided.")

#**Final query handler**
# **The Query Block is divided in to two blocks**
1.    **spell checker and grammatical mistake correction**
2.    **the evaluator**


In [None]:
# Install required packages
!pip install openai python-dotenv faiss-cpu sentence-transformers --upgrade -q

# Imports
import os
import json
import faiss
import numpy as np
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any
from openai import OpenAI
import re
import asyncio
import nest_asyncio

# Apply nest_asyncio for async functions in Jupyter/Colab environments
nest_asyncio.apply()

# --- Google Drive Mount and .env loading ---
from google.colab import drive
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted.")

DOTENV_PATH = '/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/.env'
if os.path.exists(DOTENV_PATH):
    load_dotenv(dotenv_path=DOTENV_PATH)
    print(f"Loaded .env file from: {DOTENV_PATH}")
else:
    print(f"Error: .env file not found at {DOTENV_PATH}. Please check the path and ensure it's accessible.")
    exit("Exiting: .env file not found.") # Critical dependency

# --- Configure for Together.ai using OpenAI client ---
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")

if not TOGETHER_API_KEY:
    print("WARNING: TOGETHER_API_KEY not found in environment variables. Please set it in your .env file.")
    exit("Exiting: TOGETHER_API_KEY is not set.") # Critical dependency

client = OpenAI(
    api_key=TOGETHER_API_KEY,
    base_url="https://api.together.xyz/v1/"
)

LLM_MODEL_TOGETHER = "mistralai/Mistral-7B-Instruct-v0.2" # Using the model you specified

# --- FAISS & Embedding Model Configuration ---
FAISS_INDEX_PATH = '/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/Faiss_Index/policy_chunks_faiss_index.bin'
METADATA_PATH = '/content/drive/MyDrive/Colab Notebooks/BajajFinserv_A/Faiss_Index/policy_chunks_metadata.json'
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
TOP_K_RETRIEVAL = 5 # Number of chunks to retrieve for RAG

# --- Load FAISS Index ---
print("Loading FAISS index...")
try:
    faiss_index = faiss.read_index(FAISS_INDEX_PATH)
    # Ensure the index is suitable for dot product similarity (IP) if your embeddings are normalized
    if not isinstance(faiss_index, faiss.IndexFlatIP):
        embedding_dim = faiss_index.d
        ip_index = faiss.IndexFlatIP(embedding_dim)
        ip_index.add(faiss_index.reconstruct_n(0, faiss_index.ntotal))
        faiss_index = ip_index
    print(f"FAISS index loaded with {faiss_index.ntotal} vectors and converted to IndexFlatIP.")
except Exception as e:
    print(f"Error loading FAISS index: {e}")
    exit("Exiting: FAISS index failed to load. Please check the path and file.")

# --- Load Metadata ---
print("Loading metadata...")
try:
    with open(METADATA_PATH, 'r', encoding='utf-8') as f:
        metadata = json.load(f)
    print(f"Metadata loaded with {len(metadata)} entries.")
    # --- IMPORTANT: VERIFY YOUR METADATA KEYS HERE ---
    if metadata:
        sample_chunk = metadata[0]
        print("\n--- Sample Metadata Chunk Keys (VERIFY THESE!) ---")
        for key in sample_chunk.keys():
            print(f"- {key}: {sample_chunk[key]}")
        print("--------------------------------------------------\n")
        print("Expected keys for display: 'source_document' (or similar), 'page_number' (or 'page'), 'id' (or 'chunk_id'), 'content' (or 'text')")
    else:
        print("Metadata is empty. No chunks to retrieve.")

except Exception as e:
    print(f"Error loading metadata: {e}")
    exit("Exiting: Metadata failed to load. Please check the path and file.")

# --- Load Embedding Model ---
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
print(f"Embedding model '{EMBEDDING_MODEL_NAME}' loaded.")

# --- Normalize function for embeddings (important for IP similarity) ---
def normalize_vectors(vecs: np.ndarray) -> np.ndarray:
    """Normalizes a NumPy array of vectors for dot product similarity."""
    norms = np.linalg.norm(vecs, axis=1, keepdims=True)
    norms[norms == 0] = 1e-12 # Prevent division by zero
    return vecs / norms

# --- Semantic Cleaning ---
def clean_query(query: str) -> str:
    cleaned_text = ''.join(char for char in query if char.isalnum() or char.isspace()).strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text

# --- Top-k Retriever ---
def retrieve_top_k(query_for_embedding: str, k: int = TOP_K_RETRIEVAL) -> List[Dict[str, Any]]:
    if faiss_index.ntotal == 0:
        print("Warning: FAISS index is empty. Cannot perform search.")
        return []

    query_vector = embedder.encode([query_for_embedding]).astype("float32")
    query_vector = normalize_vectors(query_vector) # Normalize query vector

    D, I = faiss_index.search(query_vector, k)

    results = []
    for idx in I[0]:
        if 0 <= idx < len(metadata):
            chunk = metadata[idx].copy() # Use .copy() to avoid modifying original metadata
            chunk['retrieval_score'] = float(D[0, np.where(I[0] == idx)[0][0]])
            results.append(chunk)
        else:
            print(f"Warning: Retrieved index {idx} is out of bounds for metadata list (size {len(metadata)}).")
    return results

# --- LLM API Call Function ---
async def call_llm_api(prompt_messages: List[Dict[str, str]], json_output: bool = True) -> str:
    if not TOGETHER_API_KEY:
        raise ValueError("Together.ai API key is not set. Cannot call LLM API.")

    payload = {
        "model": LLM_MODEL_TOGETHER,
        "messages": prompt_messages,
        "temperature": 0.2,
        "max_tokens": 1024,
    }
    if json_output:
        payload["response_format"] = {"type": "json_object"}

    try:
        response = client.chat.completions.create(**payload)
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error during LLM API call: {e}")
        raise

# --- Entity Extraction Function ---
async def enhance_and_extract_query_with_llm(raw_query: str) -> Dict[str, Any]:
    prompt = f"""
You are an intelligent query processing agent for an information retrieval system, especially good at understanding details relevant to policies, contracts, or general inquiries. Your task is to refine a user's natural language query and extract key information.

Perform the following steps:
1.  **Correct Spelling and Grammar:** Fix any spelling mistakes or grammatical errors in the query.
2.  **Rephrase for Semantic Search:** Rephrase the query to be concise, grammatically correct, and semantically rich. This rephrasing should inherently consider synonyms and different ways of expressing the same intent, making it suitable for a vector-based semantic search system. Remove any informal greetings or conversational filler words.
3.  **Extract Key Entities:** Identify and extract specific factual entities from the query. If the query implies an insurance or policy context, prioritize extracting relevant details. If not, extract general entities. **Only include entities that are explicitly mentioned or clearly implied.** If an entity type is not mentioned or found, it should be `null` and omitted from the final JSON.

Expected Entity Types (use these keys if applicable, otherwise generalize):
-   `person_name`: Name of a person.
-   `age`: Age (e.g., "46", "30 years old").
-   `gender`: Gender (e.g., "Male", "Female", "M", "F").
-   `problem_description` / `procedure`: Medical condition, surgery, or incident (e.g., "knee surgery", "warehouse fire").
-   `location`: Geographic location (e.g., "Pune", "Delhi").
-   `policy_duration`: Duration or age of a policy (e.g., "3-month policy", "1 year").
-   `document_type`: Type of document (e.g., "policy", "contract", "email").
-   `date` / `time`: Specific date or duration (e.g., "last week", "July 15, 2024").
-   `organization`: Name of a company or institution.
-   `numerical_value`: Any other number with its context/unit (e.g., "100 dollars", "5000 units").
-   `specific_term`: Any other important noun phrase or concept not fitting above (e.g., "grace period", "instalments", "premium").

Here is the user's raw query:
"{raw_query}"

Provide your output as a JSON object with the following structure. Only include the "extracted_entities" fields that have a non-null value. The keys for extracted entities should match the `Expected Entity Types` where applicable, or be a descriptive general term.

Example Output Structure (adapt keys based on query):
{{
    "corrected_and_rephrased_query": "The grammatically correct and semantically enhanced query.",
    "extracted_entities": {{
        "age": "46",
        "gender": "Male",
        "procedure": "knee surgery",
        "location": "Pune",
        "policy_duration": "3-month policy"
    }}
}}
"""
    messages = [{"role": "user", "content": prompt}]

    try:
        llm_output_str = await call_llm_api(messages)
        parsed_output = json.loads(llm_output_str)

        if "extracted_entities" not in parsed_output or not isinstance(parsed_output["extracted_entities"], dict):
            parsed_output["extracted_entities"] = {}

        parsed_output["extracted_entities"] = {k: v for k, v in parsed_output["extracted_entities"].items() if v is not None}

        return parsed_output
    except Exception as e:
        print(f"Error during LLM query enhancement. Returning basic fallback: {e}")
        return {
            "corrected_and_rephrased_query": clean_query(raw_query),
            "extracted_entities": {}
        }

# --- Query Processing Pipeline (orchestrates entity extraction and rephrasing) ---
async def process_user_query_pipeline(raw_query: str) -> Dict[str, Any]:
    llm_enhanced_output = await enhance_and_extract_query_with_llm(raw_query)
    final_query_for_embedding = clean_query(
        llm_enhanced_output.get("corrected_and_rephrased_query", raw_query)
    )
    return {
        "original_query": raw_query,
        "corrected_and_rephrased_query": llm_enhanced_output.get("corrected_and_rephrased_query", raw_query),
        "query_for_embedding": final_query_for_embedding,
        "extracted_entities": llm_enhanced_output.get("extracted_entities", {})
    }

# --- Construct Prompt for LLM (Full RAG Response) ---
def construct_rag_prompt(user_query: str, context_chunks: List[Dict[str, Any]], extracted_entities: Dict[str, Any]) -> str:
    if not context_chunks:
        return f"""You are a helpful assistant. No specific document context was found for your query. Please answer based on your general knowledge or state that you cannot find relevant information.
USER QUERY: {user_query}
Provide a comprehensive answer in clear, human-understandable language. If you cannot answer, state why.
"""

    context_text = "\n\n".join([
        # --- IMPORTANT: ADJUST THESE KEYS TO MATCH YOUR METADATA.JSON ---
        # Example: if your metadata has 'filename', 'page', 'id_chunk', use those instead of 'source_document', 'page_number', 'id'
        f"--- Document: {chunk.get('source_document', 'Unknown')}, Page: {chunk.get('page_number', 'N/A')}, Chunk ID: {chunk.get('id', 'N/A')} ---\n{chunk.get('content') or chunk.get('text', '')}"
        for chunk in context_chunks
    ])

    entities_str = "\n".join([f"- {k.replace('_', ' ').title()}: {v}" for k, v in extracted_entities.items()]) if extracted_entities else "No specific entities extracted."

    prompt = f"""You are an expert information retrieval and analysis assistant.
Your task is to answer user queries truthfully and comprehensively using ONLY the provided "DOCUMENT CONTEXT".
If the answer cannot be found in the context, clearly state "I cannot find a direct answer in the provided documents." and then, if appropriate, provide general information.

Strictly follow these instructions:
1.  **Analyze**: Carefully read the "USER QUERY", "EXTRACTED ENTITIES", and "DOCUMENT CONTEXT".
2.  **Formulate Answer**: Construct a detailed, clear, and human-understandable answer.
3.  **Directness**: Your answer MUST be derived *directly* from the provided DOCUMENT CONTEXT. Do not invent information.
4.  **Completeness**: Address all aspects of the user's query.
5.  **Referencing**: For *every* piece of information you provide that comes from the context, explicitly reference the source using: "[See Document: [Name], Page: [Page Number], Section: [ID]]".
6.  **Conditions/Exceptions**: Ensure to cover any relevant conditions, exceptions, or specific terms in the context that apply.
7.  **Output Format**: Provide the final answer as a JSON object with this structure:
    ```json
    {{
        "Decision": "Information Provided" | "Cannot Determine" | "Specific Decision (e.g., Approved, Rejected, Policy Covered, Not Covered)",
        "Amount": "N/A" | "$[amount]" | "Full Coverage" | "See Justification",
        "Justification": "Your comprehensive answer derived from context, with source references. State if information is not in documents."
    }}
    ```

USER QUERY:
{user_query}

EXTRACTED ENTITIES:
{entities_str}

DOCUMENT CONTEXT:
{context_text}

Your JSON response:
"""
    return prompt

# --- Main Orchestration Function ---
async def run_full_rag_pipeline() -> Dict[str, Any]:
    user_query = input("Enter your query (e.g., '46M, knee surgery, Pune, 3-month policy' or 'what happens if premium is paid in instalments and not received within the grace period?'): ").strip()
    if not user_query:
        return {"error": "Query cannot be empty."}

    # Step 1: Query Processing and Entity Extraction
    print(f"\n--- Step 1: Processing Query and Extracting Entities ---")
    processed_query_info = await process_user_query_pipeline(user_query)

    original_query = processed_query_info["original_query"]
    rephrased_query = processed_query_info["corrected_and_rephrased_query"]
    query_for_embedding = processed_query_info["query_for_embedding"]
    extracted_entities = processed_query_info["extracted_entities"]

    print(f"Original Query: '{original_query}'")
    print(f"Rephrased for Semantic Search: '{rephrased_query}'")
    print("Processed Query (Extracted Entities):")
    if extracted_entities:
        for key, value in extracted_entities.items():
            print(f"- {key.replace('_', ' ').title()}: {value}")
    else:
        print("No specific entities extracted.")
    print("-" * 50)

    # Step 2: Retrieve Relevant Documents
    print(f"\n--- Step 2: Retrieving Top {TOP_K_RETRIEVAL} Relevant Documents ---")
    # First, retrieve based on the rephrased query for overall context
    top_chunks = retrieve_top_k(query_for_embedding)

    # Optional: Also retrieve based on individual entities if present, then combine/re-rank
    if extracted_entities:
        print("\n🔎 Searching based on individual extracted entities (for detailed view):")
        for i, (entity_type, entity_value) in enumerate(extracted_entities.items()):
            search_term = f"{entity_value}" # Use the entity value as the search term
            entity_chunks = retrieve_top_k(search_term, k=TOP_K_RETRIEVAL)
            print(f"\n🔎 Entity {i+1}: '{entity_type.replace('_', ' ').title()}: {entity_value}'")
            if entity_chunks:
                for rank, chunk in enumerate(entity_chunks, 1):
                    # --- IMPORTANT: ADJUST THESE KEYS TO MATCH YOUR METADATA.JSON ---
                    # Example: if your metadata has 'filename', 'page', 'id_chunk', use those
                    content_preview = chunk.get('content', chunk.get('text', ''))
                    print(f"  {rank}. [Score: {chunk.get('retrieval_score', 'N/A'):.4f}] Page {chunk.get('page_number', 'N/A')} - {chunk.get('source_document', 'Unknown')}")
                    print(f"     Content: {content_preview[:120]}...\n") # Truncate for display
                # top_chunks.extend(entity_chunks) # Uncomment this if you want to merge these for LLM context
            else:
                print(f"  No specific matches found for entity '{entity_value}'.")
        print("-" * 50)

    if not top_chunks:
        print("No relevant chunks found in the FAISS index for overall query.")
        final_response = {
            "Decision": "Cannot Determine",
            "Amount": "N/A",
            "Justification": "No relevant documents found for your query in the knowledge base."
        }
    else:
        print("\n--- Context for LLM Reasoning (from overall query retrieval) ---")
        for i, chunk in enumerate(top_chunks):
            # --- IMPORTANT: ADJUST THESE KEYS TO MATCH YOUR METADATA.JSON ---
            content_preview = chunk.get('content', chunk.get('text', ''))
            print(f"  {i+1}. Doc: {chunk.get('source_document', 'Unknown')}, Page: {chunk.get('page_number', 'N/A')}, ID: {chunk.get('id', 'N/A')}, Score: {chunk.get('retrieval_score', 'N/A'):.4f}, Content: {content_preview[:150]}...")
        print("-" * 50)

        # Step 3: LLM Reasoning and Decision Making (RAG)
        print(f"\n--- Step 3: Generating Final Answer with LLM (RAG) ---")
        rag_prompt = construct_rag_prompt(original_query, top_chunks, extracted_entities)
        try:
            response_content = await call_llm_api([
                {"role": "system", "content": "You are an expert information retrieval and analysis assistant."},
                {"role": "user", "content": rag_prompt}
            ], json_output=True)

            final_response = json.loads(response_content)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from final LLM response: {e}")
            print(f"Raw LLM response: {response_content}")
            final_response = {
                "Decision": "Error",
                "Amount": "N/A",
                "Justification": f"LLM returned malformed JSON: {response_content}"
            }
        except Exception as e:
            print(f"An error occurred during final RAG response generation: {e}")
            final_response = {
                "Decision": "Error",
                "Amount": "N/A",
                "Justification": f"An unexpected error occurred during final answer generation: {e}"
            }

    print("\n--- Final RAG Decision ---")
    print(json.dumps(final_response, indent=2))
    print("\n" + "="*80 + "\n")
    return final_response # Return the final RAG response, not just entities

# --- Run the Combined Pipeline ---
if __name__ == "__main__":
    print("Starting the full RAG system pipeline.")
    print(f"Using Together.ai LLM: {LLM_MODEL_TOGETHER}")
    print(f"Using Embedding Model: {EMBEDDING_MODEL_NAME}")
    print(f"API Key path: {DOTENV_PATH}")
    print("\n" + "="*80 + "\n")

    async def main_interactive_loop():
        while True:
            try:
                await run_full_rag_pipeline()

                continue_prompt = input("Run another query? (yes/no): ").strip().lower()
                if continue_prompt != 'yes':
                    break
            except Exception as e:
                print(f"An unhandled error occurred in the main execution loop: {e}")
                break

    asyncio.run(main_interactive_loop())