In [10]:
# Full LLM Document Processing System (Integrated Tarun, Aahil, and Bharat's tasks)
# This script performs document extraction, cleaning, chunking, embedding,
# FAISS indexing, and uses a generative LLM for structured decision-making.

import os
import numpy as np
import pickle
import json
import re
import sys
import requests
import io
import glob
import email # For EML parsing

# --- Unified Library Installation and Imports ---
print("--- Initializing: Installing necessary libraries ---")
try:
    # Use !{sys.executable} -m pip to ensure installation into the correct Python environment
    !{sys.executable} -m pip install requests python-docx PyMuPDF faiss-cpu transformers sentence-transformers -qq --upgrade

    # Import all libraries after ensuring they are installed
    import fitz # PyMuPDF
    import docx # python-docx
    from urllib.parse import urlparse
    from sentence_transformers import SentenceTransformer
    import faiss
    import torch
    from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, T5ForConditionalGeneration

    print("✅ All required libraries checked/installed successfully.")
except Exception as e:
    print(f"❌ Error during library installation: {e}")
    print("Please ensure your Colab environment has internet access and try again.")
    sys.exit(1) # Exit the script if critical libraries cannot be installed

print("\n--- Starting Document Processing Pipeline ---")

# --- 1. Tarun's Task: Data Extraction and Cleaning ---
print("\n--- Tarun's Task: Data Extraction and Cleaning ---")

def download_file(url: str) -> bytes:
    """Downloads content from a given URL."""
    print(f"Attempting to download from URL: {url}")
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        return r.content
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return b""

def extract_text_from_pdf_bytes(bts: bytes) -> str:
    """Extracts text from PDF bytes."""
    text = []
    try:
        with fitz.open(stream=bts, filetype="pdf") as doc:
            for page in doc:
                text.append(page.get_text())
    except Exception as e:
        print(f"Error extracting text from PDF bytes: {e}")
        return ""
    return "\n".join(text)

def extract_text_from_docx_bytes(bts: bytes) -> str:
    """Extracts text from DOCX bytes."""
    bio = io.BytesIO(bts)
    text = []
    try:
        doc = docx.Document(bio)
        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
        text = paragraphs
    except Exception as e:
        print(f"Error extracting text from DOCX bytes: {e}")
        return ""
    return "\n".join(text)

def extract_text_from_eml_bytes(bts: bytes) -> str:
    """Extracts text from EML bytes (email files)."""
    msg = email.message_from_bytes(bts)
    parts = []
    for part in msg.walk():
        content_type = part.get_content_type()
        if content_type == "text/plain":
            payload = part.get_payload(decode=True)
            try:
                parts.append(payload.decode(part.get_content_charset() or 'utf-8', errors="ignore"))
            except:
                parts.append(payload.decode(errors="ignore"))
        elif content_type == "text/html":
            payload = part.get_payload(decode=True)
            try:
                parts.append(payload.decode(part.get_content_charset() or 'utf-8', errors="ignore"))
            except:
                parts.append(payload.decode(errors="ignore"))
    return "\n".join(parts)

def extract_text_from_file(path: str) -> str:
    """Extracts text from a local file (PDF, DOCX, or EML)."""
    print(f"Extracting text from local file: {path}")
    with open(path, "rb") as f:
        content = f.read()
    if path.lower().endswith(".pdf"):
        return extract_text_from_pdf_bytes(content)
    elif path.lower().endswith(".docx"):
        return extract_text_from_docx_bytes(content)
    elif path.lower().endswith(".eml"):
        return extract_text_from_eml_bytes(content)
    else:
        print(f"Warning: Unsupported file type for local file {path}. Attempting to decode as plain text.")
        return content.decode(errors="ignore")

def clean_text(text: str) -> str:
    """
    Cleans the extracted text by removing excessive newlines/spaces
    and retaining logical paragraph breaks.
    """
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'(?<!\n)\n(?![\n\s])', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

# Tarun's main execution part
TARUN_OUTPUT_DIR = "cleaned_texts"
os.makedirs(TARUN_OUTPUT_DIR, exist_ok=True)
print(f"Cleaned texts (in JSON) will be saved in: {TARUN_OUTPUT_DIR}")

processed_documents = []
local_files = glob.glob("*.pdf") + glob.glob("*.docx") + glob.glob("*.eml")

if not local_files:
    print("No local PDF, DOCX, or EML files found. Please upload your documents to Colab.")
    print("Example: Drag and drop 'policy.pdf' into the file browser on the left.")
    sys.exit("No documents found to process. Please upload files and run again.")

for filepath in local_files:
    print(f"\n--- Processing {filepath} ---")
    raw_text = ""
    if filepath.lower().endswith((".pdf", ".docx", ".eml")):
        raw_text = extract_text_from_file(filepath)
    else:
        print(f"Skipping {filepath}: Unsupported file type.")
        continue

    if raw_text:
        extracted_text = clean_text(raw_text)
        processed_documents.append({
            "filename": os.path.basename(filepath),
            "cleaned_text": extracted_text
        })
        print(f"✅ Processed {os.path.basename(filepath)}")
        # Optional: Keep this preview if you still want to see it, otherwise comment out
        # print(f"Preview:\n{extracted_text[:min(len(extracted_text), 2000)]}{'...' if len(extracted_text) > 2000 else ''}\n")
    else:
        print(f"Skipping {filepath} due to extraction errors or empty content.")

TARUN_OUTPUT_JSON_FILENAME = os.path.join(TARUN_OUTPUT_DIR, "all_cleaned_documents.json")
with open(TARUN_OUTPUT_JSON_FILENAME, "w", encoding="utf-8") as f:
    json.dump(processed_documents, f, indent=4)
print(f"\n🎉 Tarun's Task Completed: All cleaned texts saved to {TARUN_OUTPUT_JSON_FILENAME} in JSON format.")

# --- 2. Aahil's Task: Text Chunking and Embedding Generation ---
print("\n--- Aahil's Task: Text Chunking and Embedding Generation ---")

# Configuration for Aahil's Task
AAHIL_OUTPUT_DIR = "chunks_and_embeddings"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'

os.makedirs(AAHIL_OUTPUT_DIR, exist_ok=True)
print(f"Chunks and embeddings will be saved in: {AAHIL_OUTPUT_DIR}")

print(f"Loading sentence-transformer model '{EMBEDDING_MODEL_NAME}'...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
print("✅ Embedding model loaded successfully.")

def chunk_text(text: str, filename: str, chunk_size: int, chunk_overlap: int) -> tuple[list[str], list[str]]:
    chunks = []
    chunk_sources = []
    current_pos = 0

    while current_pos < len(text):
        end_pos = min(current_pos + chunk_size, len(text))
        chunk = text[current_pos:end_pos]
        chunks.append(chunk)
        chunk_sources.append(filename)
        current_pos += chunk_size - chunk_overlap
        if current_pos < 0: # Ensure no negative index
            current_pos = 0
    return chunks, chunk_sources

all_chunks_list = []
all_embeddings_list = []
all_source_files_list = []

if not os.path.exists(TARUN_OUTPUT_JSON_FILENAME) or not processed_documents:
    print(f"❌ Error: No cleaned documents found from Tarun's task at '{TARUN_OUTPUT_JSON_FILENAME}'. Exiting Aahil's task.")
else:
    try:
        with open(TARUN_OUTPUT_JSON_FILENAME, "r", encoding="utf-8") as f:
            documents_for_aahil = json.load(f)
        print(f"Loaded {len(documents_for_aahil)} documents for chunking and embedding.")
    except json.JSONDecodeError as e:
        print(f"❌ Error decoding JSON for Aahil: {e}. Exiting Aahil's task.")
        documents_for_aahil = []

    if documents_for_aahil:
        for doc in documents_for_aahil:
            filename = doc.get("filename", "unknown_file")
            content = doc.get("cleaned_text", "")

            if content.strip():
                chunks, chunk_sources = chunk_text(content, filename, CHUNK_SIZE, CHUNK_OVERLAP)
                print(f"Generated {len(chunks)} chunks from '{filename}'")

                if chunks:
                    with torch.no_grad():
                        embeddings = embedding_model.encode(chunks, convert_to_tensor=True, show_progress_bar=True)

                    all_chunks_list.extend(chunks)
                    all_embeddings_list.append(embeddings.cpu().numpy())
                    all_source_files_list.extend(chunk_sources)
                else:
                    print(f"No chunks generated for '{filename}'. Skipping embeddings.")
            else:
                print(f"Skipping '{filename}' as its cleaned text content is empty.")

    if all_chunks_list and all_embeddings_list:
        final_embeddings = np.vstack(all_embeddings_list)

        with open(os.path.join(AAHIL_OUTPUT_DIR, "all_chunks.pkl"), "wb") as f:
            pickle.dump(all_chunks_list, f)
        with open(os.path.join(AAHIL_OUTPUT_DIR, "all_embeddings.pkl"), "wb") as f:
            pickle.dump(final_embeddings, f)
        with open(os.path.join(AAHIL_OUTPUT_DIR, "source_files.pkl"), "wb") as f:
            pickle.dump(all_source_files_list, f)
        print(f"\n🎉 Aahil's Task Completed: Chunks, embeddings, and source files ready for Bharat.")
    else:
        print("\n❌ Aahil's Task Failed: No chunks or embeddings were generated. Cannot proceed to Bharat's task.")
        sys.exit(1) # Exit if Aahil's task failed

# --- 3. Bharat's Task: LLM Document Processing System ---
print("\n--- Bharat's Task: LLM Document Processing System ---")

# Configuration for Bharat's Task
LLM_MODEL_NAME = "google/flan-t5-small"

def load_aahil_data(directory: str):
    chunks = None
    embeddings = None
    source_files = None
    try:
        with open(os.path.join(directory, "all_chunks.pkl"), "rb") as f:
            chunks = pickle.load(f)
        with open(os.path.join(directory, "all_embeddings.pkl"), "rb") as f:
            embeddings = pickle.load(f)
        with open(os.path.join(directory, "source_files.pkl"), "rb") as f:
            source_files = pickle.load(f)
        print(f"✅ Successfully loaded {len(chunks)} chunks and embeddings of shape {embeddings.shape}")
        return chunks, embeddings, source_files
    except FileNotFoundError as e:
        print(f"Error: Missing data file in '{directory}'. Please ensure Aahil's script ran successfully and saved all .pkl files.")
        print(f"Details: {e}")
    except Exception as e:
        print(f"An unexpected error occurred while loading Aahil's data: {e}")
    return None, None, None

chunks, embeddings, source_files = load_aahil_data(AAHIL_OUTPUT_DIR)

if chunks is None or embeddings is None or source_files is None:
    print("Exiting. Cannot proceed with Bharat's task without all data loaded from Aahil's task.")
    sys.exit(1)

# Initialize FAISS Index
embeddings = embeddings.astype('float32')
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"✅ FAISS index created and populated with {index.ntotal} vectors.")

# Load Query Embedding Model (same as Aahil's)
print(f"Loading query embedding model '{EMBEDDING_MODEL_NAME}'...")
query_embedding_model_bharat = SentenceTransformer(EMBEDDING_MODEL_NAME)
print("✅ Query embedding model loaded.")

# Load Generative LLM for Decision and Structured Output
print(f"Loading Generative LLM pipeline: '{LLM_MODEL_NAME}'...")
device = 0 if torch.cuda.is_available() else -1
text_generation_pipeline = None
try:
    text_generation_pipeline = pipeline(
        "text2text-generation",
        model=LLM_MODEL_NAME,
        tokenizer=LLM_MODEL_NAME,
        device=device,
        max_new_tokens=512,
        truncation=True
    )
    print(f"✅ Generative LLM pipeline loaded successfully on {'GPU' if device == 0 else 'CPU'}.")
except Exception as e:
    print(f"❌ Error loading Generative LLM pipeline '{LLM_MODEL_NAME}'. This might be due to network issues or model availability.")
    print(f"Details: {e}")
    print("The system will proceed, but will only return relevant chunks, not generate structured answers.")
    text_generation_pipeline = None

# --- LLM Document Processing System Interaction Loop ---
print("\n--- LLM Document Processing System Ready! Type 'exit' to quit. ---")
while True:
    user_query = input("\nYour Query: ")
    if user_query.lower() == 'exit':
        print("Exiting Document Processing System. Goodbye! 👋")
        break

    print("\nSearching for relevant clauses... 🤔")
    query_embedding = query_embedding_model_bharat.encode([user_query])
    query_embedding = np.array(query_embedding).astype('float32')

    k_retrieve = 15 # Increased k for more context
    distances, indices = index.search(query_embedding, k_retrieve)

    relevant_chunks = []
    relevant_sources_info = []

    for i, idx in enumerate(indices[0]):
        if 0 <= idx < len(chunks):
            relevant_chunks.append(chunks[idx])
            source_info = f"Source: {source_files[idx]}"
            relevant_sources_info.append(source_info)
        else:
            print(f"Warning: FAISS returned an out-of-bounds index: {idx}")

    if not relevant_chunks:
        print("No highly relevant clauses found for your query. Please try rephrasing.")
        continue

    # --- Print statements for debugging (uncomment if you need to see raw chunks) ---
    # print(f"Found {len(relevant_chunks)} relevant clauses. ✨")
    # print("\n--- Relevant Clauses (Full Content for Debugging) ---")
    # for i, chunk in enumerate(relevant_chunks):
    #     print(f"Clause {i+1} ({relevant_sources_info[i]}):\n{chunk}\n")

    if text_generation_pipeline:
        context_str = "\n".join([f"Clause {i+1}: {c}" for i, c in enumerate(relevant_chunks)])

        prompt = f"""
        You are an expert document processing system. Your task is to analyze a user query and provided relevant policy clauses to determine a decision, an amount (if applicable), and provide a justification.

        **User Query Details:**
        The user query is: "{user_query}"
        From this query, identify these specific details:
        - Age of individual:
        - Gender of individual:
        - Procedure/Condition:
        - Location of procedure:
        - Policy duration:

        **Relevant Policy Clauses:**
        {context_str}

        **Instructions:**
        1.  First, **extract all the specific facts directly from the 'User Query Details' section**. For example, for "46-year-old male, knee surgery in Pune, 3-month-old insurance policy", the facts are: Age 46, Gender Male, Procedure Knee Surgery, Location Pune, Policy Duration 3 months.
        2.  **Strictly evaluate these extracted facts against the 'Relevant Policy Clauses'**.
        3.  Determine the **"Decision"** (e.g., "Approved", "Rejected", "Pending Further Review"). Base this solely on whether the extracted query facts meet the conditions in the clauses.
        4.  Determine the **"Amount"** if an exact value is specified or can be clearly inferred from the clauses based on the user query's details. If no specific amount is mentioned for the given conditions, state "N/A" or "To be determined" with a brief reason.
        5.  Provide a **"Justification"** that clearly explains your decision. You **MUST explicitly reference the clause numbers** (e.g., "Clause 1 states...") and quote or paraphrase the specific parts of the clauses that support your decision for each extracted fact. Make sure to link each query detail to the clause that governs it, especially regarding the location mentioned in the query.
        6.  Your output MUST be a **well-formed JSON object** with the following keys: "Decision", "Amount", "Justification".

        **Example Response Format:**
        ```json
        {{
          "Decision": "Approved",
          "Amount": "N/A",
          "Justification": "Based on Clause 1: 'Cataract surgery is covered for individuals above 50 years of age...' (patient is 55). Clause 1 also states: '...provided the policy has been active for a minimum of 90 days (3 months)...' (policy is 6 months old). Clause 2 states: 'All surgical procedures performed at network hospitals within major metropolitan cities, including Mumbai, are eligible for 100% coverage...' (procedure is in Mumbai)."
        }}
        ```
        """
        print("\nGenerating structured response... 🤖")
        try:
            llm_output = text_generation_pipeline(prompt)[0]['generated_text']

            # --- ALWAYS PRINT RAW LLM OUTPUT FOR DEBUGGING THIS ERROR ---
            print("\n--- Raw LLM Output (for debugging) ---")
            print(llm_output)

            json_str = ""
            # Try to extract content within ```json ... ``` markdown block first
            json_match = re.search(r'```json\n(.*?)```', llm_output, re.DOTALL)
            if json_match:
                json_str = json_match.group(1).strip()
            else:
                # If no markdown block, try to find the JSON object directly
                # Find the first '{' and the last '}'
                start_idx = llm_output.find('{')
                end_idx = llm_output.rfind('}')
                if start_idx != -1 and end_idx != -1 and start_idx < end_idx:
                    json_str = llm_output[start_idx : end_idx + 1].strip()
                else:
                    # Fallback to the entire output if JSON delimiters not found
                    # This case will likely lead to JSONDecodeError if not pure JSON
                    json_str = llm_output.strip()

            # Check if extracted json_str is empty before attempting to load
            if not json_str:
                print("⚠️ Warning: Extracted JSON string is empty. LLM might not have generated valid JSON.")
                # You might want to skip json.loads and indicate a failure here
                # For now, we proceed to let json.loads raise the error, which gives "char 0"
                # This explicitly confirms if the issue is an empty string.

            parsed_response = json.loads(json_str)

            print("\n--- Structured JSON Response ---")
            print(json.dumps(parsed_response, indent=4))


        except Exception as e:
            print(f"❌ An unexpected error occurred during LLM response generation: {e}")
            print("Please inspect the 'Raw LLM Output' above for debugging LLM's raw generation.")
    else:
        print("\nGenerative LLM not available. Cannot generate structured answer.")


--- Initializing: Installing necessary libraries ---
✅ All required libraries checked/installed successfully.

--- Starting Document Processing Pipeline ---

--- Tarun's Task: Data Extraction and Cleaning ---
Cleaned texts (in JSON) will be saved in: cleaned_texts

--- Processing policy.pdf ---
Extracting text from local file: policy.pdf
✅ Processed policy.pdf

🎉 Tarun's Task Completed: All cleaned texts saved to cleaned_texts/all_cleaned_documents.json in JSON format.

--- Aahil's Task: Text Chunking and Embedding Generation ---
Chunks and embeddings will be saved in: chunks_and_embeddings
Loading sentence-transformer model 'all-MiniLM-L6-v2'...
✅ Embedding model loaded successfully.
Loaded 1 documents for chunking and embedding.
Generated 242 chunks from 'policy.pdf'


Batches:   0%|          | 0/8 [00:00<?, ?it/s]


🎉 Aahil's Task Completed: Chunks, embeddings, and source files ready for Bharat.

--- Bharat's Task: LLM Document Processing System ---
✅ Successfully loaded 242 chunks and embeddings of shape (242, 384)
✅ FAISS index created and populated with 242 vectors.
Loading query embedding model 'all-MiniLM-L6-v2'...
✅ Query embedding model loaded.
Loading Generative LLM pipeline: 'google/flan-t5-small'...


Device set to use cpu


✅ Generative LLM pipeline loaded successfully on CPU.

--- LLM Document Processing System Ready! Type 'exit' to quit. ---


KeyboardInterrupt: Interrupted by user

"""
        print("\nGenerating structured response... 🤖")
        try:
            llm_output = text_generation_pipeline(prompt)[0]['generated_text']
            print("\n--- Raw LLM Output (for debugging) ---")
            print(llm_output)

            json_match = re.search(r'```json\n(.*?)```', llm_output, re.DOTALL)
            if json_match:
                json_str = json_match.group(1).strip()
            else:
                json_str = llm_output.strip()

            parsed_response = json.loads(json_str)

            print("\n--- Structured JSON Response ---")
            print(json.dumps(parsed_response, indent=4))

        except json.JSONDecodeError as e:
            print(f"❌ Error parsing LLM's JSON output: {e}")
            print("LLM output was not valid JSON. Please inspect the 'Raw LLM Output' above.")
            print("Raw LLM output was:\n", llm_output)
        except Exception as e:
            print(f"❌ An unexpected error occurred during LLM response generation: {e}")
            print("Raw relevant clauses were displayed above.")
    else:
        print("\nGenerative LLM not available. Displaying raw relevant clauses instead of a structured answer.")


  ```