In [3]:
# %% [markdown]
# # Part 1: The Colab Foundry - Building the Cognitive Core
# This notebook handles the heavy lifting: ingesting documents and building the vector/sparse search indexes.
# The output of this notebook will be the `policies.lancedb` folder and the `bm25_index.pkl` file, which our live API will use.

# %%
# Step 1: Install all necessary dependencies
!pip install -q "unstructured[pdf,docx]" pymupdf "camelot-py[cv]" pytesseract
!pip install -q sentence-transformers rank_bm25 lancedb spacy
!python -m spacy download en_core_web_lg

# %%
# Step 2: Import libraries and mount Google Drive
import os
import pickle
import shutil
from typing import List

import lancedb
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import CompositeElement, Text
from unstructured.partition.auto import partition
import camelot

from google.colab import drive
drive.mount('/content/drive')

print("All dependencies installed and Google Drive mounted.")

# %%
# Step 3: Configuration - Define your project paths
# This is where your source documents are and where the final artifacts will be saved.
PDF_FOLDER = "/content/drive/MyDrive/pdfs"
OUTPUT_ARTIFACTS_PATH = "/content/drive/MyDrive/artifacts"

# Ensure directories exist
os.makedirs(PDF_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_ARTIFACTS_PATH, exist_ok=True)

print(f"Please upload your initial set of policy documents to: {PDF_FOLDER}")
print(f"The final database artifacts will be saved to: {OUTPUT_ARTIFACTS_PATH}")


# %%
# Step 4: The Invincible Ingestion Pipeline
def run_ingestion_pipeline(folder_path: str) -> List[CompositeElement]:
    """
    Processes all documents within a specified folder into a clean,
    unified list of text chunks.
    """
    print(f"🚀 Starting the Invincible Ingestion Pipeline on folder: '{folder_path}'")

    doc_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
    if not doc_files:
        print(f"⚠️ WARNING: No document files found in '{folder_path}'.")
        return []

    print(f"Found {len(doc_files)} documents to process.")
    final_chunks = []

    for doc_path in doc_files:
        print(f"\nProcessing document: '{os.path.basename(doc_path)}'")

        # 1. High-fidelity partitioning with unstructured
        elements = partition(filename=doc_path)

        # 2. Surgical table extraction with Camelot for PDFs
        if doc_path.lower().endswith('.pdf'):
            try:
                tables = camelot.read_pdf(doc_path, pages='all', flavor='lattice')
                if tables.n > 0:
                    print(f"  - ✅ Found {tables.n} tables. Converting to Markdown.")
                    for table in tables:
                        elements.append(Text(f"\n--- TABLE START ---\n{table.df.to_markdown()}\n--- TABLE END ---\n"))
            except Exception as e:
                print(f"  - ⚠️ Camelot table extraction failed: {e}")

        # 3. Semantic chunking
        chunks = chunk_by_title(elements, max_characters=1024, combine_text_under_n_chars=256)

        # Add source metadata to each chunk
        for chunk in chunks:
            chunk.metadata.source = os.path.basename(doc_path)
            final_chunks.append(chunk)

        print(f"  - ✅ Document chunked into {len(chunks)} semantic blocks.")

    print("\n✅ Invincible Ingestion Pipeline Complete.")
    return final_chunks


# %%
# Step 5: The Cognitive Core Construction
def build_and_save_cognitive_core(chunks: List[CompositeElement], output_path: str):
    """
    Builds the Vector Store (LanceDB) and Sparse Index (BM25) and saves them
    to the specified output path for deployment.
    """
    if not chunks:
        print("❌ ERROR: No chunks provided to build the cognitive core.")
        return

    print(f"🚀 Building Cognitive Core from {len(chunks)} chunks...")
    corpus_texts = [chunk.text for chunk in chunks]
    metadata = [{"source": chunk.metadata.source} for chunk in chunks]

    # --- Part 1: Build Vector Store (Dense Embeddings) LOCALLY ---
    print("\n  - Part 1: Building Vector Store on local filesystem for speed...")

    # Define a temporary local path for the DB build
    local_db_path = "/content/policies_local.lancedb"
    if os.path.exists(local_db_path):
        shutil.rmtree(local_db_path)

    embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
    vectors = embedding_model.encode(corpus_texts, show_progress_bar=True)

    # Connect to the LOCAL path
    db = lancedb.connect(local_db_path)
    data_for_db = [{"vector": v, "text": t, "source": m["source"]} for v, t, m in zip(vectors, corpus_texts, metadata)]

    print("  - Creating LanceDB table locally...")
    db.create_table("policies", data=data_for_db)
    print(f"  - ✅ LanceDB table 'policies' created successfully at: '{local_db_path}'")

    # --- Part 2: Build Sparse Index (Keyword Precision) with BM25 ---
    print("\n  - Part 2: Building Sparse Index with BM25...")
    tokenized_corpus = [doc.lower().split(" ") for doc in corpus_texts]
    bm25 = BM25Okapi(tokenized_corpus)

    bm25_path = os.path.join(output_path, "bm25_index.pkl")
    with open(bm25_path, "wb") as f:
        pickle.dump(bm25, f)
    print(f"  - ✅ BM25 index saved to: '{bm25_path}'")

    # --- Part 3: Copy the COMPLETED local DB to Google Drive ---
    print("\n  - Part 3: Moving completed database to Google Drive for persistence...")
    final_db_path_drive = os.path.join(output_path, "policies.lancedb")

    if os.path.exists(final_db_path_drive):
        shutil.rmtree(final_db_path_drive)

    shutil.move(local_db_path, final_db_path_drive)
    print(f"  - ✅ LanceDB artifact successfully moved to: '{final_db_path_drive}'")

    print("\n✅ Cognitive Core is built and saved!")


# %%
# Step 6: Execute the Full Pipeline
# Make sure your documents are uploaded before running this cell.
all_chunks = run_ingestion_pipeline(PDF_FOLDER)

if all_chunks:
    build_and_save_cognitive_core(all_chunks, OUTPUT_ARTIFACTS_PATH)
    print("\n--- 🏁 ALL DONE! Your artifacts are ready in Google Drive. ---")
else:
    print("\n--- Pipeline finished with no chunks produced. Please check your document folder. ---")

[0mCollecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All dependencies installed and Google Drive mounted.
Please upload your initial set of policy documents to: /content/drive/MyDrive/pdfs
The final database artifacts will be saved to: /content/drive/MyD



  - ✅ Found 17 tables. Converting to Markdown.
  - ✅ Document chunked into 398 semantic blocks.

Processing document: 'hack3.pdf'
  - ✅ Document chunked into 9 semantic blocks.

Processing document: 'hack4.pdf'
  - ✅ Found 37 tables. Converting to Markdown.
  - ✅ Document chunked into 283 semantic blocks.

Processing document: 'hack5.pdf'
  - ✅ Found 22 tables. Converting to Markdown.
  - ✅ Document chunked into 269 semantic blocks.

Processing document: 'hack6.pdf'
  - ✅ Found 7 tables. Converting to Markdown.
  - ✅ Document chunked into 139 semantic blocks.

✅ Invincible Ingestion Pipeline Complete.
🚀 Building Cognitive Core from 1674 chunks...

  - Part 1: Building Vector Store on local filesystem for speed...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


  - Creating LanceDB table locally...
  - ✅ LanceDB table 'policies' created successfully at: '/content/policies_local.lancedb'

  - Part 2: Building Sparse Index with BM25...
  - ✅ BM25 index saved to: '/content/drive/MyDrive/artifacts/bm25_index.pkl'

  - Part 3: Moving completed database to Google Drive for persistence...
  - ✅ LanceDB artifact successfully moved to: '/content/drive/MyDrive/artifacts/policies.lancedb'

✅ Cognitive Core is built and saved!

--- 🏁 ALL DONE! Your artifacts are ready in Google Drive. ---


In [5]:
!python -m spacy download en_core_web_lg
!pip install -q "crawl4ai>=0.6.0" together nest_asyncio
!playwright install

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.7/40.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m392.6/392.6 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.1/96.1 kB[0m [31m9.4 MB/s

In [10]:
# %%
# Step 7.1: Import additional libraries for testing
import os
import pickle
import json
import asyncio
import nest_asyncio

import lancedb
from sentence_transformers import SentenceTransformer, CrossEncoder
from google.colab import userdata
from pydantic import BaseModel, Field
from typing import List, Optional

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMExtractionStrategy, LLMConfig

# Apply asyncio patch for Colab
nest_asyncio.apply()

print("Testing libraries loaded.")

# %%
# Step 7.2 (Definitive Version): AdjudicatorTester with GPT-4o-mini upgrade

class AdjudicatorTester:
    """The definitive version of the Adjudicator, using a two-stage LLM process and a high-reliability model."""

    def __init__(self, artifacts_path: str):
        print("🚀 Initializing the Definitive Adjudicator Tester with GPT-4o-mini...")

        # Load Artifacts
        db_path = os.path.join(artifacts_path, "policies.lancedb")
        bm25_path = os.path.join(artifacts_path, "bm25_index.pkl")
        if not os.path.exists(db_path) or not os.path.exists(bm25_path):
            raise FileNotFoundError("Cognitive Core artifacts not found!")

        db = lancedb.connect(db_path)
        self.table = db.open_table("policies")
        with open(bm25_path, "rb") as f:
            self.bm25 = pickle.load(f)
        self.corpus_texts = self.table.to_pandas()['text'].tolist()
        print("  - ✅ Cognitive Core loaded.")

        # Initialize Models with new LLM
        try:
            # IMPORTANT: Make sure you have 'OPENAI_API_KEY' in your Colab Secrets
            OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
        except Exception:
             raise ValueError("OPENAI_API_KEY not found in Colab Secrets.")

        self.dense_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
        self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device='cuda')

        # --- MODEL UPGRADE ---
        self.llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=OPENAI_API_KEY)

        print("  - ✅ All models initialized. Using openai/gpt-4o-mini for adjudication.")
        print("\n✅ Adjudicator Tester is ready!")

    def _retrieve_and_rerank(self, query: str, top_k: int = 15) -> List[str]:
        # This function remains the same
        print(f"\n  - Performing Hybrid Retrieval for query: '{query}'")
        query_vector = self.dense_model.encode(query)
        vector_results_df = self.table.search(query_vector).limit(20).to_pandas()
        tokenized_query = query.lower().split(" ")
        bm25_scores = self.bm25.get_scores(tokenized_query)
        top_bm25_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:20]
        combined_texts = list(dict.fromkeys(
            vector_results_df['text'].tolist() + [self.corpus_texts[i] for i in top_bm25_indices]
        ))
        print("  - Reranking retrieved clauses...")
        sentence_pairs = [[query, text] for text in combined_texts]
        scores = self.cross_encoder.predict(sentence_pairs, show_progress_bar=True)
        reranked_results = sorted(zip(scores, combined_texts), key=lambda x: x[0], reverse=True)
        top_chunks = [text for score, text in reranked_results[:top_k]]
        print(f"  - ✅ Retrieved and reranked top {len(top_chunks)} clauses.")
        return top_chunks

    async def adjudicate(self, query: str):
        """Runs the full TWO-STAGE adjudication process with text-based filtering."""

        candidate_clauses = self._retrieve_and_rerank(query)
        if not candidate_clauses:
            return {"decision": "Error", "justification": "Could not retrieve any clauses."}

        # --- STAGE 1: LLM-POWERED CLAUSE SELECTION (TEXT-BASED) ---
        print("\n  - Adjudication Stage 1: Filtering for hyper-relevant clauses...")

        class ClauseSelectionResponse(BaseModel):
            relevant_clauses: List[str] = Field(description="A list containing the full, verbatim text of the most relevant policy clauses.")

        numbered_candidates = "\n\n".join([f"--- Clause {i+1} ---\n{chunk}" for i, chunk in enumerate(candidate_clauses)])
        selection_prompt = f"""From the numbered policy clauses below, extract the full, verbatim text of ALL clauses that are directly relevant to making a decision on the user's claim. Consider coverage, waiting periods, and exclusions.
        **User's Claim:** "{query}"
        **Numbered Policy Clauses:**
        {numbered_candidates}
        Respond ONLY with a JSON object containing a list of strings, where each string is the verbatim text of a relevant clause."""

        selection_strategy = LLMExtractionStrategy(
            llm_config=self.llm_config,
            schema=ClauseSelectionResponse.model_json_schema(),
            instruction=selection_prompt,
            extraction_type="schema"
        )

        async with AsyncWebCrawler() as crawler:
            selection_result = await crawler.arun(url="raw://placeholder", config=CrawlerRunConfig(extraction_strategy=selection_strategy))

        final_clauses = candidate_clauses
        if selection_result.success and selection_result.extracted_content:
            try:
                selection_data = json.loads(selection_result.extracted_content)
                selection_json = selection_data[0] if isinstance(selection_data, list) else selection_data
                extracted_texts = selection_json.get("relevant_clauses", [])
                if extracted_texts:
                    final_clauses = extracted_texts
                    print(f"  - ✅ Stage 1 complete. Refined context to {len(final_clauses)} essential clauses.")
                else:
                    print("  - ⚠️ Stage 1 returned no selection. Using all clauses as fallback.")
            except Exception as e:
                print(f"  - ⚠️ Stage 1 parsing failed: {e}. Using all clauses as fallback.")

        # --- STAGE 2: FINAL VERDICT WITH CLEANED CONTEXT ---
        print("\n  - Adjudication Stage 2: Making final decision with refined context...")

        class AdjudicationResponse(BaseModel):
            decision: str
            amount: Optional[str]
            justification: str

        final_context = "\n\n---\n\n".join(final_clauses)
        verdict_prompt = f"""You are a hyper-vigilant insurance claim adjudicator. You must follow a strict reasoning process.
        **User's Claim:** "{query}"
        **Relevant Policy Clauses ONLY:** {final_context}
        **Your Reasoning Process:**
        1.  **Fact Extraction:** From the user's claim, identify the key facts (policy duration, medical procedure).
        2.  **Rule Identification:** From the provided clauses, what are the specific rules and waiting periods?
        3.  **Comparison & Decision:** Compare the facts against the rules. State if the claim meets all conditions.
        4.  **Final Output:** Based on your comparison, generate the final JSON object. The justification must summarize your reasoning.
        Respond with nothing but the required JSON object."""

        verdict_strategy = LLMExtractionStrategy(
            llm_config=self.llm_config,
            schema=AdjudicationResponse.model_json_schema(),
            instruction=verdict_prompt,
            extraction_type="schema"
        )

        async with AsyncWebCrawler() as crawler:
            final_result = await crawler.arun(url="raw://placeholder", config=CrawlerRunConfig(extraction_strategy=verdict_strategy))

        if final_result.success and final_result.extracted_content:
            try:
                result_data = json.loads(final_result.extracted_content)
                final_json = result_data[0] if isinstance(result_data, list) else result_data
                final_json['clauses'] = final_clauses
                return final_json
            except Exception as e:
                print(f"❌ Stage 2 parsing failed: {e}")
                return None
        else:
            print("❌ Stage 2 failed to get a response from the LLM.")
            return None

# The execution cell (Step 7.3) remains the same.
# Just re-run that cell after you have updated this AdjudicatorTester class definition.

# %%
# Step 7.3: Run the Test
async def run_test():
    # This path must match the output path from your build step
    artifacts_path = "/content/drive/MyDrive/artifacts"

    if not os.path.exists(artifacts_path):
        print("❌ ERROR: Artifacts path not found. Please run the build pipeline first.")
        return

    # Initialize our testing engine
    tester = AdjudicatorTester(artifacts_path)

    # Define our test query
    test_query = "I have a policy for 1 year and need a joint replacement surgery. Is this covered?"
    print(f"\n--- Running Test Adjudication ---")
    print(f"Test Query: \"{test_query}\"")

    # Get the final decision
    final_decision = await tester.adjudicate(test_query)

    if final_decision:
        print("\n🏆======= FINAL ADJUDICATION =======🏆\n")
        print(json.dumps(final_decision, indent=2))
        print("\n======================================")
    else:
        print("\n--- TEST FAILED ---")

# Execute the async test function
await run_test()

Testing libraries loaded.
🚀 Initializing the Definitive Adjudicator Tester with GPT-4o-mini...
  - ✅ Cognitive Core loaded.
  - ✅ All models initialized. Using openai/gpt-4o-mini for adjudication.

✅ Adjudicator Tester is ready!

--- Running Test Adjudication ---
Test Query: "I have a policy for 1 year and need a joint replacement surgery. Is this covered?"

  - Performing Hybrid Retrieval for query: 'I have a policy for 1 year and need a joint replacement surgery. Is this covered?'
  - Reranking retrieved clauses...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  - ✅ Retrieved and reranked top 15 clauses.

  - Adjudication Stage 1: Filtering for hyper-relevant clauses...


  - ✅ Stage 1 complete. Refined context to 3 essential clauses.

  - Adjudication Stage 2: Making final decision with refined context...




{
  "decision": "Denied",
  "amount": null,
  "justification": "The user's claim for joint replacement surgery is denied due to the 36-month waiting period for treatment of joint replacement unless arising from an accident. The user has a policy for only 1 year, which does not meet the required waiting period.",
  "clauses": [
    "27. Joint replacement surgery",
    "3) 30-day waiting period (Code - Excl03)\na. Expenses related to the treatment of any Illness within 30 days from the first Policy commencement date shall\nbe excluded except claims arising due to an Accident, provided the same are covered.\nb. This exclusion shall not, however apply if the Insured has Continuous Coverage for more than twelve months.",
    "36 Months waiting period 1. Treatment for joint replacement unless arising from accident"
  ]
}

