In [6]:
%pip install python-docx openai pydantic colorama

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: python-docx, colorama
Successfully installed colorama-0.4.6 python-docx-1.2.0


In [1]:
import os

# ------------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------------
os.environ["OPENAI_API_KEY"] = "sk-or-v1-"
os.environ["OPENAI_BASE_URL"] = "https://openrouter.ai/api/v1"

print(os.environ["OPENAI_API_KEY"])
# Choose your model. For hackathons, Claude 3.5 Sonnet is amazing at code/structure.
# OpenRouter Model ID: "anthropic/claude-3.5-sonnet" or "openai/gpt-4o"
MODEL_NAME = "tngtech/deepseek-r1t2-chimera:free"

sk-or-v1-


In [10]:
!pip install -q -U google-generativeai python-docx

In [None]:
import os
import google.generativeai as genai

# Paste your actual key here
os.environ["GOOGLE_API_KEY"] = ""

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [32]:
!python analyze_v2.py suitability_report.docx

Scanning deep structure of suitability_report.docx...
✅ Found 0 variables (including headers/footers).


In [17]:
%%writefile analyze.py
import os
import json
import asyncio
import re
from docx import Document
from openai import AsyncOpenAI

# -----------------------------------------------------------------------------
# 1. CONFIGURATION
# -----------------------------------------------------------------------------
API_KEY = os.getenv("OPENAI_API_KEY")
BASE_URL = "https://openrouter.ai/api/v1"
MODEL = "google/gemini-2.5-flash"

# Concurrency Limit
MAX_CONCURRENT_REQUESTS = 10

ANALYSIS_SYSTEM_PROMPT = """
You are a Financial Document Expert. Analyze the text segment to identify "Template Variables".

LOOK FOR:
1. **Client Details:** Name (e.g., Sarah), Age.
2. **Financial Goals:** specific amounts (£500k), dates (3 years), rates (8%).
3. **Line Items:** If the text is a row in a table (e.g. "Business Capital | £750,000"), extract the values.

OUTPUT JSON FORMAT:
{
  "variables": [
    { "original_text": "Sarah", "suggested_tag": "client_first_name", "type": "text" },
    { "original_text": "£750,000", "suggested_tag": "asset_business_value", "type": "money" }
  ]
}
If no variables found, return { "variables": [] }
"""

# -----------------------------------------------------------------------------
# 2. HELPER FUNCTIONS
# -----------------------------------------------------------------------------
def clean_json_response(content):
    content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
    content = re.sub(r'```json\s*', '', content)
    content = re.sub(r'```\s*', '', content)
    return content.strip()

async def analyze_segment(sem, client, text, context_label, metadata):
    """
    Analyzes a text segment (paragraph or table row).
    """
    async with sem:
        try:
            # We provide context to the LLM ("This is a Table Row")
            prompt = f"Context: {context_label}\nAnalyze this text: '{text}'"

            completion = await client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": ANALYSIS_SYSTEM_PROMPT},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1
            )

            cleaned = clean_json_response(completion.choices[0].message.content)

            try:
                data = json.loads(cleaned)
                variables = data.get("variables", [])
            except json.JSONDecodeError:
                start = cleaned.find('{')
                end = cleaned.rfind('}') + 1
                if start != -1 and end != -1:
                    variables = json.loads(cleaned[start:end]).get("variables", [])
                else:
                    variables = []

            # Enrich with location metadata
            for v in variables:
                v.update(metadata)

            return variables

        except Exception as e:
            return []

# -----------------------------------------------------------------------------
# 3. MAIN LOGIC
# -----------------------------------------------------------------------------
async def analyze_document(file_path):
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    doc = Document(file_path)
    client = AsyncOpenAI(
        api_key=API_KEY,
        base_url=BASE_URL,
        default_headers={"HTTP-Referer": "https://colab.research.google.com"}
    )

    print(f"Analyzing {file_path}...")
    tasks = []
    sem = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

    # --- A. SCAN PARAGRAPHS (With Context Window) ---
    print("Scanning Paragraphs...")
    prev_text = ""
    for i, para in enumerate(doc.paragraphs):
        text = para.text.strip()

        # Lower threshold to catch "Sarah" or "Age 55"
        if len(text) < 3:
            continue

        # Create a "Context Window" (Previous Para + Current Para)
        # This helps the AI know that "Sarah" is a Name.
        combined_input = f"Previous Line: {prev_text}\nCurrent Line: {text}"

        tasks.append(analyze_segment(
            sem, client, combined_input, "Paragraph",
            {"location_type": "paragraph", "index": i}
        ))

        prev_text = text if len(text) > 5 else prev_text

    # --- B. SCAN TABLES (Crucial for Financial Docs) ---
    print(f"Scanning {len(doc.tables)} Tables...")
    for t_idx, table in enumerate(doc.tables):
        for r_idx, row in enumerate(table.rows):
            # Convert row to a CSV-like string for the LLM
            # e.g., "Business Capital | Sarah | £750,000 | 15%"
            cells = [c.text.strip() for c in row.cells if c.text.strip()]
            if not cells: continue

            row_text = " | ".join(cells)

            # Skip header rows (heuristic: usually first row)
            if r_idx == 0 and len(table.rows) > 1:
                # Optional: You can choose to skip or scan headers.
                # Scaning headers might help context but usually isn't data.
                pass

            tasks.append(analyze_segment(
                sem, client, row_text, f"Table {t_idx} Row {r_idx}",
                {"location_type": "table", "table_index": t_idx, "row_index": r_idx}
            ))

    # --- C. EXECUTE & SAVE ---
    print(f"Processing {len(tasks)} segments...")
    results = await asyncio.gather(*tasks)
    all_detections = [item for sublist in results for item in sublist]

    out_file = file_path.replace(".docx", "_analysis.json")
    with open(out_file, "w") as f:
        json.dump(all_detections, f, indent=2)

    print(f"\n✅ Found {len(all_detections)} variables (Clients, Assets, Liabilities).")
    print(f"Saved to {out_file}")

if __name__ == "__main__":
    import sys
    asyncio.run(analyze_document(sys.argv[1]))

Writing analyze.py


In [25]:
%%writefile injection.py
import json
import sys
import os
from docx import Document

# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"

def normalize_text(text):
    """Normalizes text to handle smart quotes and non-breaking spaces."""
    if not text: return ""
    return text.replace('\u00a0', ' ').replace('’', "'").replace('“', '"').replace('”', '"').strip()

def inject_tags(docx_path, json_path):
    if not os.path.exists(docx_path):
        print(f"{RED}Error: Document not found at {docx_path}{RESET}")
        return
    if not os.path.exists(json_path):
        print(f"{RED}Error: JSON analysis not found at {json_path}{RESET}")
        return

    doc = Document(docx_path)
    with open(json_path, "r") as f:
        variables = json.load(f)

    print(f"Injecting {len(variables)} tags into {docx_path}...")
    success_count = 0

    for var in variables:
        original_text = var.get('original_text')
        tag_name = var.get('suggested_tag')
        loc_type = var.get('location_type', 'paragraph')
        target_index = var.get('index')

        if not original_text or not tag_name:
            continue

        # Create the Tag
        new_tag = f"{{{{ {tag_name} }}}}"

        # -------------------------------------------------------
        # STRATEGY 1: PARAGRAPH (With Search Window)
        # -------------------------------------------------------
        if loc_type == 'paragraph':
            found = False

            # Check the exact index first
            if target_index < len(doc.paragraphs):
                para = doc.paragraphs[target_index]
                if smart_replace(para, original_text, new_tag):
                    success_count += 1
                    found = True

            # If not found, check neighbors (Fuzzy Search)
            # This handles cases where paragraph counts shift slightly
            if not found:
                start = max(0, target_index - 2)
                end = min(len(doc.paragraphs), target_index + 3)
                for i in range(start, end):
                    if i == target_index: continue # Already checked
                    if smart_replace(doc.paragraphs[i], original_text, new_tag):
                        print(f"{YELLOW}  -> Auto-corrected index from {target_index} to {i}{RESET}")
                        success_count += 1
                        found = True
                        break

            if not found:
                # DEBUG OUTPUT: Show user what went wrong
                print(f"{RED}Failed to find '{original_text}' near Para {target_index}{RESET}")
                if target_index < len(doc.paragraphs):
                    print(f"   Context in Doc: '{doc.paragraphs[target_index].text[:50]}...'")

        # -------------------------------------------------------
        # STRATEGY 2: TABLE INJECTION
        # -------------------------------------------------------
        elif loc_type == 'table':
            try:
                table_idx = var.get('table_index')
                row_idx = var.get('row_index')
                if table_idx is not None and row_idx is not None:
                    row = doc.tables[table_idx].rows[row_idx]
                    for cell in row.cells:
                        for p in cell.paragraphs:
                            if smart_replace(p, original_text, new_tag):
                                success_count += 1
            except Exception as e:
                print(f"{RED}Table error: {e}{RESET}")

    output_path = docx_path.replace(".docx", "_tagged.docx")
    doc.save(output_path)
    print(f"\n{GREEN}Success! Injected {success_count}/{len(variables)} tags.{RESET}")
    print(f"Saved to: {output_path}")

def smart_replace(paragraph, search_text, replace_text):
    """
    Attempts to replace text handling 'runs' and 'normalization'.
    """
    # 1. Normalize both sides for comparison
    clean_para = normalize_text(paragraph.text)
    clean_search = normalize_text(search_text)

    # 2. Check if text is even present
    if clean_search not in clean_para:
        return False

    # 3. Try Run Replacement (Preserves Bold/Color)
    for run in paragraph.runs:
        if search_text in run.text:
            run.text = run.text.replace(search_text, replace_text)
            return True

    # 4. Fallback: Direct Replace (Might lose some styling, but ensures tag is inserted)
    # We use the raw text replace here
    if search_text in paragraph.text:
        paragraph.text = paragraph.text.replace(search_text, replace_text)
        return True

    return False

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: python injection.py <doc.docx> <analysis.json>")
    else:
        inject_tags(sys.argv[1], sys.argv[2])

Overwriting injection.py


In [26]:
!python injection.py suitability_report.docx suitability_report_analysis.json

Injecting 45 tags into suitability_report.docx...
[91mFailed to find '£500,000' near Para 29[0m
   Context in Doc: '   				     3 years through strategic growth initi...'
[91mFailed to find '3 years' near Para 31[0m
   Context in Doc: 'Investment Diversification...'
[91mFailed to find '8%' near Para 34[0m
   Context in Doc: 'Early Retirement...'
[91mFailed to find '£60,000' near Para 40[0m
   Context in Doc: 'aai!	www.advisoryai.com	...'
[91mFailed to find '55' near Para 40[0m
   Context in Doc: 'aai!	www.advisoryai.com	...'

[92mSuccess! Injected 40/45 tags.[0m
Saved to: suitability_report_tagged.docx


In [28]:
%pip install docxtpl

Collecting docxtpl
  Downloading docxtpl-0.20.2-py3-none-any.whl.metadata (2.8 kB)
Downloading docxtpl-0.20.2-py3-none-any.whl (17 kB)
Installing collected packages: docxtpl
Successfully installed docxtpl-0.20.2


In [29]:
%%writefile render.py
import os
import json
import sys
from docxtpl import DocxTemplate
from datetime import datetime

# -----------------------------------------------------------------------------
# MOCK DATA (In a real app, this comes from your Qdrant/Postgres DB)
# -----------------------------------------------------------------------------
MOCK_CLIENT_DATA = {
    # 1. Simple Text Variables
    "client_first_name": "Bruce",
    "client_name": "Bruce Wayne",
    "client_full_name": "Mr. Bruce Wayne",

    # 2. Financial Goals (The text you tagged)
    "business_revenue_target_amount": "£10,000,000",
    "business_revenue_target_timeline": "5 years",
    "investment_target_annual_return": "12%",
    "retirement_annual_income_goal": "£500,000",
    "retirement_age_goal": "60",

    # 3. Table Data (Matching tags in your table)
    "asset_type": "Wayne Enterprises Stock",
    "asset_business_value": "£5,000,000",
    "asset_monthly_income": "£25,000",
    "asset_current_debt": "£0",
    "asset_growth_rate": "8%",
    "asset_value": "£1,200,000",
    "asset_monthly_contribution": "£5,000",
    "asset_withdrawal": "£0",

    # 4. Investment Specifics
    "investment_type": "Global Tech Fund",
    "investment_amount": "£2,000,000",
    "current_value": "£2,400,000",
    "expected_return_rate": "10%",
    "asset_income": "£12,000",

    # 5. Emergency Fund
    "emergency_fund_name": "High Yield Cash",
    "emergency_fund_current_value": "£100,000",
    "emergency_fund_monthly_contribution": "£1,000",
    "emergency_fund_target_value": "£500,000",
    "emergency_fund_interest_rate": "4.5%",
    "asset_current_value": "£50,000"
}

def render_report(template_path, output_path):
    if not os.path.exists(template_path):
        print(f"Error: Template not found at {template_path}")
        return

    print(f"Rendering report for {MOCK_CLIENT_DATA['client_name']}...")

    try:
        doc = DocxTemplate(template_path)

        # The Magic: Jinja2 merges the data into the tags
        doc.render(MOCK_CLIENT_DATA)

        doc.save(output_path)
        print(f"✅ Success! Report generated: {output_path}")

    except Exception as e:
        print(f"❌ Render failed: {e}")
        # Hint for debugging Jinja errors
        if "undeclared variable" in str(e):
            print("Tip: A tag in the Docx doesn't match a key in MOCK_CLIENT_DATA.")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python render.py <tagged_template.docx>")
    else:
        # Auto-generate output filename
        input_file = sys.argv[1]
        output_file = input_file.replace(".docx", "_FINAL.docx")
        render_report(input_file, output_file)

Overwriting render.py


In [30]:
!python render.py suitability_report_tagged.docx

Rendering report for Bruce Wayne...
✅ Success! Report generated: suitability_report_tagged_FINAL.docx


In [5]:
%%writefile analyze_v2.py
import os
import json
import asyncio
import re
from docx import Document
from openai import AsyncOpenAI

# CONFIG
API_KEY = os.getenv("OPENAI_API_KEY")
BASE_URL = "https://openrouter.ai/api/v1"
MODEL = "google/gemini-2.0-flash-exp:free"
MAX_CONCURRENT = 10

SYSTEM_PROMPT = """
You are a Financial Template Architect.
Identify dynamic variables (Names, Dates, Amounts) in the text.
Return JSON: { "variables": [{ "original_text": "...", "suggested_tag": "...", "type": "..." }] }
"""

def clean_json(content):
    content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
    content = re.sub(r'```json\s*', '', content)
    return content.replace('```', '').strip()

def iter_all_content(doc):
    """Yields every piece of text in the document."""
    # Body
    for i, p in enumerate(doc.paragraphs): yield p, f"Body Para {i}"
    # Tables
    for t_i, table in enumerate(doc.tables):
        for r_i, row in enumerate(table.rows):
            # Combine row text for context
            row_text = " | ".join([c.text.strip() for c in row.cells if c.text.strip()])
            if row_text: yield row_text, f"Table {t_i} Row {r_i}"
    # Headers/Footers
    for section in doc.sections:
        for h in [section.header, section.first_page_header]:
            if h:
                for i, p in enumerate(h.paragraphs): yield p, f"Header Para {i}"
        for f in [section.footer, section.first_page_footer]:
            if f:
                for i, p in enumerate(f.paragraphs): yield p, f"Footer Para {i}"

async def analyze_item(sem, client, text, source_label):
    async with sem:
        try:
            # Skip empty or very short text
            if len(text) < 3: return []

            completion = await client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": f"Context: {source_label}\nText: '{text}'"}
                ],
                temperature=0.1
            )
            raw = completion.choices[0].message.content
            data = json.loads(clean_json(raw))

            # Add metadata
            vars = data.get("variables", [])
            for v in vars:
                v['source_label'] = source_label
            return vars
        except Exception:
            return []

async def main(file_path):
    doc = Document(file_path)
    client = AsyncOpenAI(api_key=API_KEY, base_url=BASE_URL, default_headers={"HTTP-Referer": "https://colab.research.google.com"})

    tasks = []
    sem = asyncio.Semaphore(MAX_CONCURRENT)

    print(f"Scanning deep structure of {file_path}...")

    for content, label in iter_all_content(doc):
        # Handle both Paragraph objects and raw strings (from tables)
        text = content.text.strip() if hasattr(content, 'text') else str(content)
        tasks.append(analyze_item(sem, client, text, label))

    results = await asyncio.gather(*tasks)
    flat_results = [item for sublist in results for item in sublist]

    out_file = file_path.replace(".docx", "_analysis_v2.json")
    with open(out_file, "w") as f:
        json.dump(flat_results, f, indent=2)

    print(f"✅ Found {len(flat_results)} variables (including headers/footers).")

if __name__ == "__main__":
    import sys
    asyncio.run(main(sys.argv[1]))

Writing analyze_v2.py


In [1]:
!pip install lxml



In [2]:
%%writefile normalizer.py
import zipfile
import os
import shutil
from lxml import etree
from copy import deepcopy

# Namespaces are critical in WordML
NS = {
    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
    'xml': 'http://www.w3.org/XML/1998/namespace'
}

def clean_and_normalize_docx(input_path, output_path):
    """
    1. Unzips docx
    2. Parses word/document.xml
    3. Removes "noise" nodes (proofErr, etc)
    4. Merges adjacent runs with identical formatting
    5. Rezips into a clean docx
    """
    # 1. Setup temporary workspace
    temp_dir = "temp_docx_extract"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    # 2. Unzip
    with zipfile.ZipFile(input_path, 'r') as z:
        z.extractall(temp_dir)

    # 3. Process the main XML content
    doc_xml_path = os.path.join(temp_dir, 'word', 'document.xml')
    if not os.path.exists(doc_xml_path):
        print("Error: Invalid docx (no document.xml found)")
        return

    # Parse with lxml
    tree = etree.parse(doc_xml_path)
    root = tree.getroot()

    # --- STEP A: SANITIZE (Remove Noise) ---
    print("  - Sanitizing XML nodes...")
    remove_noise_nodes(root)

    # --- STEP B: NORMALIZE (Merge Runs) ---
    print("  - Merging broken runs...")
    merge_adjacent_runs(root)

    # 4. Save modified XML
    with open(doc_xml_path, 'wb') as f:
        tree.write(f, xml_declaration=True, encoding='UTF-8', standalone=True)

    # 5. Re-zip
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as z:
        for folder_name, subfolders, filenames in os.walk(temp_dir):
            for filename in filenames:
                file_path = os.path.join(folder_name, filename)
                # Compute path relative to temp_dir
                arcname = os.path.relpath(file_path, temp_dir)
                z.write(file_path, arcname)

    # Cleanup
    shutil.rmtree(temp_dir)
    print(f"✅ Normalized document saved to: {output_path}")

def remove_noise_nodes(root):
    """
    Removes <w:proofErr>, <w:lastRenderedPageBreak>, and clears RSID attributes.
    These are the primary causes of split text.
    """
    # 1. Remove specific tags that split text
    tags_to_remove = [
        f"{{{NS['w']}}}proofErr",             # Spell check markers
        f"{{{NS['w']}}}lastRenderedPageBreak", # Page break markers
        f"{{{NS['w']}}}noProof",              # Grammar skip markers
        f"{{{NS['w']}}}lang"                  # Language tags (often cause splits)
    ]

    for tag in tags_to_remove:
        for element in root.findall(f".//{tag}", NS):
            # Remove the element but keep its tail text if any (rare for these tags)
            parent = element.getparent()
            if parent is not None:
                parent.remove(element)

    # 2. Strip RSID attributes (Revision Save IDs) from ALL elements
    # These change every time you edit the doc, splitting runs needlessly.
    for elem in root.iter():
        for attrib in list(elem.attrib):
            if attrib.endswith('rsidR') or attrib.endswith('rsidRPr') or attrib.endswith('rsidRDefault'):
                del elem.attrib[attrib]

def merge_adjacent_runs(root):
    """
    The Core Logic: Greedy Merge.
    Iterates through paragraphs. If Run A and Run B are neighbors
    and have IDENTICAL properties (w:rPr), merge them.
    """
    paragraphs = root.findall(f".//{{{NS['w']}}}p", NS)

    for p in paragraphs:
        children = list(p)
        if not children: continue

        i = 0
        while i < len(children) - 1:
            current_node = children[i]
            next_node = children[i+1]

            # Check if both are Runs (<w:r>)
            if current_node.tag == f"{{{NS['w']}}}r" and next_node.tag == f"{{{NS['w']}}}r":

                # Get Properties (w:rPr)
                curr_props = current_node.find(f"{{{NS['w']}}}rPr", NS)
                next_props = next_node.find(f"{{{NS['w']}}}rPr", NS)

                # Compare Properties using Canonical XML String
                # (This ensures <b/><i/> equals <b/><i/> regardless of whitespace)
                curr_props_str = etree.tostring(curr_props, method="c14n") if curr_props is not None else b""
                next_props_str = etree.tostring(next_props, method="c14n") if next_props is not None else b""

                if curr_props_str == next_props_str:
                    # MERGE!

                    # 1. Get text elements
                    curr_t = current_node.find(f"{{{NS['w']}}}t", NS)
                    next_t = next_node.find(f"{{{NS['w']}}}t", NS)

                    if curr_t is not None and next_t is not None:
                        # Append text
                        text_val = (curr_t.text or "") + (next_t.text or "")
                        curr_t.text = text_val

                        # Preserve space attribute if either had it
                        space_attr = f"{{{NS['xml']}}}space"
                        if next_t.get(space_attr) == 'preserve':
                            curr_t.set(space_attr, 'preserve')

                        # Remove the consumed 'next_node'
                        p.remove(next_node)

                        # Update children list and stay at index 'i' to try merging next one
                        children = list(p)
                        continue

            # Move to next node
            i += 1

if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Usage: python normalizer.py input.docx")
    else:
        input_file = sys.argv[1]
        output_file = input_file.replace(".docx", "_clean.docx")
        clean_and_normalize_docx(input_file, output_file)

Writing normalizer.py


In [4]:
!python normalizer.py suitability_report.docx

  - Sanitizing XML nodes...
  - Merging broken runs...
✅ Normalized document saved to: suitability_report_clean.docx


In [21]:
!python analyze_v2.py suitability_report.docx

Scanning deep structure of suitability_report.docx...
✅ Found 27 logic points.
Saved to suitability_report_analysis_v2.json


In [13]:
%%writefile analyze_v2.py
import os
import json
import asyncio
from docx import Document
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from typing import List, Optional

# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------
API_KEY = os.getenv("OPENAI_API_KEY")
BASE_URL = "https://openrouter.ai/api/v1"
MODEL = "google/gemini-2.0-flash-exp:free"
MAX_CONCURRENT_REQUESTS = 10

# -----------------------------------------------------------------------------
# PYDANTIC SCHEMAS (STRUCTURED OUTPUT)
# -----------------------------------------------------------------------------
class Variable(BaseModel):
    original_text: str
    suggested_tag: str
    type: str = Field(..., description="text, money, date, or percentage")

class TableLoop(BaseModel):
    is_loop: bool
    list_variable: Optional[str] = Field(None, description="e.g. 'pensions', 'assets'")
    item_variable: Optional[str] = Field(None, description="e.g. 'item'")
    columns_mapping: Optional[dict] = Field(None, description="{ '£50,000': 'item.value' }")

class Conditional(BaseModel):
    is_conditional: bool
    condition_expression: Optional[str] = Field(None, description="Jinja2 expression e.g. 'client.has_mortgage'")

class AnalysisResult(BaseModel):
    variables: List[Variable] = []
    table_loop: Optional[TableLoop] = None
    conditional: Optional[Conditional] = None

SYSTEM_PROMPT = """
You are a Document Architect. Analyze the text context.
1. Identify DYNAMIC VARIABLES (Names, Dates).
2. Detect LOOPS in tables (e.g., if a row looks like a list item).
3. Detect CONDITIONAL paragraphs (e.g., advice that only applies to specific clients).

Output valid JSON matching the schema.
"""

# -----------------------------------------------------------------------------
# ANALYSIS LOGIC
# -----------------------------------------------------------------------------
async def analyze_segment(sem, client, text, context_label, metadata):
    async with sem:
        if len(text) < 3: return {}
        try:
            completion = await client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": f"Context: {context_label}\nText: '{text}'"}
                ],
                temperature=0.1,
                response_format={"type": "json_object"}
            )
            # Parse JSON
            raw = completion.choices[0].message.content
            # Clean generic markdown if present
            raw = raw.replace("```json", "").replace("```", "")
            data = json.loads(raw)

            # Merge Metadata
            data['metadata'] = metadata
            return data
        except Exception as e:
            # print(f"Error: {e}")
            return {}

def iter_all_content(doc):
    """Yields Body, Tables, Headers, Footers."""
    # Body Paragraphs
    for i, p in enumerate(doc.paragraphs):
        yield p.text.strip(), f"Body Para {i}", {"type": "paragraph", "index": i}

    # Body Tables
    for t_i, table in enumerate(doc.tables):
        for r_i, row in enumerate(table.rows):
            # Skip header row usually
            if r_i == 0: continue
            text = " | ".join([c.text.strip() for c in row.cells if c.text.strip()])
            if text:
                yield text, f"Table {t_i} Row {r_i}", {"type": "table_row", "table_index": t_i, "row_index": r_i}

async def main(file_path):
    doc = Document(file_path)
    client = AsyncOpenAI(api_key=API_KEY, base_url=BASE_URL, default_headers={"HTTP-Referer": "https://colab.research.google.com"})

    sem = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
    tasks = []

    print(f"Scanning deep structure of {file_path}...")

    for text, label, meta in iter_all_content(doc):
        tasks.append(analyze_segment(sem, client, text, label, meta))

    results = await asyncio.gather(*tasks)

    # Filter empty results
    clean_results = [r for r in results if r]

    out_file = file_path.replace(".docx", "_analysis_v2.json")
    with open(out_file, "w") as f:
        json.dump(clean_results, f, indent=2)

    print(f"✅ Found {len(clean_results)} logic points.")
    print(f"Saved to {out_file}")

if __name__ == "__main__":
    import sys
    asyncio.run(main(sys.argv[1]))

Overwriting analyze_v2.py


In [9]:
%%writefile render_v2.py
import sys
from docxtpl import DocxTemplate

# -------------------------------------------------------
# MOCK DATA - This is what your "Database" would return
# -------------------------------------------------------
CONTEXT = {
    # 1. Basic Info
    "client_name": "Bruce Wayne",
    "report_date": "February 3, 2026",

    # 2. Conditionals (Try changing these to False to see sections vanish!)
    "client": {
        "has_mortgage": True,
        "is_high_risk": False,
        "is_retiring_soon": True
    },

    # 3. LOOPS (The Table Data)
    # The template expects 'pensions' or 'assets'.
    # Ensure this key matches what the Analyzer found in step 2.
    "pensions": [
        {"provider": "Wayne Corp Life", "value": "£1,500,000", "fee": "0.1%"},
        {"provider": "Gotham City Fund", "value": "£50,000", "fee": "0.5%"},
        {"provider": "Alfred Savings", "value": "£10,000", "fee": "0.0%"}
    ],

    "assets": [
        {"name": "Batcave", "value": "£100m"},
        {"name": "Manor", "value": "£50m"}
    ]
}

def render(template_path):
    print(f"Rendering {template_path} with rich data...")
    doc = DocxTemplate(template_path)

    try:
        doc.render(CONTEXT)
        out_path = template_path.replace(".docx", "_FINAL_RENDER.docx")
        doc.save(out_path)
        print(f"✅ Success! Generated: {out_path}")
    except Exception as e:
        print(f"❌ Render Error: {e}")
        print("Tip: Check if the variable names in CONTEXT match the tags in the doc.")

if __name__ == "__main__":
        render(sys.argv[1])

Writing render_v2.py
