# Advanced Financial Document Analysis Assistant (v13 - Conceptual)

**Goal:** Extract, analyze, and report on financial information from various documents with user-selectable strategies and AI enhancements.

**Structure:** This notebook is divided into logical cells:
1.  **Setup & Configuration:** Imports, constants, global state.
2.  **Text Extraction Helpers:** Functions to get text from PDF, DOCX, URL, TXT.
3.  **Parsing Strategy Helpers:** Functions for Regex, Standard QA, Chunked QA, Table (Placeholder), Hybrid (Placeholder), NER (Placeholder).
4.  **Estimation & AI Analysis Helpers:** Functions for estimations, text analysis (Summ/ZS/Sent), advanced placeholders.
5.  **Report Generation Helpers:** Functions for HTML report, NLG (Placeholder), Charting (Placeholder), Export (Placeholders).
6.  **Model Loading Trigger:** Button to explicitly load AI models.
7.  **UI Widget Setup:** Definition of all `ipywidgets`.
8.  **UI Event Handlers:** Functions triggered by button clicks, uploads, etc.
9.  **UI Layout & Display:** Assembling and displaying the final UI.

**Instructions:**
1. Run Cell 1 first to set up imports and configurations.
2. Run Cells 2-5 to define helper functions.
3. Run Cell 6 and click the button to load AI models (requires internet, takes time).
4. Run Cell 7 to define UI widgets.
5. Run Cell 8 to define UI event handlers.
6. Run Cell 9 to display the application UI.
7. Interact with the UI (load doc, parse, verify, generate report).

In [None]:
# %%-- Cell 1: Setup & Configuration --%%
# --- Imports ---
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
import math, re, time, warnings, html, os, io
from collections import defaultdict
import requests 
import pandas as pd 

# --- Optional Libs (handle import errors) ---
PDF_AVAILABLE, DOCX_AVAILABLE, HTML_AVAILABLE, CAMELOT_AVAILABLE, PLT_AVAILABLE, FPDF_AVAILABLE, TOKENIZER_AVAILABLE = False, False, False, False, False, False, False
try: import pypdf2; PDF_AVAILABLE=True; print("INFO: pypdf2 found for PDF text.")
except ImportError: print("WARN: pypdf2 not found, PDF upload disabled.")
try: import docx; DOCX_AVAILABLE=True; print("INFO: python-docx found for DOCX text.")
except ImportError: print("WARN: python-docx not found, DOCX upload disabled.")
try: from bs4 import BeautifulSoup; HTML_AVAILABLE=True; print("INFO: BeautifulSoup found for HTML parsing.")
except ImportError: print("WARN: BeautifulSoup not found, HTML/URL parsing limited.")
try: import camelot; CAMELOT_AVAILABLE=True; print("INFO: camelot-py found for PDF tables (experimental).")
except ImportError: print("WARN: camelot-py not found, PDF table extraction disabled.")
try: import matplotlib.pyplot as plt; from io import BytesIO; import base64; PLT_AVAILABLE=True; print("INFO: Matplotlib found.")
except ImportError: print("WARN: Matplotlib not found, charts disabled.")
try: from fpdf import FPDF; FPDF_AVAILABLE=True; print("INFO: fpdf2 found for PDF export.")
except ImportError: print("WARN: fpdf2 not found, PDF export disabled.")
try: from transformers import AutoTokenizer; TOKENIZER_AVAILABLE=True; print("INFO: AutoTokenizer found.")
except ImportError: print("WARN: AutoTokenizer not found.")
    
# --- Config Constants ---
# Model names 
MODEL_QA = "deepset/roberta-base-squad2" 
MODEL_SUMMARIZER = "sshleifer/distilbart-cnn-12-6"
MODEL_ZERO_SHOT = "facebook/bart-large-mnli"
MODEL_SENTIMENT = "ProsusAI/finbert"
MODEL_NLG = "gpt2" # Example, choose appropriate model

# Parsing Strategy Options & Default
PARSE_STRATEGY_STANDARD = "Standard AI QA (Fastest AI)"
PARSE_STRATEGY_CHUNKED = "Full Document AI QA (Slow, Experimental)"
PARSE_STRATEGY_HYBRID = "Hybrid (AI Text + Regex/Table Numbers)"
PARSE_STRATEGY_TABLE = "Table Extraction First (Experimental)" 
PARSE_STRATEGY_REGEX = "Regex Only (Fastest, Numbers Only)"
PARSE_STRATEGY_NER = "Financial NER/RE (Future)" # Placeholder
DEFAULT_PARSE_STRATEGY = PARSE_STRATEGY_STANDARD

# QA Model Configuration
QA_CONTEXT_TRUNCATION_CHARS = 4000 
QA_CHUNK_SIZE_CHARS = 2500       
QA_CHUNK_OVERLAP_CHARS = 500      
QA_SCORE_THRESHOLD = 0.03 # Lowered threshold        

# Zero-Shot Classification Config
ZERO_SHOT_LABELS = ["Volume/Demand", "Pricing/Mix", "Cost Control", "M&A", "FX/Rates", "Capex", "WC", "Debt/Financing", "Product/Service", "Market/Comp.", "Inflation", "Supply Chain", "Restructuring"]
ZERO_SHOT_CONFIDENCE_THRESHOLD = 0.40 

# Text Analysis Config
TEXT_ANALYSIS_MAX_CHARS_ZS = 500 
TEXT_ANALYSIS_MAX_CHARS_SENTIMENT = 450 

# --- Dependency Check for Transformers ---
TRANSFORMERS_AVAILABLE = False
try:
    try: import torch; print("INFO: PyTorch backend found.")
    except ImportError: import tensorflow; print("INFO: TensorFlow backend found.")
    from transformers import pipeline
    _ = pipeline('sentiment-analysis') # Basic check
    TRANSFORMERS_AVAILABLE = True
    print("INFO: 'transformers' library seems functional.")
except ImportError as e: print(f"WARNING: Dependency issue ({e}). AI features disabled.")
except Exception as e: print(f"WARNING: Transformer init failed: {e}. AI features disabled.")

# --- Global State --- Use a class for cleaner state management
class AppState:
    def __init__(self):
        self.document_name = None
        self.document_text = None
        self.document_tables = [] # List of pandas DataFrames
        self.parsed_data = defaultdict(lambda: None) # Holds data from parsing/estimation
        self.estimation_log = []
        self.analysis_results = {} # Holds results from text analysis module
        self.current_report_html = ""
        self.pipelines = {"qa": None, "summarizer": None, "zero_shot": None, "sentiment": None, "text-generation": None} # Add NLG pipe
        self.models_loaded = False
        self.model_load_error = False
        self.qa_tokenizer = None

app_state = AppState() # Instantiate the state object

# --- Status Widgets --- (Should be defined in Cell 7, but declare here for early use if needed)
load_status_label = widgets.HTML(value="Status: Models not loaded.")
parse_status_label = widgets.HTML(value="Status: Ready.")
analysis_status_label = widgets.HTML(value="Status: Ready.")
custom_qa_status_label = widgets.HTML(value="")

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module='transformers') 
warnings.filterwarnings("ignore", category=FutureWarning) 

print("-" * 20, "Cell 1: Setup Complete", "-" * 20)

In [None]:
# %%-- Cell 2: Helper Functions - Text Extraction & Formatting --%%

def extract_text_from_pdf(file_content):
    """Extracts text from PDF file content (bytes). Requires pypdf2."""
    if not PDF_AVAILABLE: return "Error: pypdf2 library not installed."
    try:
        pdf_reader = pypdf2.PdfReader(io.BytesIO(file_content))
        text = "".join(page.extract_text() + "\n" for page in pdf_reader.pages)
        return text if text else "Warning: No text extracted from PDF (maybe image-based?)."
    except Exception as e: return f"Error extracting PDF text: {e}"

def extract_text_from_docx(file_content):
    """Extracts text from DOCX file content (bytes). Requires python-docx."""
    if not DOCX_AVAILABLE: return "Error: python-docx library not installed."
    try:
        doc = docx.Document(io.BytesIO(file_content))
        return "\n".join([para.text for para in doc.paragraphs])
    except Exception as e: return f"Error extracting DOCX text: {e}"

def fetch_and_extract_html(url):
    """Fetches URL and extracts main text content using BeautifulSoup."""
    if not HTML_AVAILABLE: return "Error: BeautifulSoup library not installed.", None
    html_content = ""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'} # Basic user agent
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() # Raise error for bad status codes
        html_content = response.text # Get raw HTML for table extraction later
        soup = BeautifulSoup(response.content, 'html.parser')
        for script_or_style in soup(["script", "style"]): script_or_style.decompose()
        text = soup.get_text(separator='\n', strip=True)
        return text if text else "Warning: No text extracted from URL.", html_content
    except Exception as e: return f"Error fetching/parsing URL: {e}", html_content

def extract_text_from_txt(file_content):
    """Extracts text from TXT file content (bytes), guessing encoding."""
    try:
        for encoding in ['utf-8', 'latin-1', 'cp1252']:
            try: return file_content.decode(encoding)
            except UnicodeDecodeError: continue
        return "Error: Could not decode TXT file with common encodings."
    except Exception as e: return f"Error reading TXT file: {e}"

# --- Formatting Functions ---
def format_currency(v): 
    if not isinstance(v,(int,float)) or v is None or math.isnan(v) or math.isinf(v): return "$N/A"
    try: v_mm = v / 1e6; return f"${v_mm:,.1f}mm" if abs(v_mm) >= 0.05 else "$0.0mm"
    except: return "$N/A"
def format_percentage(v): 
    if not isinstance(v,(int,float)) or v is None or math.isnan(v): return "N/A%"
    try: return "Infinite %" if math.isinf(v) else f"{v:.1f}%"
    except: return "N/A%"
def format_leverage(v): 
    if not isinstance(v,(int,float)) or v is None or math.isnan(v) or math.isinf(v): return "N/Ax"
    try: return f"{v:.1f}x"
    except: return "N/Ax"

print("-" * 20, "Cell 2: Text Extraction & Formatting Helpers Defined", "-" * 20)

In [None]:
# %%-- Cell 3: Helper Functions - Parsing Strategies --%%

# --- Regex Parsing Functions ---
def extract_financial_figure_regex(text, keywords):
    """Fallback: Uses Regex to find likely financial figure near keywords."""
    num_pattern = r'[\$€£]?\s?\(?(?P<num>\d{1,3}(?:,\d{3})*(?:\.\d+)?)\)?\s*(?P<unit>million|billion|thousand|mn|bn|k)?\b'
    window = 250; figures = [] 
    for kw in keywords:
        try:
            for m in re.finditer(r'\b' + re.escape(kw) + r'\b', text, re.IGNORECASE):
                s, _ = m.span(); start = max(0, s - window); end = min(len(text), s + window); seg = text[start:end]
                for nm in re.finditer(num_pattern, seg, re.IGNORECASE):
                    v_str = nm.group('num'); unit_str = nm.group('unit'); ns, _ = nm.span()
                    neg = (ns > 0 and seg[ns - 1] == '-') or nm.group(0).startswith('(')
                    try: 
                        v = float(v_str.replace(',', '')); v = -v if neg else v
                        unit = unit_str.lower() if unit_str else ""; v *= 1e3 if 'bil' in unit or unit=='bn' else (1e-3 if 'tho' in unit or unit=='k' else 1)
                        figures.append({'value': v, 'pos': start + ns})
                    except ValueError: continue
        except Exception as e: print(f"Regex warn kw='{kw}': {e}"); continue 
    if not figures: return None
    first_kw_m = re.search(r'\b' + re.escape(keywords[0]) + r'\b', text, re.IGNORECASE); kw_pos = first_kw_m.start() if first_kw_m else 0
    after = [f for f in figures if f['pos'] >= kw_pos]; best = min(after if after else figures, key=lambda f: abs(f['pos'] - kw_pos), default=None)
    return best['value'] if best else None

def _parse_regex_only(text, status_widget):
    """Implements the Regex Only parsing strategy."""
    status_widget.value = "<span style='color: blue;'>Status: Running Regex...</span>"; print("Regex Only Strategy...")
    extracted = defaultdict(lambda: None) 
    # Define keywords for each metric
    kw_map = {'revenue_current': ["revenue", "total revenue", "net sales"], 'ebitda_current': ["ebitda", "adjusted ebitda"], 'fcf_current': ["free cash flow", "fcf"], 'cogs_current': ["cost of goods", "cogs"], 'opex_current': ["operating expenses", "opex", "sg&a"], 'da_current': ["depreciation", "amortization"], 'op_income_current': ["operating income", "ebit"], 'ocf_current': ["operating activities", "cash from operations"], 'capex_current': ["capital expenditure", "capex"], 'debt_total_current': ["total debt", "borrowings"], 'leverage_current': ["leverage ratio", "net debt to ebitda"]}
    for key, kws in kw_map.items(): extracted[key] = extract_financial_figure_regex(text, kws)
    print("Regex Only Notice: Qualitative & prior periods need manual input.")
    status_widget.value = "<span style='color: green;'>Status: Regex Complete.</span>"
    return extracted, "Regex Only"

# --- AI QA Parsing Functions ---
def parse_financial_answer(ans, is_qual=False):
    """Parses QA answer: text if qualitative, else number (in millions)."""
    # ... (Implementation from previous version) ...
    if not ans: return None
    if is_qual: return ans.strip() 
    txt = ans.lower().replace('approx.','').strip(); txt = re.sub(r'[\$€£]', '', txt).strip(); txt = re.sub(r'\s+(dollars|euros|pounds)\b','', txt); txt = txt.replace('(','-').replace(')','') 
    num_pat = r'([-+]?\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*(million|billion|thousand|mn|bn|k)?'; m = re.search(num_pat, txt)
    if not m: m = re.search(r'([-+]?\d{1,3}(?:,\d{3})*(?:\.\d+)?)', txt); v_str = m.group(1) if m else None; unit = "" 
    else: v_str, u_m = m.groups(); unit = u_m if u_m else ""
    if not v_str: return None
    try: v = float(v_str.replace(',', ''))
    except: return None 
    unit = unit.lower(); v *= 1e3 if 'bil' in unit or unit=='bn' else (1e-3 if 'tho' in unit or unit=='k' else 1)
    return v

def _run_qa_on_context(qa_pipe, questions, context, status_widget, context_label="context"):
    """Runs QA questions against a single context string, returns best results."""
    # ... (Implementation from previous version) ...
    extracted = defaultdict(lambda: None)
    q_total = sum(len(v) for v in questions.values()); q_ran = 0
    print(f"Running {q_total} QA variations on {context_label}...")
    for key, q_list in questions.items():
        best_ans = None; best_score = -1; is_qual = "text" in key 
        for q_idx, q in enumerate(q_list):
            q_ran += 1; 
            if (q_ran % 10 == 0): status_widget.value = f"<span style='color: blue;'>Status: QA ({q_ran}/{q_total} in {context_label})...</span>"
            try:
                res = qa_pipe(question=q, context=context, handle_impossible_answer=True); ans, score = res.get('answer'), res.get('score', 0)
                if ans and score > QA_SCORE_THRESHOLD: 
                    parsed = parse_financial_answer(ans, is_qualitative=is_qual)
                    if parsed is not None and score > best_score: best_score = score; best_ans = parsed; 
            except Exception as e: print(f"QA Error q='{q}': {e}")
            time.sleep(0.01) 
        extracted[key] = best_ans 
    return extracted

def _get_qa_questions():
    """Returns the dictionary of questions for the QA model."""
    # ... (Implementation from previous version with refined questions) ...
    q = defaultdict(list)
    q_fin = ['revenue_current', 'revenue_prior', 'ebitda_current', 'ebitda_prior', 'fcf_current', 'fcf_prior', 'cogs_current', 'cogs_prior', 'opex_current', 'opex_prior', 'da_current', 'da_prior', 'op_income_current', 'op_income_prior', 'ocf_current', 'ocf_prior', 'capex_current', 'capex_prior', 'debt_total_current', 'leverage_current']
    q_qual = ['revenue_drivers_text', 'ebitda_drivers_text', 'fcf_drivers_text', 'leverage_drivers_text']
    for k in q_fin + q_qual: q[k].append(f"What was the {k.replace('_', ' ').replace('text','explanation').replace('current','current period').replace('prior','prior year period')}?")
    q['revenue_current'].extend(["Net sales current period?", "Latest revenue figure?", "Total revenue?"]); q['revenue_prior'].extend(["Net sales prior year?", "Comparable prior revenue?", "Revenue last year?"])
    q['ebitda_current'].extend(["Adjusted EBITDA current period?", "Latest EBITDA?", "How much was EBITDA?"]); q['ebitda_prior'].extend(["Adjusted EBITDA prior year?", "Prior EBITDA?", "EBITDA last year?"])
    q['fcf_current'].extend(["Free cash flow current period?", "Latest FCF?"]); q['fcf_prior'].extend(["Free cash flow prior year?", "Prior FCF?"])
    q['revenue_drivers_text'].extend(["Explain revenue change.", "Factors impacting sales?", "What drove revenue?", "Describe revenue performance."])
    q['ebitda_drivers_text'].extend(["Explain EBITDA change.", "What affected margins?", "Drivers for profitability?", "Describe EBITDA performance."])
    q['fcf_drivers_text'].extend(["Explain FCF change.", "Reasons for cash flow?", "What impacted FCF?", "Describe free cash flow drivers."])
    q['leverage_drivers_text'].extend(["Explain leverage change.", "Reasons for net debt change?", "Factors affecting debt?", "Describe leverage drivers."])
    q['opex_current'].append("SG&A expense current period?"); q['opex_prior'].append("SG&A expense prior year?")
    q['ocf_current'].append("Cash flow from operations current period?"); q['ocf_prior'].append("Cash flow from operations prior year?")
    q['capex_current'].append("Capital expenditures current period?"); q['capex_prior'].append("Capital expenditures prior year?")
    q['leverage_current'].append("Net debt to EBITDA ratio end of period?")
    return q

def _parse_standard_qa(text, status_widget):
    """Implements Standard AI QA strategy (runs on truncated text)."""
    # ... (Implementation from previous version) ...
    status_widget.value = "<span style='color: blue;'>Status: Running Standard AI QA...</span>"; print("Standard AI QA Strategy...")
    qa_pipe = app_state.pipelines.get("qa"); assert qa_pipe, "QA model not loaded."
    text_context = text[:QA_CONTEXT_TRUNCATION_CHARS] 
    questions = _get_qa_questions()
    extracted_data = _run_qa_on_context(qa_pipe, questions, text_context, status_widget, context_label="truncated context") 
    print(f"Standard AI QA Finished. Found {len([v for v in extracted_data.values() if v is not None])} potential answers.")
    status_widget.value = "<span style='color: green;'>Status: Standard AI QA Complete.</span>"
    return extracted_data, PARSE_STRATEGY_STANDARD

def _create_text_chunks(text, chunk_chars, overlap_chars):
    """Splits text into potentially overlapping chunks."""
    # ... (Implementation from previous version - consider token-based if tokenizer available) ...
    chunks = []; start = 0
    if app_state.qa_tokenizer: # Use tokenizer if available
        print("Using tokenizer for chunking...")
        max_tokens = app_state.qa_tokenizer.model_max_length - 50 # Buffer
        tokens = app_state.qa_tokenizer(text, return_offsets_mapping=True, max_length=max_tokens*20, truncation=False) 
        offsets = tokens['offset_mapping']; current_chunk_start_idx = 0
        while current_chunk_start_idx < len(offsets):
            end_token_idx = min(current_chunk_start_idx + max_tokens -1, len(offsets) - 1)
            chunk_end_char = offsets[end_token_idx][1]; chunk_start_char = offsets[current_chunk_start_idx][0]
            chunks.append(text[chunk_start_char:chunk_end_char])
            overlap_tokens = max_tokens // 4; next_start_token_idx = current_chunk_start_idx + max_tokens - overlap_tokens
            current_chunk_start_idx = max(current_chunk_start_idx + 1, next_start_token_idx) 
            if current_chunk_start_idx >= len(offsets): break 
    else: # Fallback character chunking
        print("Using character-based chunking...")
        while True:
            end = start + chunk_chars; chunks.append(text[start:end])
            start += chunk_chars - overlap_chars 
            if end >= len(text): break 
    return chunks
    
def _parse_chunked_qa(text, status_widget):
    """Implements Full Document AI QA strategy using overlapping chunks."""
    # ... (Implementation from previous version - needs careful score aggregation) ...
    status_widget.value = "<span style='color: blue;'>Status: Running Full Doc QA (Chunking)... SLOW!</span>"; print(f"Full Doc AI QA Strategy...")
    qa_pipe = app_state.pipelines.get("qa"); assert qa_pipe, "QA model not loaded."
    text_chunks = _create_text_chunks(text, QA_CHUNK_SIZE_CHARS, QA_CHUNK_OVERLAP_CHARS)
    print(f"Created {len(text_chunks)} chunks. Processing...")
    questions = _get_qa_questions()
    agg_res = defaultdict(lambda: {'answer': None, 'score': -1}) 
    total_q_est = sum(len(v) for v in questions.values()) * len(text_chunks); print(f"Est. total QA calls: ~{total_q_est} (SLOW!)")
    for i, chunk_ctx in enumerate(text_chunks):
        print(f"\nProcessing Chunk {i+1}/{len(text_chunks)}...")
        status_widget.value = f"<span style='color: blue;'>Status: Chunk {i+1}/{len(text_chunks)}... (SLOW)</span>"
        q_total_chunk = sum(len(v) for v in questions.values()); q_ran_chunk = 0
        for key, q_list in questions.items():
            is_qual = "text" in key
            for q_idx, q in enumerate(q_list):
                q_ran_chunk += 1
                if (q_ran_chunk % 10 == 0): status_widget.value = f"<span style='color: blue;'>Status: Chunk {i+1} QA ({q_ran_chunk}/{q_total_chunk})...</span>"
                try:
                    res = qa_pipe(question=q, context=chunk_ctx, handle_impossible_answer=True); ans, score = res.get('answer'), res.get('score', 0)
                    if ans and score > QA_SCORE_THRESHOLD: 
                        parsed = parse_financial_answer(ans, is_qualitative=is_qual)
                        if parsed is not None and score > agg_res[key]['score']: 
                            agg_res[key]['score'] = score; agg_res[key]['answer'] = parsed
                            # print(f"    -> Updated best for '{key}' from chunk {i+1} (Score: {score:.3f}).") # Verbose
                except Exception as e: print(f"QA Error q='{q}', chunk={i+1}: {e}")
                time.sleep(0.01) 
    final_extracted = defaultdict(lambda: None)
    print("\n--- Aggregated Best Answers --- "); found_count = 0
    for key, res_data in agg_res.items():
        if res_data['answer'] is not None: final_extracted[key] = res_data['answer'] ; found_count +=1; print(f"  - {key}: {'(text)' if 'text' in key and len(str(res_data['answer']))>50 else res_data['answer']} (Best Score: {res_data['score']:.3f})")
    print("--- End Aggregation ---")
    print(f"\nFull Doc QA Finished. Aggregated {found_count} values.")
    status_widget.value = "<span style='color: green;'>Status: Full Doc AI QA Complete.</span>"
    return final_extracted, PARSE_STRATEGY_CHUNKED

# --- Table Parsing (Placeholders) ---
def _extract_tables_pdf(filepath): 
    if not CAMELOT_AVAILABLE: return [], "Camelot N/A"
    try: tables = camelot.read_pdf(filepath, pages='all'); print(f"Camelot Report: {tables.parsing_report}"); return [tbl.df for tbl in tables], "OK"
    except Exception as e: return [], f"Camelot Error: {e}"
def _extract_tables_html(html_content): 
    try: tables = pd.read_html(io.StringIO(html_content)); print(f"Pandas found {len(tables)} HTML tables."); return tables, "OK"
    except Exception as e: return [], f"Pandas HTML Table Error: {e}"
def _parse_tables(tables):
    """Placeholder: Logic to analyze extracted DataFrames."""
    print("WARN: Table parsing logic is basic placeholder."); parsed = defaultdict(lambda: None)
    # TODO: Implement robust table identification and data extraction
    return parsed

# --- Hybrid Parsing (Placeholder) ---
def _parse_hybrid(text, tables, status_widget):
    """Placeholder: Combines QA for text, Regex/Table for numbers."""
    print("WARN: Hybrid parsing logic is basic placeholder."); status_widget.value = "<span style='color: orange;'>Status: Hybrid (Experimental)...</span>"
    extracted_qual = {}; extracted_num_regex = {}; extracted_num_table = {}
    # TODO: Implement hybrid logic
    final = defaultdict(lambda: None); 
    status_widget.value = "<span style='color: green;'>Status: Hybrid Parse Attempt Done.</span>"
    return final, PARSE_STRATEGY_HYBRID

# --- NER/RE Placeholder ---
def _parse_ner_re(text, status_widget): 
    print("WARN: NER/RE parsing not implemented."); status_widget.value = "<span style='color: orange;'>Status: NER/RE N/A.</span>"
    return defaultdict(lambda: None), PARSE_STRATEGY_NER

print("-" * 20, "Cell 3: Parsing Strategy Helpers Defined", "-" * 20)

In [None]:
# %%-- Cell 4: Helper Functions - Estimation & AI Analysis --%%

def _perform_estimations(data_dict):
    """Attempts to estimate key metrics if components are available."""
    final = dict(data_dict); log = []; est_done = False
    print("Attempting estimations...") 
    def _valid(*a): return all(isinstance(x,(int,float)) and x is not None and not math.isnan(x) for x in a)
    
    # Est EBITDA(C)
    if final.get('ebitda_current') is None:
        opinc, da, rev, cogs, opex = (final.get(k) for k in ['op_income_current', 'da_current', 'revenue_current', 'cogs_current', 'opex_current'])
        if _valid(opinc, da): 
            est = opinc + da; final['ebitda_current'] = est; log.append(f"Est. EBITDA(C)=OpInc({opinc:.1f})+D&A({da:.1f})={est:.1f}"); est_done=True
        elif _valid(rev, cogs, opex): 
            base = rev - cogs - opex; da_val = da if _valid(da) else 0; est = base + da_val; final['ebitda_current'] = est; log.append(f"Est. EBITDA(C)=Rev-Exp+(D&A:{da_val:.1f})={est:.1f}"); est_done=True
            
    # Est EBITDA(P) 
    if final.get('ebitda_prior') is None:
         opinc_p, da_p = (final.get(k) for k in ['op_income_prior', 'da_prior'])
         if _valid(opinc_p, da_p): 
             est = opinc_p + da_p; final['ebitda_prior'] = est; log.append(f"Est. EBITDA(P)=OpInc_p({opinc_p:.1f})+D&A_p({da_p:.1f})={est:.1f}"); est_done=True
             
    # Est FCF(C) 
    if final.get('fcf_current') is None:
         ocf, capex = (final.get(k) for k in ['ocf_current', 'capex_current'])
         if _valid(ocf, capex): 
              est = ocf - abs(capex); final['fcf_current'] = est; log.append(f"Est. FCF(C)=OCF({ocf:.1f})-Capex({abs(capex):.1f})={est:.1f}"); est_done=True
              
    # Est FCF(P) 
    if final.get('fcf_prior') is None:
         ocf_p, capex_p = (final.get(k) for k in ['ocf_prior', 'capex_prior'])
         if _valid(ocf_p, capex_p): 
              est = ocf_p - abs(capex_p); final['fcf_prior'] = est; log.append(f"Est. FCF(P)=OCF_p({ocf_p:.1f})-Capex_p({abs(capex_p):.1f})={est:.1f}"); est_done=True
              
    # Est Lev(C) 
    if final.get('leverage_current') is None:
        debt, ebitda = (final.get(k) for k in ['debt_total_current', 'ebitda_current']) 
        if _valid(debt, ebitda) and abs(ebitda or 0) > 1e-6: 
            est = debt / ebitda 
            if 0 < est < 25: # Sanity check
                final['leverage_current'] = est
                log.append(f"Est. Lev(C)=Debt({debt:.1f})/EBITDA({ebitda:.1f})={est:.1f}x") 
                est_done = True
            else: 
                log.append(f"Est. Lev(C) skipped (ratio {est:.1f}x unreal.)")
                
    # Log estimation results
    if est_done: print("\n--- Estimations Performed ---"); [print(f"- {l}") for l in log]; print("---")
    else: print("\nNo estimations performed.")
    return final, log # Return final data and log

# --- AI Text Analysis Function ---
# (Include analyze_drivers_transformer function from previous version)
def analyze_drivers_transformer(text, status_widget):
    """Analyzes driver text using AI pipelines."""
    req_pipes = ["summarizer", "zero_shot", "sentiment"]; 
    # Use app_state.pipelines instead of global pipelines
    if not all(app_state.pipelines.get(p) for p in req_pipes): return "<i>(AI analysis models not loaded)</i>"
    if not text or text.strip().lower() == 'n/a': return "<i>(No text)</i>"
    analysis = {}; status_widget.value = "<span style='color: blue;'>Status: Analyzing text...</span>"
    try: # Wrap entire analysis in try/except
        # Summarization
        try: analysis["Summary"] = f"<i>{html.escape(app_state.pipelines['summarizer'](text, max_length=80, min_length=15, do_sample=False)[0]['summary_text'])}</i>"
        except Exception as e: print(f"Summary fail: {e}"); analysis["Summary"] = "<i>Error</i>"
        # Zero-Shot
        try:
            text_trunc = text[:TEXT_ANALYSIS_MAX_CHARS_ZS] if len(text) > TEXT_ANALYSIS_MAX_CHARS_ZS else text
            if not text_trunc.strip(): raise ValueError("Empty ZS text")
            results = app_state.pipelines['zero_shot'](text_trunc, candidate_labels=ZERO_SHOT_LABELS, multi_label=True); 
            themes = sorted([(l, s) for l, s in zip(results['labels'], results['scores']) if s > ZERO_SHOT_CONFIDENCE_THRESHOLD], key=lambda i: i[1], reverse=True)
            analysis["Themes"] = html.escape(", ".join([f"{l} ({s:.0%})" for l, s in themes])) if themes else "<i>None detected</i>"
        except Exception as e: print(f"Zero-shot fail: {e}"); analysis["Themes"] = "<i>Error</i>"
        # Sentiment
        try:
            chunks = [text[i:i+TEXT_ANALYSIS_MAX_CHARS_SENTIMENT] for i in range(0, len(text), TEXT_ANALYSIS_MAX_CHARS_SENTIMENT)]; sentiments = []; scores = []
            for chunk in chunks:
                 if not chunk.strip(): continue 
                 res = app_state.pipelines['sentiment'](chunk)[0]; sentiments.append(res['label']); scores.append(res['score']) 
            if sentiments: sentiment = max(set(sentiments), key=sentiments.count); avg_score = sum(s for s, lab in zip(scores, sentiments) if lab == sentiment) / sentiments.count(sentiment); analysis["Sentiment"] = f"{html.escape(sentiment.capitalize())} ({avg_score:.1%})" 
            else: analysis["Sentiment"] = "<i>N/A</i>"
        except Exception as e: print(f"Sentiment fail: {e}"); analysis["Sentiment"] = "<i>Error</i>"
    except Exception as e:
        print(f"Error during text analysis: {e}")
        return "<i>(Error during AI text analysis)</i>"
    # Format
    out = "<ul style='margin:0; padding-left:20px; font-size:0.9em;'>"; 
    if "Summary" in analysis: out += f"<li><b>Summary:</b> {analysis['Summary']}</li>"
    if "Themes" in analysis: out += f"<li><b>Themes:</b> {analysis['Themes']}</li>"
    if "Sentiment" in analysis: out += f"<li><b>Sentiment:</b> {analysis['Sentiment']}</li></ul>"
    return out if analysis else "<i>(Analysis failed)</i>"

# --- Advanced Analysis Placeholders ---
def _link_drivers_to_metrics(data_dict):
     """Placeholder: Advanced NLP to link qualitative sentences to metric changes."""
     print("WARN: Driver linking not implemented."); return {}

def _perform_consistency_checks(data_dict):
     """Placeholder: Check consistency between text/table/estimates."""
     print("WARN: Consistency checks not implemented."); return []

print("-" * 20, "Cell 4: Estimation & Analysis Helpers Defined", "-" * 20)

In [None]:
# %%-- Cell 5: Helper Functions - Report Generation & Visualization --%%

def _generate_report_html(data_dict, qual_analysis_results, analysis_method):
    """Generates the final HTML report string using verified/final data."""
    print("Generating HTML report...")
    # Get values safely from data_dict (which should hold the verified/final data)
    rev_c, rev_p = data_dict.get('revenue_current'), data_dict.get('revenue_prior')
    ebitda_c, ebitda_p = data_dict.get('ebitda_current'), data_dict.get('ebitda_prior')
    fcf_c, fcf_p = data_dict.get('fcf_current'), data_dict.get('fcf_prior')
    lev_c, lev_p = data_dict.get('leverage_current'), data_dict.get('leverage_prior')
    # Get QUALITATIVE text directly from data_dict (should be verified text)
    qual_in = { 
        "Rev": data_dict.get('revenue_drivers_text', "N/A"), 
        "EBITDA": data_dict.get('ebitda_drivers_text', "N/A"), 
        "FCF": data_dict.get('fcf_drivers_text', "N/A"), 
        "Lev": data_dict.get('leverage_drivers_text', "N/A")
    }
    
    # Calculations
    try:
        def calc_chg(c, p):
            if not all(isinstance(x,(int,float)) and x is not None and not math.isnan(x) for x in [c, p]): return None, None
            delta = c - p; return (float('inf') if c>1e-9 else (float('-inf') if c<-1e-9 else 0.0)) if abs(p)<1e-9 else (delta/abs(p)*100), delta
        rev_pc, _ = calc_chg(rev_c, rev_p); ebitda_pc, _ = calc_chg(ebitda_c, ebitda_p); fcf_pc, _ = calc_chg(fcf_c, fcf_p)
        lev_d = (lev_c - lev_p) if all(isinstance(x,(int,float)) and x is not None and not math.isnan(x) for x in [lev_c, lev_p]) else None
        margin_c = (ebitda_c / rev_c * 100) if rev_c and abs(rev_c) > 1e-9 and ebitda_c is not None else None
        margin_p = (ebitda_p / rev_p * 100) if rev_p and abs(rev_p) > 1e-9 and ebitda_p is not None else None
    except Exception as e: return f"<h2>Report Error</h2><p>Calc error: {e}</p>"

    # HTML Generation
    out = f"<h2>Financial Performance Summary</h2><p style='font-size:0.9em; color:grey;'><i>(Text analysis: {analysis_method})</i></p>"
    def fmt_qual(lbl, k): 
        txt = qual_in.get(k, "N/A"); analysis = qual_analysis_results.get(k, "") 
        block = f"<div style='margin-left:15px; margin-top:3px; font-size:0.9em; color:#333; border-left: 2px solid #eee; padding-left: 5px;'><b>AI Analysis:</b> {analysis}</div>" if analysis and "<i>(" not in analysis and "N/A" not in analysis else (analysis if "<i>(" in analysis else "") 
        esc_txt = html.escape(txt) if txt else "N/A"; txt_disp = f"<p style='margin:2px 0; white-space:pre-wrap;'>{esc_txt}</p>"
        return f"<li><b>{lbl}:</b> {txt_disp}{block}</li>"

    # Revenue
    out += "<h3>Revenue</h3><ul>"; s1 = "<li>Data incomplete.</li>"
    if rev_pc is not None: 
        if abs(rev_p or 0)<1e-9: s1 = f"<li>Rev: <strong>{format_currency(rev_c)}</strong>.</li>"
        else: d = "increased" if rev_pc >= 0 else "decreased"; d = "~flat" if abs(rev_pc) < 0.05 else d; s1 = f"<li>Rev {d} <strong>{format_percentage(abs(rev_pc))}</strong> YoY to <strong>{format_currency(rev_c)}</strong> (from {format_currency(rev_p)}).</li>"
    elif rev_c is not None: s1 = f"<li>Current Rev: <strong>{format_currency(rev_c)}</strong>.</li>"
    out += s1; out += fmt_qual("Drivers/Factors", "Rev"); out += "</ul>"
    # EBITDA
    out += "<h3>EBITDA & Margin</h3><ul>"; s1 = "<li>Data incomplete.</li>"; s2 = "<li>Margin N/A.</li>" 
    if ebitda_pc is not None: 
        if abs(ebitda_p or 0)<1e-9: s1 = f"<li>EBITDA: <strong>{format_currency(ebitda_c)}</strong>.</li>"
        else: d = "increased" if ebitda_pc >= 0 else "decreased"; d = "~flat" if abs(ebitda_pc) < 0.05 else d; s1 = f"<li>EBITDA {d} <strong>{format_percentage(abs(ebitda_pc))}</strong> YoY to <strong>{format_currency(ebitda_c)}</strong> (from {format_currency(ebitda_p)}).</li>"
        if margin_c is not None and margin_p is not None: margin_d = margin_c - margin_p; bps = margin_d * 100; s2 = f"<li>Margin stable ~{format_percentage(margin_c)}.</li>" if abs(margin_d)<0.05 else f"<li>Margin {'expanded' if margin_d>0 else 'contracted'} {abs(bps):.0f} bps YoY to <strong>{format_percentage(margin_c)}</strong>.</li>"
        elif margin_c is not None: s2 = f"<li>Current margin: {format_percentage(margin_c)}.</li>"
    elif ebitda_c is not None: s1 = f"<li>Current EBITDA: <strong>{format_currency(ebitda_c)}</strong>.</li>"; if margin_c is not None: s2 = f"<li>Current margin: {format_percentage(margin_c)}.</li>"
    out += s1; out += s2; out += fmt_qual("Drivers/Factors", "EBITDA"); out += "</ul>"
    # FCF
    out += "<h3>Free Cash Flow</h3><ul>"; s1 = "<li>Data incomplete.</li>"
    if fcf_pc is not None and fcf_c is not None and fcf_p is not None: 
        neg_p=fcf_p<-1e-9; neg_c=fcf_c<-1e-9; zero_p=abs(fcf_p)<1e-9
        if neg_p and not neg_c: s1 = f"<li>FCF improved to <strong>{format_currency(fcf_c)}</strong> (from {format_currency(fcf_p)}).</li>"
        elif not neg_p and neg_c: s1 = f"<li>FCF declined to <strong>{format_currency(fcf_c)}</strong> (from {format_currency(fcf_p)}).</li>"
        elif zero_p: s1 = f"<li>FCF: <strong>{format_currency(fcf_c)}</strong>.</li>"
        elif neg_p and neg_c: d="improved" if fcf_c>fcf_p else "worsened"; s1 = f"<li>FCF {d} to <strong>{format_currency(fcf_c)}</strong> (from {format_currency(fcf_p)}).</li>"
        else: d="increased" if fcf_pc >= 0 else "decreased"; d="~flat" if abs(fcf_pc)<0.05 else d; s1 = f"<li>FCF {d} <strong>{format_percentage(abs(fcf_pc))}</strong> YoY to <strong>{format_currency(fcf_c)}</strong> (from {format_currency(fcf_p)}).</li>"
    elif fcf_c is not None: s1 = f"<li>Current FCF: <strong>{format_currency(fcf_c)}</strong>.</li>"
    out += s1; out += fmt_qual("Drivers/Factors", "FCF"); out += "</ul>"
    # Leverage
    out += "<h3>Leverage</h3><ul>"; s1 = "<li>Data incomplete.</li>"
    if lev_d is not None and lev_c is not None and lev_p is not None: 
        delta=abs(lev_d); s1=f"<li>Leverage stable ~<strong>{format_leverage(lev_c)}</strong>.</li>" if delta<0.05 else (f"<li>Leverage increased {delta:.1f}x YoY to <strong>{format_leverage(lev_c)}</strong>.</li>" if lev_d>0 else f"<li>Leverage decreased {delta:.1f}x YoY to <strong>{format_leverage(lev_c)}</strong>.</li>")
    elif lev_c is not None: s1 = f"<li>Current Leverage: <strong>{format_leverage(lev_c)}</strong>.</li>"
    out += s1; out += fmt_qual("Drivers/Factors", "Lev"); out += "</ul>"
    
    # --- Add Visualization --- (Basic Example)
    if PLT_AVAILABLE:
         out += "<br><h4>YoY Visualizations (Basic)</h4>"
         out += _generate_yoy_chart("Revenue", rev_c, rev_p)
         out += _generate_yoy_chart("EBITDA", ebitda_c, ebitda_p)
         # Add more charts as needed
         
    return out

def _generate_report_nlg(data_dict, analysis_results, status_widget):
    """Placeholder: Use text-generation model for narrative summary."""
    status_widget.value = "<span style='color: blue;'>Status: Generating NLG Summary...</span>"
    nlg_pipe = app_state.pipelines.get("text-generation") # Use state
    if not nlg_pipe: return "<h2>NLG Error</h2><p>Text Generation model not loaded. Load one (e.g., 'gpt2') via `load_models`.</p>"
    try:
        # Simple prompt construction - needs refinement!
        prompt = f"Summarize financial performance:\nRevenue: {format_currency(data_dict.get('revenue_current'))} vs {format_currency(data_dict.get('revenue_prior'))}\nEBITDA: {format_currency(data_dict.get('ebitda_current'))} vs {format_currency(data_dict.get('ebitda_prior'))}\nFCF: {format_currency(data_dict.get('fcf_current'))} vs {format_currency(data_dict.get('fcf_prior'))}\nLeverage: {format_leverage(data_dict.get('leverage_current'))} vs {format_leverage(data_dict.get('leverage_prior'))}\nRevenue Drivers: {analysis_results.get('Rev', '')}\nEBITDA Drivers: {analysis_results.get('EBITDA', '')}\nSummary:"
        # Truncate prompt if too long for model
        max_prompt_len = 512 # Example limit
        truncated_prompt = prompt[:max_prompt_len]
        
        result = nlg_pipe(truncated_prompt, max_new_tokens=150, num_return_sequences=1, temperature=0.7, do_sample=True) 
        narrative = result[0]['generated_text'].replace(truncated_prompt, "").strip() 
        status_widget.value = "<span style='color: green;'>Status: NLG Summary Generated.</span>"
        return f"<h2>AI Generated Narrative (Experimental)</h2><p>{html.escape(narrative)}</p>"
    except Exception as e: print(f"NLG Error: {e}"); status_widget.value = "<span style='color: red;'>Status: NLG Error.</span>"; return f"<h2>NLG Error</h2><p>{e}</p>"

def _generate_yoy_chart(metric_name, current_val, prior_val):
     """Generates a simple YoY bar chart image as base64 HTML."""
     # ... (Implementation from previous version) ...
     if not PLT_AVAILABLE or not all(isinstance(x,(int,float)) and not math.isnan(x) for x in [current_val, prior_val]): return ""
     try:
         fig, ax = plt.subplots(figsize=(3, 2.5)); labels = ['Prior', 'Current']
         values = [prior_val / 1e6 if prior_val else 0, current_val / 1e6 if current_val else 0] # Plot in millions
         colors = ['#B0B0B0', '#4682B4'] # Grey, SteelBlue
         ax.bar(labels, values, color=colors, width=0.6)
         ax.set_ylabel('$ Millions', fontsize=9); ax.set_title(f'{metric_name} YoY', fontsize=10)
         ax.tick_params(axis='both', which='major', labelsize=8)
         ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False) # Cleaner look
         plt.tight_layout(); buf = BytesIO(); plt.savefig(buf, format='png', bbox_inches='tight'); plt.close(fig); buf.seek(0)
         img_64 = base64.b64encode(buf.read()).decode('utf-8'); return f'<img src="data:image/png;base64,{img_64}" alt="{metric_name} Chart" style="margin: 5px;"/>'
     except Exception as e: print(f"Chart Error {metric_name}: {e}"); return ""

# --- Export Functions ---
def _export_report_markdown(html_content):
    """Basic conversion of HTML report to Markdown (requires html2text)."""
    try: 
        import html2text
        h = html2text.HTML2Text(); h.ignore_links = True; h.ignore_images = True # Configure
        md = h.handle(html_content)
        md_path = "financial_report.md"
        with open(md_path, 'w', encoding='utf-8') as f: f.write(md)
        return f"Report saved to {md_path}"
    except ImportError: return "Markdown export requires `html2text`. Install it (`pip install html2text`)."
    except Exception as e: return f"Markdown conversion error: {e}"
    
def _export_report_pdf(html_content):
    """Basic PDF export using fpdf2 (very limited HTML support)."""
    if not FPDF_AVAILABLE: return "FPDF library not installed."
    try:
        pdf = FPDF(); pdf.add_page(); pdf.set_font("Arial", size=10)
        # Attempt to write basic HTML (fpdf2 has limited support)
        # For better results, convert HTML->MD first, then MD->PDF or use Weasyprint
        try: pdf.write_html(html_content) # May fail on complex HTML
        except: pdf.multi_cell(0, 5, "HTML Conversion Failed - Raw Text:\n\n" + BeautifulSoup(html_content, 'html.parser').get_text()) # Fallback
        pdf_path = "financial_report.pdf"; pdf.output(pdf_path, "F")
        return f"Basic PDF report saved to {pdf_path}."
    except Exception as e: return f"Error exporting PDF: {e}"

print("-" * 20, "Cell 5: Report Generation Helpers Defined", "-" * 20)

In [None]:
# %%-- Cell 6: Model Loading Trigger --%%

# Provides a button in the notebook output to explicitly load models after setup.
load_models_button_cell6 = widgets.Button(description="Load AI Models (if not loaded)", button_style='info', icon='download')
load_models_output_cell6 = widgets.Output()

def on_load_click_cell6(b):
     with load_models_output_cell6:
         clear_output(wait=True) # Clear previous load messages
         print("Model loading requested...")
         load_models(load_status_label) # Call the main loading function, updates global status label

load_models_button_cell6.on_click(on_load_click_cell6)

# Display the button for this cell's output
display(widgets.VBox([load_models_button_cell6, load_models_output_cell6]))

print("-" * 20, "Cell 6: Model Loading Cell Complete (Click button above)", "-" * 20)

In [None]:
# %%-- Cell 7: UI Widget Setup --%%

# Define all widgets used in the UI layout
print("Defining UI widgets...")

# --- Styles & Layouts ---
style = {'description_width': 'initial'}; 
layout_half = widgets.Layout(width='48%') 
layout_text = widgets.Layout(width='98%', height='100px') 

# --- Section 1 Widgets: Input & Parsing ---
file_upload = widgets.FileUpload(accept='.txt,.pdf,.docx,.html', multiple=False, description="Upload Doc") 
url_input = widgets.Text(description="Or Enter URL:", placeholder="http://...", layout=widgets.Layout(width='60%'))
fetch_url_btn = widgets.Button(description="Fetch URL", icon='download', button_style='info', tooltip="Fetch content from URL")
data_source_label = widgets.HTML(value="<i>No document loaded yet.</i>")
parse_strategy_selector = widgets.RadioButtons(options=[PARSE_STRATEGY_STANDARD, PARSE_STRATEGY_CHUNKED, PARSE_STRATEGY_HYBRID, PARSE_STRATEGY_TABLE, PARSE_STRATEGY_REGEX, PARSE_STRATEGY_NER], value=DEFAULT_PARSE_STRATEGY, description='Parsing Strategy:', style=style, layout=widgets.Layout(width='max-content'))
parse_btn = widgets.Button(description="Parse Document", button_style='info', icon='cogs', tooltip="Extract data using selected strategy")
parse_out = widgets.Output() # Output for parsing status messages
custom_q_in = widgets.Text(description="Follow-up Q:", placeholder="Ask AI about the loaded text...", layout=widgets.Layout(width='70%'), style=style)
custom_q_btn = widgets.Button(description="Ask", button_style='primary', tooltip="Run custom question")
# ***** Definition for custom_qa_output *****
custom_q_out = widgets.Output() # Output for custom QA results
# custom_qa_status_label defined globally in Cell 1

# --- Section 2 Widgets: Data Input/Verification ---
rev_c_in = widgets.FloatText(description="Rev (C, $mm):", style=style, layout=layout_half)
rev_p_in = widgets.FloatText(description="Rev (P, $mm):", style=style, layout=layout_half)
rev_drv_in = widgets.Textarea(description="Rev Drivers/Notes:", placeholder="(Auto-populated/Manual)", style=style, layout=layout_text)
ebitda_c_in = widgets.FloatText(description="EBITDA (C, $mm):", style=style, layout=layout_half)
ebitda_p_in = widgets.FloatText(description="EBITDA (P, $mm):", style=style, layout=layout_half)
ebitda_drv_in = widgets.Textarea(description="EBITDA Drivers/Notes:", placeholder="(Auto/Manual)", style=style, layout=layout_text)
fcf_c_in = widgets.FloatText(description="FCF (C, $mm):", style=style, layout=layout_half)
fcf_p_in = widgets.FloatText(description="FCF (P, $mm):", style=style, layout=layout_half)
fcf_drv_in = widgets.Textarea(description="FCF Drivers/Notes:", placeholder="(Auto/Manual)", style=style, layout=layout_text)
lev_c_in = widgets.FloatText(description="Leverage (C, x):", style=style, layout=layout_half)
lev_p_in = widgets.FloatText(description="Leverage (P, x):", style=style, layout=layout_half)
lev_drv_in = widgets.Textarea(description="Leverage Drivers/Notes:", placeholder="(Auto/Manual)", style=style, layout=layout_text)
benchmark_input_area = widgets.Textarea(description="Benchmark Data (Optional):", placeholder="Enter industry or historical data...", layout=widgets.Layout(width='98%', height='60px'), style=style)

# --- Section 3 Widgets: Report Generation & Export ---
gen_report_btn = widgets.Button(description="Generate Report", button_style='success', icon='file-text', tooltip="Generate analysis report") # Updated icon
nlg_report_btn = widgets.Button(description="Generate NLG Summary (Experimental)", icon='magic', tooltip="Use AI to generate narrative text")
export_md_btn = widgets.Button(description="Export MD", icon='file-code', tooltip="Export report as Markdown") # Updated icon
export_pdf_btn = widgets.Button(description="Export PDF", icon='file-pdf', tooltip="Export basic PDF report") # Updated icon
report_out = widgets.Output() # Output area for the final report & export messages

print("-" * 20, "Cell 7: UI Widgets Defined", "-" * 20)

In [None]:
# %%-- Cell 8: UI Event Handlers --%%

# --- Input Handlers ---
def on_upload_change(change):
    """Handles file upload, extracts text and potentially tables."""
    # ... (Implementation from previous version - using app_state) ...
    if not change['new']: return 
    uploaded_file = change['new'][0] 
    filename = uploaded_file['name']; content = uploaded_file['content'] 
    print(f"Processing uploaded file: {filename} ({len(content)} bytes)")
    data_source_label.value = f"<i>Processing: {filename}</i>"
    app_state.document_name = filename; app_state.document_text = None; app_state.document_tables = []
    text = f"Error processing file: {filename}"; tables = []
    if filename.lower().endswith('.pdf'):
        text = extract_text_from_pdf(content)
        temp_pdf_path = "temp_uploaded_file.pdf" 
        try:
            with open(temp_pdf_path, 'wb') as f: f.write(content)
            tables, msg = _extract_tables_pdf(temp_pdf_path); print(msg); app_state.document_tables = tables
        except Exception as e: print(f"PDF Table extraction failed: {e}")
        finally: 
             if os.path.exists(temp_pdf_path): os.remove(temp_pdf_path)
    elif filename.lower().endswith('.docx'): text = extract_text_from_docx(content)
    elif filename.lower().endswith(('.html', '.htm')):
        html_str = extract_text_from_txt(content) 
        if not html_str.startswith("Error:"): 
             try: text = BeautifulSoup(html_str, 'html.parser').get_text(separator='\n', strip=True)
             except Exception as e: text = f"HTML text parse error: {e}"
             tables, msg = _extract_tables_html(html_str); print(msg); app_state.document_tables = tables
        else: text = html_str
    elif filename.lower().endswith('.txt'): text = extract_text_from_txt(content)
    else: text = f"Error: Unsupported file type '{filename}'."
    app_state.document_text = text; pr_input.value = text[:20000] # Update preview
    data_source_label.value = f"<i>Loaded: {filename} ({len(text)} chars, {len(tables)} tables)</i>"
    parse_status_label.value = "Status: Ready to parse."; clear_output(wait=True)

def on_fetch_url_click(b):
    """Handles fetching and processing a URL."""
    # ... (Implementation from previous version - using app_state) ...
    url = url_input.value
    if not url or not url.startswith(('http://', 'https://')): data_source_label.value = "<i>Error: Invalid URL.</i>"; return
    print(f"Fetching URL: {url}"); data_source_label.value = f"<i>Fetching...</i>"
    app_state.document_name = url; app_state.document_text = None; app_state.document_tables = []
    html_content = ""; text = "Error fetching/parsing URL."
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=15); response.raise_for_status()
        html_content = response.text 
        if HTML_AVAILABLE and html_content:
            try: soup = BeautifulSoup(html_content, 'html.parser'); [s.decompose() for s in soup(["script", "style"])]; text = soup.get_text(separator='\n', strip=True)
            except Exception as e: text = f"HTML text parse error: {e}"
    except Exception as e: text = f"Error fetching URL: {e}"; print(text)
    tables = []
    if html_content: tables, msg = _extract_tables_html(html_content); print(msg); app_state.document_tables = tables
    app_state.document_text = text; pr_input.value = text[:20000] 
    data_source_label.value = f"<i>Loaded: {url} ({len(text)} chars, {len(tables)} tables)</i>"
    parse_status_label.value = "Status: Ready to parse."; clear_output(wait=True)

# --- Parsing Handler ---
def on_parse_button_click(b):
    """Handles 'Parse Document' button using selected strategy."""
    # ... (Implementation from previous version - using app_state) ...
    parse_out.clear_output(wait=True); analysis_out.clear_output() 
    app_state.parsed_data = defaultdict(lambda: None); app_state.estimation_log = []
    with parse_out: 
        parse_strategy = parse_strategy_selector.value 
        parse_status_label.value = f"<span style='color: blue;'>Status: Parsing ({parse_strategy})...</span>"; print(f"Initiating parse: {parse_strategy}")
        doc_text = app_state.get("document_text"); doc_tables = app_state.get("document_tables", [])
        if not doc_text or not doc_text.strip(): print("Error: No text loaded."); parse_status_label.value = "<span style='color: red;'>Status: Error - No text.</span>"; return
        extracted = defaultdict(lambda: None); method = "Unknown"; success = False 
        try:
            start_time = time.time()
            if parse_strategy == PARSE_STRATEGY_STANDARD:
                if not (TRANSFORMERS_AVAILABLE and app_state.models_loaded and app_state.pipelines.get("qa")): raise RuntimeError("Standard AI QA requires loaded QA model.")
                extracted, method = _parse_standard_qa(doc_text, parse_status_label)
            elif parse_strategy == PARSE_STRATEGY_CHUNKED:
                if not (TRANSFORMERS_AVAILABLE and app_state.models_loaded and app_state.pipelines.get("qa")): raise RuntimeError("Full Document AI QA requires loaded QA model.")
                extracted, method = _parse_chunked_qa(doc_text, parse_status_label)
            elif parse_strategy == PARSE_STRATEGY_REGEX:
                 extracted, method = _parse_regex_only(doc_text, parse_status_label)
            elif parse_strategy == PARSE_STRATEGY_TABLE:
                 table_data = _parse_tables(doc_tables); regex_data, _ = _parse_regex_only(doc_text, parse_status_label)
                 extracted.update(regex_data); extracted.update(table_data); method = PARSE_STRATEGY_TABLE; print("Table Extraction Finished (Basic).")
            elif parse_strategy == PARSE_STRATEGY_HYBRID:
                 if not (TRANSFORMERS_AVAILABLE and app_state.models_loaded and app_state.pipelines.get("qa")): raise RuntimeError("Hybrid parsing requires loaded QA model.")
                 extracted, method = _parse_hybrid(doc_text, doc_tables, parse_status_label)
            elif parse_strategy == PARSE_STRATEGY_NER:
                 extracted, method = _parse_ner_re(doc_text, parse_status_label) 
            else: raise ValueError(f"Unknown strategy: {parse_strategy}")
            success = True; print(f"Parsing strategy '{method}' completed in {time.time() - start_time:.1f}s.")
        except Exception as e: print(f"ERROR during parsing '{parse_strategy}': {e}"); parse_status_label.value = f"<span style='color: red;'>Status: Error during parsing.</span>"; success = False; method = f"Failed ({parse_strategy})"
        
        final_data, estimation_log = _perform_estimations(extracted) if success else (extracted, [])
        app_state.parsed_data = final_data; app_state.estimation_log = estimation_log
            
        if success: 
            w_map = {'revenue_current': rev_c_in, 'revenue_prior': rev_p_in, 'ebitda_current': ebitda_c_in, 'ebitda_prior': ebitda_p_in, 'fcf_current': fcf_c_in, 'fcf_prior': fcf_p_in, 'leverage_current': lev_c_in, 'leverage_prior': lev_p_in, 'revenue_drivers_text': rev_drv_in, 'ebitda_drivers_text': ebitda_drv_in, 'fcf_drivers_text': fcf_drv_in, 'leverage_drivers_text': lev_drv_in}
            print("\nPopulating UI:"); pop_count = 0
            for k, w in w_map.items():
                val = final.get(k); est = any(k.split('_')[0].lower() in l.lower() for l in estimation_log if k.lower() in l.lower()) 
                if val is not None:
                    try: w.value = float(f"{val:.2f}") if isinstance(w, widgets.FloatText) and isinstance(val, float) else val; pop_count += 1; print(f"  - Set {k}{' (Est.)' if est else ''}: {'(text)' if isinstance(val, str) and len(val)>50 else val}")
                    except: w.value = None 
            print(f"\nPopulated {pop_count} fields. Method: {method}"); print("\n>>> !! CRITICAL: Review ALL fields. Edit as needed. !! <<<")
            parse_status_label.value = f"<span style='color: green;'>Status: Parse OK ({method}). VERIFY VALUES.</span>"
        elif success: parse_status_label.value = f"<span style='color: orange;'>Status: Parsed ({method}), no data.</span>"
        analysis_status_label.value = "Status: Ready."

# --- Interactive QA Handler ---
def on_custom_qa_click(b):
    """Handles 'Ask AI' button for interactive QA."""
    # ... (Implementation from previous version - uses app_state) ...
    custom_q_out.clear_output(wait=True) 
    custom_qa_status_label.value = "<span style='color: blue;'>Processing...</span>"
    with custom_q_out: 
        q = custom_q_in.value; txt = app_state.document_text; qa_pipe = app_state.pipelines.get("qa")
        if not q or not q.strip(): print("ERROR: Enter question."); custom_qa_status_label.value = "<span style='color: red;'>Status: Enter question.</span>"; return
        if not txt or not txt.strip(): print("ERROR: No document loaded."); custom_qa_status_label.value = "<span style='color: red;'>Status: Load document first.</span>"; return
        if not qa_pipe: print("ERROR: QA model not loaded."); custom_qa_status_label.value = "<span style='color: red;'>Status: Load QA model.</span>"; return
        print(f"Asking AI: '{html.escape(q)}'"); 
        try:
            res = qa_pipe(question=q, context=txt, handle_impossible_answer=True)
            ans, score = res.get('answer'), res.get('score', 0)
            print(f"\nAnswer: {html.escape(ans if ans else '(No answer found)')}")
            print(f"Confidence: {score:.3f}")
            if not ans or score < QA_SCORE_THRESHOLD: print("(Low confidence or no answer found)")
            custom_qa_status_label.value = f"<span style='color: {'green' if ans and score >= QA_SCORE_THRESHOLD else 'orange'};'>Status: Custom QA complete.</span>"
        except Exception as e: print(f"\nERROR: {e}"); custom_qa_status_label.value = "<span style='color: red;'>Status: Custom QA Error.</span>"

# --- Report Generation Handler ---
def on_generate_click(b):
    """Handles 'Generate Report' button."""
    # ... (Implementation from previous version - uses app_state) ...
    report_out.clear_output(wait=True) 
    with report_out: 
        analysis_status_label.value = "<span style='color: blue;'>Status: Generating...</span>"; print("Generating Report...")
        # Get verified data from UI widgets directly for final report
        verified_data = defaultdict(lambda: None)
        try:
            def get_units(w): v=w.value; return v * 1e6 if isinstance(v,(int,float)) and v is not None and not math.isnan(v) else None
            verified_data['revenue_current'] = get_units(rev_c_in); verified_data['revenue_prior'] = get_units(rev_p_in)
            verified_data['ebitda_current'] = get_units(ebitda_c_in); verified_data['ebitda_prior'] = get_units(ebitda_p_in)
            verified_data['fcf_current'] = get_units(fcf_c_in); verified_data['fcf_prior'] = get_units(fcf_p_in)
            verified_data['leverage_current'] = lev_c_in.value if isinstance(lev_c_in.value,(int,float)) else None
            verified_data['leverage_prior'] = lev_p_in.value if isinstance(lev_p_in.value,(int,float)) else None
            verified_data['revenue_drivers_text'] = rev_drv_in.value or "N/A"; verified_data['ebitda_drivers_text'] = ebitda_drv_in.value or "N/A"
            verified_data['fcf_drivers_text'] = fcf_drv_in.value or "N/A"; verified_data['leverage_drivers_text'] = lev_drv_in.value or "N/A"
        except Exception as e: print(f"Input Error: {e}"); analysis_status_label.value = f"<span style='color: red;'>Status: Input Error</span>"; return
        
        # Run AI Text Analysis
        analysis_status_label.value = "<span style='color: blue;'>Status: Analyzing text...</span>"; 
        qual_an_res = {}; method = "Basic Fallback" 
        analysis_ok = TRANSFORMERS_AVAILABLE and app_state.models_loaded and all(app_state.pipelines.get(k) for k in ["summarizer", "zero_shot", "sentiment"])
        qual_map = {"Rev": "revenue_drivers_text", "EBITDA": "ebitda_drivers_text", "FCF": "fcf_drivers_text", "Lev": "leverage_drivers_text"}
        if analysis_ok:
            print("Attempting AI analysis..."); method = "Transformer (AI)"
            for report_key, data_key in qual_map.items():
                txt = verified_data.get(data_key, "N/A")
                analysis_status_label.value = f"<span style='color: blue;'>Status: Analyzing '{report_key}'...</span>"
                try: qual_an_res[report_key] = analyze_drivers_transformer(txt, analysis_status_label) 
                except Exception as e: print(f"AI analysis fail {report_key}: {e}"); qual_an_res[report_key] = analyze_drivers_fallback(txt); method = "Mixed"
        else:
             reason = "(Lib N/A)" if not TRANSFORMERS_AVAILABLE else "(Models not loaded)"; print(f"Using basic analysis {reason}."); analysis_status_label.value = f"<span style='color: orange;'>Status: Basic analysis {reason}...</span>"
             for report_key, data_key in qual_map.items(): qual_an_res[report_key] = analyze_drivers_fallback(verified_data.get(data_key, "N/A"))
        analysis_status_label.value = f"<span style='color: blue;'>Status: Text analysis done ({method}).</span>"
        app_state.analysis_results = qual_an_res # Store results

        # Generate HTML Report
        analysis_status_label.value = "<span style='color: blue;'>Status: Generating HTML...</span>"
        report_html = _generate_report_html(verified_data, qual_an_res, method)
        app_state.current_report_html = report_html 
        
        # Display Report
        clear_output(wait=True); display(HTML(report_html)); 
        analysis_status_label.value = "<span style='color: green;'>Status: Report Generated.</span>"; print("\nReport finished.") 

# --- Optional: NLG Handler ---
def on_nlg_click(b):
     report_out.clear_output(wait=True)
     with report_out:
         if not (TRANSFORMERS_AVAILABLE and app_state.models_loaded and app_state.pipelines.get("text-generation")): 
              print("NLG requires text-generation model (e.g., 'gpt2'). Load it via `load_models` if needed."); return
         print("Generating NLG Summary (Experimental)...")
         # Get data from UI widgets for NLG prompt
         verified_data = { # Re-read UI data for NLG
             'revenue_current': rev_c_in.value, 'revenue_prior': rev_p_in.value, 
             'ebitda_current': ebitda_c_in.value, 'ebitda_prior': ebitda_p_in.value,
             'fcf_current': fcf_c_in.value, 'fcf_prior': fcf_p_in.value,
             'leverage_current': lev_c_in.value, 'leverage_prior': lev_p_in.value
         }
         narrative_html = _generate_report_nlg(verified_data, app_state.analysis_results, analysis_status_label)
         display(HTML(app_state.current_report_html + "<hr>" + narrative_html)) # Show below main report

# --- Optional: Export Handlers ---
def on_export_md_click(b):
     report_out.clear_output(wait=True)
     with report_out:
          html_content = app_state.current_report_html
          if not html_content: print("Generate report first."); return
          md_content = _export_report_markdown(html_content) 
          print("--- Markdown Output (Basic) ---")
          print(md_content)
          # Consider adding download link here
          
def on_export_pdf_click(b):
     report_out.clear_output(wait=True)
     with report_out:
          html_content = app_state.current_report_html
          if not html_content: print("Generate report first."); return
          pdf_message = _export_report_pdf(html_content) 
          print(pdf_message)

print("-" * 20, "Cell 8: Event Handlers Defined", "-" * 20)

In [None]:
# %%-- Cell 9: UI Layout & Display --%%

print("Assembling UI Layout...")

# --- UI Layout Construction ---
# Section for Model Loading Controls
model_load_section = widgets.VBox([
    widgets.HTML("<b>AI Model Control:</b>"), 
    widgets.HBox([load_btn, load_status_label]) # Use load_btn defined in Cell 7
])

# Section 1 Layout: Parsing & Interactive QA
pr_section = widgets.VBox([
    widgets.HTML("<b>1. Load Document</b>"),
    widgets.HBox([file_upload, url_input, fetch_url_btn]), 
    data_source_label, 
    widgets.HTML("<hr style='margin: 10px 0;'><b>2. Select Parsing Strategy & Parse</b>"), 
    parse_strategy_selector, 
    parse_btn, 
    parse_status_label, 
    parse_out, 
    # Interactive QA Sub-section
    widgets.HTML("<hr style='margin: 10px 0;'><b>3. Interactive Follow-up QA (Optional)</b>"), 
    widgets.HTML("<p style='font-size:0.9em;'>Ask specific questions about the loaded text (AI models must be loaded).</p>"),
    widgets.HBox([custom_q_in, custom_q_btn]), 
    custom_qa_status_label, 
    custom_q_out 
])

# Section 2 Layout: Data Input/Verification
manual_section = widgets.VBox([
     widgets.HTML("<b>Review / Enter Financial Data:</b><p style='font-size:0.9em'>(Populated by parsing - verify and edit. Units: $ millions unless 'x'.)</p>"),
     widgets.HTML("<b>Revenue</b>"), widgets.HBox([rev_c_in, rev_p_in]), rev_drv_in, 
     widgets.HTML("<hr style='margin:5px 0;'><b>EBITDA</b>"), widgets.HBox([ebitda_c_in, ebitda_p_in]), ebitda_drv_in, 
     widgets.HTML("<hr style='margin:5px 0;'><b>FCF</b>"), widgets.HBox([fcf_c_in, fcf_p_in]), fcf_drv_in, 
     widgets.HTML("<hr style='margin:5px 0;'><b>Leverage (x)</b>"), widgets.HBox([lev_c_in, lev_p_in]), lev_drv_in, 
     widgets.HTML("<hr style='margin:5px 0;'><b>Benchmark Data (Optional)</b>"), benchmark_input_area,
])

# Section 3 Layout: Report Generation
report_section = widgets.VBox([
     widgets.HTML("<b>Generate & Export Report:</b>"),
     widgets.HBox([gen_report_btn, nlg_report_btn]), 
     widgets.HTML("<br><b>Export Options:</b>"),
     widgets.HBox([export_md_btn, export_pdf_btn]),
     analysis_status_label, 
     report_out 
])

# Main Tabbed Interface
main_tabs = widgets.Tab()
main_tabs.children = [pr_section, manual_section, report_section]
main_tabs.set_title(0, '1. Load & Parse') 
main_tabs.set_title(1, '2. Verify Data') 
main_tabs.set_title(2, '3. Generate Report') 

# --- Final App Layout ---
app_layout = widgets.VBox([ 
    widgets.HTML("<h1>Financial Document Analysis Assistant v12</h1>"), 
    model_load_section, 
    widgets.HTML("<hr>"),
    main_tabs, 
    # Global status labels can go here if needed, but most are in sections now
]) 

# --- Link Buttons to Handlers ---
# (Ensure handlers from Cell 8 are linked correctly to widgets from Cell 7)
load_btn.on_click(on_load_models_click)
file_upload.observe(on_upload_change, names='value') 
fetch_url_btn.on_click(on_fetch_url_click)
parse_btn.on_click(on_parse_button_click)
custom_q_btn.on_click(on_custom_qa_click) 
gen_report_btn.on_click(on_generate_click) 
nlg_report_btn.on_click(on_nlg_click)     
export_md_btn.on_click(on_export_md_click) 
export_pdf_btn.on_click(on_export_pdf_click)

# --- Display the UI --- 
print("Initializing UI Display...") 
display(app_layout)
# --- Post-Display Info --- 
if not TRANSFORMERS_AVAILABLE: print("\nREMINDER: `transformers` missing/broken. AI features disabled.")

print("-" * 20, "Cell 9: UI Displayed - Ready for Interaction", "-" * 20)