In [None]:
# Cell 0: GitHub Setup and Clean Auto-Logging

import os
import sys
import importlib
import importlib.util
import psycopg2

# GitHub credentials - use Kaggle secrets for security
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
github_token = user_secrets.get_secret("GITHUB_TOKEN")
repo_url = f"https://{github_token}@github.com/amiralpert/SmartReach.git"
local_path = "/kaggle/working/SmartReach"

print("📦 Setting up GitHub repository...")

# Clone or update repo with force pull
if os.path.exists(local_path):
    print(f"📂 Repository exists at {local_path}")
    print("🔄 Force updating from GitHub...")
    !cd {local_path} && git fetch origin
    !cd {local_path} && git reset --hard origin/main
    !cd {local_path} && git pull origin main
    print("✅ Repository updated")
    
    # Show current commit
    !cd {local_path} && echo "Current commit:" && git log --oneline -1
else:
    print(f"📥 Cloning repository to {local_path}")
    !git clone {repo_url} {local_path}
    print("✅ Repository cloned")

# Clear any cached modules from previous runs
modules_to_clear = [key for key in sys.modules.keys() if 'auto_logger' in key.lower() or 'clean' in key.lower()]
for mod in modules_to_clear:
    del sys.modules[mod]
    print(f"  🧹 Cleared cached module: {mod}")

# Add to Python path for regular imports
if f'{local_path}/BizIntel' in sys.path:
    sys.path.remove(f'{local_path}/BizIntel')
sys.path.insert(0, f'{local_path}/BizIntel')

print("✓ Python path configured for SEC entity extraction!")

# Set up database configuration
NEON_CONFIG = {
    'host': 'ep-royal-star-ad1gn0d4-pooler.c-2.us-east-1.aws.neon.tech',
    'database': 'BizIntelSmartReach',
    'user': 'neondb_owner',
    'password': 'npg_aTFt6Pug3Kpy',
    'sslmode': 'require'
}

# Set up the new clean auto-logger
try:
    # Create connection for logger
    logger_conn = psycopg2.connect(**NEON_CONFIG)
    print("✓ Database connected for clean logger")

    # Import the redesigned clean auto-logger
    logger_module_path = f"{local_path}/BizIntel/Scripts/KaggleLogger/auto_logger.py"
    if os.path.exists(logger_module_path):
        spec = importlib.util.spec_from_file_location("auto_logger", logger_module_path)
        auto_logger_module = importlib.util.module_from_spec(spec)
        sys.modules["auto_logger"] = auto_logger_module
        spec.loader.exec_module(auto_logger_module)

        # Use the new clean logging setup
        setup_clean_logging = auto_logger_module.setup_clean_logging
        logger = setup_clean_logging(logger_conn, "SEC_EntityExtraction")
        
        print("✨ Clean auto-logging enabled!")
        print("📋 Features:")
        print("   • One row per cell execution")
        print("   • Complete output capture")
        print("   • Proper cell numbers from # Cell N: comments")
        print("   • Full error tracebacks")
        print("   • Execution timing")
        
    else:
        print(f"✗ Clean auto-logger not found at {logger_module_path}")
        logger = None
        
except Exception as e:
    print(f"⚠️ Clean logger setup failed: {e}")
    print("  Continuing without auto-logging...")
    logger = None

print("\n🚀 Setup complete! SEC Entity Extraction Engine with Clean Logging ready.")
print("💡 Run cells with proper # Cell N: comments for best logging.")

In [None]:
# Cell 1: Neon Database Configuration
NEON_CONFIG = {
    'host': 'ep-royal-star-ad1gn0d4-pooler.c-2.us-east-1.aws.neon.tech',
    'database': 'BizIntelSmartReach',
    'user': 'neondb_owner',
    'password': 'npg_aTFt6Pug3Kpy',
    'sslmode': 'require'
}

# Test database connection
def test_database_connection():
    try:
        conn = psycopg2.connect(**NEON_CONFIG)
        cursor = conn.cursor()
        
        # Check SEC-related tables
        cursor.execute('''
            SELECT 
                (SELECT COUNT(*) FROM raw_data.sec_filings) as sec_filings,
                (SELECT COUNT(*) FROM core.companies) as companies,
                (SELECT COUNT(*) FROM system_uno.sec_entities_raw) as sec_entities_extracted,
                (SELECT COUNT(DISTINCT company_domain) FROM raw_data.sec_filings) as companies_with_filings,
                (SELECT COUNT(*) FROM raw_data.sec_filings WHERE url IS NOT NULL) as filings_with_urls
        ''')
        
        counts = cursor.fetchone()
        print("✓ Database connected successfully!")
        print(f"  SEC Filings: {counts[0]}")
        print(f"  Companies: {counts[1]}")
        print(f"  Extracted SEC Entities: {counts[2]}")
        print(f"  Companies with SEC Filings: {counts[3]}")
        print(f"  SEC Filings with URLs: {counts[4]}")
        
        # Show sample SEC filing data
        cursor.execute('''
            SELECT company_domain, filing_type, COUNT(*) as count
            FROM raw_data.sec_filings 
            GROUP BY company_domain, filing_type 
            ORDER BY company_domain, count DESC
            LIMIT 10
        ''')
        
        filing_stats = cursor.fetchall()
        print("\n📊 SEC Filing Distribution:")
        for stat in filing_stats:
            print(f"  {stat[0]}: {stat[1]} ({stat[2]} filings)")
        
        cursor.close()
        conn.close()
        return True
        
    except Exception as e:
        print(f"✗ Database connection failed: {e}")
        return False

# Test connection
test_database_connection()

In [None]:
# Cell 2: Load All 4 NER Models (BioBERT, BERT-base, FinBERT, RoBERTa)

# Install required packages for all models
!pip install transformers torch requests beautifulsoup4 'lxml[html_clean]' uuid numpy edgartools newspaper3k

import requests
from bs4 import BeautifulSoup
import uuid
import json
import time
from datetime import datetime
from typing import List, Dict, Any, Optional
import re
import torch
import numpy as np
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor, as_completed

# Hugging Face Transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

print("📦 Loading all 4 NER models for multi-model extraction...")

# ========== 1. BioBERT (Biomedical Entities) ==========
try:
    biobert_model_name = "alvaroalon2/biobert_diseases_ner"
    print(f"🧬 Loading BioBERT: {biobert_model_name}")
    biobert_tokenizer = AutoTokenizer.from_pretrained(biobert_model_name)
    biobert_model = AutoModelForTokenClassification.from_pretrained(biobert_model_name)
    biobert_pipeline = pipeline("ner", model=biobert_model, tokenizer=biobert_tokenizer, 
                                aggregation_strategy="average", device=0 if torch.cuda.is_available() else -1)
    print("   ✓ BioBERT loaded (medical/disease entities)")
except Exception as e:
    print(f"   ❌ BioBERT failed to load: {e}")
    biobert_pipeline = None

# ========== 2. BERT-base-NER (General Entities) ==========
try:
    bert_model_name = "dslim/bert-base-NER"
    print(f"📝 Loading BERT-base-NER: {bert_model_name}")
    bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
    bert_model = AutoModelForTokenClassification.from_pretrained(bert_model_name)
    bert_pipeline = pipeline("ner", model=bert_model, tokenizer=bert_tokenizer,
                            aggregation_strategy="average", device=0 if torch.cuda.is_available() else -1)
    print("   ✓ BERT-base loaded (general entities: PER, ORG, LOC, MISC)")
except Exception as e:
    print(f"   ❌ BERT-base failed to load: {e}")
    bert_pipeline = None

# ========== 3. FinBERT (Financial Entities) ==========
try:
    finbert_model_name = "ProsusAI/finbert"
    print(f"💰 Loading FinBERT: {finbert_model_name}")
    finbert_tokenizer = AutoTokenizer.from_pretrained(finbert_model_name)
    finbert_model = AutoModelForTokenClassification.from_pretrained(finbert_model_name)
    finbert_pipeline = pipeline("ner", model=finbert_model, tokenizer=finbert_tokenizer,
                               aggregation_strategy="average", device=0 if torch.cuda.is_available() else -1)
    print("   ✓ FinBERT loaded (financial entities)")
except Exception as e:
    print(f"   ❌ FinBERT failed to load: {e}")
    # Fallback: use BERT-base for financial content
    finbert_pipeline = bert_pipeline if 'bert_pipeline' in locals() else None
    if finbert_pipeline:
        print("   🔄 Using BERT-base as FinBERT fallback")

# ========== 4. RoBERTa-large-NER (High Precision) ==========
try:
    roberta_model_name = "Jean-Baptiste/roberta-large-ner-english"
    print(f"🎯 Loading RoBERTa-large-NER: {roberta_model_name}")
    roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
    roberta_model = AutoModelForTokenClassification.from_pretrained(roberta_model_name)
    roberta_pipeline = pipeline("ner", model=roberta_model, tokenizer=roberta_tokenizer,
                                aggregation_strategy="average", device=0 if torch.cuda.is_available() else -1)
    print("   ✓ RoBERTa loaded (high-precision entities)")
except Exception as e:
    print(f"   ❌ RoBERTa failed to load: {e}")
    roberta_pipeline = None

# ========== Multi-Model Configuration ==========
EXTRACTION_CONFIG = {
    'chunk_size': 400,           # Words per chunk
    'overlap_size': 120,         # 30% overlap (120/400)
    'context_window': 500,       # Characters before/after entity for context
    'batch_size': 10,           # Process 10 documents at a time
    'confidence_thresholds': {
        'biobert': 0.5,
        'bert_base': 0.5,
        'finbert': 0.5,
        'roberta': 0.6          # Higher threshold for high-precision model
    },
    'supported_entities': [
        'DISEASE', 'CHEMICAL', 'MEDICATION', 'MEDICAL_CONDITION',
        'DRUG', 'COMPOUND', 'THERAPY', 'TREATMENT',
        'PERSON', 'ORGANIZATION', 'LOCATION', 'MISCELLANEOUS',
        'FINANCIAL', 'REVENUE', 'COST', 'METRIC'
    ]
}

# ========== Model Registry ==========
LOADED_MODELS = {
    'biobert': biobert_pipeline,
    'bert_base': bert_pipeline,
    'finbert': finbert_pipeline,
    'roberta': roberta_pipeline
}

# Document-to-model routing configuration
DOCUMENT_MODEL_ROUTING = {
    'sec_filing': {
        'markdown': ['biobert', 'bert_base', 'roberta'],  # Text sections
        'xbrl': ['finbert', 'bert_base']                  # Financial data
    },
    'press_release': {
        'markdown': ['biobert', 'bert_base', 'roberta']   # All text models
    },
    'stock_data': {
        'xbrl': ['finbert', 'bert_base']                  # Financial focus
    }
}

# Entity type mapping and normalization
ENTITY_TYPE_MAPPING = {
    # BioBERT mappings
    'Disease': 'MEDICAL_CONDITION',
    'Chemical': 'MEDICATION',
    'CHEMICAL': 'MEDICATION',
    'DISEASE': 'MEDICAL_CONDITION',
    'DRUG': 'MEDICATION',
    'Drug': 'MEDICATION',
    'Compound': 'MEDICATION',
    'Treatment': 'THERAPY',
    'Therapy': 'THERAPY',
    
    # BERT-base mappings
    'PER': 'PERSON',
    'ORG': 'ORGANIZATION',
    'LOC': 'LOCATION',
    'MISC': 'MISCELLANEOUS',
    
    # Financial mappings
    'MONEY': 'FINANCIAL',
    'PERCENT': 'FINANCIAL',
    'NUMBER': 'METRIC'
}

# Count successfully loaded models
loaded_count = sum(1 for model in LOADED_MODELS.values() if model is not None)

print(f"\n✅ Model Loading Complete!")
print(f"   📊 Successfully loaded: {loaded_count}/4 models")
print(f"   🖥️ Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
print(f"   🔧 Chunk size: {EXTRACTION_CONFIG['chunk_size']} words")
print(f"   🔄 Overlap: {EXTRACTION_CONFIG['overlap_size']} words (30%)")
print(f"   📋 Total entity types: {len(EXTRACTION_CONFIG['supported_entities'])}")

# Test each model with sample biotech text
if loaded_count > 0:
    test_text = "Pfizer's COVID-19 vaccine generated $37 billion in revenue. The FDA approved treatment for Alzheimer's disease in Boston."
    print(f"\n🧪 Testing models with: '{test_text[:50]}...'")
    
    for model_name, pipeline_obj in LOADED_MODELS.items():
        if pipeline_obj:
            try:
                test_entities = pipeline_obj(test_text)
                print(f"   {model_name}: Found {len(test_entities)} entities")
                for entity in test_entities[:2]:  # Show first 2 entities
                    print(f"      • {entity['entity_group']}: '{entity['word']}' ({entity['score']:.3f})")
            except Exception as e:
                print(f"   {model_name}: Test failed - {e}")

print("\n✅ Multi-Model NER Setup Complete! Ready for parallel processing.")

In [None]:
# Cell 3: Universal Document Parser with EdgarTools Integration

import edgar
from edgar import Filing
from newspaper import Article
import yfinance as yf
import re

# ========== EXTENSIBLE PARSER REGISTRY ==========
PARSER_REGISTRY = {}

def register_parser(doc_type: str):
    """Decorator to register new parsers - easily extensible"""
    def decorator(parser_class):
        PARSER_REGISTRY[doc_type] = parser_class
        return parser_class
    return decorator

# ========== BASE PARSER INTERFACE ==========
class BaseDocumentParser(ABC):
    """Base interface that all parsers must implement"""
    
    @abstractmethod
    def can_parse(self, document: Dict) -> bool:
        """Check if this parser can handle the document"""
        pass
    
    @abstractmethod
    def extract_content(self, document: Dict) -> Dict[str, Any]:
        """Extract and structure content from document"""
        pass
    
    def chunk_text(self, text: str, chunk_size: int = 400, overlap: int = 120) -> List[Dict]:
        """Universal chunking strategy - sliding window with 30% overlap"""
        if not text or not text.strip():
            return []
        
        words = text.split()
        if len(words) <= chunk_size:
            return [{
                'text': text,
                'word_start': 0,
                'word_end': len(words),
                'chunk_index': 0,
                'total_words': len(words)
            }]
        
        chunks = []
        for i in range(0, len(words), chunk_size - overlap):
            chunk_words = words[i:min(i + chunk_size, len(words))]
            chunk_text = ' '.join(chunk_words)
            
            chunks.append({
                'text': chunk_text,
                'word_start': i,
                'word_end': min(i + chunk_size, len(words)),
                'chunk_index': len(chunks),
                'total_words': len(chunk_words)
            })
            
            # Break if we've covered all words
            if i + chunk_size >= len(words):
                break
        
        return chunks

# ========== SEC FILING PARSER (EdgarTools) ==========
@register_parser('sec_filing')
class EdgarToolsParser(BaseDocumentParser):
    """Parse SEC filings using EdgarTools - eliminates XBRL noise"""
    
    def can_parse(self, document: Dict) -> bool:
        return document.get('type') == 'sec_filing' and 'url' in document
    
    def _extract_accession_number_from_url(self, url: str) -> str:
        """Extract accession number from SEC filing URL"""
        # SEC filing URLs typically have format: 
        # https://www.sec.gov/Archives/edgar/data/CIK/ACCESSION_NUMBER/filename
        # or contain accession number in various formats
        
        # Match pattern like: 0001234567-23-000001 or 000123456723000001
        accession_patterns = [
            r'(\d{10}-\d{2}-\d{6})',  # Standard format: 0001234567-23-000001
            r'(\d{18})',              # Compressed format: 000123456723000001
        ]
        
        for pattern in accession_patterns:
            match = re.search(pattern, url)
            if match:
                accession = match.group(1)
                # Convert compressed format to standard format if needed
                if len(accession) == 18 and '-' not in accession:
                    # Convert 000123456723000001 to 0001234567-23-000001
                    accession = f"{accession[:10]}-{accession[10:12]}-{accession[12:]}"
                return accession
        
        return None
    
    def extract_content(self, document: Dict) -> Dict[str, Any]:
        try:
            print(f"🏢 Processing SEC filing: {document.get('company_domain', 'Unknown')}")
            
            # Extract accession number from URL
            accession_number = self._extract_accession_number_from_url(document['url'])
            if not accession_number:
                raise ValueError(f"Could not extract accession number from URL: {document['url']}")
            
            print(f"   📄 Using accession number: {accession_number}")
            
            # Create Filing object using accession number (correct API)
            filing = Filing(accession_no=accession_number)
            
            content = {
                'doc_id': document.get('id'),
                'doc_type': 'sec_filing',
                'filing_type': document.get('filing_type', ''),
                'company': document.get('company_domain', ''),
                'url': document['url'],
                'filing_date': document.get('filing_date'),
                'accession_number': accession_number,
                'sections': {},
                'has_markdown': False,
                'has_xbrl': False,
                'markdown_chunks': [],
                'xbrl_data': {}
            }
            
            # Extract clean text sections (no XBRL noise)
            if hasattr(filing, 'text') and filing.text:
                markdown_text = filing.text.strip()
                if markdown_text:
                    content['markdown_chunks'] = self.chunk_text(markdown_text)
                    content['has_markdown'] = True
                    print(f"   📄 Extracted {len(content['markdown_chunks'])} text chunks ({len(markdown_text):,} chars)")
            elif hasattr(filing, 'document') and filing.document and hasattr(filing.document, 'text'):
                # Try alternative text access method
                markdown_text = filing.document.text.strip()
                if markdown_text:
                    content['markdown_chunks'] = self.chunk_text(markdown_text)
                    content['has_markdown'] = True
                    print(f"   📄 Extracted {len(content['markdown_chunks'])} text chunks ({len(markdown_text):,} chars)")
            
            # Extract XBRL financial data if available
            try:
                if hasattr(filing, 'xbrl') and filing.xbrl:
                    content['xbrl_data'] = filing.xbrl.to_dict() if hasattr(filing.xbrl, 'to_dict') else {}
                    content['has_xbrl'] = bool(content['xbrl_data'])
                    if content['has_xbrl']:
                        print(f"   💰 Extracted XBRL financial data")
                elif hasattr(filing, 'financials') and filing.financials:
                    # Alternative access to financial data
                    content['xbrl_data'] = {'financials': str(filing.financials)}
                    content['has_xbrl'] = True
                    print(f"   💰 Extracted financial data")
            except Exception as xbrl_e:
                print(f"   ⚠️ XBRL extraction failed: {xbrl_e}")
            
            # Extract specific sections for detailed analysis
            try:
                if hasattr(filing, 'sections') and filing.sections:
                    for section_name, section_content in filing.sections.items():
                        if section_content and section_content.strip():
                            content['sections'][section_name] = {
                                'text': section_content,
                                'chunks': self.chunk_text(section_content)
                            }
                            print(f"   📑 Section '{section_name}': {len(content['sections'][section_name]['chunks'])} chunks")
            except Exception as sections_e:
                print(f"   ⚠️ Section extraction failed: {sections_e}")
            
            # Verify we got some usable content
            if not content['has_markdown'] and not content['has_xbrl'] and not content['sections']:
                raise ValueError("No usable content extracted from filing")
            
            return content
            
        except Exception as e:
            print(f"   ❌ EdgarTools parsing failed: {e}")
            # NO FALLBACK - let it fail properly
            return {
                'doc_id': document.get('id'),
                'doc_type': 'sec_filing',
                'company': document.get('company_domain', ''),
                'url': document['url'],
                'error': f'EdgarTools parsing failed: {e}',
                'has_markdown': False,
                'has_xbrl': False,
                'parsing_failed': True
            }

# ========== PRESS RELEASE PARSER ==========
@register_parser('press_release')
class PressReleaseParser(BaseDocumentParser):
    """Parse press releases and news articles"""
    
    def can_parse(self, document: Dict) -> bool:
        return document.get('type') == 'press_release' and 'url' in document
    
    def extract_content(self, document: Dict) -> Dict[str, Any]:
        try:
            print(f"📰 Processing press release: {document.get('title', 'Unknown')}")
            
            article = Article(document['url'])
            article.download()
            article.parse()
            
            return {
                'doc_id': document.get('id'),
                'doc_type': 'press_release',
                'title': article.title,
                'company': document.get('company_domain', ''),
                'url': document['url'],
                'markdown_chunks': self.chunk_text(article.text),
                'has_markdown': bool(article.text),
                'has_xbrl': False,
                'publish_date': article.publish_date,
                'authors': article.authors
            }
            
        except Exception as e:
            print(f"   ❌ Press release parsing failed: {e}")
            return {'doc_id': document.get('id'), 'error': str(e), 'has_markdown': False, 'has_xbrl': False}

# ========== STOCK DATA PARSER (Future Extension Example) ==========
@register_parser('stock_data')
class StockDataParser(BaseDocumentParser):
    """Parse stock market data - demonstrates extensibility"""
    
    def can_parse(self, document: Dict) -> bool:
        return document.get('type') == 'stock_data' and 'symbol' in document
    
    def extract_content(self, document: Dict) -> Dict[str, Any]:
        try:
            print(f"📈 Processing stock data: {document['symbol']}")
            
            ticker = yf.Ticker(document['symbol'])
            info = ticker.info
            
            # Convert financial metrics to text for NER processing
            financial_text = f"""
            Company: {info.get('longName', 'Unknown')}
            Market Cap: ${info.get('marketCap', 0):,}
            Revenue: ${info.get('totalRevenue', 0):,}
            Employees: {info.get('fullTimeEmployees', 0):,}
            Industry: {info.get('industry', 'Unknown')}
            Sector: {info.get('sector', 'Unknown')}
            """
            
            return {
                'doc_id': document.get('id'),
                'doc_type': 'stock_data',
                'symbol': document['symbol'],
                'company': info.get('longName', document['symbol']),
                'markdown_chunks': self.chunk_text(financial_text),
                'has_markdown': True,
                'has_xbrl': True,  # Financial data for FinBERT
                'xbrl_data': {
                    'market_cap': info.get('marketCap'),
                    'revenue': info.get('totalRevenue'),
                    'employees': info.get('fullTimeEmployees'),
                    'industry': info.get('industry'),
                    'sector': info.get('sector')
                }
            }
            
        except Exception as e:
            print(f"   ❌ Stock data parsing failed: {e}")
            return {'doc_id': document.get('id'), 'error': str(e), 'has_markdown': False, 'has_xbrl': False}

# ========== MAIN PARSER ORCHESTRATOR ==========
class UniversalDocumentParser:
    """Main parser that routes documents to appropriate parsers"""
    
    def __init__(self):
        self.parsers = {name: parser() for name, parser in PARSER_REGISTRY.items()}
        print(f"📚 Universal Document Parser initialized!")
        print(f"   🔧 Loaded {len(self.parsers)} document parsers:")
        for parser_name in self.parsers:
            print(f"      • {parser_name}")
    
    def parse_batch(self, documents: List[Dict]) -> List[Dict]:
        """Parse a batch of mixed document types"""
        parsed_results = []
        
        print(f"\n🚀 Parsing {len(documents)} documents...")
        
        for i, doc in enumerate(documents, 1):
            doc_type = doc.get('type', 'unknown')
            doc_id = doc.get('id', f'doc_{i}')
            
            print(f"📄 [{i}/{len(documents)}] Processing {doc_type}: {doc_id}")
            
            if doc_type in self.parsers:
                parser = self.parsers[doc_type]
                try:
                    parsed = parser.extract_content(doc)
                    parsed_results.append(parsed)
                    
                    # Summary - only show success if we actually got content
                    if parsed.get('parsing_failed'):
                        print(f"   ❌ Parsing failed: {parsed.get('error', 'Unknown error')}")
                    else:
                        chunks = parsed.get('markdown_chunks', [])
                        has_content = parsed.get('has_markdown') or parsed.get('has_xbrl')
                        status = "✓" if has_content else "⚠️"
                        print(f"   {status} Parsed: {len(chunks)} chunks extracted")
                    
                except Exception as e:
                    print(f"   ❌ Parser exception: {e}")
                    parsed_results.append({
                        'doc_id': doc_id,
                        'doc_type': doc_type,
                        'error': str(e),
                        'has_markdown': False,
                        'has_xbrl': False,
                        'parsing_failed': True
                    })
            else:
                print(f"   ⚠️ No parser available for document type: {doc_type}")
                parsed_results.append({
                    'doc_id': doc_id,
                    'doc_type': doc_type,
                    'error': f'No parser for type: {doc_type}',
                    'has_markdown': False,
                    'has_xbrl': False,
                    'parsing_failed': True
                })
                
        return parsed_results
    
    def add_parser(self, doc_type: str, parser_class: BaseDocumentParser):
        """Dynamically add new parser at runtime"""
        self.parsers[doc_type] = parser_class()
        print(f"✓ Added new parser for: {doc_type}")
    
    def get_documents_from_database(self, limit: int = 5) -> List[Dict]:
        """Get documents from database that need processing"""
        try:
            conn = psycopg2.connect(**NEON_CONFIG)
            cursor = conn.cursor()
            
            # Get SEC filings that haven't been processed yet
            cursor.execute('''
                SELECT sf.id, sf.company_domain, sf.filing_type, sf.url, sf.filing_date, sf.title
                FROM raw_data.sec_filings sf
                LEFT JOIN system_uno.sec_entities_raw ser ON ser.sec_filing_ref = CONCAT('SEC_', sf.id)
                WHERE sf.url IS NOT NULL 
                AND ser.sec_filing_ref IS NULL
                ORDER BY sf.filing_date DESC
                LIMIT %s
            ''', (limit,))
            
            filings = cursor.fetchall()
            cursor.close()
            conn.close()
            
            return [{
                'id': filing[0],
                'type': 'sec_filing',
                'company_domain': filing[1],
                'filing_type': filing[2],
                'url': filing[3],
                'filing_date': filing[4],
                'title': filing[5]
            } for filing in filings]
            
        except Exception as e:
            print(f"❌ Database query failed: {e}")
            return []

# ========== INITIALIZE UNIVERSAL PARSER ==========
universal_parser = UniversalDocumentParser()

# ========== TEST PARSER ==========
print(f"\n🧪 Testing parser with database documents...")
test_documents = universal_parser.get_documents_from_database(limit=1)

if test_documents:
    print(f"📄 Found {len(test_documents)} documents to test")
    print(f"   Sample URL: {test_documents[0].get('url', 'No URL')}")
    
    test_results = universal_parser.parse_batch(test_documents)
    
    for result in test_results:
        if result.get('parsing_failed'):
            print(f"   ❌ {result.get('doc_type')}: FAILED - {result.get('error')}")
        elif not result.get('error'):
            chunks = len(result.get('markdown_chunks', []))
            has_xbrl = result.get('has_xbrl', False)
            print(f"   ✓ {result.get('doc_type')}: {chunks} chunks{'+ XBRL' if has_xbrl else ''}")
        else:
            print(f"   ❌ {result.get('doc_type')}: {result.get('error')}")
else:
    print("📭 No documents found for testing")

print(f"\n✅ Universal Document Parser ready!")
print(f"🔧 Supported types: {list(PARSER_REGISTRY.keys())}")
print(f"📊 Chunking: {EXTRACTION_CONFIG['chunk_size']} words, {EXTRACTION_CONFIG['overlap_size']} overlap")
print(f"⚠️ SEC filings will FAIL if EdgarTools cannot parse them (no BeautifulSoup fallback)")

# Future: Easily add new parsers
# @register_parser('clinical_trial')
# class ClinicalTrialParser(BaseDocumentParser):
#     ...

# @register_parser('patent')
# class PatentParser(BaseDocumentParser):
#     ...

In [None]:
# Cell 4: Parallel Multi-Model NER Pipeline

# ========== BASE NER MODEL INTERFACE ==========
class BaseNERModel(ABC):
    """Base interface for all NER models"""
    
    def __init__(self, model_name: str, pipeline_obj, confidence_threshold: float = 0.5):
        self.model_name = model_name
        self.pipeline = pipeline_obj
        self.confidence_threshold = confidence_threshold
        self.stats = {
            'chunks_processed': 0,
            'entities_found': 0,
            'processing_time': 0
        }
    
    def extract_entities_from_text(self, text: str, chunk_metadata: Dict = None) -> List[Dict]:
        """Extract entities from a single text chunk"""
        if not self.pipeline or not text.strip():
            return []
        
        try:
            start_time = time.time()
            
            # Run NER pipeline
            raw_entities = self.pipeline(text)
            
            # Process and filter results
            processed_entities = []
            for entity in raw_entities:
                if entity['score'] >= self.confidence_threshold:
                    # Normalize entity type
                    entity_type = ENTITY_TYPE_MAPPING.get(entity['entity_group'], entity['entity_group'])
                    
                    processed_entity = {
                        'entity_text': entity['word'].strip(),
                        'entity_type': entity_type,
                        'confidence_score': float(entity['score']),
                        'char_start': entity['start'],
                        'char_end': entity['end'],
                        'model_source': self.model_name,
                        'original_label': entity['entity_group']
                    }
                    
                    # Add chunk metadata if provided
                    if chunk_metadata:
                        processed_entity.update(chunk_metadata)
                    
                    processed_entities.append(processed_entity)
            
            # Update statistics
            processing_time = time.time() - start_time
            self.stats['chunks_processed'] += 1
            self.stats['entities_found'] += len(processed_entities)
            self.stats['processing_time'] += processing_time
            
            return processed_entities
            
        except Exception as e:
            print(f"   ❌ {self.model_name} extraction failed: {e}")
            return []
    
    def process_document_chunks(self, chunks: List[Dict], doc_metadata: Dict) -> List[Dict]:
        """Process all chunks from a document"""
        all_entities = []
        
        for chunk in chunks:
            chunk_metadata = {
                'chunk_index': chunk.get('chunk_index', 0),
                'word_start': chunk.get('word_start', 0),
                'word_end': chunk.get('word_end', 0),
                'doc_id': doc_metadata.get('doc_id'),
                'doc_type': doc_metadata.get('doc_type'),
                'company': doc_metadata.get('company', ''),
                'filing_type': doc_metadata.get('filing_type', ''),
                'filing_date': doc_metadata.get('filing_date')
            }
            
            entities = self.extract_entities_from_text(chunk['text'], chunk_metadata)
            all_entities.extend(entities)
            
            # Small delay to prevent overwhelming the model
            time.sleep(0.01)
        
        return all_entities

# ========== NER MODEL IMPLEMENTATIONS ==========
class MultiModelNERPipeline:
    """Orchestrates parallel processing across multiple NER models"""
    
    def __init__(self, loaded_models: Dict, max_workers: int = 4):
        self.models = self._initialize_model_objects(loaded_models)
        self.max_workers = max_workers
        self.processing_stats = {
            'documents_processed': 0,
            'total_entities_found': 0,
            'total_processing_time': 0,
            'models_used': list(self.models.keys())
        }
        
        print(f"🤖 Multi-Model NER Pipeline initialized!")
        print(f"   📊 Active models: {list(self.models.keys())}")
        print(f"   🧵 Max workers: {max_workers}")
    
    def _initialize_model_objects(self, loaded_models: Dict) -> Dict:
        """Initialize model wrapper objects"""
        models = {}
        
        for model_name, pipeline_obj in loaded_models.items():
            if pipeline_obj is not None:
                confidence_threshold = EXTRACTION_CONFIG['confidence_thresholds'].get(model_name, 0.5)
                models[model_name] = BaseNERModel(model_name, pipeline_obj, confidence_threshold)
                print(f"   ✓ Initialized {model_name} (threshold: {confidence_threshold})")
        
        return models
    
    def route_document_to_models(self, parsed_doc: Dict) -> List[str]:
        """Determine which models should process this document"""
        doc_type = parsed_doc.get('doc_type', 'unknown')
        applicable_models = set()
        
        if doc_type in DOCUMENT_MODEL_ROUTING:
            routing = DOCUMENT_MODEL_ROUTING[doc_type]
            
            # Route text content to appropriate models
            if parsed_doc.get('has_markdown') and 'markdown' in routing:
                applicable_models.update(routing['markdown'])
            
            # Route financial data to FinBERT
            if parsed_doc.get('has_xbrl') and 'xbrl' in routing:
                applicable_models.update(routing['xbrl'])
        else:
            # Default: use all text models for unknown document types
            if parsed_doc.get('has_markdown'):
                applicable_models.update(['biobert', 'bert_base', 'roberta'])
        
        # Filter to only include loaded models
        return [model for model in applicable_models if model in self.models]
    
    def process_single_document(self, parsed_doc: Dict) -> List[Dict]:
        """Process a single document through applicable models"""
        if parsed_doc.get('error'):
            return []
        
        doc_id = parsed_doc.get('doc_id', 'unknown')
        doc_type = parsed_doc.get('doc_type', 'unknown')
        
        print(f"🔍 Processing {doc_type} document: {doc_id}")
        
        # Determine applicable models
        applicable_models = self.route_document_to_models(parsed_doc)
        
        if not applicable_models:
            print(f"   ⚠️ No applicable models for {doc_type}")
            return []
        
        print(f"   🎯 Using models: {applicable_models}")
        
        all_entities = []
        
        # Process with each applicable model
        for model_name in applicable_models:
            if model_name not in self.models:
                continue
                
            model = self.models[model_name]
            model_entities = []
            
            # Process markdown content
            if model_name != 'finbert' and parsed_doc.get('markdown_chunks'):
                chunks = parsed_doc['markdown_chunks']
                print(f"   📝 {model_name}: processing {len(chunks)} text chunks")
                model_entities.extend(model.process_document_chunks(chunks, parsed_doc))
            
            # Process XBRL content (FinBERT only)
            elif model_name == 'finbert' and parsed_doc.get('has_xbrl'):
                xbrl_text = json.dumps(parsed_doc.get('xbrl_data', {}))
                if xbrl_text and xbrl_text != '{}':
                    print(f"   💰 {model_name}: processing XBRL data")
                    # Create pseudo-chunk for XBRL data
                    xbrl_chunks = [{'text': xbrl_text, 'chunk_index': 0, 'word_start': 0, 'word_end': len(xbrl_text.split())}]
                    model_entities.extend(model.process_document_chunks(xbrl_chunks, parsed_doc))
            
            print(f"      → Found {len(model_entities)} entities")
            all_entities.extend(model_entities)
        
        return all_entities
    
    def process_batch_parallel(self, parsed_documents: List[Dict]) -> List[Dict]:
        """Process multiple documents in parallel across all models"""
        start_time = time.time()
        all_entities = []
        
        print(f"\n🚀 Processing {len(parsed_documents)} documents with multi-model pipeline...")
        
        # Process each document (models run in parallel within each document)
        for i, doc in enumerate(parsed_documents, 1):
            print(f"\n📄 [{i}/{len(parsed_documents)}] {doc.get('doc_type', 'unknown')}: {doc.get('doc_id', 'unknown')}")
            
            doc_entities = self.process_single_document(doc)
            
            # Add unique extraction IDs and additional metadata
            for entity in doc_entities:
                entity['extraction_id'] = str(uuid.uuid4())
                entity['sec_filing_ref'] = f"SEC_{doc.get('doc_id')}" if doc.get('doc_type') == 'sec_filing' else f"{doc.get('doc_type').upper()}_{doc.get('doc_id')}"
                entity['extraction_timestamp'] = datetime.now().isoformat()
            
            all_entities.extend(doc_entities)
            print(f"   ✓ Total entities from document: {len(doc_entities)}")
        
        # Merge entities at same positions
        merged_entities = self.merge_position_overlaps(all_entities)
        
        # Update statistics
        processing_time = time.time() - start_time
        self.processing_stats['documents_processed'] += len(parsed_documents)
        self.processing_stats['total_entities_found'] += len(merged_entities)
        self.processing_stats['total_processing_time'] += processing_time
        
        print(f"\n✅ Batch processing complete!")
        print(f"   📊 Documents: {len(parsed_documents)}")
        print(f"   🔍 Raw entities: {len(all_entities)}")
        print(f"   🎯 Merged entities: {len(merged_entities)}")
        print(f"   ⏱️ Processing time: {processing_time:.2f} seconds")
        
        return merged_entities
    
    def merge_position_overlaps(self, entities: List[Dict]) -> List[Dict]:
        """Merge entities detected at same position by different models"""
        if not entities:
            return []
        
        # Group entities by document and position
        position_groups = {}
        
        for entity in entities:
            # Create position key: doc_id + character range
            pos_key = f"{entity.get('doc_id')}_{entity.get('char_start')}_{entity.get('char_end')}"
            
            if pos_key not in position_groups:
                position_groups[pos_key] = []
            position_groups[pos_key].append(entity)
        
        merged_entities = []
        merge_stats = {'single_model': 0, 'multi_model_merged': 0}
        
        for pos_key, group in position_groups.items():
            if len(group) == 1:
                # Single model detection - keep as is
                merged_entities.append(group[0])
                merge_stats['single_model'] += 1
            else:
                # Multiple models detected same position - merge
                merged = self._merge_entity_group(group)
                merged_entities.append(merged)
                merge_stats['multi_model_merged'] += 1
        
        print(f"   🔗 Merge stats: {merge_stats['single_model']} single, {merge_stats['multi_model_merged']} merged")
        return merged_entities
    
    def _merge_entity_group(self, entities: List[Dict]) -> Dict:
        """Merge entities from different models at same position"""
        # Priority: BioBERT > FinBERT > RoBERTa > BERT-base for biotech domain
        priority = {'biobert': 4, 'finbert': 3, 'roberta': 2, 'bert_base': 1}
        
        # Sort by priority, then by confidence
        entities.sort(key=lambda x: (priority.get(x['model_source'], 0), x['confidence_score']), reverse=True)
        
        # Take best version but track all models
        best = entities[0].copy()
        
        # Add multi-model metadata
        best['models_detected'] = [e['model_source'] for e in entities]
        best['all_confidences'] = {e['model_source']: e['confidence_score'] for e in entities}
        best['primary_model'] = best['model_source']
        best['entity_variations'] = {e['model_source']: e['entity_text'] for e in entities}
        best['is_merged'] = True
        
        # Use highest confidence score
        best['confidence_score'] = max(e['confidence_score'] for e in entities)
        
        return best
    
    def get_processing_summary(self) -> Dict:
        """Get comprehensive processing statistics"""
        model_stats = {}
        for model_name, model in self.models.items():
            model_stats[model_name] = model.stats.copy()
            
        return {
            'pipeline_stats': self.processing_stats,
            'model_stats': model_stats,
            'routing_config': DOCUMENT_MODEL_ROUTING,
            'confidence_thresholds': EXTRACTION_CONFIG['confidence_thresholds']
        }

# ========== INITIALIZE MULTI-MODEL PIPELINE ==========
if 'LOADED_MODELS' in locals():
    ner_pipeline = MultiModelNERPipeline(LOADED_MODELS, max_workers=4)
    
    # ========== PROCESSING FUNCTION ==========
    def process_documents_end_to_end(documents: List[Dict]) -> List[Dict]:
        """Complete end-to-end processing pipeline"""
        print(f"\n🔄 Starting end-to-end processing of {len(documents)} documents...")
        
        # Step 1: Parse documents
        if hasattr(documents[0], 'get') and documents[0].get('doc_type'):
            # Already parsed
            parsed_docs = documents
            print(f"✓ Using pre-parsed documents")
        else:
            # Parse documents
            parsed_docs = universal_parser.parse_batch(documents)
            print(f"✓ Parsed {len(parsed_docs)} documents")
        
        # Step 2: Extract entities with all applicable models
        entities = ner_pipeline.process_batch_parallel(parsed_docs)
        
        return entities
    
    # ========== TEST PROCESSING ==========
    print(f"\n🧪 Testing multi-model processing...")
    
    # Get test documents from database
    test_docs = universal_parser.get_documents_from_database(limit=1)
    
    if test_docs:
        print(f"📄 Testing with {len(test_docs)} documents")
        test_entities = process_documents_end_to_end(test_docs)
        
        # Summary by model
        model_breakdown = {}
        for entity in test_entities:
            model = entity.get('primary_model', 'unknown')
            model_breakdown[model] = model_breakdown.get(model, 0) + 1
        
        print(f"\n📊 Test Results:")
        print(f"   Total entities: {len(test_entities)}")
        for model, count in model_breakdown.items():
            print(f"   {model}: {count} entities")
            
        # Show sample entities
        print(f"\n🔍 Sample entities:")
        for entity in test_entities[:3]:
            models = entity.get('models_detected', [entity.get('model_source')])
            print(f"   • {entity['entity_type']}: '{entity['entity_text']}' ({'+'.join(models)}, {entity['confidence_score']:.3f})")
            
    else:
        print("📭 No test documents available")
    
    print(f"\n✅ Multi-Model NER Pipeline ready for batch processing!")
    print(f"🎯 Usage: entities = process_documents_end_to_end(documents)")
    
else:
    print("❌ LOADED_MODELS not found - ensure Cell 2 was run successfully")
    ner_pipeline = None

In [None]:
# Cell 5: Multi-Model Entity Storage with Position-Based Merging

class MultiModelEntityStorage:
    """Store entities from multiple models with position-based merging support"""
    
    def __init__(self, db_config):
        self.db_config = db_config
        self.storage_stats = {
            'total_entities_stored': 0,
            'successful_inserts': 0,
            'failed_inserts': 0,
            'duplicate_entities': 0,
            'merged_entities': 0,
            'single_model_entities': 0
        }
    
    def prepare_entity_for_storage(self, entity: Dict[str, Any]) -> tuple:
        """Prepare entity data for database insertion with proper type conversion"""
        
        # Core entity data with length limits
        entity_text = str(entity.get('entity_text', '')).strip()[:1000]
        company_domain = str(entity.get('company', ''))[:255]
        entity_category = str(entity.get('entity_type', ''))[:100]
        sec_filing_ref = str(entity.get('sec_filing_ref', ''))[:255]
        
        # Handle surrounding text (extract from original chunk if needed)
        surrounding_text = str(entity.get('surrounding_text', ''))
        if not surrounding_text and entity.get('chunk_index') is not None:
            # Use entity position context
            surrounding_text = f"Chunk {entity.get('chunk_index', 0)}: {entity_text}"
        
        # Multi-model specific fields
        models_detected = entity.get('models_detected', [entity.get('model_source')])
        if not isinstance(models_detected, list):
            models_detected = [str(models_detected)]
        
        all_confidences = entity.get('all_confidences', {entity.get('model_source', 'unknown'): entity.get('confidence_score', 0.0)})
        if not isinstance(all_confidences, dict):
            all_confidences = {entity.get('model_source', 'unknown'): float(entity.get('confidence_score', 0.0))}
        
        primary_model = str(entity.get('primary_model', entity.get('model_source', 'unknown')))
        
        # Entity variations from different models
        entity_variations = entity.get('entity_variations', {entity.get('model_source', 'unknown'): entity_text})
        if not isinstance(entity_variations, dict):
            entity_variations = {entity.get('model_source', 'unknown'): entity_text}
        
        return (
            entity.get('extraction_id'),
            company_domain,
            entity_text,
            entity_category,
            float(entity.get('confidence_score', 0.0)),
            int(entity.get('char_start', 0)),
            int(entity.get('char_end', 0)),
            surrounding_text,
            sec_filing_ref,
            models_detected,           # ARRAY field
            json.dumps(all_confidences),   # JSONB field
            primary_model,
            json.dumps(entity_variations), # JSONB field
            entity.get('is_merged', False),
            entity.get('chunk_index', 0),
            entity.get('extraction_timestamp'),
            entity.get('original_label', '')
        )
    
    def create_enhanced_table_if_needed(self):
        """Create enhanced table structure for multi-model storage"""
        try:
            conn = psycopg2.connect(**self.db_config)
            cursor = conn.cursor()
            
            # Check if enhanced columns exist
            cursor.execute(\"\"\"\n                SELECT column_name \n                FROM information_schema.columns \n                WHERE table_schema = 'system_uno' \n                AND table_name = 'sec_entities_raw'\n                AND column_name IN ('models_detected', 'all_confidences', 'primary_model')\n            \"\"\")\n            \n            existing_columns = [row[0] for row in cursor.fetchall()]\n            \n            # Add missing columns for multi-model support\n            if 'models_detected' not in existing_columns:\n                cursor.execute('ALTER TABLE system_uno.sec_entities_raw ADD COLUMN models_detected TEXT[]')\n                print(\"   ✓ Added models_detected column\")\n            \n            if 'all_confidences' not in existing_columns:\n                cursor.execute('ALTER TABLE system_uno.sec_entities_raw ADD COLUMN all_confidences JSONB')\n                print(\"   ✓ Added all_confidences column\")\n            \n            if 'primary_model' not in existing_columns:\n                cursor.execute('ALTER TABLE system_uno.sec_entities_raw ADD COLUMN primary_model TEXT')\n                print(\"   ✓ Added primary_model column\")\n                \n            # Add additional helpful columns\n            cursor.execute(\"\"\"\n                ALTER TABLE system_uno.sec_entities_raw \n                ADD COLUMN IF NOT EXISTS entity_variations JSONB,\n                ADD COLUMN IF NOT EXISTS is_merged BOOLEAN DEFAULT FALSE,\n                ADD COLUMN IF NOT EXISTS chunk_index INTEGER,\n                ADD COLUMN IF NOT EXISTS extraction_timestamp TIMESTAMP,\n                ADD COLUMN IF NOT EXISTS original_label TEXT\n            \"\"\")\n            \n            # Create indexes for efficient querying\n            cursor.execute(\"\"\"\n                CREATE INDEX IF NOT EXISTS idx_sec_entities_position \n                ON system_uno.sec_entities_raw (sec_filing_ref, character_start, character_end)\n            \"\"\")\n            \n            cursor.execute(\"\"\"\n                CREATE INDEX IF NOT EXISTS idx_sec_entities_models \n                ON system_uno.sec_entities_raw USING GIN (models_detected)\n            \"\"\")\n            \n            conn.commit()\n            cursor.close()\n            conn.close()\n            print(\"   ✓ Enhanced table structure ready\")\n            \n        except Exception as e:\n            print(f\"   ⚠️ Table enhancement failed: {e}\")\n    \n    def store_entities(self, entities: List[Dict[str, Any]]) -> bool:\n        \"\"\"Store multi-model entities in database\"\"\"        \n        if not entities:\n            print(\"⚠️ No entities to store\")\n            return True\n        \n        # Ensure enhanced table structure\n        self.create_enhanced_table_if_needed()\n        \n        try:\n            conn = psycopg2.connect(**self.db_config)\n            cursor = conn.cursor()\n            \n            print(f\"💾 Storing {len(entities)} multi-model entities...\")\n            \n            # Prepare batch insert data\n            insert_data = []\n            merged_count = 0\n            single_count = 0\n            \n            for entity in entities:\n                prepared_data = self.prepare_entity_for_storage(entity)\n                insert_data.append(prepared_data)\n                \n                if entity.get('is_merged', False):\n                    merged_count += 1\n                else:\n                    single_count += 1\n            \n            # Enhanced insert query with multi-model fields\n            insert_query = \"\"\"\n                INSERT INTO system_uno.sec_entities_raw \n                (extraction_id, company_domain, entity_text, entity_category, \n                 confidence_score, character_start, character_end, surrounding_text, \n                 sec_filing_ref, models_detected, all_confidences, primary_model,\n                 entity_variations, is_merged, chunk_index, extraction_timestamp, original_label)\n                VALUES %s\n                ON CONFLICT (extraction_id) DO UPDATE SET\n                    models_detected = EXCLUDED.models_detected,\n                    all_confidences = EXCLUDED.all_confidences,\n                    primary_model = EXCLUDED.primary_model,\n                    entity_variations = EXCLUDED.entity_variations,\n                    is_merged = EXCLUDED.is_merged\n            \"\"\"\n            \n            from psycopg2.extras import execute_values\n            execute_values(cursor, insert_query, insert_data, page_size=100)\n            \n            # Get number of rows actually inserted/updated\n            rows_affected = cursor.rowcount\n            \n            conn.commit()\n            cursor.close()\n            conn.close()\n            \n            # Update statistics\n            self.storage_stats['total_entities_stored'] += len(entities)\n            self.storage_stats['successful_inserts'] += rows_affected\n            self.storage_stats['merged_entities'] += merged_count\n            self.storage_stats['single_model_entities'] += single_count\n            \n            print(f\"   ✓ Stored {rows_affected} entities\")\n            print(f\"   🔗 Merged entities: {merged_count}\")\n            print(f\"   🎯 Single-model entities: {single_count}\")\n            \n            return True\n            \n        except Exception as e:\n            print(f\"   ❌ Storage failed: {e}\")\n            self.storage_stats['failed_inserts'] += len(entities)\n            return False\n    \n    def verify_multi_model_storage(self, sec_filing_ref: str) -> Dict[str, Any]:\n        \"\"\"Verify multi-model entities were stored correctly\"\"\"        \n        try:\n            conn = psycopg2.connect(**self.db_config)\n            cursor = conn.cursor()\n            \n            # Enhanced verification query with multi-model stats\n            cursor.execute(\"\"\"\n                SELECT \n                    entity_category,\n                    COUNT(*) as count,\n                    AVG(confidence_score) as avg_confidence,\n                    COUNT(*) FILTER (WHERE is_merged = true) as merged_entities,\n                    COUNT(*) FILTER (WHERE is_merged = false) as single_model_entities,\n                    COUNT(DISTINCT primary_model) as unique_models_used\n                FROM system_uno.sec_entities_raw\n                WHERE sec_filing_ref = %s\n                GROUP BY entity_category\n                ORDER BY count DESC\n            \"\"\", (sec_filing_ref,))\n            \n            category_results = cursor.fetchall()\n            \n            # Model usage statistics\n            cursor.execute(\"\"\"\n                SELECT \n                    primary_model,\n                    COUNT(*) as entities_count,\n                    AVG(confidence_score) as avg_confidence\n                FROM system_uno.sec_entities_raw\n                WHERE sec_filing_ref = %s\n                GROUP BY primary_model\n                ORDER BY entities_count DESC\n            \"\"\", (sec_filing_ref,))\n            \n            model_results = cursor.fetchall()\n            \n            # Multi-model detection stats\n            cursor.execute(\"\"\"\n                SELECT \n                    array_length(models_detected, 1) as num_models,\n                    COUNT(*) as count\n                FROM system_uno.sec_entities_raw\n                WHERE sec_filing_ref = %s AND models_detected IS NOT NULL\n                GROUP BY array_length(models_detected, 1)\n                ORDER BY num_models\n            \"\"\", (sec_filing_ref,))\n            \n            multi_model_stats = cursor.fetchall()\n            \n            cursor.close()\n            conn.close()\n            \n            verification = {\n                'total_entities': sum(result[1] for result in category_results),\n                'entity_breakdown': [{\n                    'category': result[0],\n                    'count': result[1],\n                    'avg_confidence': float(result[2]),\n                    'merged_entities': result[3],\n                    'single_model_entities': result[4],\n                    'unique_models': result[5]\n                } for result in category_results],\n                'model_usage': [{\n                    'model': result[0],\n                    'entities': result[1],\n                    'avg_confidence': float(result[2])\n                } for result in model_results],\n                'multi_model_detection': [{\n                    'num_models': result[0] or 1,\n                    'entities': result[1]\n                } for result in multi_model_stats]\n            }\n            \n            return verification\n            \n        except Exception as e:\n            print(f\"   ❌ Verification failed: {e}\")\n            return {}\n    \n    def get_storage_summary(self) -> Dict[str, Any]:\n        \"\"\"Get comprehensive storage statistics\"\"\"        \n        return self.storage_stats.copy()\n    \n    def query_entities_by_models(self, models: List[str], limit: int = 10) -> List[Dict]:\n        \"\"\"Query entities detected by specific models\"\"\"        \n        try:\n            conn = psycopg2.connect(**self.db_config)\n            cursor = conn.cursor()\n            \n            # Query entities detected by any of the specified models\n            cursor.execute(\"\"\"\n                SELECT \n                    entity_text, entity_category, confidence_score,\n                    models_detected, all_confidences, primary_model,\n                    is_merged, sec_filing_ref\n                FROM system_uno.sec_entities_raw\n                WHERE models_detected && %s\n                ORDER BY confidence_score DESC\n                LIMIT %s\n            \"\"\", (models, limit))\n            \n            results = cursor.fetchall()\n            cursor.close()\n            conn.close()\n            \n            return [{\n                'entity_text': result[0],\n                'entity_category': result[1],\n                'confidence_score': result[2],\n                'models_detected': result[3],\n                'all_confidences': json.loads(result[4]) if result[4] else {},\n                'primary_model': result[5],\n                'is_merged': result[6],\n                'sec_filing_ref': result[7]\n            } for result in results]\n            \n        except Exception as e:\n            print(f\"❌ Query failed: {e}\")\n            return []\n\n# ========== INITIALIZE ENHANCED STORAGE ==========\nstorage = MultiModelEntityStorage(NEON_CONFIG)\nprint(\"✓ Multi-Model Entity Storage initialized!\")\n\n# ========== COMPLETE PROCESSING PIPELINE ==========\ndef process_and_store_documents(documents: List[Dict], store_results: bool = True) -> Dict[str, Any]:\n    \"\"\"Complete pipeline: Parse → Extract → Store\"\"\"    \n    print(f\"\\n🚀 Complete processing pipeline for {len(documents)} documents...\")\n    \n    pipeline_start = time.time()\n    \n    # Step 1: Process documents through multi-model pipeline\n    entities = process_documents_end_to_end(documents)\n    \n    if not entities:\n        return {\n            'success': False,\n            'message': 'No entities extracted',\n            'entities_extracted': 0\n        }\n    \n    # Step 2: Store entities if requested\n    storage_success = True\n    if store_results:\n        storage_success = storage.store_entities(entities)\n    \n    # Step 3: Verification and summary\n    pipeline_time = time.time() - pipeline_start\n    \n    # Get verification for first document\n    verification = {}\n    if entities and store_results:\n        first_filing_ref = entities[0].get('sec_filing_ref')\n        if first_filing_ref:\n            verification = storage.verify_multi_model_storage(first_filing_ref)\n    \n    # Compile results\n    result = {\n        'success': storage_success,\n        'documents_processed': len(documents),\n        'entities_extracted': len(entities),\n        'pipeline_time': round(pipeline_time, 2),\n        'verification': verification,\n        'entities_sample': entities[:5] if entities else []  # First 5 entities as sample\n    }\n    \n    print(f\"\\n✅ Pipeline Complete!\")\n    print(f\"   📊 Documents: {result['documents_processed']}\")\n    print(f\"   🔍 Entities: {result['entities_extracted']}\")\n    print(f\"   ⏱️ Time: {result['pipeline_time']} seconds\")\n    print(f\"   💾 Storage: {'✓' if storage_success else '❌'}\")\n    \n    return result\n\n# ========== TEST COMPLETE PIPELINE ==========\nprint(f\"\\n🧪 Testing complete pipeline...\")\n\n# Get test documents\ntest_documents = universal_parser.get_documents_from_database(limit=1)\n\nif test_documents:\n    print(f\"📄 Testing with {len(test_documents)} SEC filings\")\n    \n    # Run complete pipeline\n    test_results = process_and_store_documents(test_documents, store_results=True)\n    \n    if test_results['success']:\n        print(f\"\\n📊 Test Results Summary:\")\n        print(f\"   Entities extracted: {test_results['entities_extracted']}\")\n        \n        # Show verification details\n        if test_results['verification']:\n            print(f\"   Entity categories:\")\n            for cat in test_results['verification']['entity_breakdown'][:3]:\n                print(f\"      • {cat['category']}: {cat['count']} entities ({cat['merged_entities']} merged)\")\n            \n            print(f\"   Model usage:\")\n            for model in test_results['verification']['model_usage']:\n                print(f\"      • {model['model']}: {model['entities']} entities (avg conf: {model['avg_confidence']:.3f})\")\n        \n        # Show sample entities\n        print(f\"\\n🔍 Sample entities:\")\n        for entity in test_results['entities_sample']:\n            models = '+'.join(entity.get('models_detected', [entity.get('model_source')]))\n            is_merged = \" (merged)\" if entity.get('is_merged') else \"\"\n            print(f\"   • {entity['entity_type']}: '{entity['entity_text']}' ({models}{is_merged}, {entity['confidence_score']:.3f})\")\n        \n    else:\n        print(f\"   ❌ Pipeline test failed\")\n        \nelse:\n    print(\"📭 No test documents available\")\n\nprint(f\"\\n✅ Multi-Model Entity Storage ready!\")\nprint(f\"🎯 Usage: result = process_and_store_documents(documents)\")\nprint(f\"📊 Query: entities = storage.query_entities_by_models(['biobert', 'roberta'])\")

In [None]:
# Cell 6: Multi-Model Batch Processing and Advanced Analytics

def run_multi_model_batch_extraction(batch_size: int = 3, max_filings: int = None) -> Dict[str, Any]:
    """Run batch entity extraction with multi-model pipeline"""
    
    print(f"🚀 Starting multi-model batch entity extraction...")
    print(f"   📦 Batch size: {batch_size}")
    print(f"   📊 Max filings: {max_filings or 'unlimited'}")
    print(f"   🤖 Active models: {list(ner_pipeline.models.keys()) if 'ner_pipeline' in globals() else 'Not loaded'}")
    
    # Get filings to process
    filings_to_process = universal_parser.get_documents_from_database(limit=max_filings or 50)
    
    if not filings_to_process:
        print("📭 No unprocessed filings found")
        return {'success': False, 'message': 'No filings to process'}
    
    print(f"   📄 Found {len(filings_to_process)} filings to process")
    
    # Initialize batch tracking with multi-model stats
    batch_results = {
        'total_filings': len(filings_to_process),
        'successful_filings': 0,
        'failed_filings': 0,
        'total_entities_extracted': 0,
        'merged_entities': 0,
        'single_model_entities': 0,
        'model_usage': {},
        'entity_categories': {},
        'processing_start_time': datetime.now(),
        'results': []
    }
    
    # Process filings in batches
    for i in range(0, len(filings_to_process), batch_size):
        batch_filings = filings_to_process[i:i + batch_size]
        batch_number = (i // batch_size) + 1
        total_batches = (len(filings_to_process) + batch_size - 1) // batch_size
        
        print(f"\\n📦 Processing batch {batch_number}/{total_batches} ({len(batch_filings)} filings)")
        
        try:
            # Process batch through complete pipeline
            batch_result = process_and_store_documents(batch_filings, store_results=True)
            
            if batch_result['success']:
                batch_results['successful_filings'] += batch_result['documents_processed']
                batch_results['total_entities_extracted'] += batch_result['entities_extracted']
                
                # Track multi-model statistics
                for entity in batch_result.get('entities_sample', []):
                    # Count model usage
                    primary_model = entity.get('primary_model', 'unknown')
                    batch_results['model_usage'][primary_model] = batch_results['model_usage'].get(primary_model, 0) + 1
                    
                    # Count entity categories
                    entity_type = entity.get('entity_type', 'unknown')
                    batch_results['entity_categories'][entity_type] = batch_results['entity_categories'].get(entity_type, 0) + 1
                    
                    # Count merged vs single model
                    if entity.get('is_merged', False):
                        batch_results['merged_entities'] += 1
                    else:
                        batch_results['single_model_entities'] += 1
                
                print(f"   ✅ Batch {batch_number}: {batch_result['entities_extracted']} entities extracted")
            else:
                batch_results['failed_filings'] += len(batch_filings)
                print(f"   ❌ Batch {batch_number}: Processing failed")
            
            batch_results['results'].append(batch_result)
            
        except Exception as e:
            batch_results['failed_filings'] += len(batch_filings)
            error_result = {
                'success': False,
                'error': str(e),
                'documents_processed': len(batch_filings)
            }
            batch_results['results'].append(error_result)
            print(f"   ❌ Batch {batch_number}: Exception: {e}")
        
        # Brief pause between batches to prevent overwhelming
        if i + batch_size < len(filings_to_process):
            print("   ⏸️ Brief pause between batches...")
            time.sleep(3)
    
    # Finalize results
    batch_results['processing_end_time'] = datetime.now()
    batch_results['total_processing_time'] = str(batch_results['processing_end_time'] - batch_results['processing_start_time'])
    batch_results['success_rate'] = batch_results['successful_filings'] / batch_results['total_filings'] if batch_results['total_filings'] > 0 else 0
    
    return batch_results

def generate_multi_model_extraction_report(batch_results: Dict[str, Any] = None) -> None:
    """Generate comprehensive multi-model extraction report"""
    print("\\n" + "="*70)
    print("📊 MULTI-MODEL SEC ENTITY EXTRACTION REPORT")
    print("="*70)
    
    # Current database status with multi-model analysis
    try:
        conn = psycopg2.connect(**NEON_CONFIG)
        cursor = conn.cursor()
        
        # Enhanced database statistics
        cursor.execute('''\n            SELECT \n                COUNT(*) as total_entities,\n                COUNT(DISTINCT company_domain) as companies_processed,\n                COUNT(DISTINCT sec_filing_ref) as filings_processed,\n                COUNT(DISTINCT entity_category) as entity_types,\n                AVG(confidence_score) as avg_confidence,\n                COUNT(*) FILTER (WHERE is_merged = true) as merged_entities,\n                COUNT(*) FILTER (WHERE is_merged = false) as single_model_entities\n            FROM system_uno.sec_entities_raw\n            WHERE models_detected IS NOT NULL\n        ''')\n        \n        db_stats = cursor.fetchone()\n        \n        # Model usage breakdown\n        cursor.execute('''\n            SELECT primary_model, COUNT(*) as count, AVG(confidence_score) as avg_conf\n            FROM system_uno.sec_entities_raw\n            WHERE primary_model IS NOT NULL\n            GROUP BY primary_model\n            ORDER BY count DESC\n        ''')\n        \n        model_breakdown = cursor.fetchall()\n        \n        # Entity category breakdown\n        cursor.execute('''\n            SELECT entity_category, COUNT(*) as count, AVG(confidence_score) as avg_conf,\n                   COUNT(*) FILTER (WHERE is_merged = true) as merged_count\n            FROM system_uno.sec_entities_raw\n            GROUP BY entity_category\n            ORDER BY count DESC\n            LIMIT 10\n        ''')\n        \n        entity_breakdown = cursor.fetchall()\n        \n        # Multi-model detection statistics\n        cursor.execute('''\n            SELECT \n                array_length(models_detected, 1) as num_models,\n                COUNT(*) as count\n            FROM system_uno.sec_entities_raw\n            WHERE models_detected IS NOT NULL\n            GROUP BY array_length(models_detected, 1)\n            ORDER BY num_models\n        ''')\n        \n        multi_model_stats = cursor.fetchall()\n        \n        cursor.close()\n        conn.close()\n        \n        print(f\"\\n📈 DATABASE STATISTICS:\")\n        if db_stats and db_stats[0]:\n            print(f\"   Total Entities Extracted: {db_stats[0]:,}\")\n            print(f\"   Companies Processed: {db_stats[1]}\")\n            print(f\"   SEC Filings Processed: {db_stats[2]}\")\n            print(f\"   Entity Types Found: {db_stats[3]}\")\n            print(f\"   Average Confidence: {db_stats[4]:.3f}\")\n            print(f\"   Merged Entities: {db_stats[5]:,} ({(db_stats[5]/db_stats[0]*100):.1f}%)\")\n            print(f\"   Single-Model Entities: {db_stats[6]:,} ({(db_stats[6]/db_stats[0]*100):.1f}%)\")\n        \n        if model_breakdown:\n            print(f\"\\n🤖 MODEL USAGE BREAKDOWN:\")\n            for model_data in model_breakdown:\n                print(f\"   {model_data[0]}: {model_data[1]:,} entities (avg conf: {model_data[2]:.3f})\")\n        \n        if entity_breakdown:\n            print(f\"\\n🏷️ ENTITY CATEGORY BREAKDOWN:\")\n            for entity_data in entity_breakdown:\n                merged_pct = (entity_data[3] / entity_data[1] * 100) if entity_data[1] > 0 else 0\n                print(f\"   {entity_data[0]}: {entity_data[1]:,} entities (avg conf: {entity_data[2]:.3f}, {merged_pct:.1f}% merged)\")\n        \n        if multi_model_stats:\n            print(f\"\\n🔗 MULTI-MODEL DETECTION STATS:\")\n            for stat in multi_model_stats:\n                num_models = stat[0] or 1\n                print(f\"   {num_models} model(s): {stat[1]:,} entities\")\n                \n    except Exception as e:\n        print(f\"   ❌ Could not retrieve database statistics: {e}\")\n    \n    # Batch processing results\n    if batch_results:\n        print(f\"\\n⚡ BATCH PROCESSING RESULTS:\")\n        print(f\"   Total Filings Processed: {batch_results['total_filings']}\")\n        print(f\"   Successful: {batch_results['successful_filings']} ({batch_results['success_rate']*100:.1f}%)\")\n        print(f\"   Failed: {batch_results['failed_filings']}\")\n        print(f\"   Total Entities Extracted: {batch_results['total_entities_extracted']:,}\")\n        print(f\"   Merged Entities: {batch_results.get('merged_entities', 0):,}\")\n        print(f\"   Single-Model Entities: {batch_results.get('single_model_entities', 0):,}\")\n        print(f\"   Processing Time: {batch_results['total_processing_time']}\")\n        \n        if batch_results.get('model_usage'):\n            print(f\"\\n🎯 BATCH MODEL USAGE:\")\n            for model, count in batch_results['model_usage'].items():\n                print(f\"   {model}: {count} entities\")\n        \n        if batch_results.get('entity_categories'):\n            print(f\"\\n📋 BATCH ENTITY CATEGORIES:\")\n            sorted_categories = sorted(batch_results['entity_categories'].items(), key=lambda x: x[1], reverse=True)\n            for category, count in sorted_categories[:5]:\n                print(f\"   {category}: {count} entities\")\n    \n    # Pipeline statistics\n    if 'ner_pipeline' in globals() and ner_pipeline:\n        summary = ner_pipeline.get_processing_summary()\n        print(f\"\\n🔍 PIPELINE STATISTICS:\")\n        pipeline_stats = summary['pipeline_stats']\n        print(f\"   Documents Processed: {pipeline_stats['documents_processed']:,}\")\n        print(f\"   Total Entities Found: {pipeline_stats['total_entities_found']:,}\")\n        print(f\"   Total Processing Time: {pipeline_stats['total_processing_time']:.2f} seconds\")\n        print(f\"   Active Models: {', '.join(pipeline_stats['models_used'])}\")\n        \n        print(f\"\\n📊 INDIVIDUAL MODEL STATISTICS:\")\n        for model_name, stats in summary['model_stats'].items():\n            print(f\"   {model_name}:\")\n            print(f\"      Chunks processed: {stats['chunks_processed']:,}\")\n            print(f\"      Entities found: {stats['entities_found']:,}\")\n            print(f\"      Processing time: {stats['processing_time']:.2f}s\")\n    \n    # Storage statistics\n    if 'storage' in globals():\n        storage_stats = storage.get_storage_summary()\n        print(f\"\\n💾 STORAGE STATISTICS:\")\n        print(f\"   Entities Stored: {storage_stats['successful_inserts']:,}\")\n        print(f\"   Merged Entities: {storage_stats['merged_entities']:,}\")\n        print(f\"   Single-Model Entities: {storage_stats['single_model_entities']:,}\")\n        print(f\"   Failed Inserts: {storage_stats['failed_inserts']:,}\")\n    \n    print(\"\\n\" + \"=\"*70)\n    print(\"✅ Multi-Model Extraction Report Complete!\")\n    print(\"=\"*70)\n\n# ========== ADVANCED ANALYTICS FUNCTIONS ==========\n\ndef analyze_model_agreement(filing_ref: str = None, limit: int = 100) -> Dict:\n    \"\"\"Analyze agreement between different models on entity detection\"\"\"\n    try:\n        conn = psycopg2.connect(**NEON_CONFIG)\n        cursor = conn.cursor()\n        \n        base_query = \"\"\"\n            SELECT entity_text, models_detected, all_confidences, is_merged\n            FROM system_uno.sec_entities_raw\n            WHERE models_detected IS NOT NULL\n        \"\"\"\n        \n        params = []\n        if filing_ref:\n            base_query += \" AND sec_filing_ref = %s\"\n            params.append(filing_ref)\n        \n        base_query += f\" ORDER BY confidence_score DESC LIMIT {limit}\"\n        \n        cursor.execute(base_query, params)\n        results = cursor.fetchall()\n        cursor.close()\n        conn.close()\n        \n        analysis = {\n            'total_entities': len(results),\n            'agreement_stats': {},\n            'model_pairs': {},\n            'high_agreement_entities': [],\n            'disagreement_entities': []\n        }\n        \n        for entity_text, models, confidences_json, is_merged in results:\n            num_models = len(models) if models else 1\n            \n            if num_models not in analysis['agreement_stats']:\n                analysis['agreement_stats'][num_models] = 0\n            analysis['agreement_stats'][num_models] += 1\n            \n            # Parse confidences\n            try:\n                confidences = json.loads(confidences_json) if confidences_json else {}\n            except:\n                confidences = {}\n            \n            # High agreement: multiple models with similar confidence\n            if num_models > 1 and confidences:\n                conf_values = list(confidences.values())\n                if len(conf_values) > 1:\n                    conf_std = np.std(conf_values)\n                    if conf_std < 0.1:  # Low standard deviation = high agreement\n                        analysis['high_agreement_entities'].append({\n                            'entity': entity_text,\n                            'models': models,\n                            'confidences': confidences,\n                            'std_dev': conf_std\n                        })\n                    else:\n                        analysis['disagreement_entities'].append({\n                            'entity': entity_text,\n                            'models': models,\n                            'confidences': confidences,\n                            'std_dev': conf_std\n                        })\n        \n        return analysis\n        \n    except Exception as e:\n        print(f\"❌ Model agreement analysis failed: {e}\")\n        return {}\n\ndef get_top_entities_by_model(model_name: str, limit: int = 10) -> List[Dict]:\n    \"\"\"Get top entities detected by a specific model\"\"\"\n    if 'storage' in globals():\n        return storage.query_entities_by_models([model_name], limit)\n    return []\n\n# ========== GENERATE INITIAL REPORT ==========\ngenerate_multi_model_extraction_report()\n\n# ========== READY FOR BATCH PROCESSING ==========\nprint(f\"\\n🎯 MULTI-MODEL BATCH PROCESSING COMMANDS:\")\nprint(f\"   • Small batch: batch_results = run_multi_model_batch_extraction(batch_size=2, max_filings=5)\")\nprint(f\"   • Medium batch: batch_results = run_multi_model_batch_extraction(batch_size=3, max_filings=10)\")\nprint(f\"   • Analysis: agreement = analyze_model_agreement(limit=50)\")\nprint(f\"   • Model query: entities = get_top_entities_by_model('biobert', 10)\")\nprint(f\"   • New report: generate_multi_model_extraction_report(batch_results)\")\n\nprint(f\"\\n✅ Multi-Model SEC Entity Extraction Engine fully operational!\")\nprint(f\"🚀 Ready to process biotech SEC filings with 4 parallel NER models!\")\nprint(f\"🔧 Features: EdgarTools parsing, position-based merging, multi-model analytics\")