# SEC Entity & Relationship Extraction Pipeline - Clean Architecture
## Streamlined implementation with zero technical debt

In [None]:
# Cell 1: Configuration Management

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any
from enum import Enum
import os
from datetime import datetime

class RelationshipType(Enum):
    """Standard biotech relationship types"""
    COMPANY_ENTITY = "COMPANY_ENTITY"
    PARTNERSHIP = "PARTNERSHIP"
    REGULATORY = "REGULATORY"
    CLINICAL_TRIAL = "CLINICAL_TRIAL"
    FINANCIAL = "FINANCIAL"
    LICENSING = "LICENSING"
    COMPETITIVE = "COMPETITIVE"
    SUPPLY_CHAIN = "SUPPLY_CHAIN"
    RESEARCH = "RESEARCH"
    ACQUISITION = "ACQUISITION"

@dataclass
class DatabaseConfig:
    host: str = 'ep-royal-star-ad1gn0d4-pooler.c-2.us-east-1.aws.neon.tech'
    database: str = 'BizIntelSmartReach'
    user: str = 'neondb_owner'
    password: str = 'npg_aTFt6Pug3Kpy'
    sslmode: str = 'require'
    pool_min_connections: int = 2
    pool_max_connections: int = 10

@dataclass
class ModelConfig:
    biobert_model: str = 'alvaroalon2/biobert_diseases_ner'
    bert_model: str = 'dslim/bert-base-NER'
    finbert_model: str = 'ProsusAI/finbert'
    roberta_model: str = 'Jean-Baptiste/roberta-large-ner-english'
    confidence_threshold: float = 0.5
    device: str = 'auto'  # 'auto', 'cpu', or 'cuda'
    
    # Essential filtering configuration
    biobert_skip_categories: List[str] = field(default_factory=lambda: ['0'])
    finbert_common_words: List[str] = field(default_factory=lambda: ['the', 'and', 'or', 'but', 'company', 'inc'])
    bert_skip_misc: bool = True

@dataclass
class LlamaConfig:
    model_id: str = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
    max_new_tokens: int = 500
    temperature: float = 0.7
    timeout_seconds: int = 30
    batch_size: int = 10
    prompt_version: str = '1.0'

@dataclass
class SemanticConfig:
    max_summary_length: int = 200
    context_window_chars: int = 500
    confidence_threshold: float = 0.5
    cache_ttl_seconds: int = 900

@dataclass
class ProcessingConfig:
    batch_size: int = 100
    max_retries: int = 3
    retry_delay_seconds: int = 1
    parallel_workers: int = 4
    filing_batch_size: int = 5

@dataclass
class PipelineConfig:
    """Master configuration for entire pipeline"""
    database: DatabaseConfig = field(default_factory=DatabaseConfig)
    models: ModelConfig = field(default_factory=ModelConfig)
    llama: LlamaConfig = field(default_factory=LlamaConfig)
    semantic: SemanticConfig = field(default_factory=SemanticConfig)
    processing: ProcessingConfig = field(default_factory=ProcessingConfig)
    
    # Feature flags
    enable_llama: bool = True
    enable_validation: bool = True
    enable_caching: bool = True
    
    # Paths
    edgar_identity: str = "SmartReach BizIntel amir@leanbio.consulting"
    
    @classmethod
    def from_env(cls):
        """Load configuration from environment variables"""
        config = cls()
        
        # Override from environment if available
        if os.getenv('LLAMA_ENABLED'):
            config.enable_llama = os.getenv('LLAMA_ENABLED').lower() == 'true'
        if os.getenv('DB_HOST'):
            config.database.host = os.getenv('DB_HOST')
        if os.getenv('LLAMA_MODEL'):
            config.llama.model_id = os.getenv('LLAMA_MODEL')
            
        return config

# Initialize configuration
CONFIG = PipelineConfig.from_env()

print("✅ Configuration loaded")
print(f"   Database: {CONFIG.database.host}")
print(f"   Llama: {'Enabled' if CONFIG.enable_llama else 'Disabled'}")
print(f"   Batch size: {CONFIG.processing.batch_size}")

In [None]:
# Cell 2: Database Connection Manager

import psycopg2
from psycopg2 import pool
from psycopg2.extras import execute_values, RealDictCursor
from contextlib import contextmanager
import logging
from typing import Generator, List, Dict, Any, Optional

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DatabaseManager:
    """Centralized database connection management with pooling"""
    
    def __init__(self, config: DatabaseConfig):
        self.config = config
        self._pool = None
        self._initialize_pool()
    
    def _initialize_pool(self):
        """Initialize connection pool"""
        try:
            self._pool = psycopg2.pool.ThreadedConnectionPool(
                minconn=self.config.pool_min_connections,
                maxconn=self.config.pool_max_connections,
                host=self.config.host,
                database=self.config.database,
                user=self.config.user,
                password=self.config.password,
                sslmode=self.config.sslmode
            )
            logger.info(f"Database pool initialized with {self.config.pool_max_connections} connections")
        except Exception as e:
            logger.error(f"Failed to initialize database pool: {e}")
            raise
    
    @contextmanager
    def get_connection(self) -> Generator:
        """Get connection from pool with automatic cleanup"""
        conn = None
        try:
            conn = self._pool.getconn()
            yield conn
            conn.commit()
        except Exception as e:
            if conn:
                conn.rollback()
            raise e
        finally:
            if conn:
                self._pool.putconn(conn)
    
    @contextmanager
    def get_cursor(self, dict_cursor=False) -> Generator:
        """Get cursor with automatic cleanup"""
        with self.get_connection() as conn:
            cursor_factory = RealDictCursor if dict_cursor else None
            cursor = conn.cursor(cursor_factory=cursor_factory)
            try:
                yield cursor
            finally:
                cursor.close()
    
    def batch_insert(self, table: str, columns: List[str], data: List[tuple],
                    on_conflict: Optional[str] = None) -> int:
        """Efficient batch insert with optional conflict handling"""
        if not data:
            return 0
        
        with self.get_cursor() as cursor:
            columns_str = ', '.join(columns)
            placeholders = ', '.join(['%s'] * len(columns))
            
            query = f"INSERT INTO {table} ({columns_str}) VALUES %s"
            if on_conflict:
                query += f" {on_conflict}"
            
            execute_values(cursor, query, data, page_size=self.config.pool_max_connections)
            return cursor.rowcount
    
    def execute_query(self, query: str, params: tuple = None) -> List[Dict]:
        """Execute query and return results as list of dicts"""
        with self.get_cursor(dict_cursor=True) as cursor:
            cursor.execute(query, params)
            return cursor.fetchall()
    
    def close(self):
        """Close all connections in pool"""
        if self._pool:
            self._pool.closeall()
            logger.info("Database pool closed")

# Initialize database manager
db_manager = DatabaseManager(CONFIG.database)
print("✅ Database manager initialized with connection pooling")

In [None]:
# Cell 3: Entity Extraction Module - Optimized to Use Cell 2's Output

set_identity(CONFIG.edgar_identity)

class EntityExtractionPipeline:
    """Extract entities from Cell 2's pre-processed sections"""
    
    def __init__(self, config: PipelineConfig, db_manager: DatabaseManager):
        self.config = config
        self.db = db_manager
        self.models = {}
        
        # Essential filtering from CONFIG
        self._biobert_skip = set(config.models.biobert_skip_categories)
        self._finbert_common = set(word.lower() for word in config.models.finbert_common_words)
        self._bert_skip_misc = config.models.bert_skip_misc
        
        # Map Cell 2's routing names to our model names
        self._routing_to_model_map = {
            'biobert': 'biobert',
            'bert_base': 'bert',  # Cell 2 uses 'bert_base', we use 'bert'
            'roberta': 'roberta', 
            'finbert': 'finbert'
        }
        
        # Load models during initialization
        self._load_models()
    
    def _load_models(self):
        """Load NER models"""
        model_configs = [
            ('biobert', self.config.models.biobert_model),
            ('bert', self.config.models.bert_model),
            ('finbert', self.config.models.finbert_model),
            ('roberta', self.config.models.roberta_model)
        ]
        
        device = 0 if torch.cuda.is_available() and self.config.models.device != 'cpu' else -1
        
        for name, model_id in model_configs:
            try:
                self.models[name] = pipeline(
                    "ner",
                    model=model_id,
                    aggregation_strategy="average",
                    device=device
                )
                logger.info(f"Loaded {name} model")
            except Exception as e:
                logger.error(f"Failed to load {name}: {e}")
    
    def _normalize_entity_type(self, entity_type: str) -> str:
        """Normalize entity types across models"""
        mappings = {
            'Disease': 'MEDICAL_CONDITION',
            'Chemical': 'MEDICATION',
            'Drug': 'MEDICATION',
            'PER': 'PERSON',
            'ORG': 'ORGANIZATION',
            'LOC': 'LOCATION',
            'MONEY': 'FINANCIAL',
            'PERCENT': 'FINANCIAL'
        }
        return mappings.get(entity_type, entity_type.upper())
    
    def _passes_essential_filters(self, model_name: str, entity_text: str, entity_category: str) -> bool:
        """Essential filtering logic per model"""
        entity_lower = entity_text.lower()
        
        if model_name == 'biobert':
            return entity_category not in self._biobert_skip
        elif model_name == 'finbert':
            return entity_lower not in self._finbert_common
        elif model_name == 'bert':
            return not (self._bert_skip_misc and entity_category == 'MISC')
        
        return True
    
    def _extract_from_single_section(self, section_text: str, model_name: str, 
                                   section_name: str, section_result: Dict) -> List[Dict]:
        """Extract entities from single section"""
        try:
            raw_entities = self.models[model_name](section_text)
            
            filtered_entities = []
            for entity in raw_entities:
                # Apply confidence threshold
                if entity['score'] < self.config.models.confidence_threshold:
                    continue
                
                entity_text = entity['word'].strip()
                entity_category = entity['entity_group']
                
                # Apply essential filtering
                if not self._passes_essential_filters(model_name, entity_text, entity_category):
                    continue
                
                filtered_entities.append({
                    'extraction_id': str(uuid.uuid4()),
                    'company_domain': section_result['company_domain'],
                    'entity_text': entity_text,
                    'entity_category': self._normalize_entity_type(entity_category),
                    'confidence_score': float(entity['score']),
                    'character_start': entity['start'],
                    'character_end': entity['end'],
                    'section_name': section_name,
                    'sec_filing_ref': f"SEC_{section_result['filing_id']}",
                    'primary_model': model_name,
                    'filing_type': section_result['filing_type'],
                    'filing_date': section_result.get('filing_date'),
                    'accession_number': section_result['accession_number']
                })
            
            return filtered_entities
            
        except Exception as e:
            logger.error(f"Entity extraction failed with {model_name} on {section_name}: {e}")
            return []
    
    def _merge_overlapping_entities(self, entities: List[Dict]) -> List[Dict]:
        """Simplified consensus merging"""
        if not entities:
            return []
        
        # Group by position within same section and filing
        position_groups = {}
        for entity in entities:
            key = (entity['sec_filing_ref'], entity['section_name'], 
                  entity['character_start'], entity['character_end'])
            position_groups.setdefault(key, []).append(entity)
        
        # Take highest confidence from each group
        merged = []
        for group in position_groups.values():
            best_entity = max(group, key=lambda x: x['confidence_score'])
            merged.append(best_entity)
        
        return merged
    
    def _extract_entities_from_sections(self, section_result: Dict) -> List[Dict]:
        """Extract entities using Cell 2's sections and routing"""
        sections = section_result['sections']
        model_routing = section_result['model_routing']
        
        all_entities = []
        
        # Process each model's assigned sections (Cell 2's routing)
        for routing_model_name, assigned_section_names in model_routing.items():
            
            # Map Cell 2's model name to our model name
            our_model_name = self._routing_to_model_map.get(routing_model_name)
            
            if not our_model_name or our_model_name not in self.models:
                logger.warning(f"Model '{routing_model_name}' -> '{our_model_name}' not available")
                continue
            
            logger.info(f"Processing {len(assigned_section_names)} sections with {our_model_name}")
            
            # Extract entities from each assigned section
            for section_name in assigned_section_names:
                section_text = sections.get(section_name)
                if not section_text:
                    logger.warning(f"Section '{section_name}' has no text")
                    continue
                
                section_entities = self._extract_from_single_section(
                    section_text, our_model_name, section_name, section_result
                )
                all_entities.extend(section_entities)
        
        # Merge overlapping entities
        merged_entities = self._merge_overlapping_entities(all_entities)
        
        return merged_entities
    
    def process_sec_filing_entities(self, filing_data: Dict) -> List[Dict]:
        """Main function: Extract entities from a filing using Cell 2's section extraction"""
        
        # Step 1: Use Cell 2's section extraction
        section_result = process_sec_filing_with_sections(filing_data)
        
        if section_result['processing_status'] != 'success':
            logger.warning(f"Cell 2 section extraction failed: {section_result.get('error', 'Unknown')}")
            return []
        
        # Step 2: Extract entities using Cell 2's output
        entities = self._extract_entities_from_sections(section_result)
        
        logger.info(f"Extracted {len(entities)} entities from {section_result['total_sections']} sections")
        
        return entities
    
    def store_entities_in_database(self, entities: List[Dict]) -> int:
        """Store entities in database"""
        if not entities:
            return 0
        
        columns = [
            'extraction_id', 'company_domain', 'entity_text', 'entity_category',
            'confidence_score', 'character_start', 'character_end', 'section_name',
            'sec_filing_ref', 'primary_model', 'data_source', 'extraction_timestamp',
            'filing_type', 'accession_number'
        ]
        
        data = []
        for e in entities:
            data.append((
                e['extraction_id'], e['company_domain'], e['entity_text'][:1000],
                e['entity_category'], e['confidence_score'], e['character_start'],
                e['character_end'], e['section_name'], e['sec_filing_ref'],
                e['primary_model'], 'sec_filings', datetime.now(),
                e['filing_type'], e['accession_number']
            ))
        
        on_conflict = "ON CONFLICT (extraction_id) DO NOTHING"
        count = self.db.batch_insert('system_uno.sec_entities_raw', columns, data, on_conflict)
        logger.info(f"Stored {count} entities")
        return count

# Initialize entity pipeline
entity_pipeline = EntityExtractionPipeline(CONFIG, db_manager)

def process_filing_with_entity_extraction(filing_data: Dict) -> Dict:
    """Complete pipeline using Cell 2's section extraction + Cell 3's entity extraction"""
    
    # Extract entities (this calls Cell 2's process_sec_filing_with_sections internally)
    entities = entity_pipeline.process_sec_filing_entities(filing_data)
    
    # Store entities
    stored_count = 0
    if entities:
        stored_count = entity_pipeline.store_entities_in_database(entities)
    
    return {
        'filing_id': filing_data.get('id'),
        'company_domain': filing_data.get('company_domain'),
        'filing_type': filing_data.get('filing_type'),
        'entities_extracted': len(entities),
        'entities_stored': stored_count,
        'success': len(entities) > 0
    }

def process_multiple_filings_with_entities(limit: int = 5) -> List[Dict]:
    """Process multiple filings through complete Cell 2 + Cell 3 pipeline"""
    
    # Use Cell 2's function to get unprocessed filings
    filings = get_unprocessed_filings(limit)
    
    results = []
    for filing in filings:
        result = process_filing_with_entity_extraction(filing)
        results.append(result)
        
        if result['success']:
            logger.info(f"✅ {result['company_domain']}: {result['entities_extracted']} entities")
        else:
            logger.warning(f"❌ {result['company_domain']}: No entities extracted")
    
    return results

logger.info(f"✅ Entity extraction pipeline initialized with {len(entity_pipeline.models)} models")
print(f"✅ Cell 3 complete - Entity pipeline ready to process Cell 2's sections")

In [None]:
# Cell 4: Relationship Analysis Module

from huggingface_hub import InferenceClient
import json
import asyncio
from concurrent.futures import ThreadPoolExecutor

class RelationshipAnalyzer:
    """Clean relationship analysis with batch Llama processing"""
    
    def __init__(self, config: PipelineConfig, db_manager: DatabaseManager):
        self.config = config
        self.db = db_manager
        self.client = None
        
        if config.enable_llama:
            # Get HuggingFace token from Kaggle secrets
            try:
                from kaggle_secrets import UserSecretsClient
                user_secrets = UserSecretsClient()
                hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
                
                self.client = InferenceClient(
                    model=config.llama.model_id,
                    token=hf_token
                )
                logger.info(f"Llama client initialized with {config.llama.model_id}")
            except Exception as e:
                logger.error(f"Failed to initialize Llama client: {e}")
                self.config.enable_llama = False
    
    def analyze_entity_batch(self, entities: List[Dict],
                            entity_extractor: EntityExtractor) -> List[Dict]:
        """Analyze batch of entities for relationships"""
        if not self.config.enable_llama or not entities:
            return []
        
        relationships = []
        
        # Process in batches
        batch_size = self.config.llama.batch_size
        for i in range(0, len(entities), batch_size):
            batch = entities[i:i + batch_size]
            
            # Analyze batch in parallel
            with ThreadPoolExecutor(max_workers=4) as executor:
                futures = []
                for entity in batch:
                    # Get context
                    context = self._get_entity_context(entity, entity_extractor)
                    if context:
                        future = executor.submit(self._analyze_single_entity, entity, context)
                        futures.append(future)
                
                for future in futures:
                    try:
                        result = future.result(timeout=self.config.llama.timeout_seconds)
                        if result:
                            relationships.append(result)
                    except Exception as e:
                        logger.error(f"Relationship analysis failed: {e}")
        
        return relationships
    
    def _get_entity_context(self, entity: Dict, entity_extractor: EntityExtractor) -> Optional[str]:
        """Get context around entity from cached sections"""
        section_content = entity_extractor.get_section_content(
            entity['sec_filing_ref'],
            entity['section_name']
        )
        
        if not section_content:
            return None
        
        # Extract context window
        start = max(0, entity['character_start'] - self.config.semantic.context_window_chars)
        end = min(len(section_content), entity['character_end'] + self.config.semantic.context_window_chars)
        
        return section_content[start:end]
    
    def _analyze_single_entity(self, entity: Dict, context: str) -> Optional[Dict]:
        """Analyze single entity with Llama"""
        try:
            prompt = self._build_prompt(entity, context)
            
            response = self.client.text_generation(
                prompt,
                max_new_tokens=self.config.llama.max_new_tokens,
                temperature=self.config.llama.temperature,
                return_full_text=False
            )
            
            # Parse response
            analysis = self._parse_response(response)
            
            if analysis:
                return {
                    'entity': entity,
                    'analysis': analysis,
                    'context': context
                }
            
        except Exception as e:
            logger.error(f"Failed to analyze entity {entity.get('entity_text')}: {e}")
        
        return None
    
    def _build_prompt(self, entity: Dict, context: str) -> str:
        """Build streamlined prompt for Llama"""
        return f"""Analyze this entity from an SEC filing for business relationships.

Company: {entity['company_domain']}
Entity: {entity['entity_text']}
Section: {entity['section_name']}
Context: {context}

Return JSON with:
- relationship_type: PARTNERSHIP|REGULATORY|CLINICAL_TRIAL|FINANCIAL|LICENSING|COMPETITIVE|RESEARCH|NONE
- semantic_summary: max 200 char description
- semantic_action: verb (initiated|expanded|terminated|announced)
- confidence: 0.0-1.0
- monetary_value: number or null
- temporal_info: when mentioned to occur
- tags: relevant keywords

JSON:"""
    
    def _parse_response(self, response: str) -> Optional[Dict]:
        """Parse Llama response to structured data"""
        try:
            # Extract JSON
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                data = json.loads(json_match.group())
                
                # Validate and normalize
                return {
                    'relationship_type': data.get('relationship_type', 'COMPANY_ENTITY'),
                    'semantic_summary': data.get('semantic_summary', '')[:200],
                    'semantic_action': data.get('semantic_action', 'mentioned'),
                    'confidence_score': float(data.get('confidence', 0.5)),
                    'monetary_value': data.get('monetary_value'),
                    'temporal_info': data.get('temporal_info', ''),
                    'semantic_tags': data.get('tags', [])
                }
        except Exception as e:
            logger.error(f"Failed to parse response: {e}")
        
        return None
    
    def store_relationships_batch(self, relationships: List[Dict], session_id: str) -> int:
        """Store all relationships in single transaction"""
        if not relationships:
            return 0
        
        with self.db.get_connection() as conn:
            cursor = conn.cursor()
            
            try:
                events_stored = 0
                buckets_created = 0
                
                for rel in relationships:
                    entity = rel['entity']
                    analysis = rel['analysis']
                    
                    # Find or create bucket
                    bucket_id = self._find_or_create_bucket(
                        cursor,
                        entity['company_domain'],
                        entity['entity_text'],
                        analysis['relationship_type']
                    )
                    
                    if bucket_id:
                        buckets_created += 1
                    
                    # Store semantic event
                    event_id = str(uuid.uuid4())
                    cursor.execute("""
                        INSERT INTO system_uno.relationship_semantic_events
                        (event_id, bucket_id, source_entity_id, sec_filing_ref,
                         filing_date, filing_type, section_name,
                         semantic_summary, semantic_action, confidence_score,
                         monetary_value, semantic_tags, original_context_snippet,
                         llama_prompt_version)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    """, (
                        event_id, bucket_id, entity['extraction_id'],
                        entity['sec_filing_ref'], entity.get('filing_date'),
                        entity['filing_type'], entity['section_name'],
                        analysis['semantic_summary'], analysis['semantic_action'],
                        analysis['confidence_score'], analysis.get('monetary_value'),
                        analysis.get('semantic_tags', []), rel['context'][:500],
                        self.config.llama.prompt_version
                    ))
                    
                    events_stored += 1
                    
                    # Create validation placeholder
                    if self.config.enable_validation:
                        cursor.execute("""
                            INSERT INTO system_uno.relationship_validation
                            (event_id, bucket_id, validation_method, llama_prompt_version_tested)
                            VALUES (%s, %s, 'PENDING_REVIEW', %s)
                        """, (event_id, bucket_id, self.config.llama.prompt_version))
                
                # Update session
                cursor.execute("""
                    UPDATE system_uno.semantic_analysis_sessions
                    SET events_created = events_created + %s,
                        buckets_updated = buckets_updated + %s
                    WHERE session_id = %s
                """, (events_stored, buckets_created, session_id))
                
                conn.commit()
                logger.info(f"Stored {events_stored} relationships")
                return events_stored
                
            except Exception as e:
                conn.rollback()
                logger.error(f"Failed to store relationships: {e}")
                return 0
    
    def _find_or_create_bucket(self, cursor, company: str, entity: str, rel_type: str) -> str:
        """Find or create relationship bucket"""
        # Check existing
        cursor.execute("""
            SELECT bucket_id FROM system_uno.relationship_buckets
            WHERE company_domain = %s AND entity_name = %s AND relationship_type = %s
        """, (company, entity, rel_type))
        
        result = cursor.fetchone()
        if result:
            return result[0]
        
        # Create new
        bucket_id = str(uuid.uuid4())
        cursor.execute("""
            INSERT INTO system_uno.relationship_buckets
            (bucket_id, company_domain, entity_name, relationship_type,
             first_mentioned_date, is_active)
            VALUES (%s, %s, %s, %s, CURRENT_DATE, true)
            RETURNING bucket_id
        """, (bucket_id, company, entity, rel_type))
        
        return cursor.fetchone()[0]

# Initialize relationship analyzer
relationship_analyzer = RelationshipAnalyzer(CONFIG, db_manager)
print(f"✅ Relationship analyzer initialized (Llama: {'Enabled' if CONFIG.enable_llama else 'Disabled'})")

In [None]:
# Cell 5: Pipeline Orchestrator

class PipelineOrchestrator:
    """Main orchestrator for complete pipeline"""
    
    def __init__(self, config: PipelineConfig, db_manager: DatabaseManager,
                 entity_extractor: EntityExtractor,
                 relationship_analyzer: RelationshipAnalyzer):
        self.config = config
        self.db = db_manager
        self.entity_extractor = entity_extractor
        self.relationship_analyzer = relationship_analyzer
    
    def process_filing_batch(self, limit: int = None) -> Dict:
        """Process batch of filings through complete pipeline"""
        limit = limit or self.config.processing.filing_batch_size
        start_time = time.time()
        
        # Get unprocessed filings
        filings = self._get_unprocessed_filings(limit)
        if not filings:
            return {'success': False, 'message': 'No filings to process'}
        
        logger.info(f"Processing {len(filings)} filings")
        
        # Create analysis session
        session_id = self._create_analysis_session(filings)
        
        results = {
            'filings_processed': len(filings),
            'entities_extracted': 0,
            'relationships_found': 0,
            'errors': []
        }
        
        try:
            # Step 1: Extract all entities
            entities = self.entity_extractor.process_filing_batch(filings)
            results['entities_extracted'] = len(entities)
            logger.info(f"Extracted {len(entities)} entities")
            
            # Step 2: Analyze relationships (if Llama enabled)
            if self.config.enable_llama and entities:
                # Group entities by filing for efficient processing
                entities_by_filing = self._group_entities_by_filing(entities)
                
                for filing_ref, filing_entities in entities_by_filing.items():
                    # Analyze batch
                    relationships = self.relationship_analyzer.analyze_entity_batch(
                        filing_entities, self.entity_extractor
                    )
                    
                    # Store relationships
                    if relationships:
                        count = self.relationship_analyzer.store_relationships_batch(
                            relationships, session_id
                        )
                        results['relationships_found'] += count
                    
                    # Clear cache after each filing
                    self.entity_extractor.clear_cache()
            
            # Complete session
            self._complete_analysis_session(session_id, True)
            
        except Exception as e:
            logger.error(f"Pipeline error: {e}")
            results['errors'].append(str(e))
            self._complete_analysis_session(session_id, False)
        
        # Calculate metrics
        duration = time.time() - start_time
        results['processing_time_seconds'] = round(duration, 2)
        results['success'] = len(results['errors']) == 0
        
        return results
    
    def _get_unprocessed_filings(self, limit: int) -> List[Dict]:
        """Get filings that haven't been processed"""
        query = """
            SELECT sf.id, sf.company_domain, sf.filing_type, sf.url, sf.filing_date
            FROM raw_data.sec_filings sf
            LEFT JOIN system_uno.sec_entities_raw ser ON ser.sec_filing_ref = CONCAT('SEC_', sf.id)
            WHERE sf.url IS NOT NULL
              AND ser.sec_filing_ref IS NULL
            ORDER BY sf.filing_date DESC
            LIMIT %s
        """
        return self.db.execute_query(query, (limit,))
    
    def _group_entities_by_filing(self, entities: List[Dict]) -> Dict[str, List[Dict]]:
        """Group entities by filing for batch processing"""
        grouped = {}
        for entity in entities:
            filing_ref = entity['sec_filing_ref']
            if filing_ref not in grouped:
                grouped[filing_ref] = []
            grouped[filing_ref].append(entity)
        return grouped
    
    def _create_analysis_session(self, filings: List[Dict]) -> str:
        """Create analysis session record"""
        session_id = str(uuid.uuid4())
        
        with self.db.get_cursor() as cursor:
            companies = list(set(f['company_domain'] for f in filings))
            filing_refs = [f"SEC_{f['id']}" for f in filings]
            
            cursor.execute("""
                INSERT INTO system_uno.semantic_analysis_sessions
                (session_id, company_domain, filing_batch, primary_prompt_version,
                 session_start, session_status)
                VALUES (%s, %s, %s, %s, NOW(), 'RUNNING')
            """, (session_id, companies[0] if companies else 'multiple',
                  filing_refs, self.config.llama.prompt_version))
        
        return session_id
    
    def _complete_analysis_session(self, session_id: str, success: bool):
        """Complete analysis session"""
        with self.db.get_cursor() as cursor:
            cursor.execute("""
                UPDATE system_uno.semantic_analysis_sessions
                SET session_end = NOW(),
                    session_status = %s,
                    total_processing_ms = EXTRACT(EPOCH FROM (NOW() - session_start)) * 1000
                WHERE session_id = %s
            """, ('COMPLETED' if success else 'FAILED', session_id))

# Initialize orchestrator
orchestrator = PipelineOrchestrator(
    CONFIG, db_manager, entity_extractor, relationship_analyzer
)
print("✅ Pipeline orchestrator initialized")

In [None]:
# Cell 6: Analytics Module

class PipelineAnalytics:
    """Streamlined analytics and reporting"""
    
    def __init__(self, db_manager: DatabaseManager):
        self.db = db_manager
    
    def generate_report(self) -> Dict:
        """Generate comprehensive analytics report"""
        report = {}
        
        # Entity metrics
        entity_query = """
            SELECT 
                COUNT(*) as total_entities,
                COUNT(DISTINCT company_domain) as companies,
                COUNT(DISTINCT sec_filing_ref) as filings,
                AVG(confidence_score) as avg_confidence,
                COUNT(*) FILTER (WHERE section_name IS NOT NULL) as with_sections
            FROM system_uno.sec_entities_raw
            WHERE data_source = 'sec_filings'
        """
        entity_stats = self.db.execute_query(entity_query)[0] if self.db.execute_query(entity_query) else {}
        report['entities'] = entity_stats
        
        # Relationship metrics
        rel_query = """
            SELECT 
                COUNT(DISTINCT b.bucket_id) as buckets,
                COUNT(e.event_id) as events,
                AVG(e.confidence_score) as avg_confidence
            FROM system_uno.relationship_buckets b
            LEFT JOIN system_uno.relationship_semantic_events e ON b.bucket_id = e.bucket_id
        """
        rel_stats = self.db.execute_query(rel_query)[0] if self.db.execute_query(rel_query) else {}
        report['relationships'] = rel_stats
        
        # Session metrics
        session_query = """
            SELECT 
                COUNT(*) as total_sessions,
                COUNT(*) FILTER (WHERE session_status = 'COMPLETED') as completed,
                AVG(total_processing_ms/1000.0) as avg_duration_seconds
            FROM system_uno.semantic_analysis_sessions
            WHERE session_start > NOW() - INTERVAL '7 days'
        """
        session_stats = self.db.execute_query(session_query)[0] if self.db.execute_query(session_query) else {}
        report['sessions'] = session_stats
        
        # Validation metrics
        val_query = """
            SELECT 
                COUNT(*) as total,
                COUNT(*) FILTER (WHERE validation_method != 'PENDING_REVIEW') as reviewed
            FROM system_uno.relationship_validation
        """
        val_stats = self.db.execute_query(val_query)[0] if self.db.execute_query(val_query) else {}
        report['validation'] = val_stats
        
        return report
    
    def print_report(self):
        """Print formatted analytics report"""
        report = self.generate_report()
        
        print("\n" + "="*60)
        print("PIPELINE ANALYTICS REPORT")
        print("="*60)
        
        if report.get('entities'):
            e = report['entities']
            print(f"\nENTITIES:")
            print(f"  Total: {e.get('total_entities', 0):,}")
            print(f"  Companies: {e.get('companies', 0):,}")
            print(f"  Filings: {e.get('filings', 0):,}")
            print(f"  Avg Confidence: {e.get('avg_confidence', 0):.3f}")
        
        if report.get('relationships'):
            r = report['relationships']
            print(f"\nRELATIONSHIPS:")
            print(f"  Buckets: {r.get('buckets', 0):,}")
            print(f"  Events: {r.get('events', 0):,}")
            print(f"  Avg Confidence: {r.get('avg_confidence', 0):.3f}")
        
        if report.get('sessions'):
            s = report['sessions']
            print(f"\nSESSIONS (Last 7 days):")
            print(f"  Total: {s.get('total_sessions', 0)}")
            print(f"  Completed: {s.get('completed', 0)}")
            print(f"  Avg Duration: {s.get('avg_duration_seconds', 0):.1f}s")
        
        if report.get('validation'):
            v = report['validation']
            print(f"\nVALIDATION:")
            print(f"  Total: {v.get('total', 0):,}")
            print(f"  Reviewed: {v.get('reviewed', 0):,}")
            print(f"  Pending: {v.get('total', 0) - v.get('reviewed', 0):,}")
        
        print("="*60)

# Initialize analytics
analytics = PipelineAnalytics(db_manager)
print("✅ Analytics module initialized")

In [None]:
# Cell 7: Main Execution

def run_pipeline(filing_limit: int = None):
    """Main function to run the complete pipeline"""
    print("\n🚀 Starting SEC Entity & Relationship Extraction Pipeline")
    print(f"   Configuration: Llama={'ON' if CONFIG.enable_llama else 'OFF'}, Batch={CONFIG.processing.batch_size}")
    
    # Process filings
    results = orchestrator.process_filing_batch(filing_limit)
    
    # Print results
    print(f"\n📊 RESULTS:")
    print(f"   Filings Processed: {results['filings_processed']}")
    print(f"   Entities Extracted: {results['entities_extracted']:,}")
    print(f"   Relationships Found: {results['relationships_found']:,}")
    print(f"   Processing Time: {results['processing_time_seconds']:.1f}s")
    
    if results['errors']:
        print(f"   ⚠️ Errors: {len(results['errors'])}")
        for error in results['errors'][:3]:  # Show first 3 errors
            print(f"      - {error}")
    
    # Generate analytics
    analytics.print_report()
    
    return results

# Example usage
print("\n📌 USAGE:")
print("   results = run_pipeline(filing_limit=5)  # Process 5 filings")
print("   analytics.print_report()  # View analytics")
print("\n✅ Pipeline ready to execute!")