In [None]:
# Cell -1: Minimal Logging Setup (BEFORE PACKAGE INSTALLATION)
#
# Purpose: Initialize basic logging system to capture Cell 0 package installation
# This cell must run BEFORE Cell 0 to enable console log visibility

import sys
import psycopg2
from kaggle_secrets import UserSecretsClient

print("🔧 Setting up minimal logging system...")

# ============================================================================
# MINIMAL DATABASE CONNECTION FOR LOGGING
# ============================================================================

# Get database credentials
user_secrets = UserSecretsClient()
NEON_CONFIG = {
    'host': user_secrets.get_secret("NEON_HOST"),
    'database': user_secrets.get_secret("NEON_DATABASE"), 
    'user': user_secrets.get_secret("NEON_USER"),
    'password': user_secrets.get_secret("NEON_PASSWORD"),
    'port': 5432,
    'sslmode': 'require'
}

print("✅ Database credentials loaded for logging")

# ============================================================================
# GLOBAL LOGGER STATE MANAGEMENT
# ============================================================================

# Track current active logger globally to prevent overlaps
_current_logger = None

def stop_current_logging():
    """Properly stop and cleanup current logger"""
    global _current_logger
    if _current_logger:
        sys.stdout = _current_logger.original_stdout
        sys.stderr = _current_logger.original_stderr
        _current_logger = None

# ============================================================================
# MINIMAL REAL-TIME CONSOLE LOGGING SYSTEM (COMPATIBLE WITH FLAIR/GLIREL)
# ============================================================================

class RealTimeKaggleLogger:
    """Minimal logger for Cell 0 package installation visibility - compatible with Flair logging"""
    def __init__(self, cell_number):
        self.cell_number = cell_number
        self.original_stdout = sys.stdout
        self.original_stderr = sys.stderr
        self._closed = False
        
    def write(self, text):
        if self._closed:
            return
        
        # Write to original console immediately
        try:
            self.original_stdout.write(text)
            self.original_stdout.flush()
        except (ValueError, AttributeError):
            # Handle case where original stdout is closed
            pass
        
        # Save to database (non-blocking)
        if text.strip():  # Only log non-empty lines
            try:
                with psycopg2.connect(**NEON_CONFIG) as conn:
                    with conn.cursor() as cursor:
                        cursor.execute("""
                            INSERT INTO core.console_logs (cell_number, console_output) 
                            VALUES (%s, %s)
                        """, (self.cell_number, text.strip()))
                        conn.commit()
            except:
                pass  # Don't let logging errors break execution
                
    def flush(self):
        if self._closed:
            return
        try:
            self.original_stdout.flush()
        except (ValueError, AttributeError):
            pass
    
    def close(self):
        """Close method required by logging system (Flair/GLiREL compatibility)"""
        if not self._closed:
            self._closed = True
            # Don't actually close stdout/stderr as they may be needed elsewhere
    
    def isatty(self):
        """Check if this is a terminal (required by some logging systems)"""
        try:
            return self.original_stdout.isatty()
        except (ValueError, AttributeError):
            return False

def start_cell_logging(cell_number):
    """Start clean logging for a cell (stops previous logger first)"""
    global _current_logger
    
    # Clean stop of any existing logger to prevent duplicates
    stop_current_logging()
    
    # Start fresh logger for this cell only
    _current_logger = RealTimeKaggleLogger(cell_number)
    sys.stdout = _current_logger
    sys.stderr = _current_logger
    
    # Log cell start
    print(f"=== CELL {cell_number} START ===")

# Make functions globally available
globals()['start_cell_logging'] = start_cell_logging
globals()['stop_current_logging'] = stop_current_logging

print("✅ Minimal logging system initialized with Flair/GLiREL compatibility")

# ============================================================================
# CLEAR EXISTING CONSOLE LOGS
# ============================================================================

try:
    with psycopg2.connect(**NEON_CONFIG) as conn:
        with conn.cursor() as cursor:
            cursor.execute("DELETE FROM core.console_logs")
            conn.commit()
    print("🧹 Console logs cleared for fresh debugging session")
except Exception as e:
    print(f"⚠️ Could not clear console logs: {e}")

# ============================================================================
# START LOGGING FOR THIS CELL
# ============================================================================

start_cell_logging(-1)
print("✅ Cell -1 complete - Logging system ready for Cell 0 package installation")
print("   📊 Console logs will now capture Cell 0 execution details")
print("   🔧 Logger management: Prevents duplicates across cells")
print("   🔗 Flair/GLiREL compatibility: Added close() and isatty() methods")

In [None]:
# Cell 0: Package Installation and Consolidated Imports
# 
# Purpose: Install packages and consolidate all imports following PEP 8 order
# Initialize basic logging - all other setup in Cell 1

# ============================================================================
# START LOGGING FOR THIS CELL (using Cell -1 setup)
# ============================================================================

start_cell_logging(0)
print("🔧 Starting Cell 0 - Package installation and imports with logging enabled")

# ============================================================================
# PACKAGE INSTALLATION
# ============================================================================

print("🔧 Installing required packages with compatible versions...")
!pip install edgartools torch==2.6.0 transformers==4.41.0 accelerate==0.24.1 huggingface_hub requests beautifulsoup4 'lxml[html_clean]' uuid numpy newspaper3k --quiet
!pip install -U bitsandbytes --quiet
!pip install psycopg2-binary --quiet
!pip install gliner==0.2.5 glirel==0.1.0 --quiet

print("✅ All packages installed successfully with resolved dependencies")

# ============================================================================
# CONSOLIDATED IMPORTS - ALL IMPORTS FOR THE NOTEBOOK
# ============================================================================

print("📦 Starting consolidated imports...")

# Standard library imports (alphabetical order)
import importlib
import importlib.util
import json
import os
import pickle
import re
import signal
import sys
import time
import traceback
import uuid
import warnings

# Standard library from imports (alphabetical order)
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor, as_completed
from contextlib import contextmanager
from datetime import datetime
from functools import wraps
from pathlib import Path
from typing import Dict, List, Optional, Any, Set, Tuple

print("✅ Standard library imports complete")

# Third-party imports (alphabetical order)
import edgar
import numpy as np
import psycopg2
import requests
import torch

# Third-party from imports (alphabetical order by module)
from bs4 import BeautifulSoup
from edgar import Filing, find, set_identity, Company
from edgar.documents import parse_html
from edgar.documents.extractors.section_extractor import SectionExtractor
from huggingface_hub import login
from psycopg2 import pool
from psycopg2.extras import execute_values
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    pipeline,
    BitsAndBytesConfig
)

print("✅ Third-party imports complete")

# Environment imports
from kaggle_secrets import UserSecretsClient

print("✅ Environment imports complete")

# ============================================================================
# GITHUB REPOSITORY SETUP
# ============================================================================

print("🔧 Setting up GitHub repository access...")

# Get GitHub token and setup repository
user_secrets = UserSecretsClient()
GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")

# Clone/update repo for module access
REPO_URL = f"https://{GITHUB_TOKEN}@github.com/amiralpert/SmartReach.git"
LOCAL_PATH = "/kaggle/working/SmartReach"

if os.path.exists(LOCAL_PATH):
    !cd {LOCAL_PATH} && git pull origin main > /dev/null 2>&1
    print("✅ Repository updated")
else:
    !git clone {REPO_URL} {LOCAL_PATH} > /dev/null 2>&1
    print("✅ Repository cloned")

# Add to Python path
if f'{LOCAL_PATH}/BizIntel' not in sys.path:
    sys.path.insert(0, f'{LOCAL_PATH}/BizIntel')

print("✅ GitHub repository setup complete")

print("="*80)
print("🎉 CELL 0 COMPLETE - Package installation + imports + GitHub setup")
print("="*80)
print("   📦 Standard library: importlib, json, os, pickle, re, signal, sys, time, etc.")
print("   🔗 Third-party: edgar, numpy, psycopg2, requests, torch, transformers, bs4")
print("   🌐 Environment: kaggle_secrets")
print("   📂 GitHub: Repository cloned/updated and added to Python path")
print("   🧪 GLiNER: Installed (gliner==0.2.5, glirel==0.1.0) with compatible versions")
print("   📊 Console logging: Active and capturing all output")
print("   ✅ Dependency conflicts resolved - torch 2.6.0, transformers 4.41.0")

In [None]:
# Cell 1: GitHub Setup and Simplified Configuration

# ============================================================================
# START LOGGING FOR THIS CELL
# ============================================================================

start_cell_logging(1)

# ============================================================================
# GITHUB SETUP AND PATH CONFIGURATION
# ============================================================================

print("🔄 Setting up GitHub repository...")

# GitHub configuration
user_secrets = UserSecretsClient()
GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")
REPO_URL = f"https://{GITHUB_TOKEN}@github.com/amiralpert/SmartReach.git"
LOCAL_PATH = "/kaggle/working/SmartReach"

# Clone or update the repository
if os.path.exists(LOCAL_PATH):
    print("   📂 Repository exists, pulling latest changes...")
    !cd {LOCAL_PATH} && git pull origin main > /dev/null 2>&1
    print("   ✅ Repository updated")
else:
    print("   📥 Cloning repository...")
    !git clone {REPO_URL} {LOCAL_PATH} > /dev/null 2>&1
    print("   ✅ Repository cloned")

# Add paths for module imports
bizintel_path = f'{LOCAL_PATH}/BizIntel'
scripts_path = f'{LOCAL_PATH}/BizIntel/Scripts'

if bizintel_path not in sys.path:
    sys.path.insert(0, bizintel_path)
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

print(f"   ✅ Added {bizintel_path} to Python path")
print(f"   ✅ Added {scripts_path} to Python path")

# ============================================================================
# IMPORT MODULAR COMPONENTS
# ============================================================================

# Import from our modular EntityExtractionEngine
from EntityExtractionEngine import (
    SEC_FILINGS_PROMPT,
    SizeLimitedLRUCache,
    log_error,
    log_warning, 
    log_info,
    get_db_connection
)

print("✅ Imported modular EntityExtractionEngine components")

# ============================================================================
# CENTRALIZED CONFIGURATION
# ============================================================================

# Neon database configuration (from secrets)
NEON_CONFIG = {
    'host': user_secrets.get_secret("NEON_HOST"),
    'database': user_secrets.get_secret("NEON_DATABASE"), 
    'user': user_secrets.get_secret("NEON_USER"),
    'password': user_secrets.get_secret("NEON_PASSWORD"),
    'port': 5432,
    'sslmode': 'require'
}

# Complete centralized configuration
CONFIG = {
    'github': {
        'token': user_secrets.get_secret("GITHUB_TOKEN"),
        'repo_url': 'https://github.com/amiralpert/SmartReach.git',
        'local_path': '/kaggle/working/SmartReach',
        'branch': 'main'
    },
    'database': {
        'connection_pool_size': 5,
        'max_connections': 10,
        'connection_timeout': 30,
        'query_timeout': 60,
        'retry_attempts': 3,
        'batch_size': 100
    },
    'models': {
        'confidence_threshold': 0.75,
        'warm_up_enabled': True,
        'warm_up_text': 'Test entity extraction with biotechnology company.',
        'device_preference': 'auto',  # 'auto', 'cuda', 'cpu'
        'model_timeout': 30
    },
    'cache': {
        'enabled': True,
        'max_size_mb': 512,
        'ttl_hours': 24,
        'cleanup_interval': 3600
    },
    'processing': {
        'filing_batch_size': 3,
        'entity_batch_size': 50,
        'max_section_length': 50000,
        'enable_parallel': True,
        'max_workers': 4,
        'section_validation': True,
        'filing_query_limit': 10,
        'enable_relationships': True,
        'relationship_batch_size': 15,
        'context_window_chars': 400
    },
    'entity_extraction': {
        'max_chunk_size': 2000,  # Conservative limit for transformer models (512 tokens)
        'chunk_overlap': 200,     # Overlap between chunks to catch entities at boundaries
        'max_chunks_per_section': 50,  # Limit chunks to prevent excessive processing
        'enable_chunking': True   # Enable/disable text chunking for large documents
    },
    'gliner': {
        'enabled': True,
        'entity_model': 'urchade/gliner_medium-v2.1',
        'relation_model': 'jackboyla/glirel-base',
        'model_size': 'medium',  # 'small', 'medium', 'large'
        'entity_labels': [
            'Person', 'Filing Company', 'Private Company', 'Public Company',
            'Government Agency', 'Date', 'Money', 'Location', 'Product',
            'Technology', 'Financial Instrument', 'Law', 'Patent',
            'Drug', 'Disease', 'Regulatory Body'
        ],
        'confidence_threshold': 0.7,
        'relation_threshold': 0.6,
        'enable_relationships': True,
        'relation_types': [
            'employed_by', 'subsidiary_of', 'owns', 'part_of',
            'located_in', 'affiliated_with', 'contracts_with',
            'acquired_by', 'merged_with', 'partner_of'
        ],
        'max_text_length': 50000,
        'normalization': {
            'enable_coreference': True,
            'similarity_threshold': 0.85,
            'company_normalization': True
        },
        'output': {
            'verbose': False,
            'save_full_text': True,
            'include_context': True
        }
    },
    'llama': {
        'enabled': True,
        'model_name': 'meta-llama/Llama-3.1-8B-Instruct',
        'batch_size': 15,
        'max_new_tokens': 50,
        'context_window': 400,
        'temperature': 0.3,
        'entity_context_window': 400,
        'test_max_tokens': 50,
        'min_confidence_filter': 0.8,
        'timeout_seconds': 30,
        'SEC_FilingsPrompt': SEC_FILINGS_PROMPT,  # Now imported from module
    },
    'edgar': {
        'identity': 'SmartReach BizIntel amir.alpert@gmail.com',
        'rate_limit_delay': 0.1,
        'max_retries': 3,
        'timeout_seconds': 30
    }
}

# Error checking for required secrets
required_secrets = ['NEON_HOST', 'NEON_DATABASE', 'NEON_USER', 'NEON_PASSWORD', 'GITHUB_TOKEN']
missing_secrets = []

for secret in required_secrets:
    try:
        value = user_secrets.get_secret(secret)
        if not value:
            missing_secrets.append(secret)
    except Exception as e:
        missing_secrets.append(secret)

if missing_secrets:
    print(f"❌ Missing required secrets: {missing_secrets}")
    print("   Please add these secrets in Kaggle's Settings > Secrets")
    raise ValueError("Missing required secrets")

print("✅ All required secrets validated")

# Configuration validation and display
print("\n🔧 Configuration Summary:")
print(f"   • Database: {NEON_CONFIG['host']} / {NEON_CONFIG['database']}")
print(f"   • Models: {len(['biobert', 'bert', 'roberta', 'finbert'])} NER models + Llama 3.1-8B")
print(f"   • GLiNER: {'Enabled' if CONFIG['gliner']['enabled'] else 'Disabled'} - {CONFIG['gliner']['model_size']} model")
print(f"   • Processing: {CONFIG['processing']['filing_batch_size']} filings/batch")
print(f"   • Cache: {CONFIG['cache']['max_size_mb']}MB limit")
print(f"   • Relationships: {'Enabled' if CONFIG['processing']['enable_relationships'] else 'Disabled'}")
print(f"   • Text Chunking: {CONFIG['entity_extraction']['max_chunk_size']} chars/chunk with {CONFIG['entity_extraction']['chunk_overlap']} overlap")

# ============================================================================
# INITIALIZE COMPONENTS
# ============================================================================

# Initialize global cache for section extraction using imported class
SECTION_CACHE = SizeLimitedLRUCache(max_size_mb=CONFIG['cache']['max_size_mb'])

# Create database connection function with NEON_CONFIG
def get_db_connection_configured():
    """Database connection using our configuration"""
    return get_db_connection(NEON_CONFIG)

# ============================================================================
# MODULE CLEARING AND EDGARTOOLS SETUP
# ============================================================================

print("\n🧹 Clearing modules and setting up EdgarTools...")

# Clear any existing modules to ensure fresh imports
modules_to_clear = [mod for mod in sys.modules.keys() if 'SmartReach' in mod]
for module in modules_to_clear:
    del sys.modules[module]

# Configure EdgarTools identity
set_identity(CONFIG['edgar']['identity'])
print(f"   ✅ EdgarTools identity set: {CONFIG['edgar']['identity']}")

# ============================================================================
# FINAL INITIALIZATION MESSAGES
# ============================================================================

print("\n" + "="*80)
print("🎉 CELL 1 INITIALIZATION COMPLETE")
print("="*80)

print(f"✅ GitHub repository ready at: {LOCAL_PATH}")
print(f"✅ Database connection configured: {NEON_CONFIG['host']}")
print(f"✅ Configuration loaded with {len(CONFIG)} main sections")
print(f"✅ Modular components imported from EntityExtractionEngine")
print(f"✅ Size-limited cache initialized: {CONFIG['cache']['max_size_mb']}MB limit")
print(f"✅ EdgarTools identity configured")
print(f"✅ Logging functions available: log_error, log_warning, log_info")
print(f"✅ Database context manager available: get_db_connection_configured()")
print(f"✅ Llama 3.1-8B relationship extraction prompt configured")
print(f"✅ GLiNER configuration: {CONFIG['gliner']['model_size']} model with {len(CONFIG['gliner']['entity_labels'])} entity types")
print(f"✅ Entity extraction chunking: {CONFIG['entity_extraction']['max_chunks_per_section']} chunks max")
print(f"✅ Console logging active from Cell -1 - using core.console_logs table")

print(f"\n🚀 Ready to proceed to Cell 2 for EdgarTools section extraction!")

In [None]:
# Cell 2: Database Functions and ORM-like Models with Batching - SIMPLIFIED

# Start real-time console logging for this cell
start_cell_logging(2)

# Basic startup check - restart kernel if issues persist
print("Starting Cell 2 - EdgarTools section extraction")

# Ensure identity is set
set_identity(CONFIG['edgar']['identity'])

# ============================================================================
# IMPORT MODULAR COMPONENTS
# ============================================================================

# Import from our modular EntityExtractionEngine
from EntityExtractionEngine import (
    TimeoutError,
    get_filing_sections,
    route_sections_to_models, 
    process_sec_filing_with_sections,
    get_unprocessed_filings
)

print("✅ Imported EdgarTools processing components from EntityExtractionEngine")

# ============================================================================
# WRAPPER FUNCTIONS FOR CONFIGURED COMPONENTS
# ============================================================================

def get_filing_sections_configured(accession_number: str, filing_type: str = None) -> Dict[str, str]:
    """Get filing sections using global configuration and cache"""
    return get_filing_sections(accession_number, filing_type, SECTION_CACHE, CONFIG)

def process_sec_filing_configured(filing_data: Dict) -> Dict:
    """Process SEC filing using global configuration and cache"""
    return process_sec_filing_with_sections(filing_data, SECTION_CACHE, CONFIG)

def get_unprocessed_filings_configured(limit: int = 5) -> List[Dict]:
    """Get unprocessed filings using configured database connection"""
    return get_unprocessed_filings(get_db_connection_configured, limit)

# ============================================================================
# EDGAR FILING EXTRACTION
# ============================================================================

# Extract sections from unprocessed SEC filings
log_info("EdgarExtraction", "Starting section extraction with timeout protection")

unprocessed_filings = get_unprocessed_filings_configured(limit=1)

if unprocessed_filings:
    print(f"\n📄 Processing filing: {unprocessed_filings[0]['company_domain']} - {unprocessed_filings[0]['filing_type']}")
    print(f"   Accession: {unprocessed_filings[0]['accession_number']}")
    
    filing_result = process_sec_filing_configured(unprocessed_filings[0])
    
    if filing_result['processing_status'] == 'success':
        log_info("EdgarExtraction", f"✅ Successfully extracted {filing_result['total_sections']} sections")
    elif filing_result['processing_status'] == 'timeout':
        log_warning("EdgarExtraction", f"⏱️ Processing timed out - filing may be too large or slow")
    elif filing_result['processing_status'] == 'skipped':
        log_info("EdgarExtraction", f"⏭️ Skipped problematic filing")
    else:
        log_error("EdgarExtraction", f"❌ Section extraction failed: {filing_result.get('error')}")
else:
    log_info("EdgarExtraction", "No unprocessed filings available (all may be processed or problematic)")

print("✅ Cell 2 complete - EdgarTools section extraction with timeout protection ready")

In [None]:
# Cell 3: Entity Extraction Pipeline with GLiNER

# Start real-time console logging for this cell
start_cell_logging(3)

print("🔬 Initializing GLiNER Entity Extraction Pipeline")

# ============================================================================
# IMPORT GLINER COMPONENTS
# ============================================================================

from EntityExtractionEngine import (
    GLiNEREntityExtractor,
    GLiNEREntity,
    GLiNERRelationship,
    GLiNEREntityStorage,
    GLiNERLlamaBridge,
    create_gliner_storage,
    create_gliner_llama_bridge,
    GLINER_AVAILABLE
)

if not GLINER_AVAILABLE:
    print("❌ GLiNER components not available")
    print("   Ensure Cell 0 ran successfully with GLiNER package installation")
    raise ImportError("GLiNER not available")

print("✅ GLiNER components imported successfully")

# ============================================================================
# INITIALIZE GLINER EXTRACTOR
# ============================================================================

print("\n🔧 Initializing GLiNER extractor...")

try:
    # Initialize GLiNER with configuration from Cell 1
    gliner_extractor = GLiNEREntityExtractor(
        model_size=CONFIG['gliner']['model_size'],
        labels=CONFIG['gliner']['entity_labels'],
        threshold=CONFIG['gliner']['confidence_threshold'],
        enable_relationships=CONFIG['gliner']['enable_relationships'],
        debug=CONFIG['gliner']['output']['verbose']
    )
    
    print("🔬 Extractor: Initialized")
    
except Exception as e:
    print(f"❌ Failed to initialize GLiNER: {e}")
    print("   Note: GLiNER models will be downloaded on first use (may take a moment)")
    # Create a lazy-loading wrapper
    gliner_extractor = None

# ============================================================================
# INITIALIZE STORAGE COMPONENTS
# ============================================================================

print("🔧 Initializing storage components...")

# Initialize GLiNER-specific storage
gliner_storage = create_gliner_storage(NEON_CONFIG)
gliner_bridge = create_gliner_llama_bridge(CONFIG)

print("💾 Storage: Ready for database and memory operations")

# ============================================================================
# CREATE WRAPPER FOR COMPATIBILITY
# ============================================================================

class GLiNERPipelineWrapper:
    """Wrapper to make GLiNER compatible with existing pipeline interface"""
    
    def __init__(self, extractor, storage, bridge):
        self.extractor = extractor
        self.storage = storage
        self.bridge = bridge
    
    def extract_entities(self, text, section_name, filing_context=None):
        """Extract entities using GLiNER (compatible with old interface)"""
        print(f"🐛 Cell3 Debug - Processing section: '{section_name}'")
        print(f"🐛 Cell3 Debug - Text length: {len(text)} chars")
        
        if not self.extractor:
            raise RuntimeError("GLiNER extractor not initialized")
        
        # Use GLiNER extraction
        result = self.extractor.extract_with_relationships(
            text,
            filing_context or {},
            include_full_text=True
        )
        
        # Store in memory bridge
        if filing_context and 'accession' in filing_context:
            self.bridge.store_gliner_results(filing_context['accession'], result)
        
        # Add section name to each entity record before returning
        entity_records = result.get('entity_records', [])
        print(f"🐛 Cell3 Debug - Found {len(entity_records)} entities BEFORE adding section names")
        
        for i, record in enumerate(entity_records):
            record['section_name'] = section_name  # Add section name to entity record
            print(f"🐛 Cell3 Debug - Added section_name '{section_name}' to entity {i}: '{record.get('entity_text', 'NO_TEXT')}'")
        
        print(f"🐛 Cell3 Debug - Returning {len(entity_records)} entities WITH section names")
        return entity_records
    
    def process_filing(self, filing_data, sections):
        """Process entire filing with GLiNER"""
        print(f"🐛 Cell3 Debug - process_filing called with {len(sections)} sections: {list(sections.keys())}")
        
        all_entities = []
        
        for section_name, section_text in sections.items():
            print(f"🐛 Cell3 Debug - Processing section '{section_name}' ({len(section_text)} chars)")
            
            if not section_text or len(section_text.strip()) < 100:
                print(f"🐛 Cell3 Debug - Skipping section '{section_name}' - too short")
                continue
            
            filing_context = {
                'accession': filing_data.get('accession_number', ''),
                'company': filing_data.get('company_domain', ''),
                'section': section_name,
                'filing_type': filing_data.get('filing_type', ''),
                'filing_date': filing_data.get('filing_date', '')
            }
            
            print(f"🐛 Cell3 Debug - About to call extract_entities for '{section_name}'")
            entities = self.extract_entities(section_text, section_name, filing_context)
            print(f"🐛 Cell3 Debug - Got {len(entities)} entities back from extract_entities")
            
            all_entities.extend(entities)
        
        print(f"🐛 Cell3 Debug - Total entities collected: {len(all_entities)}")
        return all_entities

# Create the wrapper for pipeline compatibility
entity_pipeline = GLiNERPipelineWrapper(gliner_extractor, gliner_storage, gliner_bridge)

print("🔗 Pipeline wrapper: Compatible with existing architecture")

# ============================================================================
# PROCESS SEC FILING SECTIONS FROM CELL 2
# ============================================================================

# Use actual variables from Cell 2
if 'filing_result' in globals() and filing_result.get('processing_status') == 'success':
    
    # Get actual SEC sections extracted by EdgarTools
    sections = filing_result['sections']
    filing_data = unprocessed_filings[0] if unprocessed_filings else {}
    
    print(f"\n🔍 Processing {len(sections)} SEC filing sections with GLiNER+GLiREL...")
    
    # Process sections with GLiNER entity extraction + GLiREL relationship extraction  
    all_entities = entity_pipeline.process_filing(filing_data, sections)
    
    # Store results in database
    gliner_storage.store_gliner_entities({
        'entity_records': all_entities,
        'filing': filing_data  
    }, filing_data)
    
    print(f"✅ Processing complete: {len(all_entities)} entities stored")

else:
    print("⚠️ No SEC filing sections available from Cell 2")

print("\n✅ Cell 3 complete - GLiNER entity extraction pipeline ready")
print("📝 Usage: entity_pipeline.extract_entities(text, section, context)")

In [None]:
# Cell 4: Relationship Extractor with Local Llama 3.1-8B - MODULARIZED

# Start real-time console logging for this cell
start_cell_logging(5)

print("🦙 Loading Relationship Extractor with Local Llama 3.1-8B...")

# ============================================================================
# IMPORT MODULAR COMPONENTS
# ============================================================================

# Import from our modular EntityExtractionEngine
from EntityExtractionEngine import (
    RelationshipExtractor,
    SemanticRelationshipStorage,
    PipelineEntityStorage,
    process_filings_batch,
    generate_pipeline_analytics_report
)

print("✅ Imported relationship processing components from EntityExtractionEngine")

# ============================================================================
# INITIALIZE GLOBAL OBJECTS
# ============================================================================

print("🔧 Initializing pipeline components...")

# Initialize relationship extraction and storage components
relationship_extractor = RelationshipExtractor(CONFIG)
semantic_storage = SemanticRelationshipStorage(CONFIG['database'])
pipeline_storage = PipelineEntityStorage(CONFIG['database'])

print("✅ Pipeline components initialized:")
print(f"   🦙 Llama model status: {'✅ Loaded' if relationship_extractor.model else '❌ Failed'}")
print(f"   💾 Storage systems: ✅ Entity & ✅ Relationship storage initialized")

# ============================================================================
# WRAPPER FUNCTIONS FOR CONFIGURED PROCESSING
# ============================================================================

def process_filings_batch_configured(limit: int = None) -> Dict:
    """Process multiple SEC filings using configured pipeline components"""
    return process_filings_batch(
        entity_pipeline, relationship_extractor, pipeline_storage, 
        semantic_storage, CONFIG, limit
    )

print("✅ Cell 5 complete - Relationship extraction and storage ready")
print(f"   🎯 Batch processing: process_filings_batch_configured() function ready")
print(f"   📊 Analytics: generate_pipeline_analytics_report() function ready")

In [None]:
# Cell 5: Main Processing Pipeline with Relationship Extraction - MODULARIZED

# Start real-time console logging for this cell
start_cell_logging(6)

# ============================================================================
# IMPORT MODULAR COMPONENTS
# ============================================================================

# Import from our modular EntityExtractionEngine
from EntityExtractionEngine import execute_main_pipeline

print("✅ Imported main pipeline orchestrator from EntityExtractionEngine")

# ============================================================================
# EXECUTE MAIN PIPELINE
# ============================================================================

# Execute the complete SEC filing processing pipeline
results = execute_main_pipeline(
    entity_pipeline, 
    relationship_extractor, 
    pipeline_storage, 
    semantic_storage, 
    CONFIG
)

print("✅ Cell 6 complete - Main pipeline execution finished")