In [None]:
# Cell 0: Consolidated Imports and Auto-Logger Bootstrap
# 
# Purpose: All imports consolidated here following PEP 8 order
# Initialize basic logging - all other setup in Cell 1

# ============================================================================
# CONSOLIDATED IMPORTS - ALL IMPORTS FOR THE NOTEBOOK
# ============================================================================

# Standard library imports (alphabetical order)
import importlib
import importlib.util
import json
import os
import pickle
import re
import signal
import sys
import time
import traceback
import uuid
import warnings

# Standard library from imports (alphabetical order)
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor, as_completed
from contextlib import contextmanager
from datetime import datetime
from functools import wraps
from pathlib import Path
from typing import Dict, List, Optional, Any, Set, Tuple

# Third-party imports (alphabetical order)
import edgar
import numpy as np
import psycopg2
import requests
import torch

# Third-party from imports (alphabetical order by module)
from bs4 import BeautifulSoup
from edgar import Filing, find, set_identity, Company
from edgar.documents import parse_html
from edgar.documents.extractors.section_extractor import SectionExtractor
from huggingface_hub import login
from IPython import get_ipython
from ipykernel.iostream import OutStream
from psycopg2 import pool
from psycopg2.extras import execute_values
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    pipeline,
    BitsAndBytesConfig
)

# Environment imports
from kaggle_secrets import UserSecretsClient

# ============================================================================
# AUTO-LOGGER BOOTSTRAP (USING CONSOLIDATED IMPORTS)
# ============================================================================

# Get GitHub token for logger access
user_secrets = UserSecretsClient()
GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")

print("🔧 Setting up consolidated imports and logger bootstrap...")

# Clone/update repo for logger access
REPO_URL = f"https://{GITHUB_TOKEN}@github.com/amiralpert/SmartReach.git"
LOCAL_PATH = "/kaggle/working/SmartReach"

if os.path.exists(LOCAL_PATH):
    !cd {LOCAL_PATH} && git pull origin main > /dev/null 2>&1
else:
    !git clone {REPO_URL} {LOCAL_PATH} > /dev/null 2>&1

# Add to path
if f'{LOCAL_PATH}/BizIntel' not in sys.path:
    sys.path.insert(0, f'{LOCAL_PATH}/BizIntel')

# Initialize logger with minimal setup
logger_path = f"{LOCAL_PATH}/BizIntel/Scripts/KaggleLogger/auto_logger.py"
if os.path.exists(logger_path):
    spec = importlib.util.spec_from_file_location("auto_logger", logger_path)
    auto_logger = importlib.util.module_from_spec(spec)
    sys.modules["auto_logger"] = auto_logger
    spec.loader.exec_module(auto_logger)
    
    # Simple logger setup - database manager will be provided by Cell 1
    logger = None  # Will be properly initialized after Cell 1 runs
    print("✅ Auto-logger module loaded")
else:
    logger = None
    print("⚠️  Logger module not found - continuing without logging")

print("✅ Cell 0: All imports consolidated (33+ imports) + bootstrap complete")
print("   📦 Standard library: importlib, json, os, pickle, re, signal, sys, time, etc.")
print("   🔗 Third-party: edgar, numpy, psycopg2, requests, torch, transformers, bs4")
print("   🌐 Environment: kaggle_secrets")

In [None]:
# Cell 1: GitHub Setup and Simplified Configuration

# Install required packages first
!pip install edgartools transformers torch accelerate huggingface_hub requests beautifulsoup4 'lxml[html_clean]' uuid numpy newspaper3k --quiet
!pip install -U bitsandbytes --quiet

print("🔧 Installing additional packages...")
!pip install psycopg2-binary --quiet
!pip install accelerate --quiet

print("✅ All packages installed successfully")

# ============================================================================
# GITHUB SETUP AND PATH CONFIGURATION
# ============================================================================

print("\n🔄 Setting up GitHub repository...")

# GitHub configuration
user_secrets = UserSecretsClient()
GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")
REPO_URL = f"https://{GITHUB_TOKEN}@github.com/amiralpert/SmartReach.git"
LOCAL_PATH = "/kaggle/working/SmartReach"

# Clone or update the repository
if os.path.exists(LOCAL_PATH):
    print("   📂 Repository exists, pulling latest changes...")
    !cd {LOCAL_PATH} && git pull origin main > /dev/null 2>&1
    print("   ✅ Repository updated")
else:
    print("   📥 Cloning repository...")
    !git clone {REPO_URL} {LOCAL_PATH} > /dev/null 2>&1
    print("   ✅ Repository cloned")

# Add paths for module imports
bizintel_path = f'{LOCAL_PATH}/BizIntel'
scripts_path = f'{LOCAL_PATH}/BizIntel/Scripts'

if bizintel_path not in sys.path:
    sys.path.insert(0, bizintel_path)
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

print(f"   ✅ Added {bizintel_path} to Python path")
print(f"   ✅ Added {scripts_path} to Python path")

# ============================================================================
# IMPORT MODULAR COMPONENTS
# ============================================================================

# Import from our modular EntityExtractionEngine
from EntityExtractionEngine import (
    SEC_FILINGS_PROMPT,
    SizeLimitedLRUCache,
    log_error,
    log_warning, 
    log_info,
    get_db_connection
)

print("✅ Imported modular EntityExtractionEngine components")

# ============================================================================
# CENTRALIZED CONFIGURATION
# ============================================================================

# Neon database configuration (from secrets)
NEON_CONFIG = {
    'host': user_secrets.get_secret("NEON_HOST"),
    'database': user_secrets.get_secret("NEON_DATABASE"), 
    'user': user_secrets.get_secret("NEON_USER"),
    'password': user_secrets.get_secret("NEON_PASSWORD"),
    'port': 5432,
    'sslmode': 'require'
}

# Complete centralized configuration
CONFIG = {
    'github': {
        'token': user_secrets.get_secret("GITHUB_TOKEN"),
        'repo_url': 'https://github.com/amiralpert/SmartReach.git',
        'local_path': '/kaggle/working/SmartReach',
        'branch': 'main'
    },
    'database': {
        'connection_pool_size': 5,
        'max_connections': 10,
        'connection_timeout': 30,
        'query_timeout': 60,
        'retry_attempts': 3,
        'batch_size': 100
    },
    'models': {
        'confidence_threshold': 0.75,
        'warm_up_enabled': True,
        'warm_up_text': 'Test entity extraction with biotechnology company.',
        'device_preference': 'auto',  # 'auto', 'cuda', 'cpu'
        'model_timeout': 30
    },
    'cache': {
        'enabled': True,
        'max_size_mb': 512,
        'ttl_hours': 24,
        'cleanup_interval': 3600
    },
    'processing': {
        'filing_batch_size': 3,
        'entity_batch_size': 50,
        'max_section_length': 50000,
        'enable_parallel': True,
        'max_workers': 4,
        'section_validation': True,
        'filing_query_limit': 10,
        'enable_relationships': True,
        'relationship_batch_size': 15,
        'context_window_chars': 400
    },
    'llama': {
        'enabled': True,
        'model_name': 'meta-llama/Llama-3.1-8B-Instruct',
        'batch_size': 15,
        'max_new_tokens': 50,
        'context_window': 400,
        'temperature': 0.3,
        'entity_context_window': 400,
        'test_max_tokens': 50,
        'min_confidence_filter': 0.8,
        'timeout_seconds': 30,
        'SEC_FilingsPrompt': SEC_FILINGS_PROMPT,  # Now imported from module
    },
    'edgar': {
        'identity': 'SmartReach BizIntel amir.alpert@gmail.com',
        'rate_limit_delay': 0.1,
        'max_retries': 3,
        'timeout_seconds': 30
    }
}

# Error checking for required secrets
required_secrets = ['NEON_HOST', 'NEON_DATABASE', 'NEON_USER', 'NEON_PASSWORD', 'GITHUB_TOKEN']
missing_secrets = []

for secret in required_secrets:
    try:
        value = user_secrets.get_secret(secret)
        if not value:
            missing_secrets.append(secret)
    except Exception as e:
        missing_secrets.append(secret)

if missing_secrets:
    print(f"❌ Missing required secrets: {missing_secrets}")
    print("   Please add these secrets in Kaggle's Settings > Secrets")
    raise ValueError("Missing required secrets")

print("✅ All required secrets validated")

# Configuration validation and display
print("\n🔧 Configuration Summary:")
print(f"   • Database: {NEON_CONFIG['host']} / {NEON_CONFIG['database']}")
print(f"   • Models: {len(['biobert', 'bert', 'roberta', 'finbert'])} NER models + Llama 3.1-8B")
print(f"   • Processing: {CONFIG['processing']['filing_batch_size']} filings/batch")
print(f"   • Cache: {CONFIG['cache']['max_size_mb']}MB limit")
print(f"   • Relationships: {'Enabled' if CONFIG['processing']['enable_relationships'] else 'Disabled'}")

# ============================================================================
# INITIALIZE COMPONENTS
# ============================================================================

# Initialize global cache for section extraction using imported class
SECTION_CACHE = SizeLimitedLRUCache(max_size_mb=CONFIG['cache']['max_size_mb'])

# Create database connection function with NEON_CONFIG
def get_db_connection_configured():
    """Database connection using our configuration"""
    return get_db_connection(NEON_CONFIG)

# ============================================================================
# MODULE CLEARING AND EDGARTOOLS SETUP
# ============================================================================

print("\n🧹 Clearing modules and setting up EdgarTools...")

# Clear any existing modules to ensure fresh imports
modules_to_clear = [mod for mod in sys.modules.keys() if 'SmartReach' in mod]
for module in modules_to_clear:
    del sys.modules[module]

# Configure EdgarTools identity
set_identity(CONFIG['edgar']['identity'])
print(f"   ✅ EdgarTools identity set: {CONFIG['edgar']['identity']}")

# ============================================================================
# FINAL INITIALIZATION MESSAGES
# ============================================================================

print("\n" + "="*80)
print("🎉 CELL 1 INITIALIZATION COMPLETE")
print("="*80)

print(f"✅ GitHub repository ready at: {LOCAL_PATH}")
print(f"✅ Database connection configured: {NEON_CONFIG['host']}")
print(f"✅ Configuration loaded with {len(CONFIG)} main sections")
print(f"✅ Modular components imported from EntityExtractionEngine")
print(f"✅ Size-limited cache initialized: {CONFIG['cache']['max_size_mb']}MB limit")
print(f"✅ EdgarTools identity configured")
print(f"✅ Logging functions available: log_error, log_warning, log_info")
print(f"✅ Database context manager available: get_db_connection_configured()")
print(f"✅ Llama 3.1-8B relationship extraction prompt configured")

print(f"\n🚀 Ready to proceed to Cell 2 for EdgarTools section extraction!")

In [None]:
# Cell 2: Database Functions and ORM-like Models with Batching - SIMPLIFIED

# Basic startup check - restart kernel if issues persist
print("Starting Cell 2 - EdgarTools section extraction")

# Ensure identity is set
set_identity(CONFIG['edgar']['identity'])

# ============================================================================
# IMPORT MODULAR COMPONENTS
# ============================================================================

# Import from our modular EntityExtractionEngine
from EntityExtractionEngine import (
    TimeoutError,
    get_filing_sections,
    route_sections_to_models, 
    process_sec_filing_with_sections,
    get_unprocessed_filings
)

print("✅ Imported EdgarTools processing components from EntityExtractionEngine")

# ============================================================================
# WRAPPER FUNCTIONS FOR CONFIGURED COMPONENTS
# ============================================================================

def get_filing_sections_configured(accession_number: str, filing_type: str = None) -> Dict[str, str]:
    """Get filing sections using global configuration and cache"""
    return get_filing_sections(accession_number, filing_type, SECTION_CACHE, CONFIG)

def process_sec_filing_configured(filing_data: Dict) -> Dict:
    """Process SEC filing using global configuration and cache"""
    return process_sec_filing_with_sections(filing_data, SECTION_CACHE, CONFIG)

def get_unprocessed_filings_configured(limit: int = 5) -> List[Dict]:
    """Get unprocessed filings using configured database connection"""
    return get_unprocessed_filings(get_db_connection_configured, limit)

# ============================================================================
# TESTING AND VALIDATION
# ============================================================================

# Test the simplified extraction with timeout protection
log_info("Test", "Starting section extraction test with timeout protection")

test_filings = get_unprocessed_filings_configured(limit=1)

if test_filings:
    print(f"\n🧪 Testing with filing: {test_filings[0]['company_domain']} - {test_filings[0]['filing_type']}")
    print(f"   Accession: {test_filings[0]['accession_number']}")
    
    test_result = process_sec_filing_configured(test_filings[0])
    
    if test_result['processing_status'] == 'success':
        log_info("Test", f"✅ Successfully extracted {test_result['total_sections']} sections")
    elif test_result['processing_status'] == 'timeout':
        log_warning("Test", f"⏱️ Processing timed out - filing may be too large or slow")
    elif test_result['processing_status'] == 'skipped':
        log_info("Test", f"⏭️ Skipped problematic filing")
    else:
        log_error("Test", f"❌ Section extraction failed: {test_result.get('error')}")
else:
    log_info("Test", "No test filings available (all may be processed or problematic)")

print("✅ Cell 2 complete - EdgarTools section extraction with timeout protection ready")

In [None]:
# Cell 3: Optimized Entity Extraction Pipeline - Uses Cell 2's Pre-Extracted Sections

print("🚀 Loading Optimized EntityExtractionPipeline (Handler Classes Eliminated)...")

# ============================================================================
# IMPORT MODULAR COMPONENTS
# ============================================================================

# Import from our modular EntityExtractionEngine
from EntityExtractionEngine import EntityExtractionPipeline

print("✅ Imported EntityExtractionPipeline from EntityExtractionEngine")

# ============================================================================
# INITIALIZE PIPELINE
# ============================================================================

# Initialize the entity extraction pipeline
entity_pipeline = EntityExtractionPipeline(CONFIG)

print(f"✅ EntityExtractionPipeline initialized:")
stats = entity_pipeline.get_extraction_stats()
for key, value in stats.items():
    print(f"   • {key}: {value}")

# ============================================================================
# WRAPPER FUNCTION FOR CONFIGURED PROCESSING
# ============================================================================

def process_filing_entities_configured(filing_data: Dict) -> List[Dict]:
    """Process filing entities using configured pipeline and Cell 2 functions"""
    return entity_pipeline.process_filing_entities(filing_data, process_sec_filing_configured)

print("✅ Cell 3 complete - Optimized entity extraction ready (handler classes eliminated)")

In [None]:
# Cell 4: Relationship Extractor with Local Llama 3.1-8B - MODULARIZED

print("🦙 Loading Relationship Extractor with Local Llama 3.1-8B...")

# ============================================================================
# IMPORT MODULAR COMPONENTS
# ============================================================================

# Import from our modular EntityExtractionEngine
from EntityExtractionEngine import (
    RelationshipExtractor,
    SemanticRelationshipStorage,
    PipelineEntityStorage,
    process_filings_batch,
    generate_pipeline_analytics_report
)

print("✅ Imported relationship processing components from EntityExtractionEngine")

# ============================================================================
# INITIALIZE GLOBAL OBJECTS
# ============================================================================

print("🔧 Initializing pipeline components...")

# Initialize relationship extraction and storage components
relationship_extractor = RelationshipExtractor(CONFIG)
semantic_storage = SemanticRelationshipStorage(CONFIG['database'])
pipeline_storage = PipelineEntityStorage(CONFIG['database'])

print("✅ Pipeline components initialized:")
print(f"   🦙 Llama model status: {'✅ Loaded' if relationship_extractor.model else '❌ Failed'}")
print(f"   💾 Storage systems: ✅ Entity & ✅ Relationship storage initialized")

# ============================================================================
# WRAPPER FUNCTIONS FOR CONFIGURED PROCESSING
# ============================================================================

def process_filings_batch_configured(limit: int = None) -> Dict:
    """Process multiple SEC filings using configured pipeline components"""
    return process_filings_batch(
        entity_pipeline, relationship_extractor, pipeline_storage, 
        semantic_storage, CONFIG, limit
    )

print("✅ Cell 4 complete - Relationship extraction and storage ready")
print(f"   🎯 Batch processing: process_filings_batch_configured() function ready")
print(f"   📊 Analytics: generate_pipeline_analytics_report() function ready")

In [None]:
# Cell 5: Main Processing Pipeline with Relationship Extraction - MODULARIZED

# ============================================================================
# IMPORT MODULAR COMPONENTS
# ============================================================================

# Import from our modular EntityExtractionEngine
from EntityExtractionEngine import execute_main_pipeline

print("✅ Imported main pipeline orchestrator from EntityExtractionEngine")

# ============================================================================
# EXECUTE MAIN PIPELINE
# ============================================================================

# Execute the complete SEC filing processing pipeline
results = execute_main_pipeline(
    entity_pipeline, 
    relationship_extractor, 
    pipeline_storage, 
    semantic_storage, 
    CONFIG
)

print("✅ Cell 5 complete - Main pipeline execution finished")