In [None]:
# Cell 6: Batch Processing and Monitoring Dashboard

def run_batch_extraction(batch_size: int = 5, max_filings: int = None) -> Dict[str, Any]:
    """Run batch entity extraction on multiple SEC filings"""
    
    print(f"🚀 Starting batch entity extraction...")
    print(f"   📦 Batch size: {batch_size}")
    print(f"   📊 Max filings: {max_filings or 'unlimited'}")
    
    # Get filings to process
    filings_to_process = fetcher.get_filings_to_process(limit=max_filings or 100)
    
    if not filings_to_process:
        print("📭 No unprocessed filings found")
        return {'success': False, 'message': 'No filings to process'}
    
    print(f"   📄 Found {len(filings_to_process)} filings to process")
    
    # Initialize batch tracking
    batch_results = {
        'total_filings': len(filings_to_process),
        'successful_filings': 0,
        'failed_filings': 0,
        'total_entities_extracted': 0,
        'processing_start_time': datetime.now(),
        'results': []
    }
    
    # Process filings in batches
    for i in range(0, len(filings_to_process), batch_size):
        batch_filings = filings_to_process[i:i + batch_size]
        batch_number = (i // batch_size) + 1
        total_batches = (len(filings_to_process) + batch_size - 1) // batch_size
        
        print(f\"\\n📦 Processing batch {batch_number}/{total_batches} ({len(batch_filings)} filings)\")\n        \n        for filing in batch_filings:\n            try:\n                result = process_single_filing(filing)\n                \n                if result['success']:\n                    batch_results['successful_filings'] += 1\n                    batch_results['total_entities_extracted'] += result['entities_extracted']\n                    print(f\"   ✅ {filing['company_domain']} - {filing['filing_type']}: {result['entities_extracted']} entities\")\n                else:\n                    batch_results['failed_filings'] += 1\n                    print(f\"   ❌ {filing['company_domain']} - {filing['filing_type']}: {result.get('error', 'Unknown error')}\")\n                \n                batch_results['results'].append(result)\n                \n            except Exception as e:\n                batch_results['failed_filings'] += 1\n                error_result = {\n                    'success': False,\n                    'filing_id': filing['id'],\n                    'company_domain': filing['company_domain'],\n                    'filing_type': filing['filing_type'],\n                    'error': str(e)\n                }\n                batch_results['results'].append(error_result)\n                print(f\"   ❌ {filing['company_domain']} - {filing['filing_type']}: Exception: {e}\")\n        \n        # Short delay between batches\n        if i + batch_size < len(filings_to_process):\n            print(\"   ⏸️ Brief pause between batches...\")\n            time.sleep(2)\n    \n    # Finalize results\n    batch_results['processing_end_time'] = datetime.now()\n    batch_results['total_processing_time'] = str(batch_results['processing_end_time'] - batch_results['processing_start_time'])\n    batch_results['success_rate'] = batch_results['successful_filings'] / batch_results['total_filings'] if batch_results['total_filings'] > 0 else 0\n    \n    return batch_results\n\ndef generate_extraction_report(batch_results: Dict[str, Any] = None) -> None:\n    \"\"\"Generate comprehensive extraction report\"\"\"\n    print(\"\\n\" + \"=\"*60)\n    print(\"📊 SEC ENTITY EXTRACTION REPORT\")\n    print(\"=\"*60)\n    \n    # Current database status\n    try:\n        conn = psycopg2.connect(**NEON_CONFIG)\n        cursor = conn.cursor()\n        \n        cursor.execute('''\n            SELECT \n                COUNT(*) as total_entities,\n                COUNT(DISTINCT company_domain) as companies_processed,\n                COUNT(DISTINCT sec_filing_ref) as filings_processed,\n                COUNT(DISTINCT entity_category) as entity_types,\n                AVG(confidence_score) as avg_confidence\n            FROM system_uno.sec_entities_raw\n        ''')\n        \n        db_stats = cursor.fetchone()\n        \n        cursor.execute('''\n            SELECT entity_category, COUNT(*) as count, AVG(confidence_score) as avg_conf\n            FROM system_uno.sec_entities_raw\n            GROUP BY entity_category\n            ORDER BY count DESC\n            LIMIT 10\n        ''')\n        \n        entity_breakdown = cursor.fetchall()\n        cursor.close()\n        conn.close()\n        \n        print(f\"\\n📈 DATABASE STATISTICS:\")\n        print(f\"   Total Entities Extracted: {db_stats[0]:,}\")\n        print(f\"   Companies Processed: {db_stats[1]}\")\n        print(f\"   SEC Filings Processed: {db_stats[2]}\")\n        print(f\"   Entity Types Found: {db_stats[3]}\")\n        print(f\"   Average Confidence: {db_stats[4]:.3f}\")\n        \n        print(f\"\\n🏷️ ENTITY TYPE BREAKDOWN:\")\n        for entity_type in entity_breakdown:\n            print(f\"   {entity_type[0]}: {entity_type[1]:,} entities (avg conf: {entity_type[2]:.3f})\")\n            \n    except Exception as e:\n        print(f\"   ❌ Could not retrieve database statistics: {e}\")\n    \n    # Batch processing results\n    if batch_results:\n        print(f\"\\n⚡ BATCH PROCESSING RESULTS:\")\n        print(f\"   Total Filings Processed: {batch_results['total_filings']}\")\n        print(f\"   Successful: {batch_results['successful_filings']} ({batch_results['success_rate']*100:.1f}%)\")\n        print(f\"   Failed: {batch_results['failed_filings']}\")\n        print(f\"   Total Entities Extracted: {batch_results['total_entities_extracted']:,}\")\n        print(f\"   Processing Time: {batch_results['total_processing_time']}\")\n    \n    # Extractor statistics\n    if extractor:\n        extractor_stats = extractor.get_extraction_summary()\n        print(f\"\\n🔍 EXTRACTION STATISTICS:\")\n        print(f\"   API Calls Made: {extractor_stats['extraction_stats']['api_calls_made']:,}\")\n        print(f\"   Chunks Processed: {extractor_stats['extraction_stats']['total_chunks_processed']:,}\")\n        print(f\"   Entities Found: {extractor_stats['extraction_stats']['total_entities_found']:,}\")\n        print(f\"   Errors: {extractor_stats['extraction_stats']['errors']}\")\n    \n    # Storage statistics\n    storage_stats = storage.get_storage_summary()\n    print(f\"\\n💾 STORAGE STATISTICS:\")\n    print(f\"   Entities Stored: {storage_stats['successful_inserts']:,}\")\n    print(f\"   Duplicates Skipped: {storage_stats['duplicate_entities']:,}\")\n    print(f\"   Failed Inserts: {storage_stats['failed_inserts']:,}\")\n    \n    print(\"\\n\" + \"=\"*60)\n    print(\"✅ Report generated successfully!\")\n    print(\"=\"*60)\n\n# Generate initial report\ngenerate_extraction_report()\n\n# Ready for batch processing\nprint(\"\\n🎯 BATCH PROCESSING COMMANDS:\")\nprint(\"   • To process 5 filings: batch_results = run_batch_extraction(batch_size=5, max_filings=5)\")\nprint(\"   • To process 10 filings: batch_results = run_batch_extraction(batch_size=3, max_filings=10)\")\nprint(\"   • To generate new report: generate_extraction_report(batch_results)\")\nprint(\"\\n✅ SEC Entity Extraction Engine fully operational!\")

In [None]:
# Cell 5: Store Extraction Results in system_uno.sec_entities_raw

class SECEntityStorage:
    """Store extracted entities in the Neon database"""
    
    def __init__(self, db_config):
        self.db_config = db_config
        self.storage_stats = {
            'total_entities_stored': 0,
            'successful_inserts': 0,
            'failed_inserts': 0,
            'duplicate_entities': 0
        }
    
    def store_entities(self, entities: List[Dict[str, Any]]) -> bool:
        """Store a list of entities in the database"""
        if not entities:
            print("⚠️ No entities to store")
            return True
        
        try:
            conn = psycopg2.connect(**self.db_config)
            cursor = conn.cursor()
            
            print(f"💾 Storing {len(entities)} entities in database...")
            
            # Prepare batch insert data
            insert_data = []
            for entity in entities:
                insert_data.append((
                    entity['extraction_id'],
                    entity['company_domain'],
                    entity['entity_text'],
                    entity['entity_category'],
                    entity['confidence_score'],
                    entity['character_start'],
                    entity['character_end'],
                    entity['surrounding_text'],
                    entity['sec_filing_ref']
                ))
            
            # Batch insert using execute_values for better performance
            insert_query = '''
                INSERT INTO system_uno.sec_entities_raw 
                (extraction_id, company_domain, entity_text, entity_category, 
                 confidence_score, character_start, character_end, surrounding_text, sec_filing_ref)
                VALUES %s
                ON CONFLICT (extraction_id) DO NOTHING
            '''
            
            from psycopg2.extras import execute_values
            execute_values(cursor, insert_query, insert_data, page_size=100)
            
            # Get number of rows actually inserted
            rows_inserted = cursor.rowcount
            
            conn.commit()
            cursor.close()
            conn.close()
            
            # Update statistics
            self.storage_stats['total_entities_stored'] += len(entities)
            self.storage_stats['successful_inserts'] += rows_inserted
            self.storage_stats['duplicate_entities'] += len(entities) - rows_inserted
            
            print(f"   ✓ Stored {rows_inserted} entities ({len(entities) - rows_inserted} duplicates skipped)")
            return True
            
        except Exception as e:
            print(f"   ✗ Storage failed: {e}")
            self.storage_stats['failed_inserts'] += len(entities)
            return False
    
    def get_storage_summary(self) -> Dict[str, Any]:
        """Get summary of storage operations"""
        return self.storage_stats
    
    def verify_storage(self, sec_filing_ref: str) -> Dict[str, Any]:
        """Verify entities were stored correctly for a filing"""
        try:
            conn = psycopg2.connect(**self.db_config)
            cursor = conn.cursor()
            
            # Count entities by category for this filing
            cursor.execute('''
                SELECT entity_category, COUNT(*) as count, AVG(confidence_score) as avg_confidence
                FROM system_uno.sec_entities_raw
                WHERE sec_filing_ref = %s
                GROUP BY entity_category
                ORDER BY count DESC
            ''', (sec_filing_ref,))
            
            results = cursor.fetchall()
            cursor.close()
            conn.close()
            
            verification = {
                'total_entities': sum(result[1] for result in results),
                'entity_breakdown': [
                    {
                        'category': result[0],
                        'count': result[1],
                        'avg_confidence': float(result[2])
                    } for result in results
                ]
            }
            
            return verification
            
        except Exception as e:
            print(f"   ✗ Verification failed: {e}")
            return {}

# Initialize storage handler
storage = SECEntityStorage(NEON_CONFIG)
print("✓ SEC Entity Storage initialized!")

# Function to process a single filing end-to-end
def process_single_filing(filing_data: Dict[str, Any]) -> Dict[str, Any]:
    """Complete processing pipeline for a single SEC filing"""
    print(f"\n🚀 Processing filing: {filing_data['company_domain']} - {filing_data['filing_type']}")
    print(f"   📅 Date: {filing_data['filing_date']}")
    print(f"   🔗 URL: {filing_data['url']}")
    
    # Step 1: Fetch content
    content = fetcher.fetch_filing_content(filing_data['url'])
    if not content:
        return {'success': False, 'error': 'Failed to fetch content'}
    
    # Step 2: Extract entities
    if not extractor:
        return {'success': False, 'error': 'Entity extractor not available'}
    
    entities = extractor.extract_entities_from_filing(filing_data, content)
    if not entities:
        return {'success': False, 'error': 'No entities extracted'}
    
    # Step 3: Store entities
    success = storage.store_entities(entities)
    if not success:
        return {'success': False, 'error': 'Failed to store entities'}
    
    # Step 4: Verify storage
    sec_filing_ref = f"SEC_{filing_data['id']}"
    verification = storage.verify_storage(sec_filing_ref)
    
    result = {
        'success': True,
        'filing_id': filing_data['id'],
        'company_domain': filing_data['company_domain'],
        'filing_type': filing_data['filing_type'],
        'content_length': len(content),
        'entities_extracted': len(entities),
        'verification': verification
    }
    
    print(f"   ✅ Successfully processed {len(entities)} entities")
    return result

# Test processing a single filing
if 'test_filing' in locals() and test_filing:
    print("\n🧪 Testing complete processing pipeline...")
    test_result = process_single_filing(test_filing)
    
    if test_result['success']:
        print(f"   📊 Results: {test_result['entities_extracted']} entities extracted")
        if test_result['verification']:
            for entity_type in test_result['verification']['entity_breakdown']:
                print(f"      {entity_type['category']}: {entity_type['count']} (avg confidence: {entity_type['avg_confidence']:.3f})")
    else:
        print(f"   ❌ Test failed: {test_result.get('error', 'Unknown error')}")

print("\n✅ Entity Storage Pipeline ready!")

In [None]:
# Cell 4: Entity Extraction Pipeline Using Microsoft Biomed NLP

class SECEntityExtractor:
    """Extract biomedical entities from SEC filings using Azure Text Analytics"""
    
    def __init__(self, text_analytics_client):
        self.client = text_analytics_client
        self.extraction_stats = {
            'total_chunks_processed': 0,
            'total_entities_found': 0,
            'api_calls_made': 0,
            'errors': 0
        }
    
    def extract_entities_from_text(self, text: str, chunk_offset: int = 0) -> List[Dict[str, Any]]:
        """Extract entities from a single text chunk using Microsoft Biomed NLP"""
        if not self.client:
            print("⚠️ Azure Text Analytics client not available")
            return []
        
        try:
            # Call Azure Text Analytics for Health
            documents = [text]
            result = self.client.analyze_healthcare_entities(documents)
            
            self.extraction_stats['api_calls_made'] += 1
            
            entities = []
            
            # Process the results
            for doc_result in result:
                if doc_result.is_error:
                    print(f"✗ API Error: {doc_result.error}")
                    self.extraction_stats['errors'] += 1
                    continue
                
                for entity in doc_result.entities:
                    # Extract surrounding context
                    start_pos = max(0, entity.offset - EXTRACTION_CONFIG['context_window'])
                    end_pos = min(len(text), entity.offset + entity.length + EXTRACTION_CONFIG['context_window'])
                    surrounding_text = text[start_pos:end_pos]
                    
                    entity_data = {
                        'entity_text': entity.text,
                        'entity_category': entity.category,
                        'confidence_score': entity.confidence_score,
                        'character_start': chunk_offset + entity.offset,
                        'character_end': chunk_offset + entity.offset + entity.length,
                        'surrounding_text': surrounding_text,
                        'subcategory': getattr(entity, 'subcategory', None),
                        'assertion': getattr(entity, 'assertion', None)
                    }
                    entities.append(entity_data)
                    self.extraction_stats['total_entities_found'] += 1
            
            self.extraction_stats['total_chunks_processed'] += 1
            
            # Rate limiting
            time.sleep(EXTRACTION_CONFIG['rate_limit_delay'])
            
            return entities
            
        except Exception as e:
            print(f"✗ Entity extraction failed: {e}")
            self.extraction_stats['errors'] += 1
            return []
    
    def extract_entities_from_filing(self, filing_data: Dict[str, Any], filing_content: str) -> List[Dict[str, Any]]:
        """Extract entities from a complete SEC filing"""
        print(f"🔍 Extracting entities from {filing_data['company_domain']} - {filing_data['filing_type']}")
        
        # Split content into chunks
        chunks = fetcher.chunk_text(filing_content, EXTRACTION_CONFIG['max_text_length'])
        print(f"   📄 Processing {len(chunks)} chunks...")
        
        all_entities = []
        
        for i, chunk in enumerate(chunks):
            print(f"   🔢 Chunk {i+1}/{len(chunks)} ({len(chunk['text'])} chars)")
            
            entities = self.extract_entities_from_text(chunk['text'], chunk['start'])
            
            # Add filing metadata to each entity
            for entity in entities:
                entity.update({
                    'filing_id': filing_data['id'],
                    'company_domain': filing_data['company_domain'],
                    'filing_type': filing_data['filing_type'],
                    'filing_date': filing_data['filing_date'],
                    'sec_filing_ref': f"SEC_{filing_data['id']}",
                    'extraction_id': str(uuid.uuid4()),
                    'chunk_number': i,
                    'total_chunks': len(chunks)
                })
            
            all_entities.extend(entities)
            
            print(f"      Found {len(entities)} entities")
        
        print(f"   ✓ Total entities found: {len(all_entities)}")
        return all_entities
    
    def get_extraction_summary(self) -> Dict[str, Any]:
        """Get summary statistics of extraction process"""
        return {
            'extraction_stats': self.extraction_stats,
            'supported_entity_types': EXTRACTION_CONFIG['supported_entities'],
            'config': EXTRACTION_CONFIG
        }

# Initialize entity extractor
if text_analytics_client:
    extractor = SECEntityExtractor(text_analytics_client)
    print("✓ SEC Entity Extractor initialized!")
    
    # Test extraction on sample content if we have it
    if 'content' in locals() and content:
        print("\n🧪 Testing entity extraction...")
        sample_text = content[:1000] if len(content) > 1000 else content
        test_entities = extractor.extract_entities_from_text(sample_text)
        
        print(f"   📊 Found {len(test_entities)} entities in sample text")
        
        # Show sample entities
        for entity in test_entities[:3]:
            print(f"   🔍 {entity['entity_category']}: '{entity['entity_text']}' (confidence: {entity['confidence_score']:.3f})")
    
else:
    print("❌ Cannot initialize entity extractor - Azure client not available")
    extractor = None

print("\n✅ Entity Extraction Pipeline ready!")

In [None]:
# Cell 3: SEC Filing URL Fetching and Text Extraction

class SECFilingFetcher:
    """Fetches and processes SEC filing content from EDGAR URLs"""
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'SmartReach BizIntel Entity Extraction (contact@smartreach.com)',
            'Accept-Encoding': 'gzip, deflate',
            'Host': 'www.sec.gov'
        })
        
    def fetch_filing_content(self, url: str) -> Optional[str]:
        """Fetch and extract text content from SEC filing URL"""
        try:
            print(f"📥 Fetching: {url}")
            
            # Add delay to respect SEC rate limits
            time.sleep(0.1)
            
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            
            # Parse HTML content
            soup = BeautifulSoup(response.content, 'lxml')
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.extract()
                
            # Extract text content
            text = soup.get_text()
            
            # Clean up whitespace
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = ' '.join(chunk for chunk in chunks if chunk)
            
            print(f"  ✓ Extracted {len(text):,} characters")
            return text
            
        except requests.RequestException as e:
            print(f"  ✗ Request failed: {e}")
            return None
        except Exception as e:
            print(f"  ✗ Processing failed: {e}")
            return None
    
    def get_filings_to_process(self, limit: int = 5) -> List[Dict[str, Any]]:
        """Get SEC filings from database that need processing"""
        try:
            conn = psycopg2.connect(**NEON_CONFIG)
            cursor = conn.cursor()
            
            # Get filings that haven't been processed yet
            cursor.execute('''
                SELECT sf.id, sf.company_domain, sf.filing_type, sf.url, sf.filing_date, sf.title
                FROM raw_data.sec_filings sf
                LEFT JOIN system_uno.sec_entities_raw ser ON ser.sec_filing_ref = CONCAT('SEC_', sf.id)
                WHERE sf.url IS NOT NULL 
                AND ser.sec_filing_ref IS NULL
                ORDER BY sf.filing_date DESC
                LIMIT %s
            ''', (limit,))
            
            filings = cursor.fetchall()
            cursor.close()
            conn.close()
            
            return [{
                'id': filing[0],
                'company_domain': filing[1],
                'filing_type': filing[2],
                'url': filing[3],
                'filing_date': filing[4],
                'title': filing[5]
            } for filing in filings]
            
        except Exception as e:
            print(f"✗ Database query failed: {e}")
            return []
    
    def chunk_text(self, text: str, max_length: int = 5120, overlap: int = 200) -> List[Dict[str, Any]]:
        """Split text into chunks that fit Azure API limits"""
        if len(text) <= max_length:
            return [{'text': text, 'start': 0, 'end': len(text)}]
        
        chunks = []
        start = 0
        
        while start < len(text):
            end = min(start + max_length, len(text))
            
            # Try to break at sentence boundary
            if end < len(text):
                last_period = text.rfind('.', start, end)
                if last_period > start + max_length // 2:
                    end = last_period + 1
            
            chunk_text = text[start:end].strip()
            if chunk_text:
                chunks.append({
                    'text': chunk_text,
                    'start': start,
                    'end': end
                })
            
            start = max(end - overlap, start + 1)
        
        return chunks

# Initialize the fetcher
fetcher = SECFilingFetcher()

# Test fetching a single filing
test_filings = fetcher.get_filings_to_process(limit=1)
if test_filings:
    test_filing = test_filings[0]
    print(f"🧪 Testing with filing: {test_filing['company_domain']} - {test_filing['filing_type']}")
    print(f"   URL: {test_filing['url']}")
    
    content = fetcher.fetch_filing_content(test_filing['url'])
    if content:
        chunks = fetcher.chunk_text(content)
        print(f"   📄 Content: {len(content):,} characters")
        print(f"   🔢 Chunks: {len(chunks)} pieces")
        print(f"   📝 First 200 chars: {content[:200]}...")
    else:
        print("   ❌ Failed to fetch content")
else:
    print("📭 No unprocessed filings found")

print("\n✅ SEC Filing Fetcher ready!")

In [None]:
# Cell 0: GitHub Setup and Auto-Logging

import os
import sys
import importlib
import importlib.util
import psycopg2

# GitHub credentials - use Kaggle secrets for security
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
github_token = user_secrets.get_secret("GITHUB_TOKEN")
repo_url = f"https://{github_token}@github.com/amiralpert/SmartReach.git"
local_path = "/kaggle/working/SmartReach"

print("📦 Setting up GitHub repository...")

# Clone or update repo with force pull
if os.path.exists(local_path):
    print(f"📂 Repository exists at {local_path}")
    print("🔄 Force updating from GitHub...")
    !cd {local_path} && git fetch origin
    !cd {local_path} && git reset --hard origin/main
    !cd {local_path} && git pull origin main
    print("✅ Repository updated")
    
    # Show current commit
    !cd {local_path} && echo "Current commit:" && git log --oneline -1
else:
    print(f"📥 Cloning repository to {local_path}")
    !git clone {repo_url} {local_path}
    print("✅ Repository cloned")

# Clear any cached modules from previous runs
modules_to_clear = [key for key in sys.modules.keys() if 'sec_' in key.lower() or 'entity' in key.lower()]
for mod in modules_to_clear:
    del sys.modules[mod]
    print(f"  Cleared cached module: {mod}")

# Add to Python path for regular imports
if f'{local_path}/BizIntel' in sys.path:
    sys.path.remove(f'{local_path}/BizIntel')
sys.path.insert(0, f'{local_path}/BizIntel')

print("✓ Python path configured for SEC entity extraction!")

# Set up database configuration
NEON_CONFIG = {
    'host': 'ep-royal-star-ad1gn0d4-pooler.c-2.us-east-1.aws.neon.tech',
    'database': 'BizIntelSmartReach',
    'user': 'neondb_owner',
    'password': 'npg_aTFt6Pug3Kpy',
    'sslmode': 'require'
}

# Try to set up logger, but don't fail if there are issues
try:
    # Create separate connection for logger
    logger_conn = psycopg2.connect(**NEON_CONFIG)
    print("✓ Database connected for logger")

    # Import auto-logger using direct file import
    logger_module_path = f"{local_path}/BizIntel/Scripts/KaggleLogger/auto_logger.py"
    if os.path.exists(logger_module_path):
        spec = importlib.util.spec_from_file_location("auto_logger", logger_module_path)
        auto_logger_module = importlib.util.module_from_spec(spec)
        sys.modules["auto_logger"] = auto_logger_module
        spec.loader.exec_module(auto_logger_module)

        setup_auto_logging = auto_logger_module.setup_auto_logging
        logger = setup_auto_logging(logger_conn, "SEC_EntityExtraction")
        print("✓ Auto-logging enabled!")
    else:
        print(f"✗ Auto-logger not found at {logger_module_path}")
        logger = None
except Exception as e:
    print(f"⚠️ Logger setup failed: {e}")
    print("  Continuing without auto-logging...")
    logger = None

print("\n✅ Setup complete. SEC Entity Extraction Engine ready to use.")

In [None]:
# Cell 1: Neon Database Configuration
NEON_CONFIG = {
    'host': 'ep-royal-star-ad1gn0d4-pooler.c-2.us-east-1.aws.neon.tech',
    'database': 'BizIntelSmartReach',
    'user': 'neondb_owner',
    'password': 'npg_aTFt6Pug3Kpy',
    'sslmode': 'require'
}

# Test database connection
def test_database_connection():
    try:
        conn = psycopg2.connect(**NEON_CONFIG)
        cursor = conn.cursor()
        
        # Check SEC-related tables
        cursor.execute('''
            SELECT 
                (SELECT COUNT(*) FROM raw_data.sec_filings) as sec_filings,
                (SELECT COUNT(*) FROM core.companies) as companies,
                (SELECT COUNT(*) FROM system_uno.sec_entities_raw) as sec_entities_extracted,
                (SELECT COUNT(DISTINCT company_domain) FROM raw_data.sec_filings) as companies_with_filings,
                (SELECT COUNT(*) FROM raw_data.sec_filings WHERE url IS NOT NULL) as filings_with_urls
        ''')
        
        counts = cursor.fetchone()
        print("✓ Database connected successfully!")
        print(f"  SEC Filings: {counts[0]}")
        print(f"  Companies: {counts[1]}")
        print(f"  Extracted SEC Entities: {counts[2]}")
        print(f"  Companies with SEC Filings: {counts[3]}")
        print(f"  SEC Filings with URLs: {counts[4]}")
        
        # Show sample SEC filing data
        cursor.execute('''
            SELECT company_domain, filing_type, COUNT(*) as count
            FROM raw_data.sec_filings 
            GROUP BY company_domain, filing_type 
            ORDER BY company_domain, count DESC
            LIMIT 10
        ''')
        
        filing_stats = cursor.fetchall()
        print("\n📊 SEC Filing Distribution:")
        for stat in filing_stats:
            print(f"  {stat[0]}: {stat[1]} ({stat[2]} filings)")
        
        cursor.close()
        conn.close()
        return True
        
    except Exception as e:
        print(f"✗ Database connection failed: {e}")
        return False

# Test connection
test_database_connection()

In [None]:
# Cell 2: Microsoft Biomed NLP Setup and Dependencies

# Install required packages
!pip install azure-ai-textanalytics==5.3.0 requests beautifulsoup4 lxml uuid

import requests
from bs4 import BeautifulSoup
import uuid
import json
import time
from datetime import datetime
from typing import List, Dict, Any, Optional
import re

# Azure Text Analytics for Health (Microsoft Biomed NLP)
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

print("📦 Packages installed successfully!")

# Azure credentials - use Kaggle secrets for security
try:
    azure_key = user_secrets.get_secret("AZURE_TEXT_ANALYTICS_KEY")
    azure_endpoint = user_secrets.get_secret("AZURE_TEXT_ANALYTICS_ENDPOINT")
    
    # Initialize Text Analytics client
    credential = AzureKeyCredential(azure_key)
    text_analytics_client = TextAnalyticsClient(
        endpoint=azure_endpoint,
        credential=credential
    )
    
    print("✓ Azure Text Analytics client initialized!")
    
except Exception as e:
    print(f"⚠️ Azure setup failed: {e}")
    print("  Please ensure AZURE_TEXT_ANALYTICS_KEY and AZURE_TEXT_ANALYTICS_ENDPOINT are set in Kaggle secrets")
    text_analytics_client = None

# Configuration for entity extraction
EXTRACTION_CONFIG = {
    'max_text_length': 5120,  # Azure Text Analytics limit
    'context_window': 500,    # Characters before/after entity for context
    'batch_size': 10,         # Process 10 documents at a time
    'rate_limit_delay': 1.0,  # Seconds between API calls
    'supported_entities': [
        'MEDICATION', 'MEDICAL_CONDITION', 'TREATMENT', 
        'EXAMINATION', 'BODY_STRUCTURE', 'HEALTHCARE_PROFESSION',
        'DOSAGE', 'ROUTE_OR_MODE', 'FREQUENCY'
    ]
}

print(f"📋 Configuration loaded:")
print(f"  Max text length: {EXTRACTION_CONFIG['max_text_length']} characters")
print(f"  Context window: {EXTRACTION_CONFIG['context_window']} characters")
print(f"  Batch size: {EXTRACTION_CONFIG['batch_size']} documents")
print(f"  Supported entities: {len(EXTRACTION_CONFIG['supported_entities'])} types")

print("\n✅ Microsoft Biomed NLP setup complete!")