In [None]:
import requests
import time
import random
import json
import csv
import logging
from urllib.parse import urlparse
from dataclasses import dataclass, asdict
from typing import Optional, List, Dict, Any
from datetime import datetime
import hashlib
import re

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class RequestConfig:
    """Configuration for request behavior and rate limiting"""
    min_delay: float = 1.0
    max_delay: float = 3.0
    max_retries: int = 3
    timeout: int = 30
    backoff_base: float = 2.0
    jitter: float = 0.3

class AdvancedDataCollector:
    """
    State-of-the-art data collection with anti-detection features
    """
    
    def __init__(self, config: RequestConfig = None):
        self.config = config or RequestConfig()
        self.session = requests.Session()
        self.domain_trackers = {}
        self.request_count = 0
        self.session_id = hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
        
        # Realistic browser headers rotation pool
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
        ]
        
        self._rotate_headers()
        
    def _rotate_headers(self):
        """Rotate session headers to mimic different browser instances"""
        self.session.headers.update({
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Cache-Control': 'max-age=0'
        })
    
    def _get_domain_tracker(self, domain: str) -> Dict:
        """Get or create domain-specific request tracker"""
        if domain not in self.domain_trackers:
            self.domain_trackers[domain] = {
                'last_request': 0,
                'consecutive_errors': 0,
                'total_requests': 0,
                'backoff_multiplier': 1.0
            }
        return self.domain_trackers[domain]
    
    def _calculate_intelligent_delay(self, domain: str) -> float:
        """Calculate dynamic delay based on domain history"""
        tracker = self._get_domain_tracker(domain)
        
        if 'sec.gov' in domain:
            base_delay = random.uniform(0.8, 1.2)
        else:
            base_delay = random.uniform(self.config.min_delay, self.config.max_delay)
        
        base_delay *= tracker['backoff_multiplier']
        jitter = random.uniform(-self.config.jitter, self.config.jitter)
        final_delay = max(0.1, base_delay + jitter)
        
        time_since_last = time.time() - tracker['last_request']
        if time_since_last < final_delay:
            final_delay = time_since_last + random.uniform(0.1, 0.5)
        
        tracker['last_request'] = time.time()
        return final_delay
    
    def _handle_rate_limit(self, domain: str, attempt: int):
        """Handle rate limiting with exponential backoff"""
        tracker = self._get_domain_tracker(domain)
        tracker['consecutive_errors'] += 1
        tracker['backoff_multiplier'] *= self.config.backoff_base
        
        backoff_time = (self.config.backoff_base ** attempt) + random.uniform(1, 5)
        logger.warning(f"Rate limit hit for {domain}. Backing off for {backoff_time:.1f}s")
        time.sleep(backoff_time)
        self._rotate_headers()
    
    def _handle_blocked_request(self, domain: str, attempt: int):
        """Handle blocked requests with progressive countermeasures"""
        tracker = self._get_domain_tracker(domain)
        tracker['consecutive_errors'] += 1
        
        if attempt == 1:
            wait_time = random.uniform(10, 20)
            self._rotate_headers()
        elif attempt == 2:
            wait_time = random.uniform(30, 60)
            self.session = requests.Session()
            self._rotate_headers()
        else:
            wait_time = random.uniform(120, 300)
        
        logger.warning(f"Request blocked for {domain}. Cooling off for {wait_time:.1f}s")
        time.sleep(wait_time)
    
    def make_request(self, url: str, custom_headers: Optional[Dict] = None) -> Optional[requests.Response]:
        """Make a resilient HTTP request with comprehensive anti-detection features"""
        domain = urlparse(url).netloc
        tracker = self._get_domain_tracker(domain)
        
        for attempt in range(self.config.max_retries):
            try:
                delay = self._calculate_intelligent_delay(domain)
                if delay > 0:
                    time.sleep(delay)
                
                headers = self.session.headers.copy()
                if custom_headers:
                    headers.update(custom_headers)
                
                logger.info(f"Request {self.request_count + 1} to {domain} (attempt {attempt + 1})")
                response = self.session.get(
                    url,
                    headers=headers,
                    timeout=self.config.timeout,
                    allow_redirects=True
                )
                
                self.request_count += 1
                tracker['total_requests'] += 1
                
                if response.status_code == 200:
                    tracker['consecutive_errors'] = 0
                    tracker['backoff_multiplier'] = max(1.0, tracker['backoff_multiplier'] * 0.9)
                    return response
                    
                elif response.status_code == 429:
                    self._handle_rate_limit(domain, attempt)
                    continue
                    
                elif response.status_code in [403, 503]:
                    self._handle_blocked_request(domain, attempt)
                    continue
                    
                elif 500 <= response.status_code < 600:
                    logger.warning(f"Server error {response.status_code} for {url}")
                    time.sleep((attempt + 1) * 5)
                    continue
                    
                else:
                    logger.warning(f"HTTP {response.status_code} for {url}")
                    if attempt < self.config.max_retries - 1:
                        time.sleep((attempt + 1) * 3)
                        continue
                    else:
                        return None
                        
            except requests.exceptions.Timeout:
                logger.warning(f"Timeout on attempt {attempt + 1} for {url}")
                if attempt < self.config.max_retries - 1:
                    time.sleep((attempt + 1) * 5)
                    continue
                    
            except requests.exceptions.ConnectionError:
                logger.warning(f"Connection error on attempt {attempt + 1} for {url}")
                if attempt < self.config.max_retries - 1:
                    time.sleep((attempt + 1) * 10)
                    self.session = requests.Session()
                    self._rotate_headers()
                    continue
                    
            except requests.exceptions.RequestException as e:
                logger.error(f"Request exception on attempt {attempt + 1}: {e}")
                if attempt < self.config.max_retries - 1:
                    time.sleep((attempt + 1) * 8)
                    continue
        
        logger.error(f"All {self.config.max_retries} attempts failed for {url}")
        return None

class SECDataProcessor:
    """Process SEC filing data for cybersecurity and data breach information"""
    
    # Comprehensive breach-related patterns
    BREACH_PATTERNS = [
        # Data breach mentions
        r'(data breach|cybersecurity incident|security incident|unauthorized access|data security)',
        # Financial impact patterns
        r'(\$[\d,]+(?:\.\d{2})?)\s*(?:million|billion|thousand)?\s*(?:fine|penalty|settlement|charge|loss)',
        # Regulatory mentions
        r'(FTC|SEC|GDPR|CCPA|regulatory|compliance)\s*(?:action|fine|investigation)',
        # Incident descriptions
        r'(compromised|exposed|leaked|breached)\s*(?:data|information|records)',
        # Customer impact
        r'(customer data|personal information|PII|records)\s*(?:exposed|compromised)'
    ]
    
    def __init__(self):
        self.compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in self.BREACH_PATTERNS]
    
    def extract_breach_mentions(self, text: str, company: str, filing_date: str) -> List[Dict[str, Any]]:
        """Extract potential breach-related information from filing text"""
        if not text:
            return []
        
        findings = []
        text_clean = re.sub(r'\s+', ' ', text)
        
        for i, pattern in enumerate(self.compiled_patterns):
            matches = pattern.finditer(text_clean)
            for match in matches:
                # Get context around the match
                start = max(0, match.start() - 200)
                end = min(len(text_clean), match.end() + 200)
                context = text_clean[start:end]
                
                finding = {
                    'company': company,
                    'filing_date': filing_date,
                    'match_type': self._get_match_type(i),
                    'matched_text': match.group(0),
                    'context': context,
                    'relevance_score': self._calculate_relevance(match.group(0), context),
                    'extraction_timestamp': datetime.now().isoformat()
                }
                findings.append(finding)
        
        return findings
    
    def _get_match_type(self, pattern_index: int) -> str:
        """Categorize the type of breach mention"""
        types = [
            'breach_incident',
            'financial_impact', 
            'regulatory_action',
            'incident_description',
            'customer_impact'
        ]
        return types[pattern_index] if pattern_index < len(types) else 'general_mention'
    
    def _calculate_relevance(self, matched_text: str, context: str) -> float:
        """Calculate relevance score for the finding"""
        score = 0.0
        
        # Boost score for financial amounts
        if '$' in matched_text:
            score += 0.3
        
        # Boost for specific breach terminology
        breach_terms = ['breach', 'compromised', 'unauthorized', 'exposed']
        if any(term in matched_text.lower() for term in breach_terms):
            score += 0.4
        
        # Context-based scoring
        if 'customer' in context.lower() or 'data' in context.lower():
            score += 0.2
        
        # Penalize very short matches
        if len(matched_text.strip()) < 10:
            score -= 0.1
        
        return min(1.0, max(0.1, score))

class SECDataCollector:
    """Main class for collecting and processing SEC data"""
    
    # Comprehensive list of tech and automotive tech companies with CIKs
    COMPANY_DATABASE = {
        # Technology Companies
        'Apple Inc': '0000320193',
        'Microsoft Corp': '0000789019',
        'Google (Alphabet Inc)': '0001652044',
        'Amazon.com Inc': '0001018724',
        'Meta Platforms Inc': '0001326801',
        'Tesla Inc': '0001318605',
        'NVIDIA Corp': '0001045810',
        'Intel Corp': '0000050863',
        'Adobe Inc': '0000796343',
        'Salesforce Inc': '0001108524',
        'Cisco Systems Inc': '0000858877',
        'Oracle Corp': '0001341439',
        'IBM Corp': '0000051143',
        'Qualcomm Inc': '0000804328',
        'Broadcom Inc': '0001060492',
        
        # Automotive Technology Companies
        'Ford Motor Co': '0000037996',
        'General Motors Co': '0001467858',
        'Toyota Motor Corp': '0001094517',
        'Honda Motor Co Ltd': '0001014156',
        'BMW AG': '0001396269',
        'Mercedes-Benz Group AG': '0001562634',
        'Volkswagen AG': '0001446864',
        'Rivian Automotive Inc': '0001874178',
        'Lucid Group Inc': '0001428762',
        'NIO Inc': '0001736544',
        'XPeng Inc': '0001789020',
        'Li Auto Inc': '0001791706',
        
        # Automotive Tech Suppliers
        'Aptiv PLC': '0001521332',
        'Magna International Inc': '0001236572',
        'Continental AG': '0001444686',
        'DENSO Corp': '0001446597',
        'Mobileye Global Inc': '0001910138'
    }
    
    def __init__(self):
        self.config = RequestConfig(min_delay=0.8, max_delay=1.5, max_retries=3)
        self.collector = AdvancedDataCollector(self.config)
        self.processor = SECDataProcessor()
        self.csv_filename = f"sec_cybersecurity_research_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        
    def get_sec_headers(self) -> Dict[str, str]:
        """Get SEC-compliant headers"""
        return {
            'User-Agent': 'Research Organization research@organization.com',
            'Accept': 'application/json'
        }
    
    def get_company_filings(self, cik: str) -> Optional[Dict]:
        """Get recent filings for a company"""
        url = f"https://data.sec.gov/submissions/CIK{cik}.json"
        response = self.collector.make_request(url, self.get_sec_headers())
        
        if response and response.status_code == 200:
            return response.json()
        return None
    
    def get_filing_content(self, accession_number: str, cik: str) -> Optional[str]:
        """Get the full text content of a specific filing"""
        # Convert accession number to filing URL
        acc_no_clean = accession_number.replace('-', '')
        url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_no_clean}/{accession_number}.txt"
        
        response = self.collector.make_request(url)
        if response and response.status_code == 200:
            return response.text
        return None
    
    def extract_recent_filings(self, filings_data: Dict, filing_type: str = '8-K') -> List[Dict]:
        """Extract recent filings of specific type"""
        recent_filings = []
        
        try:
            filings = filings_data.get('filings', {}).get('recent', {})
            accession_numbers = filings.get('accessionNumber', [])
            filing_dates = filings.get('filingDate', [])
            forms = filings.get('form', [])
            primary_documents = filings.get('primaryDocument', [])
            
            for i, form in enumerate(forms):
                if form == filing_type and i < len(accession_numbers):
                    filing_info = {
                        'accession_number': accession_numbers[i],
                        'filing_date': filing_dates[i],
                        'form_type': form,
                        'primary_document': primary_documents[i] if i < len(primary_documents) else ''
                    }
                    recent_filings.append(filing_info)
            
            # Return most recent 5 filings of the specified type
            return recent_filings[:5]
            
        except (KeyError, IndexError) as e:
            logger.error(f"Error extracting filings: {e}")
            return []
    
    def initialize_csv(self):
        """Initialize CSV file with headers"""
        with open(self.csv_filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([
                'session_id',
                'company_name',
                'cik',
                'filing_date',
                'form_type',
                'accession_number',
                'match_type',
                'matched_text',
                'context',
                'relevance_score',
                'extraction_timestamp',
                'research_timestamp'
            ])
    
    def save_finding_to_csv(self, finding: Dict[str, Any], cik: str, form_type: str, accession_number: str):
        """Save individual finding to CSV"""
        with open(self.csv_filename, 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([
                self.collector.session_id,
                finding['company'],
                cik,
                finding['filing_date'],
                form_type,
                accession_number,
                finding['match_type'],
                finding['matched_text'],
                finding['context'],
                finding['relevance_score'],
                finding['extraction_timestamp'],
                datetime.now().isoformat()
            ])
    
    def conduct_research(self):
        """Main research method to collect and analyze SEC filings"""
        logger.info(f"Starting SEC cybersecurity research session: {self.collector.session_id}")
        logger.info(f"Output file: {self.csv_filename}")
        
        self.initialize_csv()
        total_findings = 0
        companies_processed = 0
        
        for company_name, cik in self.COMPANY_DATABASE.items():
            try:
                logger.info(f"Processing {company_name} (CIK: {cik})")
                
                # Get company filings
                filings_data = self.get_company_filings(cik)
                if not filings_data:
                    logger.warning(f"No filings data for {company_name}")
                    continue
                
                # Extract recent 8-K filings (current reports)
                recent_filings = self.extract_recent_filings(filings_data, '8-K')
                logger.info(f"Found {len(recent_filings)} recent 8-K filings for {company_name}")
                
                company_findings = 0
                
                for filing in recent_filings:
                    # Get filing content
                    content = self.get_filing_content(filing['accession_number'], cik)
                    if not content:
                        continue
                    
                    # Process for breach mentions
                    findings = self.processor.extract_breach_mentions(
                        content, company_name, filing['filing_date']
                    )
                    
                    # Save findings to CSV
                    for finding in findings:
                        self.save_finding_to_csv(
                            finding, cik, filing['form_type'], filing['accession_number']
                        )
                        company_findings += 1
                        total_findings += 1
                    
                    # Brief pause between filings
                    time.sleep(random.uniform(0.5, 1.5))
                
                logger.info(f"Completed {company_name}: {company_findings} findings")
                companies_processed += 1
                
                # Strategic pause between companies
                if companies_processed < len(self.COMPANY_DATABASE):
                    pause = random.uniform(2, 5)
                    logger.info(f"Pausing for {pause:.1f}s before next company")
                    time.sleep(pause)
                    
            except Exception as e:
                logger.error(f"Error processing {company_name}: {e}")
                continue
        
        # Final summary
        stats = self.collector.get_session_stats()
        logger.info(f"Research complete. Processed {companies_processed} companies, found {total_findings} total findings")
        logger.info(f"Session stats: {stats}")
        logger.info(f"Data saved to: {self.csv_filename}")
        
        return total_findings

def main():
    """Main execution function"""
    try:
        collector = SECDataCollector()
        findings_count = collector.conduct_research()
        
        print(f"\n=== RESEARCH COMPLETE ===")
        print(f"Total findings: {findings_count}")
        print(f"Output file: {collector.csv_filename}")
        print(f"Companies analyzed: {len(collector.COMPANY_DATABASE)}")
        
    except KeyboardInterrupt:
        logger.info("Research interrupted by user")
    except Exception as e:
        logger.error(f"Research failed: {e}")

if __name__ == "__main__":
    main()

2025-10-30 10:26:03,391 - INFO - Starting SEC cybersecurity research session: 1d5de285
2025-10-30 10:26:03,395 - INFO - Output file: sec_cybersecurity_research_20251030_102603.csv
2025-10-30 10:26:03,402 - INFO - Processing Apple Inc (CIK: 0000320193)
2025-10-30 10:26:04,119 - INFO - Request 1 to data.sec.gov (attempt 1)
2025-10-30 10:26:04,651 - INFO - Found 5 recent 8-K filings for Apple Inc
2025-10-30 10:26:06,003 - INFO - Request 2 to www.sec.gov (attempt 1)
