<a href="https://colab.research.google.com/github/atharvavyas1/Finance-N8N-project/blob/dev-text-extraction/SEC_EDGAR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
"""
SEC EDGAR Insider Trading Tracker - Official API
==========================================================================

A complete implementation for tracking insider trading using the free SEC EDGAR API.
Includes Form 4 parsing, rate limiting, and data analysis tools.

Requirements:
pip install requests
"""

import requests
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional
from datetime import datetime, timedelta
import time
import json
from functools import wraps

# ============================================================================
# CONFIGURATION
# ============================================================================

# REQUIRED: Replace with your information
USER_AGENT = "MyCompany myemail@company.com"  # CHANGE THIS!

SEC_BASE_URL = "https://data.sec.gov"
EDGAR_ARCHIVES_BASE = "https://www.sec.gov/Archives/edgar/data"

HEADERS = {
    "User-Agent": USER_AGENT,
    "Accept-Encoding": "gzip, deflate",
    "Host": "data.sec.gov"
}

# Rate limiting: SEC allows 10 requests per second
REQUEST_DELAY = 0.11  # 110ms between requests (slightly over 100ms to be safe)

# ============================================================================
# RATE LIMITING
# ============================================================================

last_request_time = 0

def rate_limit():
    """Ensure we don't exceed SEC's rate limit of 10 requests per second."""
    global last_request_time
    current_time = time.time()
    time_since_last = current_time - last_request_time

    if time_since_last < REQUEST_DELAY:
        time.sleep(REQUEST_DELAY - time_since_last)

    last_request_time = time.time()

def with_rate_limit(func):
    """Decorator to add rate limiting to functions."""
    @wraps(func)
    def wrapper(*args, **kwargs):
        rate_limit()
        return func(*args, **kwargs)
    return wrapper

# ============================================================================
# CIK LOOKUP
# ============================================================================

@with_rate_limit
def get_cik_by_ticker(ticker: str) -> Optional[str]:
    """
    Look up a company's CIK by ticker symbol using SEC's company tickers JSON.
    Returns CIK as 10-digit string with leading zeros, or None if not found.
    """
    # SEC provides a JSON file mapping all tickers to CIKs
    url = "https://www.sec.gov/files/company_tickers.json"

    # Use modified headers for www.sec.gov (not data.sec.gov)
    headers = {
        "User-Agent": USER_AGENT,
        "Accept-Encoding": "gzip, deflate"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()

        # The JSON format is: {0: {"cik_str": 320193, "ticker": "AAPL", "title": "Apple Inc."}, ...}
        ticker_upper = ticker.upper()

        for entry in data.values():
            if entry.get('ticker', '').upper() == ticker_upper:
                # Convert CIK to 10-digit string with leading zeros
                cik = str(entry['cik_str']).zfill(10)
                return cik

        print(f"Could not find CIK for ticker {ticker}")
        print("Please look up the CIK manually at: https://www.sec.gov/edgar/searchedgar/companysearch.html")
        return None

    except Exception as e:
        print(f"Error fetching company tickers: {e}")
        return None

# ============================================================================
# FILINGS RETRIEVAL
# ============================================================================

@with_rate_limit
def get_company_submissions(cik: str) -> Dict:
    """
    Get all submission history for a company.

    Args:
        cik: 10-digit CIK with leading zeros (e.g., "0000320193" for Apple)

    Returns:
        Complete submissions data including company info and filing history
    """
    cik = cik.zfill(10)  # Ensure 10 digits with leading zeros
    url = f"{SEC_BASE_URL}/submissions/CIK{cik}.json"

    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    return response.json()

def filter_insider_filings(submissions_data: Dict,
                          form_types: List[str] = ['3', '4', '5'],
                          days_back: Optional[int] = None) -> List[Dict]:
    """
    Filter submissions to get only insider trading forms (3, 4, 5).

    Args:
        submissions_data: Data from get_company_submissions()
        form_types: List of form types to include (default: 3, 4, 5)
        days_back: Only include filings from the last N days (optional)

    Returns:
        List of insider trading filings with metadata
    """
    recent = submissions_data.get('filings', {}).get('recent', {})

    cutoff_date = None
    if days_back:
        cutoff_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')

    insider_filings = []

    for i, form in enumerate(recent.get('form', [])):
        # Check if it's an insider form (including amended versions like 4/A)
        base_form = form.split('/')[0]
        if base_form in form_types:
            filing_date = recent['filingDate'][i]

            # Apply date filter if specified
            if cutoff_date and filing_date < cutoff_date:
                continue

            filing = {
                'accessionNumber': recent['accessionNumber'][i],
                'filingDate': filing_date,
                'reportDate': recent.get('reportDate', [None] * len(recent['form']))[i],
                'acceptanceDateTime': recent.get('acceptanceDateTime', [None] * len(recent['form']))[i],
                'form': form,
                'primaryDocument': recent.get('primaryDocument', [None] * len(recent['form']))[i],
                'description': recent.get('primaryDocDescription', [None] * len(recent['form']))[i],
                'isAmendment': '/A' in form
            }
            insider_filings.append(filing)

    return insider_filings

# ============================================================================
# URL CONSTRUCTION
# ============================================================================

def get_form4_url(cik: str, accession_number: str, primary_doc: str) -> str:
    """
    Construct the URL to access a Form 4 filing.

    Args:
        cik: Company CIK (with or without leading zeros)
        accession_number: Accession number with dashes
        primary_doc: Primary document filename (may include XSLT path like xslF345X05/)

    Returns:
        Complete URL to the filing
    """
    # Remove dashes from accession number for URL
    acc_no_stripped = accession_number.replace('-', '')

    # For archive URLs, strip leading zeros from CIK (but keep at least one digit)
    cik_for_url = cik.lstrip('0') or '0'

    # Keep primary_doc as-is (xslF345X05/ path included)
    return f"https://www.sec.gov/Archives/edgar/data/{cik_for_url}/{acc_no_stripped}/{primary_doc}"

# ============================================================================
# FORM 4 XML PARSING
# ============================================================================

@with_rate_limit
def fetch_form4_xml(cik: str, accession_number: str, primary_doc: str) -> str:
    """
    Download the raw XML content of a Form 4 filing.

    CRITICAL: Must remove XSLT stylesheet path to get raw XML instead of HTML.
    The primaryDocument field includes paths like "xslF345X05/" which causes
    the server to return HTML-rendered content instead of raw XML.
    """
    # For URL, remove leading zeros from CIK but keep at least one digit
    cik_for_url = cik.lstrip('0') or '0'
    acc_no = accession_number.replace('-', '')

    # CRITICAL: Remove XSLT stylesheet path to get raw XML
    # If we keep "xslF345X05/" in the path, we get HTML instead of XML
    if '/' in primary_doc:
        primary_doc = primary_doc.split('/')[-1]  # Take only the filename

    url = f"{EDGAR_ARCHIVES_BASE}/{cik_for_url}/{acc_no}/{primary_doc}"

    # Use modified headers for archive server (not data.sec.gov)
    archive_headers = {
        "User-Agent": USER_AGENT,
        "Accept-Encoding": "gzip, deflate"
    }

    response = requests.get(url, headers=archive_headers)
    response.raise_for_status()
    return response.text

def parse_form4(xml_content: str) -> Dict:
    """
    Parse a Form 4 XML file and extract all relevant information.

    Returns a structured dictionary with issuer, owner, and transaction details.
    """
    try:
        root = ET.fromstring(xml_content)
    except ET.ParseError as e:
        return {'error': f'XML parsing error: {str(e)}'}

    result = {}

    # Debug: Print root tag to see structure
    # print(f"Root tag: {root.tag}")
    # for child in root:
    #     print(f"  Child: {child.tag}")

    # Parse issuer (the company whose stock is being traded)
    issuer = root.find('.//issuer')
    if issuer is not None:
        result['issuer'] = {
            'name': _get_text(issuer, 'issuerName'),
            'cik': _get_text(issuer, 'issuerCik'),
            'ticker': _get_text(issuer, 'issuerTradingSymbol')
        }
    else:
        result['issuer'] = None

    # Parse reporting owner (the insider doing the trading)
    owner = root.find('.//reportingOwner')
    if owner is not None:
        owner_id = owner.find('.//reportingOwnerId')
        relationship = owner.find('.//reportingOwnerRelationship')

        result['reportingOwner'] = {
            'name': _get_text(owner_id, 'rptOwnerName'),
            'cik': _get_text(owner_id, 'rptOwnerCik'),
            'address': _parse_address(owner_id),
            'relationship': _parse_relationship(relationship)
        }
    else:
        result['reportingOwner'] = None

    # Parse non-derivative transactions (regular stock transactions)
    result['transactions'] = []
    for trans in root.findall('.//nonDerivativeTransaction'):
        transaction = _parse_transaction(trans, derivative=False)
        if transaction:
            result['transactions'].append(transaction)

    # Parse derivative transactions (options, warrants, etc.)
    result['derivativeTransactions'] = []
    for trans in root.findall('.//derivativeTransaction'):
        transaction = _parse_transaction(trans, derivative=True)
        if transaction:
            result['derivativeTransactions'].append(transaction)

    # Parse holdings (current ownership after transactions)
    result['holdings'] = _parse_holdings(root)

    return result

def _get_text(element, tag_name: str) -> Optional[str]:
    """
    Safely get text from an XML element.
    Handles both direct text and nested <value> tags.
    """
    if element is None:
        return None
    found = element.find(f'.//{tag_name}')
    if found is None:
        return None

    # Check if there's a nested <value> tag
    value_elem = found.find('value')
    if value_elem is not None:
        return value_elem.text

    # Otherwise return direct text
    return found.text

def _parse_address(owner_id) -> Dict:
    """Parse address information from owner ID."""
    if owner_id is None:
        return {}

    address = owner_id.find('.//reportingOwnerAddress')
    if address is None:
        return {}

    return {
        'street1': _get_text(address, 'rptOwnerStreet1'),
        'street2': _get_text(address, 'rptOwnerStreet2'),
        'city': _get_text(address, 'rptOwnerCity'),
        'state': _get_text(address, 'rptOwnerState'),
        'zipCode': _get_text(address, 'rptOwnerZipCode')
    }

def _parse_relationship(relationship) -> Dict:
    """Parse the relationship of the reporting owner to the company."""
    if relationship is None:
        return {}

    return {
        'isDirector': _get_text(relationship, 'isDirector') == '1',
        'isOfficer': _get_text(relationship, 'isOfficer') == '1',
        'isTenPercentOwner': _get_text(relationship, 'isTenPercentOwner') == '1',
        'isOther': _get_text(relationship, 'isOther') == '1',
        'officerTitle': _get_text(relationship, 'officerTitle')
    }

def _parse_transaction(trans_element, derivative: bool = False) -> Optional[Dict]:
    """Parse a single transaction from Form 4."""
    if trans_element is None:
        return None

    # Helper to get value from nested structure
    def get_value(parent, tag):
        elem = parent.find(f'.//{tag}')
        if elem is None:
            return None
        value_elem = elem.find('value')
        return value_elem.text if value_elem is not None else elem.text

    transaction = {
        'securityTitle': get_value(trans_element, 'securityTitle'),
        'transactionDate': get_value(trans_element, 'transactionDate'),
        'deemedExecutionDate': get_value(trans_element, 'deemedExecutionDate'),
        'transactionCode': get_value(trans_element, 'transactionCode'),
        'equitySwapInvolved': get_value(trans_element, 'equitySwapInvolved') == '1',
    }

    # Parse transaction amounts
    shares = get_value(trans_element, 'transactionShares')
    price = get_value(trans_element, 'transactionPricePerShare')

    transaction.update({
        'shares': float(shares) if shares else None,
        'pricePerShare': float(price) if price else None,
        'acquiredDisposed': get_value(trans_element, 'transactionAcquiredDisposedCode'),
        'totalValue': None
    })

    # Calculate total value
    if transaction['shares'] and transaction['pricePerShare']:
        transaction['totalValue'] = transaction['shares'] * transaction['pricePerShare']

    # Parse post-transaction ownership
    shares_owned = get_value(trans_element, 'sharesOwnedFollowingTransaction')
    transaction['sharesOwnedAfter'] = float(shares_owned) if shares_owned else None

    # Parse ownership nature
    transaction['directIndirect'] = get_value(trans_element, 'directOrIndirectOwnership')

    return transaction

def _parse_holdings(root) -> List[Dict]:
    """Parse current holdings (non-derivative holdings)."""
    holdings = []

    # Helper to get value from nested structure
    def get_value(parent, tag):
        elem = parent.find(f'.//{tag}')
        if elem is None:
            return None
        value_elem = elem.find('value')
        return value_elem.text if value_elem is not None else elem.text

    for holding in root.findall('.//nonDerivativeHolding'):
        h = {
            'securityTitle': get_value(holding, 'securityTitle'),
            'shares': get_value(holding, 'sharesOwnedFollowingTransaction'),
            'directIndirect': get_value(holding, 'directOrIndirectOwnership'),
            'natureOfOwnership': get_value(holding, 'natureOfOwnership')
        }
        holdings.append(h)

    return holdings

# ============================================================================
# TRANSACTION CODE MAPPING
# ============================================================================

TRANSACTION_CODES = {
    'P': 'Open Market Purchase',
    'S': 'Open Market Sale',
    'A': 'Grant/Award',
    'D': 'Sale to Issuer',
    'F': 'Payment of Exercise Price or Tax Liability',
    'G': 'Gift',
    'M': 'Exercise of Options',
    'C': 'Conversion',
    'E': 'Expiration',
    'H': 'Held/Withheld',
    'I': 'Discretionary Transaction',
    'J': 'Other',
    'K': 'Equity Swap',
    'L': 'Small Acquisition',
    'U': 'Disposition to Issuer',
    'W': 'Acquisition or Disposition by Will',
    'X': 'Exercise of Out-of-the-Money Options',
    'Z': 'Deposit into or Withdrawal from Voting Trust'
}

def get_transaction_description(code: str) -> str:
    """Get human-readable description of a transaction code."""
    return TRANSACTION_CODES.get(code, f'Unknown ({code})')

# ============================================================================
# CONVENIENCE FUNCTIONS
# ============================================================================

def get_recent_insider_trades(cik: str, days: int = 30) -> List[Dict]:
    """
    Get all insider trades for a company in the last N days with parsed details.

    Args:
        cik: Company CIK (10 digits with leading zeros)
        days: Number of days to look back

    Returns:
        List of parsed Form 4 filings with transaction details
    """
    # Get company submissions
    submissions = get_company_submissions(cik)

    # Filter for Form 4 in the specified time period
    form4s = filter_insider_filings(submissions, form_types=['4'], days_back=days)

    # Parse each Form 4
    parsed_filings = []
    for filing in form4s:
        try:
            # Fetch XML
            xml_content = fetch_form4_xml(
                cik,
                filing['accessionNumber'],
                filing['primaryDocument']
            )

            # Parse XML
            parsed = parse_form4(xml_content)
            parsed['filing_metadata'] = filing

            parsed_filings.append(parsed)
        except Exception as e:
            # Silently skip failed filings
            continue

    return parsed_filings

def get_recent_insider_trades_by_ticker(ticker: str, days: int = 30) -> List[Dict]:
    """
    Get all insider trades for a company by ticker symbol in the last N days.

    Args:
        ticker: Stock ticker symbol (e.g., "AAPL", "MSFT")
        days: Number of days to look back

    Returns:
        List of parsed Form 4 filings with transaction details
    """
    # First, look up the CIK from the ticker
    cik = get_cik_by_ticker(ticker)

    if cik is None:
        raise ValueError(f"Could not find CIK for ticker {ticker}")

    # Then use the existing function with the CIK
    return get_recent_insider_trades(cik, days)

def print_insider_summary(parsed_filings: List[Dict], verbose: bool = True):
    """
    Print a human-readable summary of insider trades.

    Args:
        parsed_filings: List of parsed Form 4 filings
        verbose: If True, print detailed summary. If False, print minimal info.
    """
    if not verbose:
        print(f"Processed {len(parsed_filings)} Form 4 filings")
        return

    print(f"\n{'='*80}")
    print(f"INSIDER TRADING SUMMARY - {len(parsed_filings)} Filings")
    print(f"{'='*80}\n")

    for filing in parsed_filings:
        metadata = filing.get('filing_metadata', {})
        owner = filing.get('reportingOwner', {})
        issuer = filing.get('issuer', {})

        print(f"Filing Date: {metadata.get('filingDate')}")
        print(f"Company: {issuer.get('name')} ({issuer.get('ticker')})")
        print(f"Insider: {owner.get('name')}")

        rel = owner.get('relationship', {})
        roles = []
        if rel.get('isDirector'): roles.append('Director')
        if rel.get('isOfficer'):
            title = rel.get('officerTitle', 'Officer')
            roles.append(title)
        if rel.get('isTenPercentOwner'): roles.append('10% Owner')

        if roles:
            print(f"Role: {', '.join(roles)}")

        print(f"\nTransactions:")
        for trans in filing.get('transactions', []):
            code = trans.get('transactionCode')
            action = get_transaction_description(code)
            acquired = trans.get('acquiredDisposed') == 'A'

            shares = trans.get('shares', 0)
            price = trans.get('pricePerShare')
            total = trans.get('totalValue')

            action_verb = "Acquired" if acquired else "Disposed"
            if price and total:
                print(f"  - {action_verb} {shares:,.0f} shares @ ${price:.2f} = ${total:,.2f}")
            else:
                print(f"  - {action_verb} {shares:,.0f} shares")
            print(f"    Type: {action} ({code})")
            print(f"    Date: {trans.get('transactionDate')}")
            if trans.get('sharesOwnedAfter'):
                print(f"    Shares Owned After: {trans.get('sharesOwnedAfter'):,.0f}")

        # Print derivative transactions if any
        deriv_trans = filing.get('derivativeTransactions', [])
        if deriv_trans:
            print(f"\nDerivative Transactions:")
            for trans in deriv_trans:
                code = trans.get('transactionCode')
                action = get_transaction_description(code)
                acquired = trans.get('acquiredDisposed') == 'A'

                shares = trans.get('shares', 0)
                action_verb = "Acquired" if acquired else "Disposed"

                print(f"  - {action_verb} {shares:,.0f} derivative securities")
                print(f"    Security: {trans.get('securityTitle')}")
                print(f"    Type: {action} ({code})")
                print(f"    Date: {trans.get('transactionDate')}")

        print(f"\n{'-'*80}\n")

# ============================================================================
# EXAMPLE USAGE
# ============================================================================

if __name__ == "__main__":
    # Example: Track Apple's insider trading using ticker
    TICKER = "NVDA"

    print(f"Fetching insider trades for {TICKER}...")

    try:
        # Get last 90 days of insider trades using ticker
        trades = get_recent_insider_trades_by_ticker(TICKER, days=90)

        # Save to JSON file
        with open('insider_trades.json', 'w') as f:
            json.dump(trades, f, indent=2)

        print(f"✓ Successfully saved {len(trades)} filings to insider_trades.json")

    except Exception as e:
        print(f"✗ Error: {e}")
        print("\nMake sure you've updated the USER_AGENT at the top of the file!")

Fetching insider trades for NVDA...
✓ Successfully saved 29 filings to insider_trades.json
