# SEC Filing Download, Parsing, and Chunking

This notebook demonstrates how to:
1. Download SEC filings using the EDGAR client
2. Parse XML/HTML documents from SEC filings
3. Create intelligent chunks for RAG systems
4. Store chunks in a vector database

Based on techniques from week1/docs.py and week1/intelligent-chunking.ipynb


In [None]:
# Import required libraries
from sec_edgar_client import SECEdgarClient
from sec_xml_parser import parse_sec_filing, chunk_sec_documents, SECXMLParser
import json
from pathlib import Path


## Initialize the SEC EDGAR Client

The client will automatically read your `SEC_USER_AGENT` from the `.env` file.


In [None]:
# Initialize client - reads SEC_USER_AGENT from .env automatically
client = SECEdgarClient()


## Step 1: Fetch Filings

Get recent filings for a company


In [None]:
# Get Apple's recent 10-K filings
apple_cik = "320193"
filings = client.fetch_filings(apple_cik, years=3)

# Filter for 10-K annual reports
ten_k_filings = [f for f in filings if f['form'] == '10-K']

print(f"Found {len(ten_k_filings)} 10-K filings")

# Show the most recent filing
if ten_k_filings:
    latest = ten_k_filings[0]
    print(f"\nMost recent 10-K:")
    print(f"  Date: {latest['filing_date']}")
    print(f"  Accession: {latest['accession_number']}")
    print(f"  Document: {latest['primary_document']}")


## Step 2: Download and Parse a Filing


In [None]:
# Download the most recent 10-K
if ten_k_filings:
    filing = ten_k_filings[0]
    
    # Download to a directory
    output_dir = "sec_downloads"
    Path(output_dir).mkdir(exist_ok=True)
    
    file_path = client.download_filing_document(
        accession_number=filing['accession_number'],
        primary_document=filing['primary_document'],
        cik=apple_cik,
        output_dir=output_dir
    )
    
    print(f"Downloaded to: {file_path}")


In [None]:
# Parse the downloaded document
if file_path:
    parser = SECXMLParser()
    parsed_doc = parse_sec_filing(file_path, document_name=filing['primary_document'])
    
    print(f"Parsed document: {parsed_doc['document_name']}")
    print(f"Filing type: {parsed_doc.get('filing_type', 'Unknown')}")
    print(f"Number of sections: {len(parsed_doc.get('sections', []))}")
    
    # Show first few sections
    for i, section in enumerate(parsed_doc.get('sections', [])[:3]):
        print(f"\nSection {i+1}: {section.get('title', 'Untitled')}")
        print(f"  Length: {len(section.get('content', ''))} chars")
        print(f"  Preview: {section.get('content', '')[:100]}...")


## Step 3: Create Intelligent Chunks

Use the chunking functions to split the document into manageable pieces for RAG


In [None]:
# Create chunks from the parsed document
# Adjust size and step for your use case:
# - size: max characters per chunk
# - step: overlap between chunks (for context preservation)

chunks = chunk_sec_documents(
    [parsed_doc],
    size=2000,      # Max 2000 characters per chunk
    step=1000,      # 1000 character overlap between chunks
    chunk_by_section=True  # Preserve section boundaries
)

print(f"Created {len(chunks)} chunks")
print(f"\nFirst 3 chunks:")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1}:")
    print(f"  Section: {chunk.get('section_title', 'N/A')}")
    print(f"  Content length: {len(chunk.get('content', ''))}")
    print(f"  Start position: {chunk.get('start', 0)}")
    print(f"  Preview: {chunk.get('content', '')[:150]}...")


## Step 4: Prepare Chunks for Vector Database

Add metadata and prepare for embedding


In [None]:
# Enhance chunks with metadata for RAG
enhanced_chunks = []
for i, chunk in enumerate(chunks):
    enhanced_chunk = {
        'id': f"{parsed_doc['document_name']}_chunk_{i}",
        'content': chunk.get('content', ''),
        'metadata': {
            'document_name': parsed_doc['document_name'],
            'filing_type': parsed_doc.get('filing_type', 'Unknown'),
            'section_title': chunk.get('section_title', 'Untitled'),
            'filing_date': filing['filing_date'],
            'cik': apple_cik,
            'form': filing['form'],
            'start_pos': chunk.get('start', 0),
            'chunk_index': i
        }
    }
    enhanced_chunks.append(enhanced_chunk)

print(f"Prepared {len(enhanced_chunks)} enhanced chunks")

# Show example
example = enhanced_chunks[0]
print(f"\nExample chunk metadata: {json.dumps(example['metadata'], indent=2)}")


## Step 5: Save Chunks (Optional)

Save for later use or batch processing


In [None]:
# Save chunks to JSON file
output_file = f"chunks_{parsed_doc['document_name']}.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(enhanced_chunks, f, indent=2, ensure_ascii=False)

print(f"Saved {len(enhanced_chunks)} chunks to {output_file}")
