In [1]:
# 10-K Report Pre-Processing
# This notebook reduces the number of chunks in 10-K reports to a more manageable size
# while maintaining the most relevant information across all available reports.

import os
import json
import glob
import re
import tqdm
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from concurrent.futures import ProcessPoolExecutor, as_completed
import time
import gc
from typing import List, Dict, Any, Tuple, Optional

# Mount Google Drive (uncomment for Google Colab)
from google.colab import drive
drive.mount('/content/drive')

# Configuration
BASE_DIR = "/content/drive/MyDrive/MSDS490_Project"
TENK_DIR = os.path.join(BASE_DIR, "10-K")
OUTPUT_DIR = os.path.join(BASE_DIR, "10-K_Cleaned")
TARGET_CHUNKS = 50000  # Target number of chunks to keep
MAX_WORKERS = 4  # Adjust based on your available resources

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Helper functions

def count_chunks_in_file(file_path):
    """Count the number of chunks in a JSON file."""
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
            if isinstance(data, list):
                return len(data)
            return 1
    except Exception as e:
        print(f"Error counting chunks in {file_path}: {e}")
        return 0

def get_report_metadata(file_path):
    """Extract metadata from the file path and its content."""
    try:
        filename = os.path.basename(file_path)
        company = os.path.basename(os.path.dirname(file_path))

        # Try to extract year from filename
        year_match = re.search(r'(\d{4})', filename)
        year = year_match.group(1) if year_match else "unknown"

        with open(file_path, 'r') as f:
            data = json.load(f)

        # Check the first item for metadata
        if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
            metadata = data[0].get('metadata', {})
            year = metadata.get('year', year)
            filing_number = metadata.get('filing_number', 0)
        else:
            filing_number = 0

        return {
            'company': company,
            'filename': filename,
            'year': year,
            'filing_number': filing_number
        }
    except Exception as e:
        print(f"Error extracting metadata from {file_path}: {e}")
        return {
            'company': os.path.basename(os.path.dirname(file_path)),
            'filename': os.path.basename(file_path),
            'year': "unknown",
            'filing_number': 0
        }

def collect_report_info():
    """Collect information about all 10-K report files."""
    report_files = []

    print("Scanning directories for 10-K reports...")
    companies = [d for d in os.listdir(TENK_DIR) if os.path.isdir(os.path.join(TENK_DIR, d))]

    total_chunks = 0
    for company in tqdm.tqdm(companies):
        company_dir = os.path.join(TENK_DIR, company)
        json_files = glob.glob(os.path.join(company_dir, "*_rag.json"))

        for file_path in json_files:
            num_chunks = count_chunks_in_file(file_path)
            total_chunks += num_chunks

            if num_chunks > 0:
                metadata = get_report_metadata(file_path)
                report_files.append({
                    'path': file_path,
                    'num_chunks': num_chunks,
                    **metadata
                })

    print(f"Found {len(report_files)} report files with a total of {total_chunks} chunks")
    return report_files, total_chunks

def is_important_section(text):
    """Identify if a chunk is from an important section of the 10-K report."""
    important_keywords = [
        "risk factors", "management discussion", "financial statements",
        "MD&A", "business overview", "executive summary", "results of operations",
        "liquidity", "capital resources", "critical accounting", "market risk",
        "revenue", "profit", "income", "expense", "cash flow", "assets",
        "liabilities", "debt", "equity", "dividend", "acquisition", "strategy",
        "outlook", "guidance", "forecast"
    ]

    # Check if text contains any important keywords
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in important_keywords)

def calculate_chunk_importance(chunks):
    """Calculate importance score for each chunk based on content."""
    # Extract text content
    texts = [chunk.get('text', '') for chunk in chunks]

    # Initialize importance scores
    importance_scores = np.zeros(len(texts))

    # 1. Score based on important section keywords
    for i, text in enumerate(texts):
        if is_important_section(text):
            importance_scores[i] += 2.0

    # 2. Score based on TF-IDF (identifies unique/distinctive chunks)
    try:
        # Create TF-IDF matrix
        vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            min_df=2,
            max_df=0.95
        )
        tfidf_matrix = vectorizer.fit_transform(texts)

        # Calculate average TF-IDF score for each document
        tfidf_scores = np.array([tfidf_matrix[i].mean() for i in range(len(texts))])

        # Normalize and add to importance scores
        if tfidf_scores.max() > 0:
            normalized_tfidf = (tfidf_scores - tfidf_scores.min()) / (tfidf_scores.max() - tfidf_scores.min())
            importance_scores += normalized_tfidf
    except Exception as e:
        print(f"Error calculating TF-IDF: {e}")

    # 3. Score based on chunk length (longer chunks may contain more information)
    text_lengths = np.array([len(text) for text in texts])
    if text_lengths.max() > 0:
        length_scores = (text_lengths - text_lengths.min()) / (text_lengths.max() - text_lengths.min())
        importance_scores += 0.5 * length_scores

    return importance_scores

def process_file(file_path, reduction_factor):
    """Process a single 10-K report file and select important chunks."""
    try:
        with open(file_path, 'r') as f:
            chunks = json.load(f)

        if not isinstance(chunks, list):
            chunks = [chunks]

        # If the file has too few chunks, keep all of them
        if len(chunks) <= 10:
            return chunks

        # Calculate importance scores
        importance_scores = calculate_chunk_importance(chunks)

        # Determine how many chunks to keep
        num_to_keep = max(10, int(len(chunks) * reduction_factor))

        # Create array of indices sorted by importance score
        sorted_indices = np.argsort(-importance_scores)

        # Select the most important chunks
        selected_indices = sorted_indices[:num_to_keep]

        # Sort selected indices to maintain original order
        selected_indices = np.sort(selected_indices)

        # Extract selected chunks
        selected_chunks = [chunks[i] for i in selected_indices]

        return selected_chunks
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return []

def process_all_files(report_files, total_chunks):
    """Process all 10-K report files in parallel."""
    # Calculate global reduction factor
    global_reduction_factor = TARGET_CHUNKS / total_chunks
    print(f"Global reduction factor: {global_reduction_factor:.4f}")

    processed_data = {}
    total_chunks_kept = 0

    # Group files by company for better organization
    companies = {}
    for report in report_files:
        company = report['company']
        if company not in companies:
            companies[company] = []
        companies[company].append(report)

    # Process each company's files
    print(f"Processing {len(companies)} companies...")
    for company_name, company_reports in tqdm.tqdm(companies.items()):
        company_dir = os.path.join(OUTPUT_DIR, company_name)
        os.makedirs(company_dir, exist_ok=True)

        # Adjusted reduction factor for newer reports
        # Sort reports by filing number (latest reports first)
        company_reports.sort(key=lambda x: x['filing_number'], reverse=True)

        # Process files in parallel
        with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {}

            for i, report in enumerate(company_reports):
                # Adjust reduction factor based on report age
                # Keep more chunks from recent reports, fewer from older ones
                age_factor = 1.0 if i < 2 else (0.8 if i < 4 else 0.6)
                adjusted_factor = global_reduction_factor * age_factor

                future = executor.submit(process_file, report['path'], adjusted_factor)
                futures[future] = report

            # Collect results
            for future in as_completed(futures):
                report = futures[future]
                try:
                    selected_chunks = future.result()

                    if selected_chunks:
                        # Save to new file
                        output_path = os.path.join(company_dir, f"{os.path.splitext(report['filename'])[0]}_cleaned.json")
                        with open(output_path, 'w') as f:
                            json.dump(selected_chunks, f)

                        total_chunks_kept += len(selected_chunks)
                        print(f"Kept {len(selected_chunks)} chunks from {report['filename']} ({report['company']}, {report['year']})")
                except Exception as e:
                    print(f"Error processing {report['path']}: {e}")

    print(f"Total chunks kept: {total_chunks_kept}")
    return total_chunks_kept

# Main execution
def main():
    start_time = time.time()

    # Step 1: Collect information about all report files
    report_files, total_chunks = collect_report_info()

    # Step 2: Process and reduce all files
    total_chunks_kept = process_all_files(report_files, total_chunks)

    # Step 3: Generate summary report
    reduction_percent = (1 - (total_chunks_kept / total_chunks)) * 100

    print("\n===== Summary Report =====")
    print(f"Original chunks: {total_chunks}")
    print(f"Chunks after reduction: {total_chunks_kept}")
    print(f"Reduction: {reduction_percent:.2f}%")
    print(f"Processing time: {(time.time() - start_time) / 60:.2f} minutes")

    # Create summary file
    summary = {
        "original_chunks": total_chunks,
        "reduced_chunks": total_chunks_kept,
        "reduction_percent": reduction_percent,
        "processing_time_minutes": (time.time() - start_time) / 60
    }

    with open(os.path.join(OUTPUT_DIR, "reduction_summary.json"), 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"Summary saved to {os.path.join(OUTPUT_DIR, 'reduction_summary.json')}")

if __name__ == "__main__":
    main()

Mounted at /content/drive
Scanning directories for 10-K reports...


100%|██████████| 11/11 [01:05<00:00,  5.91s/it]


Found 11 report files with a total of 2520606 chunks
Global reduction factor: 0.0198
Processing 11 companies...


  9%|▉         | 1/11 [00:49<08:10, 49.00s/it]

Kept 3489 chunks from aapl_10k_rag.json (AAPL, 2024)


 18%|█▊        | 2/11 [01:36<07:15, 48.36s/it]

Kept 3327 chunks from amzn_10k_rag.json (AMZN, 2024)


 27%|██▋       | 3/11 [02:34<07:00, 52.55s/it]

Kept 4141 chunks from googl_10k_rag.json (GOOGL, 2024)


 36%|███▋      | 4/11 [05:10<10:53, 93.30s/it]

Kept 10985 chunks from jpm_10k_rag.json (JPM, 2024)


 45%|████▌     | 5/11 [05:56<07:37, 76.23s/it]

Kept 3215 chunks from meta_10k_rag.json (META, 2024)


 55%|█████▍    | 6/11 [06:45<05:35, 67.18s/it]

Kept 3417 chunks from msft_10k_rag.json (MSFT, 2024)


 64%|██████▎   | 7/11 [07:44<04:17, 64.30s/it]

Kept 4148 chunks from nvda_10k_rag.json (NVDA, 2025)


 73%|███████▎  | 8/11 [08:39<03:04, 61.49s/it]

Kept 3864 chunks from rtx_10k_rag.json (RTX, 2019)


 82%|████████▏ | 9/11 [09:45<02:05, 62.80s/it]

Kept 4605 chunks from tsla_10k_rag.json (TSLA, 2024)


 91%|█████████ | 10/11 [11:00<01:06, 66.65s/it]

Kept 5380 chunks from v_10k_rag.json (V, 2024)


100%|██████████| 11/11 [11:49<00:00, 64.48s/it]

Kept 3425 chunks from wmt_10k_rag.json (WMT, 2025)
Total chunks kept: 49996

===== Summary Report =====
Original chunks: 2520606
Chunks after reduction: 49996
Reduction: 98.02%
Processing time: 12.91 minutes
Summary saved to /content/drive/MyDrive/MSDS490_Project/10-K_Cleaned/reduction_summary.json



