In [None]:
import os
import pandas as pd
import re
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor

# Convert date string to quarter format
def date_to_quarter(date_str):
    if not date_str:
        return None
    year, month, _ = date_str.split('-')
    month = int(month)
    if month in [1, 2, 3]:
        q = "Q1"
    elif month in [4, 5, 6]:
        q = "Q2"
    elif month in [7, 8, 9]:
        q = "Q3"
    else:
        q = "Q4"
    return f"{year}{q}"

# Extract MD&A and Risk Factors sections based on Item headings
def extract_sections(text):
    text = text.replace("\n", " ")  # Flatten newlines
    sections = {}

    # Risk Factors: between Item 1A and Item 2
    match_risk = re.search(r"Item\s*1A\..*?Risk Factors(.*?)(?=Item\s*2\.|$)", text, re.IGNORECASE | re.DOTALL)
    sections["Risk Factors"] = match_risk.group(1).strip() if match_risk else ""

    # MD&A: between Item 2 and Item 3
    match_mda = re.search(r"Item\s*2\..*?Management’s Discussion.*?(.*?)(?=Item\s*3\.|$)", text, re.IGNORECASE | re.DOTALL)
    sections["MD&A"] = match_mda.group(1).strip() if match_mda else ""

    return sections

# Process a single PDF file
def process_pdf(company, file_path, filename):
    # Extract quarter from filename
    match = re.match(r"(\d{4}-\d{2}-\d{2})", filename)
    quarter = date_to_quarter(match.group(1)) if match else None

    try:
        reader = PdfReader(file_path)
        full_text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + " "
        sections_text = extract_sections(full_text)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        sections_text = {"MD&A": "", "Risk Factors": ""}

    return {
        "company": company,
        "quarter": quarter,
        "MD&A": sections_text["MD&A"],
        "Risk Factors": sections_text["Risk Factors"]
    }

# Collect all PDF file paths
tasks = []
base_path = os.getcwd()
for company in os.listdir(base_path):
    company_path = os.path.join(base_path, company)
    if os.path.isdir(company_path):
        for filename in os.listdir(company_path):
            if filename.lower().endswith(".pdf") and "10-q" in filename.lower():
                tasks.append((company, os.path.join(company_path, filename), filename))

# Process PDFs in parallel
data = []
with ThreadPoolExecutor(max_workers=8) as executor:
    results = executor.map(lambda args: process_pdf(*args), tasks)
    data.extend(results)

# Create DataFrame
df = pd.DataFrame(data, columns=["company", "quarter", "MD&A", "Risk Factors"])

# Sort by company and quarter
df['year'] = df['quarter'].str[:4].astype(int)
df['q_num'] = df['quarter'].str[5].astype(int)
df = df.sort_values(by=['company', 'year', 'q_num']).reset_index(drop=True)
df = df.drop(columns=['year', 'q_num'])

# Save CSV
df.to_csv("10Q_sections_Item1A_Item2_clean.csv", index=False)

print("DataFrame created and saved. Total rows:", len(df))


DataFrame created and saved. Total rows: 153


In [None]:
len(df)

Unnamed: 0,company,quarter,MD&A,Risk Factors
0,AAPL,2017Q1,,33
1,AAPL,2017Q2,,33
2,AAPL,2017Q3,,34
3,AAPL,2018Q1,,33
4,AAPL,2018Q2,,35
...,...,...,...,...
148,WMT,2022Q4,,32
149,WMT,2023Q2,,In addition to the other information set forth...
150,WMT,2023Q3,,In addition to the other information set forth...
151,WMT,2023Q4,,In addition to the other information set forth...
