In [None]:
import os
import re
import json
from typing import List, Dict
import pdfplumber
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a PDF file using pdfplumber.
    
    Args:
        pdf_path (str): Path to the PDF file.
        
    Returns:
        str: Extracted text.
    """
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

def clean_text(text: str) -> str:
    """
    Cleans extracted text by removing headers, footers, special characters, and stopwords.
    
    Args:
        text (str): Raw extracted text.
        
    Returns:
        str: Cleaned text.
    """
    # Remove multiple spaces and line breaks
    text = re.sub(r'\s+', ' ', text)
    
    # Remove headers and footers (simple heuristic based on page numbers)
    text = re.sub(r'\bPage\s+\d+\b', '', text, flags=re.IGNORECASE)
    
    # Remove non-ASCII characters
    text = text.encode('ascii', errors='ignore').decode()
    
    # Remove special characters except for basic punctuation
    text = re.sub(r'[^A-Za-z0-9.,;:?!()\[\] ]+', '', text)
    
    return text.strip()

In [21]:
def clean_text(text: str) -> str:
    """
    Cleans extracted text by removing headers, footers, special characters, and stopwords.
    
    Args:
        text (str): Raw extracted text.
        
    Returns:
        str: Cleaned text.
    """
    # Remove multiple spaces and line breaks
    text = re.sub(r'\s+', ' ', text)
    
    # Remove headers and footers (simple heuristic based on page numbers)
    text = re.sub(r'\bPage\s+\d+\b', '', text, flags=re.IGNORECASE)
    
    # Remove non-ASCII characters
    text = text.encode('ascii', errors='ignore').decode()
    
    # Remove special characters except for basic punctuation
    text = re.sub(r'[^A-Za-z0-9.,;:?!()\[\] ]+', '', text)
       
    return text.strip()

def get_company_files(companies: List[str], dir_path: str, file_extension: str = '.pdf') -> Dict[str, str]:
    """
    Retrieves file paths for each company.
    
    Args:
        companies (List[str]): List of company identifiers.
        dir_path (str): Directory path where the files are stored.
        file_extension (str): File extension to look for.
        
    Returns:
        Dict[str, str]: Mapping from company to file path.
    """
    company_files = {}
    for company in companies:
        # searching for filename containing the company identifier
        for file in os.listdir(dir_path):
            if file.lower().endswith(file_extension) and company.lower() in file.lower():
                company_files[company] = os.path.join(dir_path, file)
                break
        else:
            print(f"Warning: No file found for {company} in {dir_path}")
    return company_files

def split_into_chunks(text: str, max_length: int) -> List[str]:
    """
    Splits text into smaller chunks based on max_length.
    
    Args:
        text (str): The text to split.
        max_length (int): Maximum number of characters per chunk.
        
    Returns:
        List[str]: List of text chunks.
    """
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

def process_data(config: Dict) -> None:
    """
    Processes PDF files and generates a JSONL file for fine-tuning.

    Output data format

    {
        "company": "CompanyA",
        "input_text": "Company: CompanyA\n[10-K Filing]\n[Cleaned 10-K Text]",
        "output_text": "Company: CompanyA\n[Morningstar Report]\n[Cleaned Report Text]"
    }
    """
    # Retrieve file mappings
    tenk_files = get_company_files(config['companies'], config['10k_dir'])
    report_files = get_company_files(config['companies'], config['reports_dir'])
    
    # Open output file
    with open(config['output_file'], 'w', encoding='utf-8') as outfile:
        # Iterate over companies
        for company in tqdm(config['companies'], desc="Processing Companies"):
            tenk_path = tenk_files.get(company)
            report_path = report_files.get(company)
            
            if not tenk_path or not report_path:
                print(f"Skipping {company} due to missing files.")
                continue
            
            # Extract and clean 10-K text
            tenk_text = extract_text_from_pdf(tenk_path)
            tenk_clean = clean_text(tenk_text)
            
            # Extract and clean Morningstar report text
            report_text = extract_text_from_pdf(report_path)
            report_clean = clean_text(report_text)
            
            # Handle chunking based on configuration
            if config['enable_chunking']:
                # Split texts into chunks
                tenk_chunks = split_into_chunks(tenk_clean, config['chunk_size'])
                report_chunks = split_into_chunks(report_clean, config['chunk_size'])
                
                # Pair each 10-K chunk with the corresponding report chunk
                # Assuming one-to-one mapping; adjust if necessary
                for i, (input_chunk, output_chunk) in enumerate(zip(tenk_chunks, report_chunks)):
                    data_point = {
                        'company': company,
                        'input_text': f"Company: {company}\n[10-K Filing]\n{input_chunk}",
                        'output_text': f"Company: {company}\n[Morningstar Report]\n{output_chunk}"
                    }
                    outfile.write(json.dumps(data_point) + '\n')
            else:
                # Use the full texts without chunking
                data_point = {
                    'company': company,
                    'input_text': f"Company: {company}\n[10-K Filing]\n{tenk_clean}",
                    'output_text': f"Company: {company}\n[Morningstar Report]\n{report_clean}"
                }
                outfile.write(json.dumps(data_point) + '\n')
                    
    print(f"Data processing complete. Output saved to {config['output_file']}")


In [22]:
CONFIG = {
    'data_dir': '../data',  # Root directory containing '10k' and 'reports' subdirectories
    '10k_dir': '../data/10Ks',  # Directory containing 10-K PDFs
    'reports_dir': '../data/reports',  # Directory containing Morningstar reports PDFs
    'output_file': '../data/processed_data.jsonl',  # Output file path
    'chunk_size': 1000,  # Characters per chunk if splitting is needed
    'enable_chunking': False,
    # 'companies': ['axp']  
    'companies': ['axp', 'bac', 'cb', 'cvx', 'itochu',
                  'khc', 'ko', 'mco', 'mitsubishi', 'oxy']  
}

In [20]:
process_data(CONFIG)

Processing Companies: 100%|██████████| 10/10 [05:47<00:00, 34.71s/it]

Data processing complete. Output saved to ../data/processed_data.jsonl



