In [1]:
import os
import json
from typing import List, Dict, Tuple
import spacy
from bs4 import BeautifulSoup
import PyPDF2
from openai import OpenAI, Client
from dataclasses import dataclass
from pathlib import Path

In [2]:
@dataclass
class ChunkConfig:
    max_chunk_size: int = 1500  # Default chunk size
    min_chunk_size: int = 500   # Minimum chunk size to consider
    overlap_size: int = 200     # Number of characters to overlap between chunks

In [3]:
# Main DocumentProcessor class
class DocumentProcessor:
    def __init__(self, client: Client, config: ChunkConfig):
        self.nlp = spacy.load('en_core_web_sm',disable=['tagger','ner'])
        self.nlp.max_length = 10000000
        self.client = client
        self.config = config
        
    def process_pdf(self, pdf_path: str) -> List[str]:
        """Extract text from PDF and split into chunks."""
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
        return self._chunk_text(text)

    def process_html(self, html_path: str) -> List[str]:
        """Extract text from HTML and split into chunks."""
        with open(html_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text()
            # Normalize whitespace
            lines = (line.strip() for line in text.splitlines())
            text = ' '.join(line for line in lines if line)
        return self._chunk_text(text)

    def _chunk_text(self, text: str) -> List[str]:
        """Split text into chunks based on sentence boundaries."""
        doc = self.nlp(text)
        chunks = []
        current_chunk = []
        current_length = 0
        
        for sent in doc.sents:
            sent_text = sent.text.strip()
            sent_length = len(sent_text)
            
            if current_length + sent_length > self.config.max_chunk_size and current_length >= self.config.min_chunk_size:
                # Store current chunk if it's long enough
                chunks.append(' '.join(current_chunk))
                # Start new chunk with overlap from previous chunk
                overlap_text = ' '.join(current_chunk[-3:])  # Keep last 3 sentences for context
                current_chunk = [overlap_text, sent_text]
                current_length = len(overlap_text) + sent_length
            else:
                current_chunk.append(sent_text)
                current_length += sent_length
        
        # Add the last chunk if it's long enough
        if current_length >= self.config.min_chunk_size:
            chunks.append(' '.join(current_chunk))
            
        return chunks

    def generate_conversation(self, chunk: str) -> Dict:
        """Generate a conversation-style QA pair using GPT-4."""
        # First pass: Check if content is suitable for conversation
        validation_prompt = f"""Analyze this text and determine if it contains meaningful Warren Buffett insights, commentary, or narrative content.

Approve the text only if:
- It discusses business philosophy or investment thinking that applies across industries and time.
- It provides views on markets, financial practices, or economic principles that are broadly applicable.
- Buffett shares personal reflections or general lessons learned that are useful beyond a single event.

Reject the text if:
- It primarily describes a specific investment, acquisition, deal, or financial transaction.
- It focuses on a single company's business decision without a clearly stated general principle.
- It discusses short-term market conditions, quarterly earnings, or economic events without broader insights.
- It contains only financial data, figures, or statistics without meaningful explanation.
- Buffett does not explicitly state a broad lesson. The text must include a clear, stated takeaway that can apply to other cases.

If the text mentions a business decision, investment, or deal, it must contain a stated general principle that applies beyond that case. If there is no such general lesson, reject it.

Text: {chunk}

Return only "yes" if the text contains meaningful, wide-scope content, or "no" otherwise.

DO NOT RETURN ANY OTHER TEXT APART FROM yes or NO!!!
"""
        
        validation_response = self.client.chat.completions.create(
            model="/model",
            messages=[{"role": "user", "content": validation_prompt}],
            max_tokens=100,
            temperature=0
        )
        
        if validation_response.choices[0].message.content.strip().lower() != "yes":
            return None

        # Second pass: Generate conversation if content is suitable
        conversation_prompt = f"""Below is a text excerpt from me (Warren Buffett). Generate 1-2 questions that could be asked about this specific content, but ONLY if the text contains clear, direct information to answer them. Then provide my answers in first person, as if I am directly responding to these questions. Use my communication style—plain-spoken, using analogies when helpful, and occasionally humorous.

Text: {chunk}

Important guidelines:
- Only generate questions about topics that are explicitly discussed in this text excerpt
- Write answers in first person ("I think...", "In my experience...", "At Berkshire, we...")
- Use my direct, plain-spoken style
- Keep answers focused on what's actually in the text
- Return as a JSON string in ShareGPT format:
[{{
    "conversations": [
        {{"role": "human", "content": "question here"}},
        {{"role": "assistant", "content": "answer here"}},
        {{"role": "human", "content": "second question"}},
        {{"role": "assistant", "content": "second answer"}}
    ]
}}]
"""
        try:
            conversation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": conversation_prompt}],
                max_tokens=1000,
                temperature=0.2
            )
            
            # Get the response content and parse it as JSON
            response_text = conversation_response.choices[0].message.content
            conversation_data = json.loads(response_text)
            return conversation_data
        except (json.JSONDecodeError, AttributeError, IndexError) as e:
            print(f"Error parsing response for chunk: {chunk[:100]}...")
            print(f"Error details: {str(e)}")
            return None

def process_directory(input_dir: str, output_dir: str, client: Client, config: ChunkConfig):
    """Process all PDF and HTML files in a directory and generate training data."""
    processor = DocumentProcessor(client, config)
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    for file_path in Path(input_dir).rglob('*'):
        if file_path.suffix.lower() in ['.pdf', '.html']:
            print(f"Processing {file_path}")
            output_file = Path(output_dir) / (file_path.stem + '.json')
            
            chunks = (processor.process_pdf(str(file_path)) if file_path.suffix.lower() == '.pdf' 
                     else processor.process_html(str(file_path)))
            
            file_conversations = []
            for chunk in chunks:
                conversation = processor.generate_conversation(chunk)
                if conversation:
                    file_conversations.append(conversation)
            
            # Save conversations for this file
            if file_conversations:
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump({"conversations": file_conversations}, f, indent=2)

In [4]:
# Set up configuration
config = ChunkConfig(
    max_chunk_size=1500,
    min_chunk_size=500,
    overlap_size=200
)

In [5]:
# Initialize Client
client = OpenAI(
    api_key="EMPTY",
    base_url="http://82.150.117.181:8000/v1",
)

In [10]:
# Test with a single PDF file
processor = DocumentProcessor(client, config)
test_pdf_path = "Dataset/Unprocessed/Shareholder Letters/2004ltr.pdf"
chunks = processor.process_pdf(test_pdf_path)
print(f"Generated {len(chunks)} chunks from the PDF")

Generated 749 chunks from the PDF


In [18]:
chunks[103]

'288 261 \nHom eServices .....................................................................................................  130 113 \nOthe r (net)..........................................................................................................  172 190 \nLoss from zinc project ........................................................................................       (579)        (46) Earni ngs before corporate i nterest and ta xes......................................................  605 1,076 \nIntere st, other  than to Berkshi re.........................................................................  (212) (225) Intere st on Be rkshire junior debt ........................................................................  (170) (184) \nIncome tax..........................................................................................................         (53)      (251) Net ear nings................................................................................

In [10]:
if chunks:
    conversation = processor.generate_conversation(chunks[16])
    print("Sample conversation:")
    print(json.dumps(conversation, indent=2))

Sample conversation:
null


In [11]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Shareholder Letters/"
output_directory = "Dataset/Processed/Shareholder Letters/"
process_directory(input_directory, output_directory, client, config)

Processing Dataset/Unprocessed/Shareholder Letters/2001pdf.pdf
Processing Dataset/Unprocessed/Shareholder Letters/2002pdf.pdf
Processing Dataset/Unprocessed/Shareholder Letters/2003ltr.pdf
Processing Dataset/Unprocessed/Shareholder Letters/2004ltr.pdf
Processing Dataset/Unprocessed/Shareholder Letters/2005ltr.pdf
Processing Dataset/Unprocessed/Shareholder Letters/2006ltr.pdf
Processing Dataset/Unprocessed/Shareholder Letters/2007ltr.pdf
Error parsing response for chunk: 2 Berkshire’s Corporate Performance vs. the S&P 500 
   Annual Percentage Change  
  in Per-Share in...
Error details: Expecting ',' delimiter: line 6 column 591 (char 1745)
Processing Dataset/Unprocessed/Shareholder Letters/2008ltr.pdf
Processing Dataset/Unprocessed/Shareholder Letters/2009ltr.pdf
Error parsing response for chunk: 2 5 1 8 6
Operating earnings before corporate interest and taxes ........................... 1,846 2...
Error details: Expecting ',' delimiter: line 6 column 586 (char 1365)
Processing Datase