In [43]:
import os
import json
from typing import List, Dict, Optional
import spacy
import lxml.html
import re
import fitz
from openai import OpenAI, Client
from dataclasses import dataclass
from pathlib import Path
from bs4 import BeautifulSoup

In [44]:
@dataclass
class Sentence:
    """Represents a single sentence with its text and length."""
    text: str
    length: int

@dataclass
class ChunkConfig:
    """Configuration for text chunking and processing."""
    max_chunk_size: int = 1500
    min_chunk_size: int = 500
    overlap_sentences: int = 2
    
class DocumentProcessor:
    def __init__(self, client: Client, config: ChunkConfig):
        """Initialize the document processor with improved chunking capabilities."""
        self.nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner'])
        self.nlp.max_length = 10000000
        self.client = client
        self.config = config
        
    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text before processing."""
        # Remove excessive whitespace
        text = ' '.join(text.split())
        # Normalize line endings
        text = text.replace('\n', ' ')
        return text
        
    def process_pdf(self, pdf_path: str) -> List[str]:
        """Extract text from PDF and split into chunks with improved handling."""
        text = ""
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text() + "\n"
        
        # Preprocess the extracted text
        clean_text = self.preprocess_text(text)
        return self._chunk_text(clean_text)


    def process_txt(self, txt_path: str) -> List[str]:
        """Read text file and split into chunks."""
        try:
            with open(txt_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            # Preprocess the text
            clean_text = self.preprocess_text(text)
            return self._chunk_text(clean_text)
            
        except Exception as e:
            print(f"Error processing text file {txt_path}: {str(e)}")
            return []
    
    def create_sentence_objects(self, doc) -> List[Sentence]:
        """Convert spaCy doc into list of Sentence objects."""
        sentences = []
        for sent in doc.sents:
            text = sent.text.strip()
            if text:  # Only include non-empty sentences
                sentences.append(Sentence(text=text, length=len(text)))
        return sentences

    def _chunk_sentences(self, sentences: List[Sentence]) -> List[List[Sentence]]:
        """Split sentences into chunks while maintaining proper overlap."""
        chunks = []
        current_chunk = []
        current_length = 0
        
        for i, sentence in enumerate(sentences):
            # Always add the current sentence
            current_chunk.append(sentence)
            current_length += sentence.length
            
            # Check if we should create a new chunk
            if current_length >= self.config.max_chunk_size and len(current_chunk) > self.config.overlap_sentences:
                # Only create chunk if it meets minimum size
                if current_length >= self.config.min_chunk_size:
                    chunks.append(current_chunk)
                    
                    # Start new chunk with overlap
                    overlap_sentences = current_chunk[-self.config.overlap_sentences:]
                    current_chunk = overlap_sentences.copy()
                    current_length = sum(s.length for s in current_chunk)

        # Add the last chunk if it meets minimum size
        if current_length >= self.config.min_chunk_size:
            chunks.append(current_chunk)
        
        return chunks

    def _chunk_text(self, text: str) -> List[str]:
        """Enhanced text chunking with better overlap handling."""
        # Create spaCy doc and convert to sentence objects
        doc = self.nlp(text)
        sentences = self.create_sentence_objects(doc)
        
        # Create chunks of sentences
        sentence_chunks = self._chunk_sentences(sentences)
        
        # Convert chunks of sentences back to text
        text_chunks = []
        for chunk in sentence_chunks:
            chunk_text = ' '.join(sentence.text for sentence in chunk)
            text_chunks.append(chunk_text)
            
        return text_chunks

    def generate_conversation(self, chunk: str) -> Optional[Dict]:
        """Generate conversation from chunk with validation."""
        # Validation prompt remains unchanged
        validation_prompt = f"""Analyze this text and determine if it contains meaningful Warren Buffett insights, commentary, or narrative content.

Approve the text only if:
- It discusses business philosophy or investment thinking that applies across industries and time.
- It provides views on markets, financial practices, or economic principles that are broadly applicable.
- Buffett shares personal reflections or general lessons learned that are useful beyond a single event.

Reject the text if:
- It primarily describes a specific investment, acquisition, deal, or financial transaction.
- It focuses on a single company's business decision without a clearly stated general principle.
- It discusses short-term market conditions, quarterly earnings, or economic events without broader insights.
- It contains only financial data, figures, or statistics without meaningful explanation.
- Buffett does not explicitly state a broad lesson. The text must include a clear, stated takeaway that can apply to other cases.

Text: {chunk}

Return only "yes" if the text contains meaningful, wide-scope content, or "no" otherwise.
"""
        try:
            validation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": validation_prompt}],
                max_tokens=100,
                temperature=0
            )
            
            if validation_response.choices[0].message.content.strip().lower() != "yes":
                return None

            # Generate conversation prompt remains unchanged
            conversation_prompt = f"""Below is a text excerpt from me (Warren Buffett). Generate 1-2 questions that could be asked about this specific content, but ONLY if the text contains clear, direct information to answer them. Then provide my answers in first person, as if I am directly responding to these questions. Use my communication style—plain-spoken, using analogies when helpful, and occasionally humorous.

Text: {chunk}

Important guidelines:
- Only generate questions about topics that are explicitly discussed in this text excerpt
- Write answers in first person
- Use my direct, plain-spoken style, occasional humor and occasional metaphor
- Keep answers focused on what's actually in the text
- Return as a JSON string in ShareGPT format:
[{{"conversations": [
    {{"role": "human", "content": "question here"}},
    {{"role": "assistant", "content": "answer here"}},
    {{"role": "human", "content": "second question"}},
    {{"role": "assistant", "content": "second answer"}}
]}}]"""

            conversation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": conversation_prompt}],
                max_tokens=1000,
                temperature=0.2
            )
            
            response_text = conversation_response.choices[0].message.content
            return json.loads(response_text)
            
        except Exception as e:
            print(f"Error processing chunk: {chunk[:100]}...")
            print(f"Error details: {str(e)}")
            return None

def process_directory(input_dir: str, output_dir: str, client: Client, config: ChunkConfig):
    """Process all PDF and TXT files in a directory and generate training data."""
    processor = DocumentProcessor(client, config)
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Walk through directory, skipping hidden directories
    for file_path in Path(input_dir).rglob('*'):
        # Skip hidden directories and their contents
        if any(part.startswith('.') for part in file_path.parts):
            continue
            
        if file_path.suffix.lower() in ['.pdf', '.txt']:
            print(f"Processing {file_path}")
            output_file = Path(output_dir) / (file_path.stem + '.json')
            
            # Process based on file type
            if file_path.suffix.lower() == '.pdf':
                chunks = processor.process_pdf(str(file_path))
            else:  # .txt file
                chunks = processor.process_txt(str(file_path))
            
            file_conversations = []
            for chunk in chunks:
                conversation = processor.generate_conversation(chunk)
                if conversation:
                    file_conversations.append(conversation)
            
            # Save conversations for this file
            if file_conversations:
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump({"conversations": file_conversations}, f, indent=2)

In [45]:
# Set up configuration
config = ChunkConfig(
    max_chunk_size=2000,    # Maximum characters per chunk
    min_chunk_size=500,     # Minimum characters per chunk
    overlap_sentences=2      # Number of sentences to overlap (changed from overlap_size)
)

In [46]:
# Initialize Client
client = OpenAI(
    api_key="EMPTY",
    base_url="http://82.150.117.181:8000/v1",
)

In [47]:
# Test with a single PDF file
processor = DocumentProcessor(client, config)
test_pdf_path = "Dataset/Unprocessed/Meeting Transcripts/Splits/split_1.pdf"
chunks = processor.process_pdf(test_pdf_path)
print(f"Generated {len(chunks)} chunks from the PDF")

Generated 222 chunks from the PDF


In [48]:
print(chunks[14])

and it’s a little hard to pick out where they do fall. But, I think you can usually figure out — I mean, it’s not hard to figure out that, say, Bill Gates, or Tom Murphy, or Don Keough, or people like that, are really outstanding managers. And it’s not hard to figure out who they’re working for. And I can give you some cases on the other end of the spectrum, too. It’s interesting how often the ones that, in my view, are the poor managers also turn out to be the ones that really don’t think that much about the shareholders, too. The two often go hand in hand. But, I think reading of reports — reading of competitors’ reports — I think you’ll get a fix on that in some cases. You don’t have to — you know, you don’t have to make a hundred correct judgments in this business or 50 correct judgments. You only have to make a few. And that’s all we try to do. And, generally speaking, the conclusions I’ve come to about managers have really come about the same way you can make yours. I mean they c

In [49]:
if chunks:
    conversation = processor.generate_conversation(chunks[28])
    print("Sample conversation:")
    print(json.dumps(conversation, indent=2))

Sample conversation:
[
  {
    "conversations": [
      {
        "role": "human",
        "content": "What do you consider when evaluating the use of leverage in a business?"
      },
      {
        "role": "assistant",
        "content": "When I'm evaluating the use of leverage in a business, I consider two main things: whether the management team can control the business in a way that the leverage doesn't become dangerous, and what kind of returns on equity they can earn while using it. It's like driving a car - if you're a good driver, you can handle a more powerful vehicle, but if you're a reckless driver, you're more likely to end up in a ditch."
      },
      {
        "role": "human",
        "content": "How did Deryck and Bob Denham get involved with Salomon Brothers?"
      },
      {
        "role": "assistant",
        "content": "Deryck took on the job of operating head of Salomon Brothers on August 18th, 1991, without knowing exactly what he was getting into, and withou

In [50]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Meeting Transcripts/Splits"
output_directory = "Dataset/Processed/Meeting Transcripts/"
process_directory(input_directory, output_directory, client, config)

Processing Dataset/Unprocessed/Meeting Transcripts/Splits/split_1.pdf
Processing Dataset/Unprocessed/Meeting Transcripts/Splits/split_2.pdf
Processing Dataset/Unprocessed/Meeting Transcripts/Splits/split_3.pdf
Processing Dataset/Unprocessed/Meeting Transcripts/Splits/split_4.pdf
Error processing chunk: We would never — there won’t be a deal ever made for Berkshire anyway — but if there would be we wou...
Error details: Expecting value: line 6 column 1 (char 1067)
Processing Dataset/Unprocessed/Meeting Transcripts/Splits/split_5.pdf
Processing Dataset/Unprocessed/Meeting Transcripts/Splits/split_6.pdf
Processing Dataset/Unprocessed/Meeting Transcripts/Splits/split_7.pdf
Processing Dataset/Unprocessed/Meeting Transcripts/Splits/split_8.pdf
Processing Dataset/Unprocessed/Meeting Transcripts/Splits/split_9.pdf


KeyboardInterrupt: 

In [None]:
# Set up configuration
config = ChunkConfig(
    max_chunk_size=1500,    # Maximum characters per chunk
    min_chunk_size=500,     # Minimum characters per chunk
    overlap_sentences=2      # Number of sentences to overlap (changed from overlap_size)
)

In [None]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Shareholder Letters/"
output_directory = "Dataset/Processed/Shareholder Letters/"
process_directory(input_directory, output_directory, client, config)

In [None]:
# Set up configuration
config = ChunkConfig(
    max_chunk_size=1500,    # Maximum characters per chunk
    min_chunk_size=500,     # Minimum characters per chunk
    overlap_sentences=2      # Number of sentences to overlap (changed from overlap_size)
)

In [None]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Shareholder Letters/Lessons for Corporate America/"
output_directory = "Dataset/Processed/Shareholder Letters/Lessons for Corporate America/"
process_directory(input_directory, output_directory, client, config)