In [43]:
import os
import json
from typing import List, Dict, Optional
import spacy
import lxml.html
import re
import fitz
from openai import OpenAI, Client
from dataclasses import dataclass
from pathlib import Path
from bs4 import BeautifulSoup

In [123]:
@dataclass
class Sentence:
    """Represents a single sentence with its text and length."""
    text: str
    length: int

@dataclass
class ChunkConfig:
    """Configuration for text chunking and processing."""
    max_chunk_size: int = 1500
    min_chunk_size: int = 500
    overlap_sentences: int = 2
    
class DocumentProcessor:
    def __init__(self, client: Client, config: ChunkConfig):
        """Initialize the document processor with improved chunking capabilities."""
        self.nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner'])
        self.nlp.max_length = 10000000
        self.client = client
        self.config = config
        
    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text before processing."""
        # Remove excessive whitespace
        text = ' '.join(text.split())
        # Normalize line endings
        text = text.replace('\n', ' ')
        return text
        
    def process_pdf(self, pdf_path: str) -> List[str]:
        """Extract text from PDF and split into chunks with improved handling."""
        text = ""
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text() + "\n"
        
        # Preprocess the extracted text
        clean_text = self.preprocess_text(text)
        return self._chunk_text(clean_text)


    def process_txt(self, txt_path: str) -> List[str]:
        """Read text file and split into chunks."""
        try:
            with open(txt_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            # Preprocess the text
            clean_text = self.preprocess_text(text)
            return self._chunk_text(clean_text)
            
        except Exception as e:
            print(f"Error processing text file {txt_path}: {str(e)}")
            return []
    
    def create_sentence_objects(self, doc) -> List[Sentence]:
        """Convert spaCy doc into list of Sentence objects."""
        sentences = []
        for sent in doc.sents:
            text = sent.text.strip()
            if text:  # Only include non-empty sentences
                sentences.append(Sentence(text=text, length=len(text)))
        return sentences

    def _chunk_sentences(self, sentences: List[Sentence]) -> List[List[Sentence]]:
        """Split sentences into chunks while maintaining proper overlap."""
        chunks = []
        current_chunk = []
        current_length = 0
        
        for i, sentence in enumerate(sentences):
            # Always add the current sentence
            current_chunk.append(sentence)
            current_length += sentence.length
            
            # Check if we should create a new chunk
            if current_length >= self.config.max_chunk_size and len(current_chunk) > self.config.overlap_sentences:
                # Only create chunk if it meets minimum size
                if current_length >= self.config.min_chunk_size:
                    chunks.append(current_chunk)
                    
                    # Start new chunk with overlap
                    overlap_sentences = current_chunk[-self.config.overlap_sentences:]
                    current_chunk = overlap_sentences.copy()
                    current_length = sum(s.length for s in current_chunk)

        # Add the last chunk if it meets minimum size
        if current_length >= self.config.min_chunk_size:
            chunks.append(current_chunk)
        
        return chunks

    def _chunk_text(self, text: str) -> List[str]:
        """Enhanced text chunking with better overlap handling."""
        # Create spaCy doc and convert to sentence objects
        doc = self.nlp(text)
        sentences = self.create_sentence_objects(doc)
        
        # Create chunks of sentences
        sentence_chunks = self._chunk_sentences(sentences)
        
        # Convert chunks of sentences back to text
        text_chunks = []
        for chunk in sentence_chunks:
            chunk_text = ' '.join(sentence.text for sentence in chunk)
            text_chunks.append(chunk_text)
            
        return text_chunks

    def generate_conversation(self, chunk: str) -> Optional[Dict]:
        """Generate conversation from chunk with validation."""
        # Validation prompt remains unchanged
        validation_prompt = f"""Analyze this text and determine if it contains meaningful Warren Buffett insights, commentary, or narrative content.

Approve the text only if:
- It discusses business philosophy or investment thinking that applies across industries and time.
- It provides views on markets, financial practices, or economic principles that are broadly applicable.
- Buffett shares personal reflections or general lessons learned that are useful beyond a single event.

Reject the text if:
- It primarily describes a specific investment, acquisition, deal, or financial transaction.
- It focuses on a single company's business decision without a clearly stated general principle.
- It discusses short-term market conditions, quarterly earnings, or economic events without broader insights.
- It contains only financial data, figures, or statistics without meaningful explanation.
- Buffett does not explicitly state a broad lesson. The text must include a clear, stated takeaway that can apply to other cases.

Text: {chunk}

Return only "yes" if the text contains meaningful, wide-scope content, or "no" otherwise.
"""
        try:
            validation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": validation_prompt}],
                max_tokens=100,
                temperature=0
            )
            
            if validation_response.choices[0].message.content.strip().lower() != "yes":
                return None

            # Generate conversation prompt remains unchanged
            conversation_prompt = f"""Below is a text excerpt from me (Warren Buffett). Generate 1-2 questions that could be asked about this specific content, but ONLY if the text contains clear, direct information to answer them. Then provide my answers in first person, as if I am directly responding to these questions. Use my communication style—plain-spoken, using analogies when helpful, and occasionally humorous.

Text: {chunk}

Important guidelines:
- Only generate questions about topics that are explicitly discussed in this text excerpt
- Write answers in first person
- Use my direct, plain-spoken style.
- Keep answers focused on what's actually in the text
- Return as a JSON string in ShareGPT format:
[{{"conversations": [
    {{"role": "human", "content": "question here"}},
    {{"role": "assistant", "content": "answer here"}},
    {{"role": "human", "content": "second question"}},
    {{"role": "assistant", "content": "second answer"}}
]}}]"""

            conversation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": conversation_prompt}],
                max_tokens=2000,
                temperature=0.4
            )
            
            response_text = conversation_response.choices[0].message.content
            return json.loads(response_text)
            
        except Exception as e:
            print(f"Error processing chunk: {chunk[:100]}...")
            print(f"Error details: {str(e)}")
            return None

def process_directory(input_dir: str, output_dir: str, client: Client, config: ChunkConfig):
    """Process all PDF and TXT files in a directory and generate training data."""
    processor = DocumentProcessor(client, config)
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Walk through directory, skipping hidden directories
    for file_path in Path(input_dir).rglob('*'):
        # Skip hidden directories and their contents
        if any(part.startswith('.') for part in file_path.parts):
            continue
            
        if file_path.suffix.lower() in ['.pdf', '.txt']:
            print(f"Processing {file_path}")
            output_file = Path(output_dir) / (file_path.stem + '.json')
            
            # Process based on file type
            if file_path.suffix.lower() == '.pdf':
                chunks = processor.process_pdf(str(file_path))
            else:  # .txt file
                chunks = processor.process_txt(str(file_path))
            
            file_conversations = []
            for chunk in chunks:
                conversation = processor.generate_conversation(chunk)
                if conversation:
                    file_conversations.append(conversation)
            
            # Save conversations for this file
            if file_conversations:
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump({"conversations": file_conversations}, f, indent=2)

In [138]:
# Set up configuration
config = ChunkConfig(
    max_chunk_size=1500,    # Maximum characters per chunk
    min_chunk_size=500,     # Minimum characters per chunk
    overlap_sentences=2      # Number of sentences to overlap (changed from overlap_size)
)

In [139]:
# Initialize Client
client = OpenAI(
    api_key="EMPTY",
    base_url="http://82.150.117.181:8000/v1",
)

In [140]:
# Test with a single PDF file
processor = DocumentProcessor(client, config)
test_pdf_path = "Dataset/Unprocessed/Lessons for Corporate America/Lessons-for-Corporate-America.pdf"
chunks = processor.process_pdf(test_pdf_path)
print(f"Generated {len(chunks)} chunks from the PDF")

Generated 406 chunks from the PDF


In [141]:
print(chunks[23])

1997] THE ESSAYS OF WARREN BUFFETT 15 Buffett learned the art of investing from Ben Graham as a graduate student at Columbia Business School in the 1950s and later working at Graham-Newman. In a number of classic works, including The Intelligent Investor, Graham introduced some of the most profound investment wisdom in history. It rejects a prevalent but mistaken mind-set that equates price with value. On the con- trary, Graham held that price is what you pay and value is what you get. These two things are rarely identical, but most people rarely notice any difference. One of Graham's most profound contributions is a character who lives on Wall Street, Mr. Market. He is your hypothetical business partner who is daily willing to buy your interest in a busi- ness or sell you his at prevailing market prices. Mr. Market is moody, prone to manic swings from joy to despair. Sometimes he offers prices way higher than value; sometimes he offers prices way lower than value. The more manic-depre

In [142]:
if chunks:
    conversation = processor.generate_conversation(chunks[23])
    print("Sample conversation:")
    print(json.dumps(conversation, indent=2))

Sample conversation:
[
  {
    "conversations": [
      {
        "role": "human",
        "content": "What is the main difference between price and value according to Ben Graham's investment wisdom?"
      },
      {
        "role": "assistant",
        "content": "Well, as I learned from Ben, price is what you pay, and value is what you get. It's like buying a sandwich - the price is what's on the menu, but the value is how full you are after eating it. They're not always the same thing, and that's where the opportunities lie."
      },
      {
        "role": "human",
        "content": "Can you explain the concept of Mr. Market and how it relates to investment opportunities?"
      },
      {
        "role": "assistant",
        "content": "Mr. Market is a great allegory that Ben Graham came up with. He's like a moody business partner who's always willing to buy or sell at the current market price. The thing is, he's prone to wild mood swings, so sometimes he'll offer you a great d

In [None]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Shareholder Letters/"
output_directory = "Dataset/Processed/Shareholder Letters/"
process_directory(input_directory, output_directory, client, config)

In [None]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Shareholder Letters/Lessons for Corporate America/"
output_directory = "Dataset/Processed/Shareholder Letters/Lessons for Corporate America/"
process_directory(input_directory, output_directory, client, config)

In [176]:
@dataclass
class Sentence:
    """Represents a single sentence with its text and length."""
    text: str
    length: int

@dataclass
class ChunkConfig:
    """Configuration for text chunking and processing."""
    max_chunk_size: int = 1600
    overlap_sentences: int = 1
    
class DocumentProcessor:
    def __init__(self, client: Client, config: ChunkConfig):
        """Initialize the document processor."""
        self.nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner'])
        self.nlp.max_length = 10000000
        self.client = client
        self.config = config
        
    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text while preserving important newlines."""
        # Split into lines
        lines = text.split('\n')
        # Remove empty lines and excessive whitespace within lines
        lines = [' '.join(line.split()) for line in lines if line.strip()]
        # Rejoin with newlines
        return '\n'.join(lines)
        
    def process_pdf(self, pdf_path: str) -> List[str]:
        """Extract text from PDF and split into chunks."""
        text = ""
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text() + "\n"
        
        # Preprocess the extracted text
        clean_text = self.preprocess_text(text)
        return self._chunk_text(clean_text)

    def process_txt(self, txt_path: str) -> List[str]:
        """Read text file and split into chunks."""
        try:
            with open(txt_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            # Preprocess the text
            clean_text = self.preprocess_text(text)
            return self._chunk_text(clean_text)
            
        except Exception as e:
            print(f"Error processing text file {txt_path}: {str(e)}")
            return []

    def _split_large_chunk(self, text: str) -> List[str]:
        """Split a large chunk of text into roughly equal parts at sentence boundaries."""
        # Use spaCy to split into sentences
        doc = self.nlp(text)
        sentences = list(doc.sents)
        
        if len(sentences) < 2:
            return [text]
            
        # Calculate target split point
        total_length = len(text)
        target_length = total_length // 2
        
        # Find best split point
        current_length = 0
        best_split_idx = 0
        
        for i, sent in enumerate(sentences):
            current_length += len(sent.text)
            if current_length >= target_length:
                best_split_idx = i
                break
        
        # Create the two chunks
        first_chunk = ' '.join(sent.text.strip() for sent in sentences[:best_split_idx + 1])
        second_chunk = ' '.join(sent.text.strip() for sent in sentences[best_split_idx + 1:])
        
        return [first_chunk, second_chunk]

    def _chunk_text(self, text: str) -> List[str]:
        """Split text into chunks based on numbered sections and size limits."""
        # Split text into lines
        lines = text.split('\n')
        
        initial_chunks = []
        current_chunk = []
        
        # First split by numbered sections
        for line in lines:
            # Check if line starts with a number followed by a dot
            if re.match(r'^\d+\.', line.strip()):
                # If we have accumulated lines in the current chunk, save it
                if current_chunk:
                    initial_chunks.append('\n'.join(current_chunk))
                # Start a new chunk with the current line
                current_chunk = [line]
            else:
                # Add line to current chunk
                current_chunk.append(line)
        
        # Add the last chunk if it exists
        if current_chunk:
            initial_chunks.append('\n'.join(current_chunk))
        
        # Further split chunks that are too large
        final_chunks = []
        for chunk in initial_chunks:
            if len(chunk) > self.config.max_chunk_size:
                split_chunks = self._split_large_chunk(chunk)
                final_chunks.extend(split_chunks)
            else:
                final_chunks.append(chunk)
            
        return final_chunks

    def generate_conversation(self, chunk: str) -> Optional[Dict]:
        """Generate conversation from chunk with validation."""
        # Validation prompt remains unchanged
        validation_prompt = f"""Analyze this text and determine if it contains meaningful Warren Buffett insights, commentary, or narrative content.

Approve the text only if:
- It discusses business philosophy or investment thinking that applies across industries and time.
- It provides views on markets, financial practices, or economic principles that are broadly applicable.
- Buffett shares personal reflections or general lessons learned that are useful beyond a single event.

Reject the text if:
- It primarily describes a specific investment, acquisition, deal, or financial transaction.
- It focuses on a single company's business decision without a clearly stated general principle.
- It discusses short-term market conditions, quarterly earnings, or economic events without broader insights.
- It contains only financial data, figures, or statistics without meaningful explanation.
- Buffett does not explicitly state a broad lesson. The text must include a clear, stated takeaway that can apply to other cases.

Text: {chunk}

Return only "yes" if the text contains meaningful, wide-scope content, or "no" otherwise.
"""
        try:
            validation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": validation_prompt}],
                max_tokens=100,
                temperature=0
            )
            
            if validation_response.choices[0].message.content.strip().lower() != "yes":
                return None

            conversation_prompt = f"""Below is a text excerpt from me (Warren Buffett). Generate 1-2 questions that could be asked about this specific content, but ONLY if the text contains clear, direct information to answer them. Then provide my answers in first person, as if I am directly responding to these questions. Use my communication style—plain-spoken, using analogies when helpful, and occasionally humorous.

Text: {chunk}

Important guidelines:
- Only generate questions about topics that are explicitly discussed in this text excerpt
- Write answers in first person
- Use my direct, plain-spoken style.
- Keep answers focused on what's actually in the text
- Return as a JSON string in ShareGPT format:
[{{"conversations": [
    {{"role": "human", "content": "question here"}},
    {{"role": "assistant", "content": "answer here"}},
    {{"role": "human", "content": "second question"}},
    {{"role": "assistant", "content": "second answer"}}
]}}]"""

            conversation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": conversation_prompt}],
                max_tokens=2000,
                temperature=0.4
            )
            
            response_text = conversation_response.choices[0].message.content
            return json.loads(response_text)
            
        except Exception as e:
            print(f"Error processing chunk: {chunk[:100]}...")
            print(f"Error details: {str(e)}")
            return None

In [181]:
config = ChunkConfig()
processor = DocumentProcessor(client, config)

In [182]:
# Process single PDF
pdf_path = "Dataset/Unprocessed/Meeting Transcripts/Berkshire Meeting Transcripts - 1994 - 2022.pdf"
chunks = processor.process_pdf(pdf_path)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 3796


In [180]:
chunks[1824]

'19. “A complicated bankruptcy can offer opportunities for profit”\nWARREN BUFFETT: Number 2.\nAUDIENCE MEMBER: Hello. My name is Barry Steinhart (PH), shareholder from New York. My\nquestion relates to the Chapter 11 bankruptcy process. I know you have been active in the past in some activity in the bankruptcy court. And if you had\nthoughts on possible reforms in that area, if you believe that any reforms are necessary?\nWARREN BUFFETT: Well, that’s a good question. Charlie is probably better qualified to answer\nthan I am. I mean, we have bought Fruit of the Loom out of bankruptcy. And we have had some involvement in owning junk bonds. You know, we get — we think about\nthe bankruptcy process. But in terms of the practicalities of improving on it, what do you think,\nCharlie?\nCHARLIE MUNGER: Well, I think much of that is pretty horrible. You have a competition there, where the courts themselves have gone into bidding contests to\nget bankruptcy business attracted. Meaning that the 

In [None]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Meeting Transcripts/"
output_directory = "Dataset/Processed/Meeting Transcripts/"
process_directory(input_directory, output_directory, client, config)