In [13]:
import os
import json
from typing import List, Dict, Optional
import spacy
import re
import fitz
from openai import OpenAI, Client
from dataclasses import dataclass
from pathlib import Path
import shutil

In [2]:
@dataclass
class Sentence:
    """Represents a single sentence with its text and length."""
    text: str
    length: int

@dataclass
class ChunkConfig:
    """Configuration for text chunking and processing."""
    max_chunk_size: int = 1500
    min_chunk_size: int = 500
    overlap_sentences: int = 2
    
class DocumentProcessor:
    def __init__(self, client: Client, config: ChunkConfig):
        """Initialize the document processor with improved chunking capabilities."""
        self.nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner'])
        self.nlp.max_length = 10000000
        self.client = client
        self.config = config
        
    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text before processing."""
        # Remove excessive whitespace
        text = ' '.join(text.split())
        # Normalize line endings
        text = text.replace('\n', ' ')
        return text
        
    def process_pdf(self, pdf_path: str) -> List[str]:
        """Extract text from PDF and split into chunks with improved handling."""
        text = ""
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text() + "\n"
        
        # Preprocess the extracted text
        clean_text = self.preprocess_text(text)
        return self._chunk_text(clean_text)


    def process_txt(self, txt_path: str) -> List[str]:
        """Read text file and split into chunks."""
        try:
            with open(txt_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            # Preprocess the text
            clean_text = self.preprocess_text(text)
            return self._chunk_text(clean_text)
            
        except Exception as e:
            print(f"Error processing text file {txt_path}: {str(e)}")
            return []
    
    def create_sentence_objects(self, doc) -> List[Sentence]:
        """Convert spaCy doc into list of Sentence objects."""
        sentences = []
        for sent in doc.sents:
            text = sent.text.strip()
            if text:  # Only include non-empty sentences
                sentences.append(Sentence(text=text, length=len(text)))
        return sentences

    def _chunk_sentences(self, sentences: List[Sentence]) -> List[List[Sentence]]:
        """Split sentences into chunks while maintaining proper overlap."""
        chunks = []
        current_chunk = []
        current_length = 0
        
        for i, sentence in enumerate(sentences):
            # Always add the current sentence
            current_chunk.append(sentence)
            current_length += sentence.length
            
            # Check if we should create a new chunk
            if current_length >= self.config.max_chunk_size and len(current_chunk) > self.config.overlap_sentences:
                # Only create chunk if it meets minimum size
                if current_length >= self.config.min_chunk_size:
                    chunks.append(current_chunk)
                    
                    # Start new chunk with overlap
                    overlap_sentences = current_chunk[-self.config.overlap_sentences:]
                    current_chunk = overlap_sentences.copy()
                    current_length = sum(s.length for s in current_chunk)

        # Add the last chunk if it meets minimum size
        if current_length >= self.config.min_chunk_size:
            chunks.append(current_chunk)
        
        return chunks

    def _chunk_text(self, text: str) -> List[str]:
        """Enhanced text chunking with better overlap handling."""
        # Create spaCy doc and convert to sentence objects
        doc = self.nlp(text)
        sentences = self.create_sentence_objects(doc)
        
        # Create chunks of sentences
        sentence_chunks = self._chunk_sentences(sentences)
        
        # Convert chunks of sentences back to text
        text_chunks = []
        for chunk in sentence_chunks:
            chunk_text = ' '.join(sentence.text for sentence in chunk)
            text_chunks.append(chunk_text)
            
        return text_chunks

    def generate_conversation(self, chunk: str) -> Optional[Dict]:
        """Generate conversation from chunk with validation."""
        # Validation prompt remains unchanged
        validation_prompt = f"""Analyze this text and determine if it contains meaningful Warren Buffett insights, commentary, or narrative content.

Approve the text only if:
- It discusses business philosophy or investment thinking that applies across industries and time.
- It provides views on markets, financial practices, or economic principles that are broadly applicable.
- Buffett shares personal reflections or general lessons learned that are useful beyond a single event.

Reject the text if:
- It primarily describes a specific investment, acquisition, deal, or financial transaction.
- It focuses on a single company's business decision without a clearly stated general principle.
- It discusses short-term market conditions, quarterly earnings, or economic events without broader insights.
- It contains only financial data, figures, or statistics without meaningful explanation.
- Buffett does not explicitly state a broad lesson. The text must include a clear, stated takeaway that can apply to other cases.

Text: {chunk}

Return only "yes" if the text contains meaningful, wide-scope content, or "no" otherwise.
"""
        try:
            validation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": validation_prompt}],
                max_tokens=100,
                temperature=0
            )
            
            if validation_response.choices[0].message.content.strip().lower() != "yes":
                return None

            # Generate conversation prompt remains unchanged
            conversation_prompt = f"""Below is a text excerpt from me (Warren Buffett). Generate 1-2 questions that could be asked about this specific content, but ONLY if the text contains clear, direct information to answer them. Then provide my answers in first person, as if I am directly responding to these questions. Use my communication style—plain-spoken, using analogies when helpful, and occasionally humorous.

Text: {chunk}

Important guidelines:
- Only generate questions about topics that are explicitly discussed in this text excerpt
- Write answers in first person
- Use my direct, plain-spoken style.
- Keep answers focused on what's actually in the text
- Return as a JSON string in ShareGPT format:
[{{"conversations": [
    {{"role": "human", "content": "question here"}},
    {{"role": "assistant", "content": "answer here"}},
    {{"role": "human", "content": "second question"}},
    {{"role": "assistant", "content": "second answer"}}
]}}]"""

            conversation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": conversation_prompt}],
                max_tokens=2000,
                temperature=0.4
            )
            
            response_text = conversation_response.choices[0].message.content
            return json.loads(response_text)
            
        except Exception as e:
            print(f"Error processing chunk: {chunk[:100]}...")
            print(f"Error details: {str(e)}")
            return None

def process_directory(input_dir: str, output_dir: str, client: Client, config: ChunkConfig):
    """Process all PDF and TXT files in a directory and generate training data."""
    processor = DocumentProcessor(client, config)
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Walk through directory, skipping hidden directories
    for file_path in Path(input_dir).rglob('*'):
        # Skip hidden directories and their contents
        if any(part.startswith('.') for part in file_path.parts):
            continue
            
        if file_path.suffix.lower() in ['.pdf', '.txt']:
            print(f"Processing {file_path}")
            output_file = Path(output_dir) / (file_path.stem + '.json')
            
            # Process based on file type
            if file_path.suffix.lower() == '.pdf':
                chunks = processor.process_pdf(str(file_path))
            else:  # .txt file
                chunks = processor.process_txt(str(file_path))
            
            file_conversations = []
            for chunk in chunks:
                conversation = processor.generate_conversation(chunk)
                if conversation:
                    file_conversations.append(conversation)
            
            # Save conversations for this file
            if file_conversations:
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump({"conversations": file_conversations}, f, indent=2)

In [3]:
# Set up configuration
config = ChunkConfig(
    max_chunk_size=1500,    # Maximum characters per chunk
    min_chunk_size=500,     # Minimum characters per chunk
    overlap_sentences=2      # Number of sentences to overlap (changed from overlap_size)
)

In [4]:
# Initialize Client
client = OpenAI(
    api_key="EMPTY",
    base_url="http://82.150.117.181:8000/v1",
)

In [5]:
# Test with a single PDF file
processor = DocumentProcessor(client, config)
test_pdf_path = "Dataset/Unprocessed/Lessons for Corporate America/Lessons-for-Corporate-America.pdf"
chunks = processor.process_pdf(test_pdf_path)
print(f"Generated {len(chunks)} chunks from the PDF")



Generated 406 chunks from the PDF


In [6]:
print(chunks[23])

1997] THE ESSAYS OF WARREN BUFFETT 15 Buffett learned the art of investing from Ben Graham as a graduate student at Columbia Business School in the 1950s and later working at Graham-Newman. In a number of classic works, including The Intelligent Investor, Graham introduced some of the most profound investment wisdom in history. It rejects a prevalent but mistaken mind-set that equates price with value. On the con- trary, Graham held that price is what you pay and value is what you get. These two things are rarely identical, but most people rarely notice any difference. One of Graham's most profound contributions is a character who lives on Wall Street, Mr. Market. He is your hypothetical business partner who is daily willing to buy your interest in a busi- ness or sell you his at prevailing market prices. Mr. Market is moody, prone to manic swings from joy to despair. Sometimes he offers prices way higher than value; sometimes he offers prices way lower than value. The more manic-depre

In [7]:
if chunks:
    conversation = processor.generate_conversation(chunks[23])
    print("Sample conversation:")
    print(json.dumps(conversation, indent=2))

Sample conversation:
[
  {
    "conversations": [
      {
        "role": "human",
        "content": "What is the main difference between price and value according to Ben Graham's investment wisdom?"
      },
      {
        "role": "assistant",
        "content": "Well, as I learned from Ben, price is what you pay, and value is what you get. These two aren't always the same, but a lot of people don't notice the difference. It's a crucial distinction, though, and one that's served me well over the years."
      },
      {
        "role": "human",
        "content": "Can you describe Mr. Market, a character introduced by Ben Graham, and his significance in investing?"
      },
      {
        "role": "assistant",
        "content": "Mr. Market is a great allegory. He's like a business partner who's willing to buy or sell his interest in a business at the prevailing market price every day. The thing is, Mr. Market can be quite moody, swinging from extreme optimism to deep pessimism. Thi

In [8]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Shareholder Letters/"
output_directory = "Dataset/Processed/Shareholder Letters/"
process_directory(input_directory, output_directory, client, config)

Processing Dataset/Unprocessed/Shareholder Letters/1977.txt
Processing Dataset/Unprocessed/Shareholder Letters/1978.txt
Processing Dataset/Unprocessed/Shareholder Letters/1979.txt
Processing Dataset/Unprocessed/Shareholder Letters/1980.txt
Processing Dataset/Unprocessed/Shareholder Letters/1981.txt
Processing Dataset/Unprocessed/Shareholder Letters/1982.txt
Processing Dataset/Unprocessed/Shareholder Letters/1983.txt
Processing Dataset/Unprocessed/Shareholder Letters/1984.txt
Processing Dataset/Unprocessed/Shareholder Letters/1985.txt
Processing Dataset/Unprocessed/Shareholder Letters/1986.txt
Processing Dataset/Unprocessed/Shareholder Letters/1987.txt
Processing Dataset/Unprocessed/Shareholder Letters/1989.txt
Processing Dataset/Unprocessed/Shareholder Letters/1988.txt
Processing Dataset/Unprocessed/Shareholder Letters/1990.txt
Processing Dataset/Unprocessed/Shareholder Letters/1991.txt
Processing Dataset/Unprocessed/Shareholder Letters/1992.txt
Processing Dataset/Unprocessed/Sharehold

In [6]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Lessons for Corporate America/"
output_directory = "Dataset/Processed/Lessons for Corporate America/"
process_directory(input_directory, output_directory, client, config)

Processing Dataset/Unprocessed/Lessons for Corporate America/Lessons-for-Corporate-America.pdf




In [10]:
@dataclass
class Sentence:
    """Represents a single sentence with its text and length."""
    text: str
    length: int

@dataclass
class ChunkConfig:
    """Configuration for text chunking and processing."""
    max_chunk_size: int = 1600
    overlap_sentences: int = 1
    
class DocumentProcessor:
    def __init__(self, client: Client, config: ChunkConfig):
        """Initialize the document processor."""
        self.nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner'])
        self.nlp.max_length = 10000000
        self.client = client
        self.config = config
        
    def preprocess_text(self, text: str) -> str:
        """Clean and normalize text while preserving important newlines."""
        # Split into lines
        lines = text.split('\n')
        # Remove empty lines and excessive whitespace within lines
        lines = [' '.join(line.split()) for line in lines if line.strip()]
        # Rejoin with newlines
        return '\n'.join(lines)
        
    def process_pdf(self, pdf_path: str) -> List[str]:
        """Extract text from PDF and split into chunks."""
        text = ""
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text() + "\n"
        
        # Preprocess the extracted text
        clean_text = self.preprocess_text(text)
        return self._chunk_text(clean_text)

    def process_txt(self, txt_path: str) -> List[str]:
        """Read text file and split into chunks."""
        try:
            with open(txt_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            # Preprocess the text
            clean_text = self.preprocess_text(text)
            return self._chunk_text(clean_text)
            
        except Exception as e:
            print(f"Error processing text file {txt_path}: {str(e)}")
            return []

    def _split_large_chunk(self, text: str) -> List[str]:
        """Split a large chunk of text into roughly equal parts at sentence boundaries."""
        # Use spaCy to split into sentences
        doc = self.nlp(text)
        sentences = list(doc.sents)
        
        if len(sentences) < 2:
            return [text]
            
        # Calculate target split point
        total_length = len(text)
        target_length = total_length // 2
        
        # Find best split point
        current_length = 0
        best_split_idx = 0
        
        for i, sent in enumerate(sentences):
            current_length += len(sent.text)
            if current_length >= target_length:
                best_split_idx = i
                break
        
        # Create the two chunks
        first_chunk = ' '.join(sent.text.strip() for sent in sentences[:best_split_idx + 1])
        second_chunk = ' '.join(sent.text.strip() for sent in sentences[best_split_idx + 1:])
        
        return [first_chunk, second_chunk]

    def _chunk_text(self, text: str) -> List[str]:
        """Split text into chunks based on numbered sections and size limits."""
        # Split text into lines
        lines = text.split('\n')
        
        initial_chunks = []
        current_chunk = []
        
        # First split by numbered sections
        for line in lines:
            # Check if line starts with a number followed by a dot
            if re.match(r'^\d+\.', line.strip()):
                # If we have accumulated lines in the current chunk, save it
                if current_chunk:
                    initial_chunks.append('\n'.join(current_chunk))
                # Start a new chunk with the current line
                current_chunk = [line]
            else:
                # Add line to current chunk
                current_chunk.append(line)
        
        # Add the last chunk if it exists
        if current_chunk:
            initial_chunks.append('\n'.join(current_chunk))
        
        # Further split chunks that are too large
        final_chunks = []
        for chunk in initial_chunks:
            if len(chunk) > self.config.max_chunk_size:
                split_chunks = self._split_large_chunk(chunk)
                final_chunks.extend(split_chunks)
            else:
                final_chunks.append(chunk)
            
        return final_chunks

    def generate_conversation(self, chunk: str) -> Optional[Dict]:
        """Generate conversation from chunk with validation."""
        # Validation prompt remains unchanged
        validation_prompt = f"""Analyze this text and determine if it contains meaningful Warren Buffett insights, commentary, or narrative content.

Approve the text only if:
- It discusses business philosophy or investment thinking that applies across industries and time.
- It provides views on markets, financial practices, or economic principles that are broadly applicable.
- Buffett shares personal reflections or general lessons learned that are useful beyond a single event.

Reject the text if:
- It primarily describes a specific investment, acquisition, deal, or financial transaction.
- It focuses on a single company's business decision without a clearly stated general principle.
- It discusses short-term market conditions, quarterly earnings, or economic events without broader insights.
- It contains only financial data, figures, or statistics without meaningful explanation.
- Buffett does not explicitly state a broad lesson. The text must include a clear, stated takeaway that can apply to other cases.

Text: {chunk}

Return only "yes" if the text contains meaningful, wide-scope content, or "no" otherwise.
"""
        try:
            validation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": validation_prompt}],
                max_tokens=100,
                temperature=0
            )
            
            if validation_response.choices[0].message.content.strip().lower() != "yes":
                return None

            conversation_prompt = f"""Below is a text excerpt from me (Warren Buffett). Generate 1-2 questions that could be asked about this specific content, but ONLY if the text contains clear, direct information to answer them. Then provide my answers in first person, as if I am directly responding to these questions. Use my communication style—plain-spoken, using analogies when helpful, and occasionally humorous.

Text: {chunk}

Important guidelines:
- Only generate questions about topics that are explicitly discussed in this text excerpt
- Write answers in first person
- Use my direct, plain-spoken style.
- Keep answers focused on what's actually in the text
- Return as a JSON string in ShareGPT format:
[{{"conversations": [
    {{"role": "human", "content": "question here"}},
    {{"role": "assistant", "content": "answer here"}},
    {{"role": "human", "content": "second question"}},
    {{"role": "assistant", "content": "second answer"}}
]}}]"""

            conversation_response = self.client.chat.completions.create(
                model="/model",
                messages=[{"role": "user", "content": conversation_prompt}],
                max_tokens=2000,
                temperature=0.4
            )
            
            response_text = conversation_response.choices[0].message.content
            return json.loads(response_text)
            
        except Exception as e:
            print(f"Error processing chunk: {chunk[:100]}...")
            print(f"Error details: {str(e)}")
            return None

In [11]:
config = ChunkConfig()
processor = DocumentProcessor(client, config)

In [12]:
# Process single PDF
pdf_path = "Dataset/Unprocessed/Meeting Transcripts/Berkshire Meeting Transcripts - 1994 - 2022.pdf"
chunks = processor.process_pdf(pdf_path)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 3796


In [13]:
chunks[1824]

'2. Opening remarks\nWARREN BUFFETT: Got a lot of people to thank, starting off with Jimmy. Wonderful. We hid him out — came in last night kind of late and we — to be sure it was a surprise, we\nstashed him away over at the Hilton, and I just want to say thanks to him. We both got the commercial gene, but unfortunately, he got the singing gene. I got this voice\nyou’re hearing. We — the movie, as we mentioned, we get a lot of help from a lot of people. They all do it just\nfor the fun of it. I particularly want to thank Andy Heyward of DIC who did that cartoon. He’s done them now for\na number of years. They come back here to get my voice recorded and to get Bill’s [Gates] voice\nand Charlie’s voice. They do it all themselves just to participate in the movie. Andy and I — I’m working with Andy on a cartoon series that will be out pretty soon, which\nwe’re aiming toward younger people to try and work a little financial education into a good\ntime on Saturday morning for kids. And we’ll 

In [14]:
# Process entire directory
input_directory = "Dataset/Unprocessed/Meeting Transcripts/"
output_directory = "Dataset/Processed/Meeting Transcripts/"
process_directory(input_directory, output_directory, client, config)

Processing Dataset/Unprocessed/Meeting Transcripts/Berkshire Meeting Transcripts - 1994 - 2022.pdf
Error processing chunk: And we have Fred Schwed’s “Where Are the Customers’ Yachts?” book, which contains an
incredible amou...
Error details: Invalid \escape: line 3 column 77 (char 219)


In [10]:
def merge_json_files(input_directory, output_path):
    """
    Merge all JSON files in the specified directory into a single JSON file
    maintaining the nested 'conversations' structure.
    
    Args:
        input_directory (str): Path to the directory containing JSON files
        output_path (str): Full path (including filename) for the output merged JSON file
    """
    # Initialize the merged structure
    merged_data = {
        "conversations": []
    }
    
    # Convert string paths to Path objects
    directory = Path(input_directory)
    output = Path(output_path)
    
    # Create output directory if it doesn't exist
    output.parent.mkdir(parents=True, exist_ok=True)
    
    # Iterate through all JSON files in the directory
    for file_path in directory.glob("*.json"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
                # Check if the file has the expected structure
                if "conversations" in data:
                    # Extend the conversations list with the new data
                    merged_data["conversations"].extend(data["conversations"])
                else:
                    print(f"Warning: File {file_path} does not have the expected structure")
                    
        except json.JSONDecodeError:
            print(f"Error: Could not parse JSON from {file_path}")
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
    # Write the merged data to the specified output path
    try:
        with open(output, 'w', encoding='utf-8') as f:
            json.dump(merged_data, f, indent=2, ensure_ascii=False)
        print(f"Successfully created merged file at: {output}")
    except Exception as e:
        print(f"Error writing merged file: {str(e)}")

In [11]:
merge_json_files('Dataset/Processed/Shareholder Letters/', 'Dataset/Processed/Shareholder Letters/Letters.json')

Successfully created merged file at: Dataset/Processed/Shareholder Letters/Letters.json


In [15]:
shutil.copy2('Dataset/Processed/Shareholder Letters/Letters.json', 'Dataset/Processed/Ground Truth/Letters.json')
shutil.copy2('Dataset/Processed/Lessons for Corporate America/Lessons-for-Corporate-America.json', 'Dataset/Processed/Ground Truth/Lessons.json')
shutil.copy2('Dataset/Processed/Meeting Transcripts/Berkshire Meeting Transcripts - 1994 - 2022.json', 'Dataset/Processed/Ground Truth/Transcripts.json')

'Dataset/Processed/Merged/Transcripts.json'

In [16]:
merge_json_files('Dataset/Processed/Ground Truth/', 'Dataset/Processed/Ground Truth/dataset_combined.json')

Successfully created merged file at: Dataset/Processed/Ground Truth/dataset_combined.json


In [1]:
import json

def split_conversations(input_file, output_file):
    # Read the input JSON file
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # New list to store split conversations
    new_conversations = []
    
    # Process each conversation group
    for conv_group in data['conversations']:
        # Get the inner conversations list
        inner_convs = conv_group[0]['conversations']
        
        # Process pairs of messages
        for i in range(0, len(inner_convs), 2):
            if i + 1 < len(inner_convs):  # Make sure we have a pair
                # Create a new conversation group with just this pair
                new_group = [{
                    "conversations": [
                        inner_convs[i],     # human message
                        inner_convs[i + 1]  # assistant message
                    ]
                }]
                new_conversations.append(new_group)
    
    # Create new JSON structure
    new_data = {
        "conversations": new_conversations
    }
    
    # Write to output file
    with open(output_file, 'w') as f:
        json.dump(new_data, f, indent=2)

In [3]:
input_file = '/notebooks/AlphaBuffet/Dataset/Processed/Ground Truth/dataset_combined.json'
output_file = '/notebooks/AlphaBuffet/Dataset/Processed/Ground Truth/dataset_combined_2.json'
split_conversations(input_file, output_file)