# Process SeekingAlpha Test Data (2025)

**Purpose:** Convert SeekingAlpha transcript txt files into structured Q&A pairs

**Input:** `../Data/SeekingAlpha transcripts 2025/*.txt`

**Output:** 
- `../Processed Data/speaker_turns.csv` - All speaker turns
- `../Processed Data/test_qa_pairs.jsonl` - Jamie Dimon Q&A pairs

In [1]:
# Import libraries
import json
import re
import pandas as pd
from pathlib import Path
from collections import Counter
import shutil
from datetime import datetime

## 1. Extract Speaker Turns from Transcripts

In [2]:
def extract_speaker_turns(file_path):
    """
    Extract all speaker turns from a transcript file.
    
    Speaker turn format:
        [blank line]
        Speaker Name
        [optional title line]
        [blank line]
        Speaking content...
    
    Returns:
        List of dictionaries with: quarter, date, speaker, text, turn_number
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Remove line numbers (e.g., "     1→")
    cleaned_lines = [re.sub(r'^\s*\d+→', '', line) for line in lines]
    full_text = ''.join(cleaned_lines)
    
    # Extract quarter (e.g., "Q1 2025" -> "2025Q1")
    quarter_match = re.search(r'Q(\d+)\s+(\d{4})\s+Earnings', full_text)
    quarter = f"{quarter_match.group(2)}Q{quarter_match.group(1)}" if quarter_match else "unknown"
    
    # Extract conference call date
    date_match = re.search(r'Earnings (?:Conference )?Call[^\n]*\n?([A-Z][a-z]+)\s+(\d{1,2}),\s+(\d{4})', full_text)
    conference_date = f"{date_match.group(1)} {date_match.group(2)} {date_match.group(3)}" if date_match else "unknown"
    
    # Complete list of valid speakers (updated by user)
    valid_speakers = [
        'Operator',
        'Jeremy Barnum',
        'Ken Usdin',
        'Jamie Dimon',
        'James Dimon',
        'Erika Najarian',
        'John McDonald',
        'Matt O\'Connor',
        'Steven Chubak',
        'Gerard Cassidy',
        'Ebrahim Poonawala',
        'Jim Mitchell',
        'Betsy Graseck',
        'Mike Mayo',
        'James Mitchell',
        'Michael Mayo',
        'Glenn Schorr',
        'Saul Martinez',
        'Betsy Lynn Graseck',
        'Christoph M. Kotowski',
        'Christopher Edward McGratty',
        'Ebrahim Huseini Poonawala',
        'Gerard Sean Cassidy',
        'Glenn Paul Schorr',
        'James Francis Mitchell',
        'John Eamon McDonald',
        'Kenneth Michael Usdin',
        'Matthew Derek O\'Connor',
        'Michael Lawrence Mayo',
        'Steven A. Alexopoulos',
        'L. Erika Penala',
        'Kenneth Usdin',
        'Christopher McGratty'
    ]
    
    # Find Q&A section
    qa_start = full_text.find("Question-and-Answer Session")
    if qa_start == -1:
        print(f"Warning: No Q&A section found in {file_path.name}")
        return []
    
    qa_text = full_text[qa_start:]
    
    # Build regex pattern - escape special characters and sort by length (longest first)
    escaped_names = [re.escape(name) for name in valid_speakers]
    escaped_names.sort(key=len, reverse=True)
    names_pattern = '|'.join(escaped_names)
    
    # Pattern: blank line + speaker name + newline
    pattern = rf'\n\n({names_pattern})\n'
    
    # Find all speaker positions
    speaker_matches = list(re.finditer(pattern, qa_text))
    
    if not speaker_matches:
        print(f"Warning: No speaker turns found in {file_path.name}")
        return []
    
    # Title patterns to strip from content
    # These patterns match title/company lines that appear immediately after speaker name
    title_patterns = [
        # Job titles
        r'^(Executive VP & CFO|Chairman & CEO|Chief Financial Officer|Chairman & Chief Executive Officer)\s*\n+',
        # Company names with "Research Division"
        r'^.+,\s*Research Division\s*\n+',
        # Investment banks and securities firms (without Research Division)
        r'^(?:[A-Z][a-zA-Z\s&,\.]+)?(?:Securities|Capital Markets|Investment Bank)(?:\s*,\s*Inc\.)?(?:\s*,\s*Research Division)?\s*\n+',
        # Specific patterns for Q3 format
        r'^[A-Z][a-zA-Z\s&,\.]+(?:LLC|LLP|Inc\.|L\.P\.)\s*,\s*Research Division\s*\n+',
        # Standalone company names like "Bernstein Autonomous LLP"
        r'^[A-Z][a-zA-Z\s&,\.]+(?:LLC|LLP|Inc\.|L\.P\.|LP)\s*\n+',
        # Other institutional names like "Seaport Research Partners"
        r'^[A-Z][a-z]+\s+(?:Research\s+)?(?:Autonomous|Partners|Group|Advisors|Associates)(?:\s+(?:LLC|LLP|Inc\.|L\.P\.))?\s*\n+',
    ]
    
    def strip_title_prefix(text):
        """Remove title/company prefix from text"""
        for pattern in title_patterns:
            text = re.sub(pattern, '', text, flags=re.MULTILINE)
        return text.strip()
    
    # Extract speaker turns
    speaker_turns = []
    
    for i, match in enumerate(speaker_matches):
        speaker = match.group(1).strip()
        
        # Get text from end of this match to start of next match (or end of qa_text)
        text_start = match.end()
        
        if i + 1 < len(speaker_matches):
            text_end = speaker_matches[i + 1].start()
        else:
            text_end = len(qa_text)
        
        text = qa_text[text_start:text_end].strip()
        
        # Strip title prefix (in case title line was included in text)
        text = strip_title_prefix(text)
        
        # Skip empty text
        if not text:
            continue
        
        speaker_turns.append({
            'quarter': quarter,
            'date': conference_date,
            'turn_number': len(speaker_turns) + 1,
            'speaker': speaker,
            'text': text,
            'word_count': len(text.split())
        })
    
    return speaker_turns

In [3]:
# Process all transcript files
input_dir = Path("../Data/SeekingAlpha transcripts 2025")
txt_files = sorted(input_dir.glob("jpmorgan_chase_2025q*.txt"))

all_speaker_turns = []
print("Extracting speaker turns...\n")

for txt_file in txt_files:
    turns = extract_speaker_turns(txt_file)
    all_speaker_turns.extend(turns)
    
    # Count by speaker type
    jamie_count = sum(1 for t in turns if t['speaker'] in ['Jamie Dimon', 'James Dimon'])
    jeremy_count = sum(1 for t in turns if t['speaker'] == 'Jeremy Barnum')
    analyst_count = sum(1 for t in turns if t['speaker'] not in ['Jamie Dimon', 'James Dimon', 'Jeremy Barnum', 'Operator'])
    
    print(f"{txt_file.name}:")
    print(f"  Total turns: {len(turns)}")
    print(f"  Jamie/James: {jamie_count} | Jeremy: {jeremy_count} | Analysts: {analyst_count}")

print(f"\nTotal speaker turns extracted: {len(all_speaker_turns)}")

Extracting speaker turns...

jpmorgan_chase_2025q1.txt:
  Total turns: 108
  Jamie/James: 23 | Jeremy: 32 | Analysts: 39
jpmorgan_chase_2025q2.txt:
  Total turns: 128
  Jamie/James: 34 | Jeremy: 39 | Analysts: 39
jpmorgan_chase_2025q3.txt:
  Total turns: 60
  Jamie/James: 11 | Jeremy: 20 | Analysts: 18

Total speaker turns extracted: 296


## 2. Save Speaker Turns to CSV

In [4]:
# Convert to DataFrame and save
df_turns = pd.DataFrame(all_speaker_turns)

# Normalize speaker names to handle variations
def normalize_name(speaker):
    """Normalize speaker names to canonical form"""
    if 'Dimon' in speaker:
        return 'Jamie Dimon'
    elif 'Barnum' in speaker:
        return 'Jeremy Barnum'
    return speaker

df_turns['speaker'] = df_turns['speaker'].apply(normalize_name)

# Add speaker type column for easier filtering
def classify_speaker(speaker):
    if speaker == 'Jamie Dimon':
        return 'CEO'
    elif speaker == 'Jeremy Barnum':
        return 'CFO'
    elif speaker == 'Operator':
        return 'Operator'
    else:
        return 'Analyst'

df_turns['speaker_type'] = df_turns['speaker'].apply(classify_speaker)

# Save to CSV
output_csv = Path("../Processed Data/speaker_turns.csv")
output_csv.parent.mkdir(exist_ok=True)
df_turns.to_csv(output_csv, index=False)

print(f"{'='*80}")
print(f"SAVED: {output_csv}")
print(f"{'='*80}")
print(f"Total turns: {len(df_turns)}")
print(f"\nBreakdown by speaker type:")
print(df_turns['speaker_type'].value_counts())
print(f"\nBreakdown by quarter:")
print(df_turns.groupby(['quarter', 'speaker_type']).size().unstack(fill_value=0))

SAVED: ../Processed Data/speaker_turns.csv
Total turns: 296

Breakdown by speaker type:
speaker_type
Analyst     96
CFO         91
CEO         68
Operator    41
Name: count, dtype: int64

Breakdown by quarter:
speaker_type  Analyst  CEO  CFO  Operator
quarter                                  
2025Q1             39   23   32        14
2025Q2             39   34   39        16
2025Q3             18   11   20        11


## 3. Extract Jamie Dimon Q&A Pairs

In [5]:
def extract_qa_pairs(df_turns, min_words=20):
    """
    Extract Q&A pairs from speaker turns.
    
    Logic:
    - For each analyst turn, collect all Jamie responses until next analyst
    - Only include if Jamie's combined response > min_words
    """
    qa_pairs = []
    
    # Group by quarter and process each separately
    for quarter in df_turns['quarter'].unique():
        quarter_turns = df_turns[df_turns['quarter'] == quarter].reset_index(drop=True)
        
        i = 0
        while i < len(quarter_turns):
            turn = quarter_turns.iloc[i]
            
            # Skip if not an analyst
            if turn['speaker_type'] != 'Analyst':
                i += 1
                continue
            
            # This is an analyst question
            analyst_name = turn['speaker']
            analyst_question = turn['text']
            date = turn['date']
            
            # Collect Jamie's responses until next analyst
            jamie_responses = []
            j = i + 1
            
            while j < len(quarter_turns):
                next_turn = quarter_turns.iloc[j]
                
                # Stop if we hit another analyst
                if next_turn['speaker_type'] == 'Analyst':
                    break
                
                # Collect Jamie/James responses
                if next_turn['speaker_type'] == 'CEO':
                    jamie_responses.append(next_turn['text'])
                
                j += 1
            
            # Check if Jamie responded substantially
            if jamie_responses:
                combined_answer = '\n\n'.join(jamie_responses)
                word_count = len(combined_answer.split())
                
                if word_count > min_words:
                    qa_pairs.append({
                        'quarter': quarter,
                        'date': date,
                        'analyst': analyst_name,
                        'question': analyst_question,
                        'answer': combined_answer
                    })
            
            i += 1
    
    return qa_pairs

In [6]:
# Extract Q&A pairs
print("Extracting Jamie Dimon Q&A pairs (>20 words)...\n")

qa_pairs = extract_qa_pairs(df_turns, min_words=20)

print(f"Total Q&A pairs extracted: {len(qa_pairs)}")
print(f"\nBreakdown by quarter:")
quarter_counts = Counter([pair['quarter'] for pair in qa_pairs])
for quarter in sorted(quarter_counts.keys()):
    print(f"  {quarter}: {quarter_counts[quarter]} Q&A pairs")

Extracting Jamie Dimon Q&A pairs (>20 words)...

Total Q&A pairs extracted: 43

Breakdown by quarter:
  2025Q1: 16 Q&A pairs
  2025Q2: 19 Q&A pairs
  2025Q3: 8 Q&A pairs


## 4. Save Q&A Pairs to JSONL

In [7]:
# Save to JSONL
output_file = Path("../Processed Data/test_qa_pairs.jsonl")
output_file.parent.mkdir(exist_ok=True)

# Create backup if file exists
if output_file.exists():
    backup_file = output_file.parent / f"test_qa_pairs_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
    shutil.copy(output_file, backup_file)
    print(f"Backup created: {backup_file.name}\n")

with open(output_file, 'w', encoding='utf-8') as f:
    for pair in qa_pairs:
        json.dump(pair, f, ensure_ascii=False)
        f.write('\n')

print(f"{'='*80}")
print(f"SAVED: {output_file}")
print(f"{'='*80}")
print(f"Total Q&A pairs: {len(qa_pairs)}")

SAVED: ../Processed Data/test_qa_pairs.jsonl
Total Q&A pairs: 43
