# Multi-Analyst Text Analysis Pipeline

This notebook demonstrates the full pipeline for analyzing text through multiple specialist lenses (rhetorician, syntactician, lexicologist, etc.) and synthesizing their observations.

In [1]:
# Optional: Install requirements if running in a fresh kernel
# Uncomment if needed:
!pip install -r requirements.txt

# Or install individual packages:
# !pip install litellm pydantic jinja2



[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import sys
import subprocess
import os

print("=== ENVIRONMENT DIAGNOSIS ===\n")

# 1. What Python is Jupyter using?
print(f"Jupyter kernel Python: {sys.executable}")
print(f"Python version: {sys.version}\n")

# 2. What Python is pip using?
pip_python = subprocess.run(['which', 'python'], capture_output=True, text=True)
print(f"System Python: {pip_python.stdout.strip()}")

pip_python3 = subprocess.run(['which', 'python3'], capture_output=True, text=True)
print(f"System Python3: {pip_python3.stdout.strip()}")

pip_location = subprocess.run(['which', 'pip'], capture_output=True, text=True)
print(f"Pip location: {pip_location.stdout.strip()}\n")

# 3. Where is pip installing packages?
pip_show = subprocess.run(['pip', 'show', 'litellm'], capture_output=True, text=True)
if pip_show.returncode == 0:
    print("Litellm is installed via pip:")
    for line in pip_show.stdout.split('\n'):
        if 'Location' in line:
            print(f"  {line}")
else:
    print("Litellm NOT found via pip show\n")

# 4. What's in Jupyter's Python path?
print("\nJupyter's Python can see these site-packages:")
for path in sys.path:
    if 'site-packages' in path:
        print(f"  {path}")

=== ENVIRONMENT DIAGNOSIS ===

Jupyter kernel Python: /Users/andersohrn/PycharmProjects/yolo_playnice/bin/python3
Python version: 3.9.19 (main, Mar 19 2024, 16:08:27) 
[Clang 15.0.0 (clang-1500.3.9.4)]

System Python: /Users/andersohrn/opt/anaconda3/bin/python
System Python3: /Users/andersohrn/opt/anaconda3/bin/python3
Pip location: /Users/andersohrn/opt/anaconda3/bin/pip

Litellm is installed via pip:
  Location: /Users/andersohrn/opt/anaconda3/lib/python3.8/site-packages

Jupyter's Python can see these site-packages:
  /Users/andersohrn/PycharmProjects/yolo_playnice/lib/python3.9/site-packages


In [3]:
try:
    import litellm
    print(f"✓ litellm {litellm.__version__} is installed and importable")
except ImportError as e:
    print(f"✗ Cannot import litellm: {e}")
    
# Check if your src directory is accessible  
import os
print(f"\nCurrent directory: {os.getcwd()}")
print(f"src folder exists: {os.path.exists('src')}")
print(f"src/llm.py exists: {os.path.exists('src/llm.py')}")

# Check Python environment
import sys
print(f"\nPython executable: {sys.executable}")

✗ Cannot import litellm: No module named 'litellm'

Current directory: /Users/andersohrn/PycharmProjects/russell_writes
src folder exists: True
src/llm.py exists: True

Python executable: /Users/andersohrn/PycharmProjects/yolo_playnice/bin/python3


## Initialize Objects
Set up connections to a Large Language Model provider via `litellm` model router. Also, setup up tools to retrieve text data to be part of the context window, that is, instructions and texts to analyze.

In [None]:
import os
from src.llm import LLM
from src.models.llm_config_models import LLMConfig
from src.models.prompt_models import (
    PreambleInstructionConfig,
    PreambleTextConfig,
    RhetoricianConfig,
    SyntacticianConfig,
    LexicologistConfig,
    InformationArchitectConfig,
    EfficiencyAuditorConfig,
    PatternRecognizerTextConfig,
)

llm = LLM(LLMConfig(
    model="mistral/mistral-large-2411",
    api_key=os.environ.get("MISTRAL_API_KEY")
))

from src.prompt_maker import PromptMaker
from src.data_sampler import DataSampler

prompt_maker = PromptMaker()
sampler = DataSampler()


In [None]:
test

## Step 1: Generate and Store Sample

Sample text from the data corpus and store it with full provenance (which file, which paragraphs).

In [None]:
# Option 1: Random sample (weighted by file length)
# paragraphs = sampler.sample_segment(p_length=10)
# file_index = None  # Random sampling doesn't track which file

# Option 2: Specific file and paragraph range (better for provenance)
file_index = 0
paragraph_range = slice(10, 20)  # Paragraphs 10-20 from file
paragraphs = sampler.get_paragraph_chunk(file_index, paragraph_range)
text = "\n\n".join(paragraphs)

# Generate sample ID
sample_id = f"sample_{len(store.list_samples()) + 1:03d}"

# Store sample with full provenance
store.save_sample(
    sample_id=sample_id,
    text=text,
    file_index=file_index,
    paragraph_start=paragraph_range.start,
    paragraph_end=paragraph_range.stop
)

print(f"Created {sample_id}")
print(f"Text length: {len(text)} characters")
print(f"First 200 chars: {text[:200]}...")

## Step 2: Run Multi-Analyst Pipeline

Send the text through each specialist analyst. Each produces an independent analysis from their domain expertise.

**Prompt structure for caching optimization:**
1. Preamble instruction (static)
2. Analyst-specific template (static per analyst)
3. Text to analyze (dynamic)

In [None]:
# Get the sample text
sample = store.get_sample(sample_id)
text = sample.text

# Build shared prompt components (reused across all analysts)
preamble_instruction = maker.render(PreambleInstructionConfig())
preamble_text = maker.render(PreambleTextConfig(text_to_analyze=text))

# --- RHETORICIAN ---
print("Running rhetorician...", end=" ")
rhetorician_prompt = maker.render(RhetoricianConfig())  # All sections enabled by default
full_prompt = f"{preamble_instruction}\n\n{rhetorician_prompt}\n\n{preamble_text}"
response = llm.complete(full_prompt)
store.save_analysis(sample_id, "rhetorician", response.content, response.model)
print(f"✓ ({len(response.content)} chars)")

# --- SYNTACTICIAN ---
print("Running syntactician...", end=" ")
syntactician_prompt = maker.render(SyntacticianConfig())
full_prompt = f"{preamble_instruction}\n\n{syntactician_prompt}\n\n{preamble_text}"
response = llm.complete(full_prompt)
store.save_analysis(sample_id, "syntactician", response.content, response.model)
print(f"✓ ({len(response.content)} chars)")

# --- LEXICOLOGIST ---
print("Running lexicologist...", end=" ")
lexicologist_prompt = maker.render(LexicologistConfig())
full_prompt = f"{preamble_instruction}\n\n{lexicologist_prompt}\n\n{preamble_text}"
response = llm.complete(full_prompt)
store.save_analysis(sample_id, "lexicologist", response.content, response.model)
print(f"✓ ({len(response.content)} chars)")

# --- INFORMATION ARCHITECT ---
print("Running information_architect...", end=" ")
info_arch_prompt = maker.render(InformationArchitectConfig())
full_prompt = f"{preamble_instruction}\n\n{info_arch_prompt}\n\n{preamble_text}"
response = llm.complete(full_prompt)
store.save_analysis(sample_id, "information_architect", response.content, response.model)
print(f"✓ ({len(response.content)} chars)")

# --- EFFICIENCY AUDITOR ---
print("Running efficiency_auditor...", end=" ")
efficiency_prompt = maker.render(EfficiencyAuditorConfig())
full_prompt = f"{preamble_instruction}\n\n{efficiency_prompt}\n\n{preamble_text}"
response = llm.complete(full_prompt)
store.save_analysis(sample_id, "efficiency_auditor", response.content, response.model)
print(f"✓ ({len(response.content)} chars)")

print(f"\nAll analyses complete for {sample_id}")

## Step 3: Retrieve and Examine Results

Check what's been stored and verify all analyses are present.

In [None]:
# Check if all required analyses are present
is_complete = store.is_complete(sample_id, ANALYSTS)
print(f"Analysis complete: {is_complete}")

# Retrieve sample and all analyses
sample, analyses = store.get_sample_with_analyses(sample_id)

print(f"\nSample: {sample.sample_id}")
print(f"Source: File {sample.file_index}, paragraphs {sample.paragraph_start}-{sample.paragraph_end}")
print(f"Analyses available: {list(analyses.keys())}")

# Examine one analysis
print(f"\n--- Rhetorician Output (first 500 chars) ---")
print(analyses.get("rhetorician", "Not found")[:500])

## Step 4: Pattern Recognition (Cross-Perspective Integration)

Synthesize all analyst perspectives to identify interactions, tensions, and load-bearing features.

In [None]:
# Get sample and all analyses
sample, analyses = store.get_sample_with_analyses(sample_id)

# Format all analyst reports into a single string
specialist_analyses = f"""**RHETORICIAN:**
{analyses['rhetorician']}

**SYNTACTICIAN:**
{analyses['syntactician']}

**LEXICOLOGIST:**
{analyses['lexicologist']}

**INFORMATION ARCHITECT:**
{analyses['information_architect']}

**EFFICIENCY AUDITOR:**
{analyses['efficiency_auditor']}
"""

# Build pattern recognizer prompt using PromptMaker
pattern_config = PatternRecognizerTextConfig(
    original_text=sample.text,
    specialist_analyses=specialist_analyses
)
pattern_prompt = maker.render(pattern_config)

# Get cross-perspective integration
print("Running pattern recognizer...", end=" ")
pattern_response = llm.complete(pattern_prompt)
print(f"✓ ({len(pattern_response.content)} chars)")

# Display first part of the synthesis
print("\n--- Pattern Recognition Output (first 1000 chars) ---")
print(pattern_response.content[:1000])

## Utilities: Working with Stored Samples

Helper functions for browsing and managing stored results.

In [None]:
# List all samples in the database
all_samples = store.list_samples()
print(f"Total samples: {len(all_samples)}")
print(f"Sample IDs: {all_samples}")

# Check completion status for each
print("\nCompletion status:")
for sid in all_samples:
    complete = store.is_complete(sid, ANALYSTS)
    status = "✓" if complete else "✗"
    print(f"  {status} {sid}")

# Close database connection when done
# store.close()