# Customer Intelligence Pipeline — Interactive Workbench

This notebook provides three capabilities:

1. **End-to-end demo** — Run the full extraction pipeline on any transcript and inspect all 4 signal layers
2. **Prompt tuning** — Edit extraction prompts, re-run individual layers, and compare results side-by-side
3. **Evaluation** — Measure extraction quality against ground truth using the same metrics as the test suite

### Prerequisites

- `ANTHROPIC_API_KEY` set in environment or `.env` file
- Synthetic corpus generated (`python -m customer_intelligence.synthetic.generator`)
- Package dependencies installed (`pip install -e .`)

### Cost awareness

Each extraction layer makes one LLM API call to `claude-opus-4-6`. A full extraction is 3–4 calls.
The corpus-level evaluation (Section 7) runs extraction on all 7 transcripts (~25 API calls total).

In [1]:
import json
import sys
from pathlib import Path
from datetime import datetime, timezone
from IPython.display import display, HTML, Markdown

import anthropic
from dotenv import load_dotenv

# Paths relative to notebook location
PROJECT_ROOT = Path("..").resolve()
DATA_DIR = PROJECT_ROOT / "data"
TRANSCRIPTS_DIR = DATA_DIR / "transcripts"
GROUND_TRUTH_DIR = DATA_DIR / "ground_truth"
EXTRACTIONS_DIR = DATA_DIR / "extractions"
PROMPTS_DIR = PROJECT_ROOT / "src" / "customer_intelligence" / "extraction" / "prompts"

# Ensure the package is importable
sys.path.insert(0, str(PROJECT_ROOT / "src"))

from customer_intelligence.schemas.transcript import Transcript
from customer_intelligence.schemas.extraction import ExtractionResult
from customer_intelligence.schemas.surface import SurfaceSignals
from customer_intelligence.schemas.behavioral import BehavioralSignals
from customer_intelligence.schemas.psychographic import PsychographicSignals
from customer_intelligence.schemas.multimodal import MultimodalSignals
from customer_intelligence.schemas.summary import TranscriptSummary
from customer_intelligence.extraction.extractor import (
    extract,
    extract_summary,
    _extract_layer,
    _format_transcript,
    _coerce_to_schema,
    _has_paralinguistic,
    _parse_json_response,
    MODEL,
)
from customer_intelligence.extraction.prompts import (
    load_prompt,
    SURFACE_EXTRACTION_PROMPT,
    BEHAVIORAL_EXTRACTION_PROMPT,
    PSYCHOGRAPHIC_EXTRACTION_PROMPT,
    MULTIMODAL_DIVERGENCE_PROMPT,
    SUMMARY_PROMPT,
)

load_dotenv(PROJECT_ROOT / ".env")

True

In [2]:
client = anthropic.Anthropic()

print(f"Anthropic client initialized")
print(f"Model: {MODEL}")
print(f"Transcripts available: {len(list(TRANSCRIPTS_DIR.glob('*.json')))}")
print(f"Ground truth available: {len(list(GROUND_TRUTH_DIR.glob('*.json')))}")
print(f"Existing extractions: {len(list(EXTRACTIONS_DIR.glob('*.json')))}")

Anthropic client initialized
Model: claude-opus-4-6
Transcripts available: 10
Ground truth available: 7
Existing extractions: 1


---
## Display Helpers

Utility functions for rendering extraction results as formatted tables and structured output.
These are used throughout the notebook — run this cell before any display calls.

In [3]:
def pretty_json(obj):
    """Pretty-print a Pydantic model or dict as formatted JSON."""
    if hasattr(obj, "model_dump"):
        data = obj.model_dump()
    else:
        data = obj
    print(json.dumps(data, indent=2, default=str))


def format_table(headers: list[str], rows: list[list]) -> str:
    """Build a Markdown table string."""
    lines = ["| " + " | ".join(headers) + " |"]
    lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
    for row in rows:
        lines.append("| " + " | ".join(str(c) for c in row) + " |")
    return "\n".join(lines)


def display_table(headers, rows):
    """Render a Markdown table in the notebook."""
    display(Markdown(format_table(headers, rows)))


def display_surface(surface: SurfaceSignals):
    """Display Layer 1 surface signals in formatted tables."""
    display(Markdown("### Aspect Sentiments"))
    rows = [[a.aspect, a.sentiment, f"{a.intensity:.2f}", (a.context or "")[:80]]
            for a in surface.aspects]
    display_table(["Aspect", "Sentiment", "Intensity", "Context"], rows)

    display(Markdown("### Topics"))
    rows = [[t.name, t.timeline_position, f"{t.relevance:.2f}"]
            for t in surface.topics]
    display_table(["Topic", "Position", "Relevance"], rows)

    display(Markdown("### Named Entities"))
    rows = [[e.name, e.entity_type, e.role or "", e.mention_count]
            for e in surface.entities]
    display_table(["Name", "Type", "Role", "Mentions"], rows)

    display(Markdown("### Key Phrases"))
    rows = [[kp.phrase, f"{kp.relevance:.2f}", (kp.context or "")[:60]]
            for kp in surface.key_phrases]
    display_table(["Phrase", "Relevance", "Context"], rows)


def display_behavioral(behavioral: BehavioralSignals):
    """Display Layer 2 behavioral signals."""
    display(Markdown("### Objection Triples"))
    for i, triple in enumerate(behavioral.objection_triples):
        obj = triple.objection
        res = triple.resolution
        out = triple.outcome
        display(Markdown(f"**Objection {i+1}** ({obj.type}, {obj.conversation_stage})"))
        print(f"  Speaker: {obj.speaker_role}")
        print(f"  Language: \"{obj.specific_language}\"")
        if res:
            print(f"  Resolution: {res.type} — \"{res.specific_language}\"")
        print(f"  Outcome: resolved={out.resolved}, progressed={out.deal_progressed}")
        print(f"  Confidence: {triple.confidence:.2f}")
        print()

    display(Markdown("### Buying Intent Markers"))
    rows = [[m.type, m.evidence[:80], f"{m.confidence:.2f}"]
            for m in behavioral.buying_intent_markers]
    display_table(["Type", "Evidence", "Confidence"], rows)

    display(Markdown("### Competitive Mentions"))
    if behavioral.competitive_mentions:
        rows = [[c.competitor, (c.context or "")[:60], c.sentiment]
                for c in behavioral.competitive_mentions]
        display_table(["Competitor", "Context", "Sentiment"], rows)
    else:
        print("  (none detected)")

    display(Markdown("### Engagement Trajectory"))
    rows = [[e.phase, e.participation_level, e.question_depth, e.energy, e.notes or ""]
            for e in behavioral.engagement_trajectory]
    display_table(["Phase", "Participation", "Question Depth", "Energy", "Notes"], rows)


def display_psychographic(psychographic: PsychographicSignals):
    """Display Layer 3 psychographic signals."""
    mm = psychographic.mental_model
    display(Markdown("### Mental Model"))
    print(f"  Primary: {mm.primary}")
    print(f"  Secondary: {mm.secondary}")
    print(f"  Confidence: {mm.confidence:.2f}")
    print(f"  Reasoning: {mm.reasoning}")
    print(f"  Evidence: {mm.evidence}")

    display(Markdown("### Persona Indicators"))
    for p in psychographic.persona_indicators:
        print(f"  {p.archetype} (confidence={p.confidence:.2f})")
        print(f"    Reasoning: {p.reasoning}")
        print(f"    Evidence: {p.evidence}")
        print()

    display(Markdown("### Language Fingerprint"))
    lf = psychographic.language_fingerprint
    print(f"  Vocabulary: {lf.distinctive_vocabulary}")
    print(f"  Metaphors: {lf.metaphors}")
    print(f"  Framing: {lf.framing_patterns}")


def display_multimodal(multimodal: MultimodalSignals | None):
    """Display Layer 4 multimodal signals."""
    if multimodal is None:
        print("  (No multimodal signals — transcript lacks paralinguistic annotations)")
        return

    display(Markdown("### Divergence Signals"))
    rows = [[d.utterance_index, d.type, d.text_sentiment,
             ", ".join(d.nonverbal_cues), d.interpretation[:60], f"{d.confidence:.2f}"]
            for d in multimodal.divergences]
    display_table(["Utterance", "Type", "Text Sentiment", "Cues", "Interpretation", "Confidence"], rows)

    display(Markdown("### Composite Sentiments"))
    rows = [[cs.utterance_index, cs.original_text_polarity,
             cs.adjusted_polarity, f"{cs.confidence:.2f}", cs.note or ""]
            for cs in multimodal.composite_sentiments]
    display_table(["Utterance", "Original", "Adjusted", "Confidence", "Note"], rows)


def display_result_summary(result: ExtractionResult):
    """One-line summary of an ExtractionResult."""
    display(Markdown(
        f"**{result.transcript_id}** | "
        f"confidence={result.overall_confidence:.2f} | "
        f"topics={len(result.surface.topics)} | "
        f"entities={len(result.surface.entities)} | "
        f"objections={len(result.behavioral.objection_triples)} | "
        f"intent_markers={len(result.behavioral.buying_intent_markers)} | "
        f"multimodal={'yes' if result.multimodal else 'no'}"
    ))
    if result.notes:
        for note in result.notes:
            print(f"  Note: {note}")


def display_summary(summary: TranscriptSummary):
    """Display a transcript summary with all sections."""
    display(Markdown("### Executive Summary"))
    display(Markdown(summary.executive_summary))

    display(Markdown("### Key Moments"))
    for i, m in enumerate(summary.key_moments, 1):
        display(Markdown(f"**{i}. {m.moment_type.upper()}** (turns {m.turn_indices})"))
        print(f"  {m.description}")
        print(f"  Significance: {m.significance}\n")

    display(Markdown("### Action Items"))
    if summary.action_items:
        rows = [[a.action, a.owner, a.criticality] for a in summary.action_items]
        display_table(["Action", "Owner", "Criticality"], rows)
    else:
        print("  (none identified)")

    display(Markdown("### Prospect Priorities"))
    for i, p in enumerate(summary.prospect_priorities, 1):
        print(f"  {i}. {p}")

    display(Markdown("### Concerns to Address"))
    if summary.concerns_to_address:
        for i, c in enumerate(summary.concerns_to_address, 1):
            print(f"  {i}. {c}")
    else:
        print("  (none identified)")


print("Display helpers loaded.")

Display helpers loaded.


---
## Data Loading

Load transcripts and ground truth from the synthetic corpus. The corpus contains
7 transcripts across 5 accounts, covering won/lost/stalled outcomes, with and
without paralinguistic annotations.

If you haven't generated the corpus yet, run:
```bash
python -m customer_intelligence.synthetic.generator
```

In [4]:
# Load all transcripts
transcripts = {}
for path in sorted(TRANSCRIPTS_DIR.glob("*.json")):
    t = Transcript.model_validate_json(path.read_text())
    transcripts[t.call_metadata.call_id] = t

# Load all ground truth
ground_truths = {}
for path in sorted(GROUND_TRUTH_DIR.glob("*.json")):
    gt = ExtractionResult.model_validate_json(path.read_text())
    ground_truths[gt.transcript_id] = gt

# Load any existing extractions
extractions = {}
EXTRACTIONS_DIR.mkdir(parents=True, exist_ok=True)
for path in sorted(EXTRACTIONS_DIR.glob("*.json")):
    ex = ExtractionResult.model_validate_json(path.read_text())
    extractions[ex.transcript_id] = ex

print(f"Loaded {len(transcripts)} transcripts, {len(ground_truths)} ground truths, {len(extractions)} existing extractions")

Loaded 10 transcripts, 7 ground truths, 1 existing extractions


In [5]:
display(Markdown("### Corpus Overview"))

rows = []
for call_id, t in transcripts.items():
    a = t.account
    has_para = _has_paralinguistic(t)
    has_gt = call_id in ground_truths
    rows.append([
        call_id,
        a.company_name,
        a.company_size,
        a.deal_outcome,
        len(t.utterances),
        "Yes" if has_para else "No",
        "Yes" if has_gt else "No",
    ])

display_table(
    ["Call ID", "Company", "Size", "Outcome", "Turns", "Paralinguistic", "Ground Truth"],
    rows,
)

### Corpus Overview

| Call ID | Company | Size | Outcome | Turns | Paralinguistic | Ground Truth |
| --- | --- | --- | --- | --- | --- | --- |
| cloudfirst_analytics_call1 | CloudFirst Analytics | smb | won | 28 | No | No |
| growthco_call1 | GrowthCo | startup | won | 22 | No | Yes |
| legacy_systems_corp_call1 | Legacy Systems Corp | enterprise | lost | 83 | No | Yes |
| legacy_systems_corp_call2 | Legacy Systems Corp | enterprise | lost | 55 | No | Yes |
| meridian_healthcare_call1 | Meridian Healthcare | enterprise | stalled | 48 | Yes | No |
| safeguard_inc_call1 | SafeGuard Inc | enterprise | stalled | 49 | Yes | Yes |
| scaleup_ltd_call1 | ScaleUp Ltd | smb | won | 36 | Yes | Yes |
| scaleup_ltd_call2 | ScaleUp Ltd | smb | won | 27 | Yes | Yes |
| techcorp_call1 | TechCorp | mid_market | won | 37 | Yes | Yes |
| velocity_logistics_call1 | Velocity Logistics | mid_market | won | 37 | Yes | No |

In [6]:
SELECTED_CALL_ID = list(transcripts.keys())[0]

In [8]:
# === CHANGE THIS to select a different transcript ===
SELECTED_CALL_ID = list(transcripts.keys())[0]

transcript = transcripts[SELECTED_CALL_ID]
print(f"Selected: {SELECTED_CALL_ID}")
print(f"Company: {transcript.account.company_name} ({transcript.account.company_size})")
print(f"Deal: {transcript.account.deal_stage} -> {transcript.account.deal_outcome}")
print(f"Turns: {len(transcript.utterances)}")
print(f"Paralinguistic: {_has_paralinguistic(transcript)}")
print(f"\n--- First 5 utterances ---")
for u in transcript.utterances[:5]:
    para_tag = " [paralinguistic]" if u.paralinguistic else ""
    print(f"[{u.turn_index}] {u.speaker}{para_tag}: {u.text[:120]}")

Selected: cloudfirst_analytics_call1
Company: CloudFirst Analytics (smb)
Deal: discovery -> won
Turns: 28
Paralinguistic: False

--- First 5 utterances ---
[0] rep: Hi Nina, thanks for hopping on. I saw you downloaded our attribution benchmarking report last week. What caught your eye
[1] prospect_head_of_data: Yeah, the section on multi-touch attribution gaps was exactly what we're wrestling with. We're a 60-person shop doing an
[2] rep: Ha, the cobbler's children have no shoes. That's actually more common than you'd think with analytics firms. Tell me mor
[3] prospect_head_of_data: We're running campaigns across LinkedIn, Google, our podcast sponsorships, and a bunch of partner co-marketing stuff. Ri
[4] rep: Two days a month on manual attribution. And you said you don't trust the output. What breaks down specifically?


In [7]:
print(SELECTED_CALL_ID)

cloudfirst_analytics_call1


---
## Understanding the Input: Transcript Structure

Before running extraction, let's examine what a sales call transcript looks like.
Each transcript contains three parts:
- **Account context** — company profile, deal stage, stakeholders
- **Call metadata** — date, duration, participants
- **Utterances** — speaker-labeled conversation turns with optional paralinguistic annotations

In [9]:
display(Markdown("### Account Context"))
a = transcript.account
display_table(
    ["Field", "Value"],
    [
        ["Company", a.company_name],
        ["Size", a.company_size],
        ["Industry", a.industry],
        ["Deal Stage", a.deal_stage],
        ["Deal Outcome", a.deal_outcome],
        ["Stakeholders", ", ".join(f"{s.name} ({s.role}, {s.persona_type})" for s in a.stakeholders)],
    ]
)

display(Markdown("### Call Metadata"))
cm = transcript.call_metadata
display_table(
    ["Field", "Value"],
    [
        ["Call ID", cm.call_id],
        ["Date", cm.call_date],
        ["Duration", f"{cm.duration_minutes} minutes"],
        ["Call #", str(cm.call_number)],
        ["Participants", ", ".join(cm.participants)],
        ["Total Turns", str(len(transcript.utterances))],
    ]
)

# Collapsible raw JSON
raw_transcript = json.dumps(transcript.model_dump(), indent=2, default=str)
display(HTML(f"""
<details>
<summary><b>Raw Transcript JSON</b> (click to expand — {len(raw_transcript)} chars)</summary>
<pre>{raw_transcript[:3000]}{"..." if len(raw_transcript) > 3000 else ""}</pre>
</details>
"""))

### Account Context

| Field | Value |
| --- | --- |
| Company | CloudFirst Analytics |
| Size | smb |
| Industry | Data Analytics |
| Deal Stage | discovery |
| Deal Outcome | won |
| Stakeholders | Nina Patel (Head of Data, executive_champion) |

### Call Metadata

| Field | Value |
| --- | --- |
| Call ID | cloudfirst_analytics_call1 |
| Date | 2026-02-07 |
| Duration | 22 minutes |
| Call # | 1 |
| Participants | rep, prospect_head_of_data |
| Total Turns | 28 |

In [10]:
display(Markdown("### The Conversation: Readable View"))
display(Markdown("The raw input to the extraction pipeline — speaker-labeled turns with optional paralinguistic annotations."))

for u in transcript.utterances[:15]:
    speaker_label = f"**{u.speaker}** (turn {u.turn_index})"

    para_tags = []
    if u.paralinguistic:
        p = u.paralinguistic
        if p.pause_before_sec:
            para_tags.append(f"*{p.pause_before_sec}s pause*")
        if p.energy:
            para_tags.append(f"*{p.energy} energy*")
        if p.tone:
            para_tags.append(f"*{p.tone}*")
        if p.behaviors:
            para_tags.append(f"*{', '.join(p.behaviors)}*")

    para_str = " ".join(para_tags) if para_tags else ""
    display(Markdown(f"{speaker_label} {para_str}"))
    print(f"  {u.text}\n")

print(f"[Showing first 15 of {len(transcript.utterances)} total turns]")

### The Conversation: Readable View

The raw input to the extraction pipeline — speaker-labeled turns with optional paralinguistic annotations.

**rep** (turn 0) 

  Hi Nina, thanks for hopping on. I saw you downloaded our attribution benchmarking report last week. What caught your eye?



**prospect_head_of_data** (turn 1) 

  Yeah, the section on multi-touch attribution gaps was exactly what we're wrestling with. We're a 60-person shop doing analytics consulting, and our own marketing attribution is embarrassingly bad.



**rep** (turn 2) 

  Ha, the cobbler's children have no shoes. That's actually more common than you'd think with analytics firms. Tell me more about what you're seeing.



**prospect_head_of_data** (turn 3) 

  We're running campaigns across LinkedIn, Google, our podcast sponsorships, and a bunch of partner co-marketing stuff. Right now we're stitching everything together in spreadsheets. My analyst spends probably two days a month just building the attribution report, and honestly, I don't trust the numbers.



**rep** (turn 4) 

  Two days a month on manual attribution. And you said you don't trust the output. What breaks down specifically?



**prospect_head_of_data** (turn 5) 

  The big one is we can't track the podcast-to-website-to-demo-request journey. People hear us on a podcast, Google us a week later, and we attribute it all to organic search. We know the podcasts are working because prospects mention them on calls, but the data doesn't show it.



**rep** (turn 6) 

  That's a classic dark funnel problem. The offline-to-online handoff is where most attribution models fall apart. How much are you spending annually on those podcast sponsorships?



**prospect_head_of_data** (turn 7) 

  About 180K a year. And our CEO keeps asking me whether we should double down or kill them. I can't give her a straight answer with the data I have.



**rep** (turn 8) 

  So you have a 180K spending decision that's essentially flying blind. That's actually a perfect use case for us. Our self-reported attribution module captures that exact journey. When someone books a demo, they get a 'how did you hear about us' field that ties back into the attribution model alongside the digital touchpoints.



**prospect_head_of_data** (turn 9) 

  We've tried the how-did-you-hear-about-us thing manually. The data quality was terrible. People just pick the first option.



**rep** (turn 10) 

  Totally. The trick is in how you present it and how you weight it. We use a free-text field instead of a dropdown, then use NLP to categorize and cross-reference with digital touchpoint data. Clients typically see 70 to 80% match rates between self-reported and digital signals.



**prospect_head_of_data** (turn 11) 

  That's clever. Okay, what does something like this cost for a company our size? We're not a big budget operation.



**rep** (turn 12) 

  For your team size and channel mix, you'd be looking at our Starter tier. That's 42K annually, which includes the core attribution engine, the self-reported module, and integrations with your ad platforms and CRM.



**prospect_head_of_data** (turn 13) 

  42K. That's not nothing, but if it saves my analyst two days a month and helps us make a smarter call on 180K in podcast spend, the math works pretty quickly.



**rep** (turn 14) 

  Exactly. And that analyst time is probably the smaller piece. The real value is having confidence in your channel allocation decisions. If podcasts are actually driving 30% of your pipeline but your current data says 5%, that changes your entire strategy.

[Showing first 15 of 28 total turns]


### What We Extract: The 4 Signal Layers

From this raw conversation, the pipeline extracts structured intelligence at 4 levels:

| Layer | What It Captures | Example Output |
|-------|------------------|----------------|
| **Surface** | Aspect sentiment, topics, entities, key phrases | `pricing: negative (0.7)`, `product: positive (0.8)` |
| **Behavioral** | Objection triples, buying intent, competitive mentions, engagement | `Objection: pricing → Resolution: ROI argument → Outcome: resolved` |
| **Psychographic** | Mental models, buyer personas, language fingerprint | `Mental model: cost_reduction`, `Persona: analytical_evaluator` |
| **Multimodal** | Text-audio divergence, composite sentiment | `Said "fine" but [2.1s pause + falling pitch] = hidden concern` |

Let's run the extraction and see these layers in action.

---
## End-to-End Pipeline Demo

Run the full 4-layer extraction pipeline on the selected transcript.
This makes 3–4 LLM API calls (surface, behavioral, psychographic, and
optionally multimodal if paralinguistic annotations are present).

| Layer | Signals | Always runs? |
|-------|---------|-------------|
| Surface | Aspect sentiment, topics, entities, key phrases | Yes |
| Behavioral | Objection triples, buying intent, competitive mentions, engagement trajectory | Yes |
| Psychographic | Mental model, persona indicators, language fingerprint | Yes |
| Multimodal | Text-audio divergences, composite sentiments | Only if paralinguistic annotations present |

In [11]:
%%time
result = extract(transcript, client=client)
print(f"Extraction complete for {result.transcript_id}")
display_result_summary(result)

Extraction complete for cloudfirst_analytics_call1


**cloudfirst_analytics_call1** | confidence=0.83 | topics=12 | entities=6 | objections=4 | intent_markers=9 | multimodal=no

  Note: No paralinguistic annotations — multimodal extraction skipped
CPU times: user 61.7 ms, sys: 10.1 ms, total: 71.8 ms
Wall time: 1min 32s


In [12]:
display(Markdown("## Layer 1: Surface Signals"))
display_surface(result.surface)

display(Markdown("---\n## Layer 2: Behavioral Signals"))
display_behavioral(result.behavioral)

display(Markdown("---\n## Layer 3: Psychographic Signals"))
display_psychographic(result.psychographic)

display(Markdown("---\n## Layer 4: Multimodal Signals"))
display_multimodal(result.multimodal)

## Layer 1: Surface Signals

### Aspect Sentiments

| Aspect | Sentiment | Intensity | Context |
| --- | --- | --- | --- |
| current attribution process | negative | 0.85 | Manual spreadsheet-based attribution taking two days per month with untrusted ou |
| podcast attribution / dark funnel tracking | negative | 0.90 | Cannot track podcast-to-website-to-demo journey; $180K spend decision flying bli |
| self-reported attribution (manual/dropdown approach) | negative | 0.70 | Previously tried how-did-you-hear-about-us manually but data quality was terribl |
| self-reported attribution (NLP-based product approach) | positive | 0.70 | Prospect found the free-text + NLP cross-referencing approach clever; 70-80% mat |
| pricing | mixed | 0.60 | 42K annually acknowledged as 'not nothing' but prospect quickly rationalized ROI |
| ROI / value proposition | positive | 0.85 | Prospect self-computed ROI: saving analyst time plus making smarter call on $180 |
| integration (HubSpot / Google Ads / LinkedIn) | positive | 0.80 | Native HubSpot integration with one-click OAuth, plug-and-play Google Ads, UTM-b |
| implementation timeline | positive | 0.75 | 3-4 weeks to go live perceived as fast; could kick off next Monday and be live b |
| engineering resource requirements | positive | 0.85 | Marketing ops can handle entire setup without engineering cycles; described as ' |
| historical data ingestion | positive | 0.80 | Can ingest up to 12 months of historical data from HubSpot and ad platforms; pro |
| overall product impression | positive | 0.85 | Prospect said 'I'm pretty sold on this honestly' and plans to recommend moving f |
| internal buying process | positive | 0.75 | CEO trusts prospect's judgment on data tooling; expects straightforward approval |

### Topics

| Topic | Position | Relevance |
| --- | --- | --- |
| Multi-touch attribution challenges | early | 1.00 |
| Manual reporting inefficiency (spreadsheet-based process) | early | 0.85 |
| Dark funnel / offline-to-online attribution | early | 0.95 |
| Podcast sponsorship ROI and spend justification | early | 0.90 |
| Self-reported attribution methodology (NLP-based) | mid | 0.80 |
| Pricing and ROI calculation | mid | 0.85 |
| Channel allocation strategy and confidence | mid | 0.80 |
| Technical integration and implementation | mid | 0.80 |
| Engineering resource constraints | mid | 0.70 |
| Historical data ingestion and retroactive modeling | mid | 0.75 |
| Next steps and procurement / internal approval | late | 0.85 |
| Case studies and social proof | late | 0.60 |

### Named Entities

| Name | Type | Role | Mentions |
| --- | --- | --- | --- |
| Nina | person | Prospect, Head of Data at analytics consulting firm | 2 |
| Alisha | person | CEO of prospect's company; final decision-maker | 2 |
| HubSpot | product | Prospect's CRM platform; integration target | 4 |
| Google Ads | product | Prospect's advertising platform; integration target | 2 |
| LinkedIn | company | Prospect's advertising/campaign channel | 2 |
| Google | company | Search engine referenced in attribution journey context | 1 |

### Key Phrases

| Phrase | Relevance | Context |
| --- | --- | --- |
| multi-touch attribution gaps | 1.00 | Core pain point that brought prospect to the conversation; d |
| dark funnel | 0.95 | Rep's characterization of the offline-to-online tracking pro |
| podcast-to-website-to-demo-request journey | 0.90 | Specific attribution gap prospect cannot track; listeners Go |
| 180K annual podcast spend | 0.90 | Major budget decision at stake; CEO asking whether to double |
| self-reported attribution module | 0.85 | Product feature using free-text NLP to capture how-did-you-h |
| two days a month | 0.80 | Analyst time spent on manual attribution reporting; quantifi |
| 42K annually | 0.85 | Starter tier pricing for prospect's company size and channel |
| channel allocation decisions | 0.80 | Primary strategic value proposition; confidence in where to  |
| one-click OAuth connection | 0.65 | HubSpot integration simplicity; no engineering needed |
| 12 months of historical data | 0.75 | Retroactive data ingestion capability that excited prospect  |
| three to four weeks implementation | 0.70 | Go-live timeline perceived favorably by prospect |
| no spare engineering cycles | 0.70 | Key constraint for prospect; implementation must be handled  |
| attribution benchmarking report | 0.60 | Content asset that initiated the sales conversation |
| cobbler's children have no shoes | 0.40 | Rep's rapport-building acknowledgment that analytics firms o |
| answer by Friday | 0.80 | Prospect's commitment to a decision timeline; strong buying  |

---
## Layer 2: Behavioral Signals

### Objection Triples

**Objection 1** (risk, mid)

  Speaker: prospect_head_of_data
  Language: "We've tried the how-did-you-hear-about-us thing manually. The data quality was terrible. People just pick the first option."
  Resolution: technical_demo — "The trick is in how you present it and how you weight it. We use a free-text field instead of a dropdown, then use NLP to categorize and cross-reference with digital touchpoint data. Clients typically see 70 to 80% match rates between self-reported and digital signals."
  Outcome: resolved=True, progressed=True
  Confidence: 0.95



**Objection 2** (pricing, mid)

  Speaker: prospect_head_of_data
  Language: "42K. That's not nothing, but if it saves my analyst two days a month and helps us make a smarter call on 180K in podcast spend, the math works pretty quickly."
  Resolution: roi_argument — "Exactly. And that analyst time is probably the smaller piece. The real value is having confidence in your channel allocation decisions. If podcasts are actually driving 30% of your pipeline but your current data says 5%, that changes your entire strategy."
  Outcome: resolved=True, progressed=True
  Confidence: 0.85



**Objection 3** (implementation, mid)

  Speaker: prospect_head_of_data
  Language: "That's really important to us. We don't have spare engineering cycles."
  Resolution: risk_mitigation — "Marketing ops can handle the whole thing. The HubSpot integration is a one-click OAuth connection. We handle the data pipeline setup on our end."
  Outcome: resolved=True, progressed=True
  Confidence: 0.92



**Objection 4** (authority, late)

  Speaker: prospect_head_of_data
  Language: "I need to loop in our CEO, Alisha, but I'm going to recommend we do this."
  Resolution: social_proof — "Absolutely. I have two great case studies from similar-sized consulting firms. I'll package those with the proposal. Would it be helpful to have a short call with Alisha as well, or do you think you can handle it internally?"
  Outcome: resolved=True, progressed=True
  Confidence: 0.90



### Buying Intent Markers

| Type | Evidence | Confidence |
| --- | --- | --- |
| budget_confirmation | Okay, what does something like this cost for a company our size? We're not a big | 0.80 |
| if_to_when_shift | 42K. That's not nothing, but if it saves my analyst two days a month and helps u | 0.85 |
| timeline_question | Right. Okay, what does implementation look like? We run HubSpot and Google Ads p | 0.90 |
| implementation_detail | Three to four weeks is fast. Do we need engineering resources or can my marketin | 0.95 |
| implementation_detail | What about data we've already collected? Can we do any historical attribution or | 0.92 |
| if_to_when_shift | Oh that's great. That would let us test the accuracy before we commit to changin | 0.95 |
| next_steps_request | What would next steps look like if we wanted to move forward? | 0.99 |
| stakeholder_introduction | I need to loop in our CEO, Alisha, but I'm going to recommend we do this. | 0.95 |
| budget_confirmation | She trusts my judgment on data tooling. If she has questions I'll pull you in, b | 0.90 |

### Competitive Mentions

  (none detected)


### Engagement Trajectory

| Phase | Participation | Question Depth | Energy | Notes |
| --- | --- | --- | --- | --- |
| early | high | deep | high | Prospect immediately self-identified a specific pain point (multi-touch attribution gaps), volunteered detailed context about team size, channels, manual processes, and time cost. Showed vulnerability by calling their own attribution 'embarrassingly bad.' Very forthcoming with specifics like analyst time (two days/month) and distrust of numbers. |
| mid | high | deep | high | Prospect pushed back constructively on self-reported attribution (showing sophistication), proactively asked about pricing, and self-justified the ROI ('the math works pretty quickly'). Transitioned rapidly from pricing to implementation questions, showing accelerating momentum. Asked about engineering resource requirements and historical data ingestion — both deep, practical questions indicating serious evaluation. |
| late | high | moderate | high | Prospect explicitly stated 'I'm pretty sold on this honestly,' asked for next steps unprompted, volunteered a timeline commitment ('aim to have an answer by Friday'), and expressed confidence in internal approval ('She trusts my judgment on data tooling'). Energy remained high through close with humor ('you and me both'). Champion behavior is clear — prospect is self-qualifying and preparing to sell internally. |

---
## Layer 3: Psychographic Signals

### Mental Model

  Primary: efficiency
  Secondary: risk_mitigation
  Confidence: 0.90
  Reasoning: Nina's primary concern is efficiency: she repeatedly highlights the time her analyst wastes on manual spreadsheet work (two days a month), the lack of engineering cycles, and the need for a tool that marketing ops can handle without engineering involvement. She frames the ROI calculation around time savings and better decision-making speed. Her secondary model is risk_mitigation — she is deeply uncomfortable making a $180K budget decision without trustworthy data. She doesn't frame the problem as 'grow revenue' but rather as 'avoid making the wrong call.' She wants to validate attribution accuracy with historical data before committing to budget changes ('test the accuracy before we commit to changing our budget allocation'). She's not trying to expand capabilities or grow into new markets; she's trying to stop flying blind and reduce the risk of misallocating spend. While there's an element of cost_redu

### Persona Indicators

  analytical_evaluator (confidence=0.85)
    Reasoning: Nina asks detailed, specific technical questions in a methodical sequence: cost, implementation timeline, engineering requirements, historical data capabilities, and integration specifics. She pushes back on the self-reported attribution approach based on prior experience with data quality, showing she evaluates claims critically rather than accepting them at face value. She does mental ROI math on the spot (42K vs. 2 days/month + 180K decision). She wants to validate the tool's accuracy against historical data before making strategic changes — a classic analytical evaluator behavior.
    Evidence: ["We've tried the how-did-you-hear-about-us thing manually. The data quality was terrible. People just pick the first option.", 'What does implementation look like? We run HubSpot and Google Ads primarily.', 'Do we need engineering resources or can my marketing ops person handle it?', 'Can we do any historical attribution or is it forwa

### Language Fingerprint

  Vocabulary: ['dark funnel', 'stitching together', 'flying blind', 'spare engineering cycles', 'budget allocation', 'channel mix', 'the math works', 'data quality']
  Metaphors: ["the cobbler's children have no shoes (acknowledged/adopted from rep)", 'flying blind (used implicitly via rep, accepted framing)', 'stitching everything together in spreadsheets', 'get you off those spreadsheets (echoed by rep, endorsed)']
  Framing: ['If X saves Y and helps Z, the math works — ROI justification through concrete arithmetic', "We've tried [approach] manually. [Negative outcome]. — Prior experience as credibility filter", "That's really important to us. We don't have [resource]. — Constraint-first evaluation of feasibility", "I can't give her a straight answer with the data I have. — Framing problems as inability to act due to data gaps", 'That would let us test [claim] before we commit to [action]. — Validation-before-commitment pattern', "I'm pretty sold on this honestly. — Direct emotional 

---
## Layer 4: Multimodal Signals

  (No multimodal signals — transcript lacks paralinguistic annotations)


In [13]:
# Collapsible raw JSON output
raw_json = result.model_dump_json(indent=2)
display(HTML(f"""
<details>
<summary><b>Raw JSON output</b> (click to expand)</summary>
<pre>{raw_json}</pre>
</details>
"""))

---
## Before & After: Input vs Extracted Features

Side-by-side comparison of raw conversation moments and the structured signals extracted from them.
This demonstrates the transformation from text to actionable intelligence.

In [14]:
# Example: Objection Detection — raw utterances vs structured extraction
if result.behavioral.objection_triples:
    triple = result.behavioral.objection_triples[0]
    obj = triple.objection

    display(Markdown("### Example: Objection Detection"))
    display(Markdown("**INPUT** (Raw Transcript)"))
    for idx in obj.source_utterance_indices[:3]:
        if idx < len(transcript.utterances):
            u = transcript.utterances[idx]
            print(f"  [{u.turn_index}] {u.speaker}: {u.text}\n")

    display(Markdown("**OUTPUT** (Structured Extraction)"))
    display(Markdown(f"""
- **Type:** `{obj.type}`
- **Stage:** `{obj.conversation_stage}`
- **Speaker:** `{obj.speaker_role}`
- **Language:** "{obj.specific_language}"
- **Resolution:** {triple.resolution.type + ' — "' + triple.resolution.specific_language + '"' if triple.resolution else "None"}
- **Outcome:** {"Resolved" if triple.outcome.resolved else "Unresolved"}, Deal {"progressed" if triple.outcome.deal_progressed else "stalled"}
- **Confidence:** {triple.confidence:.2f}
"""))

# Example: Aspect Sentiment — pick a negative or mixed aspect if available
interesting = next((a for a in result.surface.aspects if a.sentiment in ("negative", "mixed")), None)
if not interesting and result.surface.aspects:
    interesting = result.surface.aspects[0]

if interesting:
    display(Markdown("### Example: Aspect-Based Sentiment"))
    display(Markdown("**INPUT** (Raw Transcript)"))
    for idx in interesting.source_utterance_indices[:2]:
        if idx < len(transcript.utterances):
            u = transcript.utterances[idx]
            print(f"  [{u.turn_index}] {u.speaker}: {u.text}\n")

    display(Markdown("**OUTPUT** (Structured Extraction)"))
    display(Markdown(f"""
- **Aspect:** `{interesting.aspect}`
- **Sentiment:** `{interesting.sentiment}` (intensity: {interesting.intensity:.2f})
- **Context:** "{interesting.context}"
"""))

### Example: Objection Detection

**INPUT** (Raw Transcript)

  [9] prospect_head_of_data: We've tried the how-did-you-hear-about-us thing manually. The data quality was terrible. People just pick the first option.



**OUTPUT** (Structured Extraction)


- **Type:** `risk`
- **Stage:** `mid`
- **Speaker:** `prospect_head_of_data`
- **Language:** "We've tried the how-did-you-hear-about-us thing manually. The data quality was terrible. People just pick the first option."
- **Resolution:** technical_demo — "The trick is in how you present it and how you weight it. We use a free-text field instead of a dropdown, then use NLP to categorize and cross-reference with digital touchpoint data. Clients typically see 70 to 80% match rates between self-reported and digital signals."
- **Outcome:** Resolved, Deal progressed
- **Confidence:** 0.95


### Example: Aspect-Based Sentiment

**INPUT** (Raw Transcript)

  [1] prospect_head_of_data: Yeah, the section on multi-touch attribution gaps was exactly what we're wrestling with. We're a 60-person shop doing analytics consulting, and our own marketing attribution is embarrassingly bad.

  [3] prospect_head_of_data: We're running campaigns across LinkedIn, Google, our podcast sponsorships, and a bunch of partner co-marketing stuff. Right now we're stitching everything together in spreadsheets. My analyst spends probably two days a month just building the attribution report, and honestly, I don't trust the numbers.



**OUTPUT** (Structured Extraction)


- **Aspect:** `current attribution process`
- **Sentiment:** `negative` (intensity: 0.85)
- **Context:** "Manual spreadsheet-based attribution taking two days per month with untrusted output; described as 'embarrassingly bad'"


---
## Transcript Summary

Generate a human-readable executive summary of the call. This is a separate extraction step
(1 LLM call) that produces narrative prose, key moments, action items, and deal signals —
designed for sales managers who need to understand a call without reading the full transcript.

In [15]:
%%time
summary = extract_summary(transcript, client=client)
print(f"Summary generated for {SELECTED_CALL_ID}")
display_summary(summary)

Summary generated for cloudfirst_analytics_call1


### Executive Summary

This call was between the sales rep and Nina, Head of Data at a 60-person analytics consulting firm, who had previously downloaded an attribution benchmarking report. The conversation centered on Nina's inability to accurately attribute marketing results across channels—particularly podcast sponsorships ($180K/year)—due to manual, spreadsheet-based processes that consume two analyst-days per month and produce untrustworthy data. The core pain point is a 'dark funnel' problem: prospects hear about the firm through podcasts but are misattributed to organic search, leaving the CEO without reliable data to decide whether to scale or cut podcast spend.

Nina raised one notable objection around self-reported attribution, noting that a previous manual 'how did you hear about us' approach yielded poor data quality. The rep addressed this effectively by explaining their NLP-powered free-text field approach and its 70-80% match rate with digital signals, which satisfied Nina. She also asked about cost, implementation complexity, engineering resource requirements, and historical data ingestion—all of which the rep answered favorably. The $42K annual price point was received positively given the clear ROI against the $180K podcast spend decision and analyst time savings. Nina specifically valued that no engineering resources would be needed and that implementation could be handled by marketing ops alone.

The deal is in strong shape. Nina expressed clear buying intent and stated she would recommend the purchase to her CEO, Alisha, who has final sign-off authority. The rep agreed to send a proposal with two consulting-firm case studies by end of day. Nina committed to providing an answer by Friday. The prospect declined to have the rep join a call with the CEO, indicating high internal confidence but also introducing a risk that the rep won't have direct access to the final decision-maker.

### Key Moments

**1. INSIGHT** (turns [5, 6, 7])

  Nina revealed that $180K in annual podcast spend is essentially unattributable, and her CEO is pressing for a scale-or-kill decision she cannot support with data.
  Significance: This crystallized the core business pain and established a concrete, high-dollar ROI narrative that makes the $42K solution price easy to justify internally.



**2. OBJECTION** (turns [9, 10])

  Nina pushed back on self-reported attribution, stating a previous manual 'how did you hear about us' attempt produced terrible data quality because respondents just picked the first option.
  Significance: This was the only real objection in the call. The rep's explanation of NLP-powered free-text categorization with 70-80% digital match rates directly addressed the concern and restored credibility in the approach.



**3. BREAKTHROUGH** (turns [19, 20, 21])

  The rep explained that up to 12 months of historical data could be ingested from HubSpot and ad platforms, enabling retroactive attribution modeling.
  Significance: This was a pivotal moment because it allows Nina to validate podcast attribution against anecdotal evidence before committing to budget reallocation—removing a key adoption risk and accelerating her confidence.



**4. COMMITMENT** (turns [21, 23, 25])

  Nina said she was 'pretty sold,' committed to recommending the purchase to her CEO, and set a Friday deadline for a decision.
  Significance: This represents a strong verbal commitment with a specific timeline, moving the deal to a clear proposal stage with an identified close date.



**5. RISK** (turns [24, 25])

  Nina declined to have the rep join a call with CEO Alisha, preferring to handle the internal conversation herself.
  Significance: While Nina expressed high confidence, the rep will have no direct access to the final decision-maker. If Alisha raises objections or has questions Nina can't fully address, the deal could stall without the rep's involvement.



### Action Items

| Action | Owner | Criticality |
| --- | --- | --- |
| Send proposal with $42K annual pricing, quarterly billing terms, and standard annual agreement details | rep | high |
| Include two case studies from similar-sized analytics/consulting firms with the proposal | rep | high |
| Present the proposal and case studies to CEO Alisha and secure approval | Nina | high |
| Provide a decision by Friday | Nina | high |
| If signed this week, schedule implementation kickoff for next Monday with target go-live by end of March | rep | medium |
| Be available for a follow-up call with CEO Alisha if questions arise | rep | medium |

### Prospect Priorities

  1. Accurate attribution of podcast sponsorships ($180K/year) to support a strategic scale-or-cut budget decision for the CEO
  2. Eliminating manual, spreadsheet-based attribution processes that consume two analyst-days per month and produce untrustworthy data
  3. Low implementation burden—no engineering resources required, marketing ops can handle setup independently
  4. Ability to validate the solution with historical data before making major budget reallocation decisions
  5. Data quality and trustworthiness of self-reported attribution methods


### Concerns to Address

  1. CEO Alisha is the final decision-maker and the rep has no direct access to her; if Nina's internal pitch encounters resistance, there is no fallback engagement plan
  2. Nina described the company as 'not a big budget operation'—even though she rationalized the ROI, price sensitivity could resurface during CEO review or if competing budget priorities arise
  3. The prospect's past negative experience with self-reported attribution may linger as a credibility concern for the CEO who wasn't on this call to hear the NLP explanation
  4. No discussion of contract flexibility (e.g., month-to-month, exit clauses, or pilot options) which could become a sticking point for a budget-conscious 60-person firm committing to $42K annually
  5. Podcast sponsorship tracking via partner co-marketing channels was mentioned but not deeply explored—if the solution doesn't fully cover all co-marketing attribution, that gap could surface post-sale
CPU times: user 24.4 ms, sys: 4.6 ms, total: 29 ms
Wall

---
## Layer-by-Layer Exploration

Run individual extraction layers to inspect intermediate outputs.
This is useful for:
- Debugging a specific layer's behavior
- Understanding how each prompt template processes the transcript
- Timing individual layers to identify bottlenecks
- Inspecting raw LLM output before schema validation/coercion

In [17]:
transcript_text = _format_transcript(transcript)
print(f"Formatted transcript length: {len(transcript_text)} chars")
print(f"\n--- First 500 chars ---")
print(transcript_text[:500])

Formatted transcript length: 5666 chars

--- First 500 chars ---
[0] rep: Hi Nina, thanks for hopping on. I saw you downloaded our attribution benchmarking report last week. What caught your eye?
[1] prospect_head_of_data: Yeah, the section on multi-touch attribution gaps was exactly what we're wrestling with. We're a 60-person shop doing analytics consulting, and our own marketing attribution is embarrassingly bad.
[2] rep: Ha, the cobbler's children have no shoes. That's actually more common than you'd think with analytics firms. Tell me more about what you


In [18]:
%%time
display(Markdown("### Layer 1: Surface Signals"))
surface_data = _extract_layer(client, SURFACE_EXTRACTION_PROMPT, transcript_text)
surface = SurfaceSignals.model_validate(_coerce_to_schema(surface_data, SurfaceSignals))
display_surface(surface)

### Layer 1: Surface Signals

### Aspect Sentiments

| Aspect | Sentiment | Intensity | Context |
| --- | --- | --- | --- |
| current attribution process | negative | 0.85 | Manual spreadsheet-based attribution takes two days/month, produces untrustworth |
| pricing | mixed | 0.60 | Prospect acknowledges 42K is 'not nothing' but quickly rationalizes ROI against  |
| self-reported attribution module | positive | 0.70 | Initially skeptical due to past poor experience with manual how-did-you-hear fie |
| integration / implementation | positive | 0.85 | Native HubSpot integration, plug-and-play Google Ads, 3-4 week timeline, no engi |
| historical data ingestion | positive | 0.80 | Ability to ingest 12 months of historical data from HubSpot and ad platforms exc |
| ROI / value proposition | positive | 0.85 | Prospect sees clear ROI: saving analyst time, gaining confidence on 180K podcast |
| ease of setup / resource requirements | positive | 0.90 | Marketing ops can handle entire implementation without engineering resources; on |
| podcast attribution / dark funnel | negative | 0.80 | Current inability to track podcast-to-demo journey means 180K annual spend decis |
| internal buying process | positive | 0.75 | Prospect is confident she can handle internal approval; CEO trusts her judgment  |

### Topics

| Topic | Position | Relevance |
| --- | --- | --- |
| Multi-touch attribution gaps and challenges | early | 1.00 |
| Dark funnel / offline-to-online tracking (podcast attribution) | early | 0.95 |
| Manual reporting and analyst time costs | early | 0.80 |
| Self-reported attribution methodology (NLP free-text approach) | mid | 0.85 |
| Pricing and budget justification | mid | 0.80 |
| Channel budget allocation decisions (podcast spend) | mid | 0.85 |
| Technical implementation and integrations | mid | 0.80 |
| Historical data ingestion and retroactive validation | late | 0.75 |
| Next steps / proposal and internal approval process | late | 0.70 |

### Named Entities

| Name | Type | Role | Mentions |
| --- | --- | --- | --- |
| Nina | person | Prospect, Head of Data at analytics consulting firm | 2 |
| Alisha | person | CEO of prospect's company, final decision-maker | 2 |
| HubSpot | product | Prospect's CRM / marketing platform | 4 |
| Google Ads | product | Prospect's advertising platform | 2 |
| LinkedIn | company | Prospect's advertising / campaign channel | 2 |

### Key Phrases

| Phrase | Relevance | Context |
| --- | --- | --- |
| multi-touch attribution gaps | 1.00 | Core pain point that initiated the conversation; prospect do |
| dark funnel | 0.95 | Rep's framing of the podcast-to-online tracking challenge; r |
| podcast-to-website-to-demo-request journey | 0.90 | Specific attribution gap prospect cannot track; drives the 1 |
| self-reported attribution module | 0.85 | Key product feature addressing the dark funnel problem using |
| 180K annual podcast spend | 0.90 | Major budget decision flying blind; CEO asking whether to do |
| 42K annually | 0.85 | Starter tier pricing for prospect's company size and channel |
| two days a month manual reporting | 0.75 | Analyst time cost of current spreadsheet-based attribution p |
| 70 to 80% match rates | 0.80 | Claimed accuracy of NLP-based self-reported attribution cros |
| no engineering resources needed | 0.80 | Critical requirement for prospect who has no spare engineeri |
| 12 months historical data ingestion | 0.75 | Retroactive attribution capability that allows prospect to v |
| three to four weeks implementation | 0.70 | Fast timeline perceived positively; live by end of March if  |
| channel allocation decisions | 0.85 | Strategic value of the product — confidence in how marketing |
| cobbler's children have no shoes | 0.50 | Rep's rapport-building metaphor acknowledging irony of analy |

CPU times: user 17 ms, sys: 3.74 ms, total: 20.7 ms
Wall time: 31.2 s


In [None]:
%%time
display(Markdown("### Layer 2: Behavioral Signals"))
behavioral_data = _extract_layer(client, BEHAVIORAL_EXTRACTION_PROMPT, transcript_text)
behavioral = BehavioralSignals.model_validate(_coerce_to_schema(behavioral_data, BehavioralSignals))
display_behavioral(behavioral)

In [None]:
%%time
display(Markdown("### Layer 3: Psychographic Signals"))
psychographic_data = _extract_layer(client, PSYCHOGRAPHIC_EXTRACTION_PROMPT, transcript_text)
psychographic = PsychographicSignals.model_validate(_coerce_to_schema(psychographic_data, PsychographicSignals))
display_psychographic(psychographic)

In [None]:
%%time
display(Markdown("### Layer 4: Multimodal Divergence"))
if _has_paralinguistic(transcript):
    multimodal_data = _extract_layer(client, MULTIMODAL_DIVERGENCE_PROMPT, transcript_text)
    multimodal = MultimodalSignals.model_validate(_coerce_to_schema(multimodal_data, MultimodalSignals))
    display_multimodal(multimodal)
else:
    print("Skipped — no paralinguistic annotations in this transcript.")
    print("Select a transcript with paralinguistic data (e.g., techcorp_call1, safeguard_inc_call1)")

In [None]:
# Inspect the raw dict returned by the LLM before Pydantic validation.
# Useful for debugging schema coercion issues.
# Change this to: surface_data, behavioral_data, psychographic_data, or multimodal_data
display(Markdown("### Raw Layer Output (pre-validation)"))
pretty_json(surface_data)

---
## Prompt Tuning Workbench

Load, edit, and test prompt modifications without touching the prompt files on disk.

**Workflow:**
1. Select a layer and view its current prompt template
2. Copy and modify the prompt in the editable cell below
3. Run extraction with the modified prompt
4. Compare results against the original prompt

Prompts use `{transcript}` as the placeholder for the formatted transcript text.
Double braces `{{` and `}}` are used for literal braces in JSON schema examples.

In [16]:
# Available: "surface", "behavioral", "psychographic", "multimodal_divergence"
PROMPT_NAME = "surface"

current_prompt = load_prompt(PROMPT_NAME)
print(f"Prompt: {PROMPT_NAME}")
print(f"Length: {len(current_prompt)} chars")
print(f"\n--- Full prompt template ---")
print(current_prompt)

Prompt: surface
Length: 1397 chars

--- Full prompt template ---
You are extracting Layer 1 (Surface) signals from a sales call transcript.

TRANSCRIPT:
{transcript}

Extract these signal types:

1. ASPECT-BASED SENTIMENT: For each distinct aspect discussed (e.g., "pricing", "product",
   "integration", "support"), identify the sentiment, intensity (0-1), context explaining
   the sentiment, and which utterance indices support it. A single utterance can contain
   multiple aspects with different sentiments.

2. TOPIC DETECTION: What subjects are discussed? Position each in the conversation
   timeline (early/mid/late) and rate relevance (0-1).

3. NAMED ENTITIES: People, companies, products, and competitors mentioned. Include
   entity type, role if inferable, and mention count.

4. KEY PHRASES: Important terms and concepts, weighted by relevance to the sales context.

Return ONLY valid JSON:
{{
  "aspects": [
    {{"aspect": "string", "sentiment": "positive|negative|neutral|mixed",
  

In [None]:
# ============================================================
# EDIT THIS PROMPT to test modifications.
# The {transcript} placeholder will be filled automatically.
# Use {{ and }} for literal braces in JSON examples.
# ============================================================

modified_prompt = current_prompt  # Start from the current prompt and modify as needed

In [None]:
%%time
# Map prompt name to schema class for validation
SCHEMA_MAP = {
    "surface": SurfaceSignals,
    "behavioral": BehavioralSignals,
    "psychographic": PsychographicSignals,
    "multimodal_divergence": MultimodalSignals,
}

schema_cls = SCHEMA_MAP[PROMPT_NAME]
modified_data = _extract_layer(client, modified_prompt, transcript_text)
modified_result = schema_cls.model_validate(_coerce_to_schema(modified_data, schema_cls))
print(f"Modified extraction complete for layer: {PROMPT_NAME}")

In [None]:
%%time
original_prompt = load_prompt(PROMPT_NAME)
original_data = _extract_layer(client, original_prompt, transcript_text)
original_result = schema_cls.model_validate(_coerce_to_schema(original_data, schema_cls))
print(f"Original extraction complete for layer: {PROMPT_NAME}")

In [None]:
display(Markdown("## Side-by-Side Comparison"))

DISPLAY_FN = {
    "surface": display_surface,
    "behavioral": display_behavioral,
    "psychographic": display_psychographic,
    "multimodal_divergence": display_multimodal,
}

display(Markdown("### Original Prompt Result"))
DISPLAY_FN[PROMPT_NAME](original_result)

display(Markdown("---\n### Modified Prompt Result"))
DISPLAY_FN[PROMPT_NAME](modified_result)

In [None]:
display(Markdown("### Diff Summary"))

if PROMPT_NAME == "surface":
    orig_topics = {t.name.lower() for t in original_result.topics}
    mod_topics = {t.name.lower() for t in modified_result.topics}
    print(f"Topics — Original: {len(orig_topics)}, Modified: {len(mod_topics)}")
    print(f"  Added:   {mod_topics - orig_topics or '{none}'}")
    print(f"  Removed: {orig_topics - mod_topics or '{none}'}")
    print(f"  Shared:  {orig_topics & mod_topics}")

    orig_entities = {e.name.lower() for e in original_result.entities}
    mod_entities = {e.name.lower() for e in modified_result.entities}
    print(f"\nEntities — Original: {len(orig_entities)}, Modified: {len(mod_entities)}")
    print(f"  Added:   {mod_entities - orig_entities or '{none}'}")
    print(f"  Removed: {orig_entities - mod_entities or '{none}'}")

    print(f"\nAspects — Original: {len(original_result.aspects)}, Modified: {len(modified_result.aspects)}")
    print(f"Key phrases — Original: {len(original_result.key_phrases)}, Modified: {len(modified_result.key_phrases)}")

elif PROMPT_NAME == "behavioral":
    orig_types = {t.objection.type for t in original_result.objection_triples}
    mod_types = {t.objection.type for t in modified_result.objection_triples}
    print(f"Objection types — Original: {orig_types}, Modified: {mod_types}")
    print(f"  Added:   {mod_types - orig_types or '{none}'}")
    print(f"  Removed: {orig_types - mod_types or '{none}'}")
    print(f"\nBuying intent markers — Original: {len(original_result.buying_intent_markers)}, Modified: {len(modified_result.buying_intent_markers)}")
    print(f"Competitive mentions — Original: {len(original_result.competitive_mentions)}, Modified: {len(modified_result.competitive_mentions)}")

elif PROMPT_NAME == "psychographic":
    print(f"Mental model — Original: {original_result.mental_model.primary}, Modified: {modified_result.mental_model.primary}")
    orig_personas = {p.archetype for p in original_result.persona_indicators}
    mod_personas = {p.archetype for p in modified_result.persona_indicators}
    print(f"Personas — Original: {orig_personas}, Modified: {mod_personas}")

else:
    print("Use the side-by-side display above for detailed comparison.")

---
## Evaluation

Compare extraction results against ground truth using the same metrics
as `tests/test_extraction.py`.

| Metric | Threshold | Description |
|--------|-----------|-------------|
| Topic recall | >= 50% | Overlap of extracted vs ground truth topics |
| Entity recall | >= 50% | Overlap of extracted vs ground truth entities |
| Objection type recall | >= 50% | Overlap of extracted vs ground truth objection types |
| Buying intent presence | bool | If GT has markers, extraction should too |
| Mental model accuracy | >= 50% | Primary mental model match across corpus |
| Persona archetype overlap | bool | At least one archetype matches |
| Multimodal divergence | bool | Divergences detected when GT has them |

In [None]:
def compute_topic_recall(result: ExtractionResult, gt: ExtractionResult) -> float:
    gt_topics = {t.name.lower() for t in gt.surface.topics}
    extracted_topics = {t.name.lower() for t in result.surface.topics}
    if not gt_topics:
        return 1.0
    return len(gt_topics & extracted_topics) / len(gt_topics)


def compute_entity_recall(result: ExtractionResult, gt: ExtractionResult) -> float:
    gt_entities = {e.name.lower() for e in gt.surface.entities}
    extracted_entities = {e.name.lower() for e in result.surface.entities}
    if not gt_entities:
        return 1.0
    return len(gt_entities & extracted_entities) / len(gt_entities)


def compute_objection_type_recall(result: ExtractionResult, gt: ExtractionResult) -> float:
    gt_types = {t.objection.type for t in gt.behavioral.objection_triples}
    extracted_types = {t.objection.type for t in result.behavioral.objection_triples}
    if not gt_types:
        return 1.0
    return len(gt_types & extracted_types) / len(gt_types)


def check_buying_intent(result: ExtractionResult, gt: ExtractionResult) -> bool:
    if not gt.behavioral.buying_intent_markers:
        return True
    return len(result.behavioral.buying_intent_markers) > 0


def check_mental_model_match(result: ExtractionResult, gt: ExtractionResult) -> bool:
    return result.psychographic.mental_model.primary == gt.psychographic.mental_model.primary


def check_persona_overlap(result: ExtractionResult, gt: ExtractionResult) -> bool:
    gt_archetypes = {p.archetype for p in gt.psychographic.persona_indicators}
    extracted_archetypes = {p.archetype for p in result.psychographic.persona_indicators}
    if not gt_archetypes:
        return True
    return bool(gt_archetypes & extracted_archetypes)


def check_multimodal(result: ExtractionResult, gt: ExtractionResult) -> bool:
    if gt.multimodal is None:
        return result.multimodal is None
    if gt.multimodal.divergences:
        return result.multimodal is not None and len(result.multimodal.divergences) > 0
    return True


def evaluate_single(result: ExtractionResult, gt: ExtractionResult) -> dict:
    """Run all evaluation metrics on a single result/ground-truth pair."""
    return {
        "transcript_id": result.transcript_id,
        "topic_recall": compute_topic_recall(result, gt),
        "entity_recall": compute_entity_recall(result, gt),
        "objection_type_recall": compute_objection_type_recall(result, gt),
        "buying_intent_present": check_buying_intent(result, gt),
        "mental_model_match": check_mental_model_match(result, gt),
        "persona_overlap": check_persona_overlap(result, gt),
        "multimodal_correct": check_multimodal(result, gt),
    }

print("Evaluation functions loaded.")

In [None]:
display(Markdown("### Single Transcript Evaluation"))

if SELECTED_CALL_ID in ground_truths:
    gt = ground_truths[SELECTED_CALL_ID]
    metrics = evaluate_single(result, gt)

    THRESHOLDS = {
        "topic_recall": ">= 0.50",
        "entity_recall": ">= 0.50",
        "objection_type_recall": ">= 0.50",
        "buying_intent_present": "True",
        "mental_model_match": "True (>= 50% across corpus)",
        "persona_overlap": "True",
        "multimodal_correct": "True",
    }

    rows = []
    for key, value in metrics.items():
        if key == "transcript_id":
            continue
        if isinstance(value, float):
            display_val = f"{value:.0%}"
            passed = value >= 0.5
        else:
            display_val = str(value)
            passed = bool(value)
        status = "PASS" if passed else "FAIL"
        rows.append([key, display_val, THRESHOLDS.get(key, "N/A"), status])

    display_table(["Metric", "Value", "Threshold", "Status"], rows)
else:
    print(f"No ground truth available for {SELECTED_CALL_ID}")

In [None]:
%%time
display(Markdown("### Full Corpus Evaluation"))
print("This runs extraction on all transcripts with ground truth (~25 API calls).\n")

corpus_results = []
for call_id, t in transcripts.items():
    if call_id not in ground_truths:
        print(f"Skipping {call_id} (no ground truth)")
        continue
    print(f"Extracting {call_id}...", end=" ", flush=True)
    r = extract(t, client=client)
    gt = ground_truths[call_id]
    m = evaluate_single(r, gt)
    corpus_results.append(m)
    print(f"done (topic_recall={m['topic_recall']:.0%}, entity_recall={m['entity_recall']:.0%})")

In [None]:
display(Markdown("### Per-Transcript Results"))

rows = []
for m in corpus_results:
    rows.append([
        m["transcript_id"],
        f"{m['topic_recall']:.0%}",
        f"{m['entity_recall']:.0%}",
        f"{m['objection_type_recall']:.0%}",
        str(m["buying_intent_present"]),
        str(m["mental_model_match"]),
        str(m["persona_overlap"]),
        str(m["multimodal_correct"]),
    ])
display_table(
    ["Transcript", "Topic Recall", "Entity Recall", "Objection Recall",
     "Buying Intent", "Mental Model", "Persona", "Multimodal"],
    rows,
)

# Aggregate metrics
n = len(corpus_results)
if n > 0:
    avg_topic = sum(m["topic_recall"] for m in corpus_results) / n
    avg_entity = sum(m["entity_recall"] for m in corpus_results) / n
    avg_objection = sum(m["objection_type_recall"] for m in corpus_results) / n
    mental_model_accuracy = sum(1 for m in corpus_results if m["mental_model_match"]) / n
    all_persona = all(m["persona_overlap"] for m in corpus_results)
    all_intent = all(m["buying_intent_present"] for m in corpus_results)
    all_multimodal = all(m["multimodal_correct"] for m in corpus_results)

    display(Markdown("### Aggregate Metrics"))
    agg_rows = [
        ["Avg Topic Recall", f"{avg_topic:.0%}", ">= 50%", "PASS" if avg_topic >= 0.5 else "FAIL"],
        ["Avg Entity Recall", f"{avg_entity:.0%}", ">= 50%", "PASS" if avg_entity >= 0.5 else "FAIL"],
        ["Avg Objection Recall", f"{avg_objection:.0%}", ">= 50%", "PASS" if avg_objection >= 0.5 else "FAIL"],
        ["Mental Model Accuracy", f"{mental_model_accuracy:.0%}", ">= 50%", "PASS" if mental_model_accuracy >= 0.5 else "FAIL"],
        ["All Buying Intent", str(all_intent), "True", "PASS" if all_intent else "FAIL"],
        ["All Persona Overlap", str(all_persona), "True", "PASS" if all_persona else "FAIL"],
        ["All Multimodal Correct", str(all_multimodal), "True", "PASS" if all_multimodal else "FAIL"],
    ]
    display_table(["Metric", "Value", "Threshold", "Status"], agg_rows)

---
## Results Comparison

Compare two extraction results against the same ground truth to measure
the impact of prompt changes. Use this after running the Prompt Tuning
Workbench to see how modifications affect evaluation metrics.

In [None]:
def compare_versions_vs_ground_truth(
    result_a: ExtractionResult,
    result_b: ExtractionResult,
    gt: ExtractionResult,
    label_a: str = "Version A",
    label_b: str = "Version B",
):
    """Compare two extraction results against the same ground truth."""
    metrics_a = evaluate_single(result_a, gt)
    metrics_b = evaluate_single(result_b, gt)

    rows = []
    for key in metrics_a:
        if key == "transcript_id":
            continue
        val_a = metrics_a[key]
        val_b = metrics_b[key]
        if isinstance(val_a, float):
            disp_a = f"{val_a:.0%}"
            disp_b = f"{val_b:.0%}"
            delta = val_b - val_a
            delta_str = f"{delta:+.0%}"
        else:
            disp_a = str(val_a)
            disp_b = str(val_b)
            delta_str = "same" if val_a == val_b else "changed"
        rows.append([key, disp_a, disp_b, delta_str])

    display(Markdown(f"### {label_a} vs {label_b}"))
    display_table(["Metric", label_a, label_b, "Delta"], rows)

In [None]:
# Example: compare two full extraction results against ground truth.
# Replace result_a and result_b with your own ExtractionResult objects.
#
# To compare prompt tuning results, run a full extraction with each prompt:
#   result_original = extract(transcript, client=client)
#   # ... modify the prompt file on disk, or rebuild the extractor ...
#   result_modified = extract(transcript, client=client)
#
#   compare_versions_vs_ground_truth(
#       result_original, result_modified,
#       ground_truths[SELECTED_CALL_ID],
#       "Original", "Modified"
#   )

print("Use compare_versions_vs_ground_truth(result_a, result_b, gt) to compare two results.")
print(f"Ground truth available for: {list(ground_truths.keys())}")

---
## Evaluation Module — Precision, Recall & F1

The evaluation module (`customer_intelligence.evaluation`) provides a comprehensive
evaluation pipeline with three methods:

1. **Programmatic metrics** — Precision, recall, F1 via fuzzy matching for all signal types
2. **LLM-as-judge** — Rubric-based quality scoring for subjective signals (optional)
3. **NLP baselines** — Cross-validation with spaCy NER, YAKE keyphrases, TextBlob sentiment (optional)

This replaces the simpler recall-only evaluation above with per-signal-type precision AND recall,
fuzzy matching (so "ROI justification" matches "ROI analysis"), and structural checks.

See `docs/evaluation.md` for full details on the methodology.

In [None]:
from customer_intelligence.evaluation import evaluate, evaluate_corpus
from customer_intelligence.evaluation.report import EvaluationReport, CorpusReport

print("Evaluation module loaded.")

In [None]:
# Single transcript evaluation with the evaluation module
# Uses 'result' from the extraction above and ground truth
if SELECTED_CALL_ID in ground_truths:
    gt = ground_truths[SELECTED_CALL_ID]
    report = evaluate(result, gt, transcript, skip_llm_judge=True)
    print(report.summary())
else:
    print(f"No ground truth for {SELECTED_CALL_ID} — skipping evaluation")

In [None]:
# Corpus-level evaluation with per-signal-type breakdown
cases = []
for call_id, t in transcripts.items():
    if call_id not in ground_truths:
        continue
    gt = ground_truths[call_id]
    # Re-use existing extraction if available, otherwise extract
    if call_id in extractions:
        ex = extractions[call_id]
    elif call_id == SELECTED_CALL_ID:
        ex = result
    else:
        print(f"Extracting {call_id}...", end=" ", flush=True)
        ex = extract(t, client=client)
        print("done")
    cases.append((ex, gt, t))

print(f"\nEvaluating {len(cases)} transcripts...")
corpus = evaluate_corpus(cases, skip_llm_judge=True)

# Per-signal-type aggregated metrics
display(Markdown("### Corpus Evaluation — Per-Signal Metrics"))
agg = corpus.mean_metrics_by_signal()
rows = []
for signal, stats in sorted(agg.items()):
    p = stats.get("precision")
    r = stats.get("recall")
    f = stats.get("f1")
    rows.append([
        signal,
        f"{p:.0%}" if p is not None else "—",
        f"{r:.0%}" if r is not None else "—",
        f"{f:.0%}" if f is not None else "—",
    ])
display_table(["Signal", "Precision", "Recall", "F1"], rows)

# Per-transcript summary
display(Markdown("### Per-Transcript Overall F1"))
for report in corpus.reports:
    print(f"  {report.transcript_id}: F1 = {report.overall_f1:.2%}")