# FinSense — CFO KPI & Sentiment Extraction (Multi-Company)

This notebook takes the parsed transcript segments from `data/processed/transcripts.csv`,
filters **CFO prepared remarks**, and extracts:

- basic KPIs (e.g., revenue YoY growth if mentioned)
- sentiment (polarity, subjectivity)
- metadata (company, fiscal period, doc path)

It then builds a `cfo_insights_enriched` DataFrame and writes JSON "insight packs"
for each CFO segment into `data/insights/`.

In [11]:
# 06_kpi_extraction.ipynb

from pathlib import Path
import json
import math

import pandas as pd

DATA_DIR = Path("../data")
PROCESSED = DATA_DIR / "processed"
INSIGHTS_DIR = DATA_DIR / "insights"

PROCESSED.mkdir(parents=True, exist_ok=True)
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)

PROCESSED, INSIGHTS_DIR


(PosixPath('../data/processed'), PosixPath('../data/insights'))

In [12]:
# Load the processed transcripts produced by src.finsense.ingest

transcripts_path = PROCESSED / "transcripts.csv"
df = pd.read_csv(transcripts_path)

print(transcripts_path)
df.head()


../data/processed/transcripts.csv


Unnamed: 0,doc_path,company_hint,fiscal_year,fiscal_quarter,ingest_date,segment_index,speaker,section,text,source
0,data/raw/NFLX_2024Q2_SAMPLE_REMARKS.txt,NFLX 2024Q2 SAMPLE REMARKS,2024.0,Q2,2025-11-24,0,FULL_TEXT,prepared_remarks,Prepared Remarks – CFO\n\nRevenue grew 12% yea...,manual_drop
1,data/raw/AMD_2024Q2_SAMPLE_REMARKS.txt,AMD 2024Q2 SAMPLE REMARKS,2024.0,Q2,2025-11-24,0,FULL_TEXT,prepared_remarks,Prepared Remarks – CFO\n\nRevenue increased 8%...,manual_drop
2,data/raw/ADBE_2024Q2_SAMPLE_REMARKS.txt,ADBE 2024Q2 SAMPLE REMARKS,2024.0,Q2,2025-11-24,0,FULL_TEXT,prepared_remarks,Prepared Remarks – CFO\n\nTotal revenue increa...,manual_drop
3,data/raw/NVDA_2024Q2_Remarks.txt,NVDA 2024Q2 Remarks,2024.0,Q2,2025-11-24,0,OPERATOR,prepared_remarks,Welcome to Q2 2024 results.,manual_drop
4,data/raw/NVDA_2024Q2_Remarks.txt,NVDA 2024Q2 Remarks,2024.0,Q2,2025-11-24,1,CFO,prepared_remarks,Revenue grew 10% year-over-year.,manual_drop


In [13]:
# Quick check: which speakers and sections do we have?

df[["company_hint", "fiscal_year", "fiscal_quarter", "speaker", "section"]].head(20)


Unnamed: 0,company_hint,fiscal_year,fiscal_quarter,speaker,section
0,NFLX 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT,prepared_remarks
1,AMD 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT,prepared_remarks
2,ADBE 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT,prepared_remarks
3,NVDA 2024Q2 Remarks,2024.0,Q2,OPERATOR,prepared_remarks
4,NVDA 2024Q2 Remarks,2024.0,Q2,CFO,prepared_remarks
5,NVDA 2024Q2 Remarks,2024.0,Q2,Q&A,qa
6,ADBE 2025Q4 Q3 FY2025 earnings press release,2025.0,Q4,PREFACE,preface
7,ADBE 2025Q4 Q3 FY2025 earnings press release,2025.0,Q4,GAAP,prepared_remarks
8,ADBE 2025Q4 Q3 FY2025 earnings press release,2025.0,Q4,GAAP,prepared_remarks
9,ADBE Q3 FY2025 earnings press release,,,PREFACE,preface


In [14]:
# Focus on CFO / chief financial officer / full-text prepared remarks

def is_cfo_like(s: str) -> bool:
    if not isinstance(s, str):
        return False
    s_upper = s.upper()
    return (
        "CFO" in s_upper
        or "CHIEF FINANCIAL" in s_upper
        or s_upper == "FULL_TEXT"    # fallback for PDFs with no speaker segmentation
    )

mask = (
    df["section"].eq("prepared_remarks")
    & df["speaker"].apply(is_cfo_like)
)

cfo_segments_all = df[mask].copy()

# Keep only rows with a fiscal year AND quarter
cfo_segments_all = cfo_segments_all[
    cfo_segments_all["fiscal_year"].notna()
    & cfo_segments_all["fiscal_quarter"].notna()
].copy()

print("CFO-like segments:", len(cfo_segments_all))
cfo_segments_all[["company_hint", "fiscal_year", "fiscal_quarter", "speaker"]].head(20)


CFO-like segments: 5


Unnamed: 0,company_hint,fiscal_year,fiscal_quarter,speaker
0,NFLX 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT
1,AMD 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT
2,ADBE 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT
4,NVDA 2024Q2 Remarks,2024.0,Q2,CFO
12,NFLX 2025Q4 Content Accounting Overview,2025.0,Q4,FULL_TEXT


## Filter to CFO Prepared Remarks

We only want the CFO's **prepared remarks** segments, which are usually where
high-level KPIs and guidance comments are given.


In [15]:
import re

def parse_growth_pct(text: str, keyword: str = "revenue") -> float | None:
    """
    Very lightweight parser:
    - Look for patterns like XX% near the keyword ("revenue", "EPS", etc.)
    - Return first match as a number.
    """
    if not isinstance(text, str):
        return None

    text_lower = text.lower()
    if keyword not in text_lower:
        return None

    # Find patterns like "up 10%" or "10 percent"
    pct_match = re.search(r"(-?\d+(\.\d+)?)\s*%", text_lower)
    if pct_match:
        try:
            return float(pct_match.group(1))
        except ValueError:
            return None

    return None


def detect_guidance_comment(text: str) -> str | None:
    if not isinstance(text, str):
        return None

    lower = text.lower()
    if "guidance" in lower or "outlook" in lower or "forecast" in lower:
        # return a short snippet around the first occurrence
        idx = lower.find("guidance")
        if idx == -1:
            idx = lower.find("outlook")
        if idx == -1:
            idx = lower.find("forecast")

        start = max(0, idx - 80)
        end = min(len(text), idx + 220)
        snippet = text[start:end].replace("\n", " ").strip()
        return snippet

    return None


def detect_margin_comment(text: str) -> str | None:
    if not isinstance(text, str):
        return None

    lower = text.lower()
    if "margin" in lower:
        idx = lower.find("margin")
        start = max(0, idx - 80)
        end = min(len(text), idx + 220)
        snippet = text[start:end].replace("\n", " ").strip()
        return snippet

    return None


## KPI Extraction Helper

We define a simple **rule-based extractor** that looks for:

- revenue YoY growth patterns like: `"10% year-over-year"`
- EPS growth patterns like: `"EPS grew 5%"`
- presence of words like `"guidance"`, `"outlook"`, `"forecast"`, `"margin"`

This is intentionally lightweight — enough to demonstrate signal extraction
for portfolio / credit analytics, without being a full NLP engine.


In [16]:
def kpis_for_segment(row: pd.Series) -> dict:
    text = row.get("text", "")

    rev_yoy = parse_growth_pct(text, keyword="revenue")
    eps_yoy = parse_growth_pct(text, keyword="eps")

    guidance = detect_guidance_comment(text)
    margin = detect_margin_comment(text)

    return {
        "revenue_growth_yoy_pct": rev_yoy,
        "eps_growth_yoy_pct": eps_yoy,
        "guidance_comment": guidance,
        "margin_comment": margin,
    }


# Quick smoke test
sample_row = cfo_segments_all.iloc[0]
kpis_for_segment(sample_row)


{'revenue_growth_yoy_pct': 12.0,
 'eps_growth_yoy_pct': None,
 'guidance_comment': 'focused on disciplined returns on new titles. We are raising full-year revenue guidance to low-teens growth and expect operating margin to land at the high end of our prior range.',
 'margin_comment': '% year-over-year, driven by steady subscriber growth and higher ARPU. Operating margin expanded 150 basis points as content amortization normalized. Content spend grew modestly, but we remain focused on disciplined returns on new titles. We are raising full-year revenue guidance to low-teens growth'}

In [17]:
def sentiment_for_segment(row: pd.Series) -> dict:
    """
    Placeholder sentiment. You can later swap this for a true model
    (e.g. FinBERT) or OpenAI sentiment.
    """
    return {
        "polarity": 0.0,
        "subjectivity": 0.0,
    }

sentiment_for_segment(sample_row)


{'polarity': 0.0, 'subjectivity': 0.0}

In [18]:
def clean_preview_text(text: str, length: int = 300) -> str | None:
    """
    Take the CFO text, collapse whitespace, truncate to N chars.
    """
    if not isinstance(text, str) or not text.strip():
        return None

    t = " ".join(text.split())  # collapse whitespace/newlines
    return t[:length] + ("…" if len(t) > length else "")


In [19]:
def make_insight_pack(row: pd.Series) -> dict:
    kpis = kpis_for_segment(row)
    sent = sentiment_for_segment(row)

    # Defensive handling of year
    fy = row.get("fiscal_year")
    fq = row.get("fiscal_quarter")

    try:
        fy_int = int(fy) if not (isinstance(fy, float) and math.isnan(fy)) else None
    except Exception:
        fy_int = None

    meta = {
        "doc_path": row.get("doc_path"),
        "segment_index": int(row.get("segment_index", 0)),
        "ingest_date": row.get("ingest_date"),
        "source": row.get("source"),
    }

    pack = {
        "company_hint": row.get("company_hint"),
        "fiscal_year": fy_int,
        "fiscal_quarter": fq,
        "speaker": row.get("speaker"),
        "section": row.get("section"),
        "kpis": kpis,
        "sentiment": sent,
        "meta": meta,
        # NEW: preview snippet used by the Streamlit app
        "preview_text": clean_preview_text(row.get("text", "")),
    }

    return pack


# Test on one row
test_pack = make_insight_pack(sample_row)
test_pack


{'company_hint': 'NFLX 2024Q2 SAMPLE REMARKS',
 'fiscal_year': 2024,
 'fiscal_quarter': 'Q2',
 'speaker': 'FULL_TEXT',
 'section': 'prepared_remarks',
 'kpis': {'revenue_growth_yoy_pct': 12.0,
  'eps_growth_yoy_pct': None,
  'guidance_comment': 'focused on disciplined returns on new titles. We are raising full-year revenue guidance to low-teens growth and expect operating margin to land at the high end of our prior range.',
  'margin_comment': '% year-over-year, driven by steady subscriber growth and higher ARPU. Operating margin expanded 150 basis points as content amortization normalized. Content spend grew modestly, but we remain focused on disciplined returns on new titles. We are raising full-year revenue guidance to low-teens growth'},
 'sentiment': {'polarity': 0.0, 'subjectivity': 0.0},
 'meta': {'doc_path': 'data/raw/NFLX_2024Q2_SAMPLE_REMARKS.txt',
  'segment_index': 0,
  'ingest_date': '2025-11-24',
  'source': 'manual_drop'},
 'preview_text': 'Prepared Remarks – CFO Revenue

## Sentiment Extraction Helper

We use a lightweight sentiment model (`TextBlob`) to get:

- **polarity**: [-1, 1]
- **subjectivity**: [0, 1]

This isn't finance-specific, but it's enough to demonstrate how FinSense can
attach directional "tone" to CFO commentary.


import pandas as pd

df = pd.read_csv("data/processed/transcripts.csv")

cfo_mask = df["speaker"].str.upper().eq("CFO")
remarks_mask = df["section"].isin(["FULL_TEXT", "prepared_remarks"])

df_cfo = df[cfo_mask & remarks_mask]
df_cfo[["company_hint", "fiscal_year", "fiscal_quarter"]].drop_duplicates()


## Build `cfo_insights_enriched` DataFrame

Now we loop over all CFO prepared-remarks segments and create an enriched table with:

- company & period metadata  
- raw text location (`doc_path`)  
- extracted KPIs  
- sentiment scores


written = 0

for idx, row in cfo_segments.iterrows():
    pack = make_insight_pack(row)

    company_slug = str(pack["company_hint"]).split()[0].upper()  # AMD, ADBE, NFLX, etc.
    year = pack["fiscal_year"] or "NA"
    quarter = pack["fiscal_quarter"] or "NA"
    seg = pack["meta"]["segment_index"]

    out_name = f"cfo_insight_pack_{company_slug}_{year}{quarter}_seg{seg}.json"
    out_path = INSIGHTS_DIR / out_name

    with out_path.open("w", encoding="utf-8") as f:
        json.dump(pack, f, indent=2)

    written += 1

print(f"Wrote {written} CFO insight packs to {INSIGHTS_DIR}")


sorted(p.name for p in INSIGHTS_DIR.glob("cfo_insight_pack_*.json"))


## Write JSON "Insight Packs" for Each CFO Segment

For downstream systems (dashboards, credit analytics, portfolio tools), we
export each CFO prepared-remarks segment as a **single JSON object** with:

- metadata (company, fiscal period, doc path)
- KPI fields
- sentiment
- basic provenance

Files are written to `../data/insights/` as:

`cfo_insight_<index>.json` for now (you can later shift to ticker/quarter naming).


# If company_hint is literally "AMD" use that; 
# otherwise we search for rows where doc_path contains "AMD".
amd_rows = cfo_prepared[cfo_prepared["company_hint"].str.contains("AMD", case=False, na=False)]

if amd_rows.empty:
    amd_rows = cfo_prepared[cfo_prepared["doc_path"].str.contains("AMD", case=False, na=False)]

amd_rows[["company_hint", "fiscal_year", "fiscal_quarter", "doc_path"]].head()


if amd_rows.empty:
    raise ValueError("No AMD CFO prepared-remarks segment found. Check transcripts.csv.")

amd_row = amd_rows.iloc[0]

amd_insight = make_insight_pack(amd_row)
amd_insight


out_dir = Path("../data/insights")
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "cfo_insight_pack_AMD_2025Q4.json"

with out_path.open("w", encoding="utf-8") as f:
    json.dump(amd_insight, f, indent=2)

print("Wrote AMD insight pack to:", out_path)