# FinSense — CFO KPI & Sentiment Extraction (Multi-Company)

This notebook takes the parsed transcript segments from `data/processed/transcripts.csv`,
filters **CFO prepared remarks**, and extracts:

- basic KPIs (e.g., revenue YoY growth if mentioned)
- sentiment (polarity, subjectivity)
- metadata (company, fiscal period, doc path)

It then builds a `cfo_insights_enriched` DataFrame and writes JSON "insight packs"
for each CFO segment into `data/insights/`.

In [1]:
# 06_kpi_extraction.ipynb

from pathlib import Path
import json
import math

import pandas as pd

DATA_DIR = Path("../data")
PROCESSED = DATA_DIR / "processed"
INSIGHTS_DIR = DATA_DIR / "insights"

PROCESSED.mkdir(parents=True, exist_ok=True)
INSIGHTS_DIR.mkdir(parents=True, exist_ok=True)

PROCESSED, INSIGHTS_DIR


(PosixPath('../data/processed'), PosixPath('../data/insights'))

In [2]:
# Load the processed transcripts produced by src.finsense.ingest

transcripts_path = PROCESSED / "transcripts.csv"
df = pd.read_csv(transcripts_path)

print(transcripts_path)
df.head()


../data/processed/transcripts.csv


Unnamed: 0,doc_path,company_hint,fiscal_year,fiscal_quarter,ingest_date,segment_index,speaker,section,text,source
0,data/raw/NFLX_2024Q2_SAMPLE_REMARKS.txt,NFLX 2024Q2 SAMPLE REMARKS,2024.0,Q2,2025-11-25,0,FULL_TEXT,prepared_remarks,Prepared Remarks – CFO\n\nRevenue grew 12% yea...,manual_drop
1,data/raw/AMD_2024Q2_SAMPLE_REMARKS.txt,AMD 2024Q2 SAMPLE REMARKS,2024.0,Q2,2025-11-25,0,FULL_TEXT,prepared_remarks,Prepared Remarks – CFO\n\nRevenue increased 8%...,manual_drop
2,data/raw/ADBE_2024Q2_SAMPLE_REMARKS.txt,ADBE 2024Q2 SAMPLE REMARKS,2024.0,Q2,2025-11-25,0,FULL_TEXT,prepared_remarks,Prepared Remarks – CFO\n\nTotal revenue increa...,manual_drop
3,data/raw/NVDA_2024Q2_Remarks.txt,NVDA 2024Q2 Remarks,2024.0,Q2,2025-11-25,0,OPERATOR,prepared_remarks,Welcome to Q2 2024 results.,manual_drop
4,data/raw/NVDA_2024Q2_Remarks.txt,NVDA 2024Q2 Remarks,2024.0,Q2,2025-11-25,1,CFO,prepared_remarks,Revenue grew 10% year-over-year.,manual_drop


In [3]:
# Identify which documents have a proper CFO prepared-remarks segment
cfo_mask = (
    df["speaker"].str.upper().eq("CFO")
    & df["section"].str.lower().eq("prepared_remarks")
)

docs_with_cfo_prepared = set(df.loc[cfo_mask, "doc_path"])

# For every row, tag whether its underlying document has any CFO prepared remarks
df["has_cfo_prepared"] = df["doc_path"].isin(docs_with_cfo_prepared)


In [4]:
# Quick check: which speakers and sections do we have?

df[["company_hint", "fiscal_year", "fiscal_quarter", "speaker", "section"]].head(20)


Unnamed: 0,company_hint,fiscal_year,fiscal_quarter,speaker,section
0,NFLX 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT,prepared_remarks
1,AMD 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT,prepared_remarks
2,ADBE 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT,prepared_remarks
3,NVDA 2024Q2 Remarks,2024.0,Q2,OPERATOR,prepared_remarks
4,NVDA 2024Q2 Remarks,2024.0,Q2,CFO,prepared_remarks
5,NVDA 2024Q2 Remarks,2024.0,Q2,Q&A,qa
6,ADBE 2025Q4 Q3 FY2025 earnings press release,2025.0,Q4,PREFACE,preface
7,ADBE 2025Q4 Q3 FY2025 earnings press release,2025.0,Q4,GAAP,prepared_remarks
8,ADBE 2025Q4 Q3 FY2025 earnings press release,2025.0,Q4,GAAP,prepared_remarks
9,ADBE Q3 FY2025 earnings press release,,,PREFACE,preface


In [5]:
# Focus on CFO / chief financial officer / full-text prepared remarks

def is_cfo_like(s: str) -> bool:
    if not isinstance(s, str):
        return False
    s_upper = s.upper()
    return (
        "CFO" in s_upper
        or "CHIEF FINANCIAL" in s_upper
        or s_upper == "FULL_TEXT"    # fallback for PDFs with no speaker segmentation
    )

mask = (
    df["section"].eq("prepared_remarks")
    & df["speaker"].apply(is_cfo_like)
)

cfo_segments_all = df[mask].copy()

# Keep only rows with a fiscal year AND quarter
cfo_segments_all = cfo_segments_all[
    cfo_segments_all["fiscal_year"].notna()
    & cfo_segments_all["fiscal_quarter"].notna()
].copy()

print("CFO-like segments:", len(cfo_segments_all))
cfo_segments_all[["company_hint", "fiscal_year", "fiscal_quarter", "speaker"]].head(20)


CFO-like segments: 4


Unnamed: 0,company_hint,fiscal_year,fiscal_quarter,speaker
0,NFLX 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT
1,AMD 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT
2,ADBE 2024Q2 SAMPLE REMARKS,2024.0,Q2,FULL_TEXT
4,NVDA 2024Q2 Remarks,2024.0,Q2,CFO


## Filter to CFO Prepared Remarks

We only want the CFO's **prepared remarks** segments, which are usually where
high-level KPIs and guidance comments are given.


In [6]:
import re

def parse_growth_pct(text: str, keyword: str = "revenue") -> float | None:
    """
    Very lightweight parser:
    - Look for patterns like XX% near the keyword ("revenue", "EPS", etc.)
    - Return first match as a number.
    """
    if not isinstance(text, str):
        return None

    text_lower = text.lower()
    if keyword not in text_lower:
        return None

    # Find patterns like "up 10%" or "10 percent"
    pct_match = re.search(r"(-?\d+(\.\d+)?)\s*%", text_lower)
    if pct_match:
        try:
            return float(pct_match.group(1))
        except ValueError:
            return None

    return None


def detect_guidance_comment(text: str) -> str | None:
    if not isinstance(text, str):
        return None

    lower = text.lower()
    if "guidance" in lower or "outlook" in lower or "forecast" in lower:
        # return a short snippet around the first occurrence
        idx = lower.find("guidance")
        if idx == -1:
            idx = lower.find("outlook")
        if idx == -1:
            idx = lower.find("forecast")

        start = max(0, idx - 80)
        end = min(len(text), idx + 220)
        snippet = text[start:end].replace("\n", " ").strip()
        return snippet

    return None


def detect_margin_comment(text: str) -> str | None:
    if not isinstance(text, str):
        return None

    lower = text.lower()
    if "margin" in lower:
        idx = lower.find("margin")
        start = max(0, idx - 80)
        end = min(len(text), idx + 220)
        snippet = text[start:end].replace("\n", " ").strip()
        return snippet

    return None


## KPI Extraction Helper

We define a simple **rule-based extractor** that looks for:

- revenue YoY growth patterns like: `"10% year-over-year"`
- EPS growth patterns like: `"EPS grew 5%"`
- presence of words like `"guidance"`, `"outlook"`, `"forecast"`, `"margin"`

This is intentionally lightweight — enough to demonstrate signal extraction
for portfolio / credit analytics, without being a full NLP engine.


In [7]:
def kpis_for_segment(row):
    """
    Accepts either a Pandas Series row or a raw text string.
    Safely extracts KPIs in both cases.
    """
    if isinstance(row, str):
        text = row
    else:
        # row is a Pandas Series
        text = row.get("text", "")

    rev_yoy = parse_growth_pct(text, keyword="revenue")
    eps_yoy = parse_growth_pct(text, keyword="eps")

    # Very basic pattern checks
    guidance = None
    margin = None
    low_text = text.lower()
    if "guidance" in low_text:
        guidance = "guidance commentary detected"
    if "margin" in low_text:
        margin = "margin commentary detected"

    return {
        "revenue_growth_yoy_pct": rev_yoy,
        "eps_growth_yoy_pct": eps_yoy,
        "guidance_comment": guidance,
        "margin_comment": margin,
    }


# Quick smoke test
sample_row = cfo_segments_all.iloc[0]
kpis_for_segment(sample_row)


{'revenue_growth_yoy_pct': 12.0,
 'eps_growth_yoy_pct': None,
 'guidance_comment': 'guidance commentary detected',
 'margin_comment': 'margin commentary detected'}

In [8]:
def sentiment_for_segment(row: pd.Series) -> dict:
    """
    Placeholder sentiment. You can later swap this for a true model
    (e.g. FinBERT) or OpenAI sentiment.
    """
    return {
        "polarity": 0.0,
        "subjectivity": 0.0,
    }

sentiment_for_segment(sample_row)


{'polarity': 0.0, 'subjectivity': 0.0}

In [9]:
def clean_preview_text(text: str, length: int = 300) -> str | None:
    """
    Take the CFO text, collapse whitespace, truncate to N chars.
    """
    if not isinstance(text, str) or not text.strip():
        return None

    t = " ".join(text.split())  # collapse whitespace/newlines
    return t[:length] + ("…" if len(t) > length else "")


In [10]:
def make_insight_pack(row: pd.Series) -> dict:
    kpis = kpis_for_segment(row)
    sent = sentiment_for_segment(row)

    # Defensive handling of year
    fy = row.get("fiscal_year")
    fq = row.get("fiscal_quarter")

    try:
        fy_int = int(fy) if not (isinstance(fy, float) and math.isnan(fy)) else None
    except Exception:
        fy_int = None

    meta = {
        "doc_path": row.get("doc_path"),
        "segment_index": int(row.get("segment_index", 0)),
        "ingest_date": row.get("ingest_date"),
        "source": row.get("source"),
    }

    pack = {
        "doc_path": row["doc_path"],
        "company_hint": row["company_hint"],
        "fiscal_year": row["fiscal_year"],
        "fiscal_quarter": row["fiscal_quarter"],
        "speaker": row["speaker"],
        "section": row["section"],
        "segment_index": int(row["segment_index"]),
        "kpis": kpis_for_segment(row["text"]),
        "meta": {
            "ingest_date": row.get("ingest_date"),
            # new: whether this underlying doc has a CFO prepared-remarks segment anywhere
            "has_cfo_prepared": bool(row.get("has_cfo_prepared", False)),
        },
    }

    # NEW: high-level doc_type flag (earnings vs non-earnings)
    # If *no* CFO prepared-remarks segment exists anywhere in this document,
    # treat it as a non-earnings IR document (overview, sustainability, etc.)
    if not pack["meta"]["has_cfo_prepared"]:
        pack["meta"]["doc_type"] = "non_earnings"
    else:
        pack["meta"]["doc_type"] = "earnings_call"

    return pack


# Test on one row
test_pack = make_insight_pack(sample_row)
test_pack


{'doc_path': 'data/raw/NFLX_2024Q2_SAMPLE_REMARKS.txt',
 'company_hint': 'NFLX 2024Q2 SAMPLE REMARKS',
 'fiscal_year': np.float64(2024.0),
 'fiscal_quarter': 'Q2',
 'speaker': 'FULL_TEXT',
 'section': 'prepared_remarks',
 'segment_index': 0,
 'kpis': {'revenue_growth_yoy_pct': 12.0,
  'eps_growth_yoy_pct': None,
  'guidance_comment': 'guidance commentary detected',
  'margin_comment': 'margin commentary detected'},
 'meta': {'ingest_date': '2025-11-25',
  'has_cfo_prepared': False,
  'doc_type': 'non_earnings'}}

In [11]:
written = 0
skipped = 0

for idx, row in cfo_segments_all.iterrows():
    pack = make_insight_pack(row)

    fy = pack.get("fiscal_year")
    fq = pack.get("fiscal_quarter")
    company = pack.get("company_hint") or "Unknown"

    # Skip if year/quarter missing
    if fy is None or fq is None:
        skipped += 1
        continue

    # Use a short slug for company name
    company_slug = str(company).split()[0].upper()

    # File name pattern: TICKER_YEAR_QUARTER_SEG.json
    # segment_index is stored at the top level of pack
    seg_idx = int(pack.get("segment_index", row.get("segment_index", 0)))

    file_name = f"{company_slug}_{fy}_{fq}_seg{seg_idx}.json"
    out_path = INSIGHTS_DIR / file_name

    with out_path.open("w", encoding="utf-8") as f:
        json.dump(pack, f, indent=2)

    written += 1

print(f"Wrote {written} CFO insight packs to {INSIGHTS_DIR}")
if skipped:
    print(f"Skipped {skipped} rows without fiscal year/quarter")


Wrote 4 CFO insight packs to ../data/insights


## Sentiment Extraction Helper

We use a lightweight sentiment model (`TextBlob`) to get:

- **polarity**: [-1, 1]
- **subjectivity**: [0, 1]

This isn't finance-specific, but it's enough to demonstrate how FinSense can
attach directional "tone" to CFO commentary.
