***Purpose of this Notebook***

This notebook evaluates the performance of the rule-based regular expression (regex) extraction baseline against a human-labeled gold dataset. Evaluation metrics include precision, recall, and F1 score, calculated using the same gold annotations that define ground truth. No extraction logic is modified or introduced in this notebook.

In [8]:
import json
from pathlib import Path
import pandas as pd
import re

In [9]:
PROJECT_ROOT = Path.cwd().parent
GOLD_DIR = PROJECT_ROOT / "gold_annotations"

EVAL_FILES = [
    '1017655_10K_2020_0001654954-21-003649.json.gold.json',
    '1064722_10K_2020_0001760319-21-000039.json.gold.json',
    '1066684_10K_2020_0001104659-21-042359.json.gold.json',
    '1082324_10K_2020_0001140361-21-008678.json.gold.json',
    '1327567_10K_2021_0001327567-21-000029.json.gold.json',
    '1353499_10K_2020_0001344676-21-000004.json.gold.json',
    '1378590_10K_2021_0001437749-21-028984.json.gold.json',
    '1404655_10K_2020_0001564590-21-006083.json.gold.json'
]



In [10]:

def load_gold_annotations(gold_dir: Path, eval_files: list[str]) -> pd.DataFrame:
    rows = []
    for fname in eval_files:
        fp = gold_dir / fname
        data = json.loads(fp.read_text(encoding="utf-8"))

        anns = data.get("annotations", data)  # supports either {annotations:[...]} or just [...]

        # normalize file id to match the source file stem
        # e.g. "...042359.json.gold.json" -> "...042359"
        file_id = Path(fname.replace(".gold.json", "")).stem

        for a in anns:
            rows.append({
                "file": file_id,
                "label": a["label"],
                "section": a["section"],
                "text": a["text"],
                "start": a.get("start"),
                "end": a.get("end"),
            })

    return pd.DataFrame(rows)

gold_df = load_gold_annotations(GOLD_DIR, EVAL_FILES)
gold_df.head()

Unnamed: 0,file,label,section,text,start,end
0,1017655_10K_2020_0001654954-21-003649,MONEY,item_7,"$3,541",13031,13037
1,1017655_10K_2020_0001654954-21-003649,MONEY,item_7,"$19,395",13038,13045
2,1017655_10K_2020_0001654954-21-003649,MONEY,item_7,27845,13081,13087
3,1017655_10K_2020_0001654954-21-003649,MONEY,item_7,148035,13088,13095
4,1017655_10K_2020_0001654954-21-003649,MONEY,item_7,114881,13130,13137


In [11]:
PROJECT_ROOT = Path.cwd().parent
BASELINE_PATH = PROJECT_ROOT / "baseline_regex_outputs.json"

LABELS = ["PERSON", "TITLE", "ORG", "MONEY"]

def load_baseline_predictions(baseline_path: Path, eval_file_ids: set[str]) -> pd.DataFrame:
    rows = []
    data = json.loads(baseline_path.read_text(encoding="utf-8"))

    def section_for_label(label: str) -> str:
        return "item_7" if label == "MONEY" else "item_10"

    # Case A: { "<filename>.json": { "PERSON":[...], ... }, ... }
    if isinstance(data, dict):
        for file_key, payload in data.items():
            file_id = Path(file_key).stem  # "....json" -> "...."
            if file_id not in eval_file_ids:
                continue

            for label in LABELS:
                values = payload.get(label, []) or []
                for v in values:
                    rows.append({
                        "file": file_id,
                        "label": label,
                        "section": section_for_label(label),
                        "text": str(v),
                        "start": None,
                        "end": None,
                    })

    # Case B: [ { "file":"<filename>.json", "PERSON":[...], ... }, ... ]
    elif isinstance(data, list):
        for obj in data:
            file_key = obj.get("file") or obj.get("filename") or obj.get("FILE")
            if not file_key:
                continue

            file_id = Path(file_key).stem
            if file_id not in eval_file_ids:
                continue

            for label in LABELS:
                values = obj.get(label, []) or []
                for v in values:
                    rows.append({
                        "file": file_id,
                        "label": label,
                        "section": section_for_label(label),
                        "text": str(v),
                        "start": None,
                        "end": None,
                    })
    else:
        raise ValueError("baseline_regex_outputs.json is neither a dict nor a list")

    return pd.DataFrame(rows)

# Use the gold_df you already built to define the evaluation file universe
eval_file_ids = set(gold_df["file"].unique())

baseline_df = load_baseline_predictions(BASELINE_PATH, eval_file_ids)
baseline_df.head()


Unnamed: 0,file,label,section,text,start,end
0,1017655_10K_2020_0001654954-21-003649,PERSON,item_10,Austin Lewis,,
1,1017655_10K_2020_0001654954-21-003649,PERSON,item_10,David Scott,,
2,1017655_10K_2020_0001654954-21-003649,PERSON,item_10,Andrew Pilaro,,
3,1017655_10K_2020_0001654954-21-003649,PERSON,item_10,Allan Pratt,,
4,1017655_10K_2020_0001654954-21-003649,PERSON,item_10,Under Delaware,,


In [18]:
def norm_text(s: str) -> str:
    """
    Lightweight normalization to reduce false mismatches due to casing/whitespace.
    Keeps meaning intact (no punctuation stripping, no stemming, etc.).
    """
    s = "" if s is None else str(s)
    s = s.strip().lower()
    s = re.sub(r"\s+", " ", s)  # collapse internal whitespace
    return s


def to_counts(df: pd.DataFrame, count_col: str) -> pd.DataFrame:
    """
    Turn row-level entities into a multiset count table by (file,label,section,text_n).
    Using text_n (normalized text) improves fairness for string-based baseline evaluation.
    """
    df = df.copy()
    df["text_n"] = df["text"].map(norm_text)

    return (
        df.groupby(["file", "label", "section", "text_n"])
          .size()
          .reset_index(name=count_col)
    )


def compute_metrics(gold_df: pd.DataFrame, baseline_df: pd.DataFrame):
    """
    Compute TP/FP/FN using multiset exact matching on normalized text.
    Returns:
      - merged: row-level match table (one row per unique (file,label,section,text_n))
      - by_label: micro totals + PRF per label
      - overall: micro totals + PRF overall
    """
    gold_counts = to_counts(gold_df, "gold_n")
    base_counts = to_counts(baseline_df, "pred_n")

    merged = gold_counts.merge(
        base_counts,
        on=["file", "label", "section", "text_n"],
        how="outer"
    ).fillna(0)

    merged["tp"] = merged[["gold_n", "pred_n"]].min(axis=1)
    merged["fp"] = merged["pred_n"] - merged["tp"]
    merged["fn"] = merged["gold_n"] - merged["tp"]

    def prf(tp, fp, fn):
        precision = tp / (tp + fp) if (tp + fp) else 0.0
        recall    = tp / (tp + fn) if (tp + fn) else 0.0
        f1        = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
        return precision, recall, f1

    # Per-label totals (micro by label)
    by_label = (
        merged.groupby("label")[["tp", "fp", "fn"]]
              .sum()
              .reset_index()
    )

    by_label[["precision", "recall", "f1"]] = by_label.apply(
        lambda r: pd.Series(prf(r.tp, r.fp, r.fn)),
        axis=1
    )

    # Overall micro-average
    tp_all, fp_all, fn_all = merged[["tp", "fp", "fn"]].sum()
    p_all, r_all, f1_all = prf(tp_all, fp_all, fn_all)

    overall = {
        "tp": int(tp_all),
        "fp": int(fp_all),
        "fn": int(fn_all),
        "precision": float(p_all),
        "recall": float(r_all),
        "f1": float(f1_all),
    }

    return merged, by_label, overall


In [19]:
# --- Run evaluation ---
merged_df, by_label_df, overall = compute_metrics(gold_df, baseline_df)

print("OVERALL REGEX BASELINE (micro):")
print({
    "tp": overall["tp"],
    "fp": overall["fp"],
    "fn": overall["fn"],
    "precision": round(overall["precision"], 3),
    "recall": round(overall["recall"], 3),
    "f1": round(overall["f1"], 3),
})

print("\nPER-LABEL PERFORMANCE (micro):")
display(
    by_label_df
    .sort_values("label")
    .assign(
        precision=lambda d: d["precision"].round(3),
        recall=lambda d: d["recall"].round(3),
        f1=lambda d: d["f1"].round(3),
    )
)

# --- Build per-document, per-section scores for Sprint 3 ---
def prf(tp, fp, fn):
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall    = tp / (tp + fn) if (tp + fn) else 0.0
    f1        = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
    return precision, recall, f1

per_doc = (
    merged_df.groupby(["file", "section"])[["tp", "fp", "fn"]]
             .sum()
             .reset_index()
)

per_doc["filename"] = per_doc["file"].astype(str) + ".json"
per_doc["variant"] = "regex_baseline"
per_doc["mode"] = "strict"

per_doc[["precision", "recall", "f1"]] = per_doc.apply(
    lambda r: pd.Series(prf(r.tp, r.fp, r.fn)),
    axis=1
)

per_doc = per_doc[[
    "filename", "variant", "section", "mode",
    "tp", "fp", "fn", "precision", "recall", "f1"
]]

# --- Add overall row (useful for headline tables) ---
overall_row = pd.DataFrame([{
    "filename": "ALL_DOCS",
    "variant": "regex_baseline",
    "section": "overall",
    "mode": "strict",
    "tp": overall["tp"],
    "fp": overall["fp"],
    "fn": overall["fn"],
    "precision": overall["precision"],
    "recall": overall["recall"],
    "f1": overall["f1"],
}])

regex_scores = pd.concat([per_doc, overall_row], ignore_index=True)

# --- Export for Sprint 3 ---
out_path = Path("exports") / "regex_scores_per_doc.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
regex_scores.to_csv(out_path, index=False)

print("\nExported Sprint 3 regex scores to:")
print(out_path.resolve())

display(regex_scores.head())

OVERALL REGEX BASELINE (micro):
{'tp': 126, 'fp': 126, 'fn': 1095, 'precision': 0.5, 'recall': 0.103, 'f1': 0.171}

PER-LABEL PERFORMANCE (micro):


Unnamed: 0,label,tp,fp,fn,precision,recall,f1
0,MONEY,71.0,92.0,871.0,0.436,0.075,0.129
1,ORG,8.0,7.0,77.0,0.533,0.094,0.16
2,PERSON,15.0,24.0,38.0,0.385,0.283,0.326
3,TITLE,32.0,3.0,109.0,0.914,0.227,0.364



Exported Sprint 3 regex scores to:
C:\Users\abrid\OneDrive\Desktop\WGU Work\D502 (CAPSTONE)\Task 3\Notebooks\exports\regex_scores_per_doc.csv


Unnamed: 0,filename,variant,section,mode,tp,fp,fn,precision,recall,f1
0,1017655_10K_2020_0001654954-21-003649.json,regex_baseline,item_10,strict,13.0,3.0,49.0,0.8125,0.209677,0.333333
1,1017655_10K_2020_0001654954-21-003649.json,regex_baseline,item_7,strict,5.0,0.0,58.0,1.0,0.079365,0.147059
2,1064722_10K_2020_0001760319-21-000039.json,regex_baseline,item_10,strict,9.0,6.0,48.0,0.6,0.157895,0.25
3,1064722_10K_2020_0001760319-21-000039.json,regex_baseline,item_7,strict,5.0,0.0,10.0,1.0,0.333333,0.5
4,1066684_10K_2020_0001104659-21-042359.json,regex_baseline,item_10,strict,7.0,4.0,11.0,0.636364,0.388889,0.482759


A True Positive (TP) occurs when:

gold label == regex label

gold text is exactly present in the regex list for that label

A False Negative (FN) occurs when:

gold text is not present in the regex list for that label

A False Positive (FP) occurs when:

regex extracted a string under a label

that string does not appear anywhere in gold for that label in that file

precision = TP / (TP + FP)
recall    = TP / (TP + FN)
f1        = harmonic mean

The regex baseline produced high precision for TITLE entities (0.91), indicating that when the rules identified a title, it was usually correct. However, recall for TITLE remained low (0.23), suggesting the rule set was conservative and missed many valid titles. PERSON and ORG extraction showed both low precision and low recall, reflecting the difficulty of reliably identifying names and organizations in SEC filings using pattern-based rules alone (e.g., false positives from capitalized phrases and structural/legal boilerplate). MONEY recall was especially low (0.08) because the gold standard includes many numeric monetary-like values beyond dollar-prefixed currency strings; the regex baseline primarily captured explicit currency formats, leading to systematic under-identification relative to the broader gold definition.