In [1]:
import spacy
from spacy.training import Example
from spacy.pipeline import EntityRuler
import re
from spacy.tokens import Span
import string

# convention: never overwrite these
# nlp_sm = pretrained pipeline
# nlp_blank = blank pipeline for offsets/tests


In [2]:
# Pretrained pipeline (has tokenizer + tagger/parser/ner)
nlp_sm = spacy.load("en_core_web_sm")

# Blank pipeline for building training Examples (no trained components required)
nlp_blank = spacy.blank("en")

# --- Quick token / POS / DEP inspection (use pretrained, not blank) ----------

sample_text = "John Q. Smith served as Chief Financial Officer of Acme Corp."
doc_sm = nlp_sm(sample_text)

for token in doc_sm:
    print(token.text, token.pos_, token.dep_)

# --- Build a training Example from character-offset spans -------------------

# IMPORTANT: Example creation only needs a Doc (tokenized). It does NOT require
# adding / initializing a blank NER component.
train_entities = [
    (0, 13, "PERSON"),  # John Q. Smith
    (24, 47, "TITLE"),  # Chief Financial Officer
    (51, 61, "ORG"),    # Acme Corp. (including period)
]

doc_blank = nlp_blank.make_doc(sample_text)
example = Example.from_dict(doc_blank, {"entities": train_entities})

print([(ent.text, ent.label_) for ent in example.reference.ents])

John PROPN compound
Q. PROPN compound
Smith PROPN nsubj
served VERB ROOT
as ADP prep
Chief PROPN compound
Financial PROPN compound
Officer PROPN pobj
of ADP prep
Acme PROPN compound
Corp. PROPN pobj
[('John Q. Smith', 'PERSON'), ('Chief Financial Officer', 'TITLE'), ('Acme Corp.', 'ORG')]


In [3]:
#label inspector
def inspect_labels(nlp):
    print("=== Pipeline components ===")
    print(nlp.pipe_names)

    print("\n=== EntityRuler labels ===")
    if "entity_ruler" in nlp.pipe_names:
        ruler = nlp.get_pipe("entity_ruler")
        print(sorted(ruler.labels))
    else:
        print("(no EntityRuler)")

    print("\n=== NER labels ===")
    if "ner" in nlp.pipe_names:
        ner = nlp.get_pipe("ner")
        print(sorted(ner.labels))
    else:
        print("(no NER component)")

    print("\n=== Combined entity labels (what docs can emit) ===")
    combined = set()
    if "entity_ruler" in nlp.pipe_names:
        combined.update(nlp.get_pipe("entity_ruler").labels)
    if "ner" in nlp.pipe_names:
        combined.update(nlp.get_pipe("ner").labels)
    print(sorted(combined))


In [4]:
#Experiment 1

#Sample texts
samples = [
    "Chairman, President, Chief Executive\nOfficer & Financial Officer.",
    "Ms. Jane A. Doe has served as Interim Chief Executive Officer since 2021.",
    "John Smith, Jr. is our Chief Accounting Officer (CAO).",
]

probe_titles = [
    "He served as Chief Executive\nOfficer of the Company.",
    "She is Chairman, President, and Chief Financial Officer.",
    "He is Chief Executive Officer & President.",
    "She is our CEO and CAO.",
    "John Smith, Jr. served as Chief Accounting Officer.",
    "Chairman, President, Chief Executive\nOfficer & Financial Officer.",
]

#Helpers to run tests
def show_doc(doc, show_tokens=True):
    """Print tokens (optional) and entity predictions."""
    if show_tokens:
        print("TOKENS:", [t.text for t in doc])
    print("ENTS:", [(e.text, e.label_) for e in doc.ents])

def run_suite(nlp, texts, show_tokens=True, header=None):
    """Run a list of texts through a pipeline and print outputs."""
    if header:
        print(f"\n=== {header} ===")
    for s in texts:
        doc = nlp(s)
        print("\nTEXT:", s)
        show_doc(doc, show_tokens=show_tokens)

#Base model experiment
nlp_sm = spacy.load("en_core_web_sm")

run_suite(nlp_sm, samples, show_tokens=True, header="Experiment 1: Base model (titles mostly ignored)")
run_suite(nlp_sm, probe_titles, show_tokens=True, header="Probe suite: Base model")


=== Experiment 1: Base model (titles mostly ignored) ===

TEXT: Chairman, President, Chief Executive
Officer & Financial Officer.
TOKENS: ['Chairman', ',', 'President', ',', 'Chief', 'Executive', '\n', 'Officer', '&', 'Financial', 'Officer', '.']
ENTS: [('Officer & Financial', 'ORG')]

TEXT: Ms. Jane A. Doe has served as Interim Chief Executive Officer since 2021.
TOKENS: ['Ms.', 'Jane', 'A.', 'Doe', 'has', 'served', 'as', 'Interim', 'Chief', 'Executive', 'Officer', 'since', '2021', '.']
ENTS: [('Jane A. Doe', 'PERSON'), ('2021', 'DATE')]

TEXT: John Smith, Jr. is our Chief Accounting Officer (CAO).
TOKENS: ['John', 'Smith', ',', 'Jr.', 'is', 'our', 'Chief', 'Accounting', 'Officer', '(', 'CAO', ')', '.']
ENTS: [('John Smith', 'PERSON'), ('CAO', 'ORG')]

=== Probe suite: Base model ===

TEXT: He served as Chief Executive
Officer of the Company.
TOKENS: ['He', 'served', 'as', 'Chief', 'Executive', '\n', 'Officer', 'of', 'the', 'Company', '.']
ENTS: []

TEXT: She is Chairman, President, 

In [5]:
#TITLE EntityRuler builder

def build_title_pipeline(model="en_core_web_sm"):
    """
    Build a pipeline with an EntityRuler that adds TITLE spans
    before the default NER runs.
    """
    nlp = spacy.load(model)
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    title_patterns = [
        # C-suite acronyms
        {"label": "TITLE", "pattern": "CEO"},
        {"label": "TITLE", "pattern": "CFO"},
        {"label": "TITLE", "pattern": "COO"},
        {"label": "TITLE", "pattern": "CAO"},
        {"label": "TITLE", "pattern": "CTO"},
        {"label": "TITLE", "pattern": "CIO"},

        # Common single-token titles
        {"label": "TITLE", "pattern": [{"LOWER": "chairman"}]},
        {"label": "TITLE", "pattern": [{"LOWER": "president"}]},
        {"label": "TITLE", "pattern": [{"LOWER": "director"}]},

        # Chief <TitleCase...> Officer (no newline)
        {"label": "TITLE", "pattern": [
            {"LOWER": "chief"},
            {"IS_TITLE": True, "OP": "+"},
            {"LOWER": "officer"},
        ]},

        # Chief <TitleCase...> \n Officer (optional newline token)
        {"label": "TITLE", "pattern": [
            {"LOWER": "chief"},
            {"IS_TITLE": True, "OP": "+"},
            {"TEXT": "\n", "OP": "?"},
            {"LOWER": "officer"},
        ]},

        # <TitleCase...> Officer (Financial Officer, Accounting Officer, etc.)
        {"label": "TITLE", "pattern": [
            {"IS_TITLE": True, "OP": "+"},
            {"LOWER": "officer"},
        ]},
    ]

    ruler.add_patterns(title_patterns)
    return nlp

nlp_titles = build_title_pipeline()

run_suite(nlp_titles, probe_titles, show_tokens=False, header="Probe suite: With TITLE EntityRuler")


=== Probe suite: With TITLE EntityRuler ===

TEXT: He served as Chief Executive
Officer of the Company.
ENTS: [('Chief Executive\nOfficer', 'TITLE')]

TEXT: She is Chairman, President, and Chief Financial Officer.
ENTS: [('Chairman', 'TITLE'), ('President', 'TITLE'), ('Chief Financial Officer', 'TITLE')]

TEXT: He is Chief Executive Officer & President.
ENTS: [('Chief Executive Officer', 'TITLE'), ('President', 'TITLE')]

TEXT: She is our CEO and CAO.
ENTS: [('CEO', 'TITLE'), ('CAO', 'TITLE')]

TEXT: John Smith, Jr. served as Chief Accounting Officer.
ENTS: [('John Smith', 'PERSON'), ('Jr.', 'NORP'), ('Chief Accounting Officer', 'TITLE')]

TEXT: Chairman, President, Chief Executive
Officer & Financial Officer.
ENTS: [('Chairman', 'TITLE'), ('President', 'TITLE'), ('Chief Executive\nOfficer', 'TITLE'), ('Financial Officer', 'TITLE')]


In [6]:
#Suffix Merge Utility

SUFFIXES = {"jr.", "sr.", "ii", "iii", "iv", "v"}

def merge_person_suffixes(doc):
    """
    Merge suffix tokens like Jr./III into the preceding PERSON span.
    For display/debug convenience only (not training data).
    Returns a list of (text, label) tuples.
    """
    ents = list(doc.ents)
    new_ents = []
    i = 0

    while i < len(ents):
        ent = ents[i]

        if ent.label_ == "PERSON":
            end = ent.end  # token index (exclusive)

            # optionally include comma token right after PERSON
            if end < len(doc) and doc[end].text == ",":
                end += 1

            # include suffix token if present
            if end < len(doc) and doc[end].text.strip().lower() in SUFFIXES:
                end += 1
                merged = Span(doc, ent.start, end, label=doc.vocab.strings["PERSON"])
                new_ents.append(merged)

                # skip next entity if it is the suffix (often NORP)
                if i + 1 < len(ents) and ents[i + 1].text.strip().lower() in SUFFIXES:
                    i += 1

                i += 1
                continue

        new_ents.append(ent)
        i += 1

    # dedupe + sort
    new_ents = sorted({(e.start, e.end, e.label_): e for e in new_ents}.values(), key=lambda e: (e.start, e.end))
    return [(e.text, e.label_) for e in new_ents]

In [7]:
# quick demo - Experiment 1
demo = "John Smith, Jr. served as Chief Accounting Officer (CAO)."
doc_demo = nlp_titles(demo)
print("\nRAW:", [(e.text, e.label_) for e in doc_demo.ents])
print("MERGED:", merge_person_suffixes(doc_demo))


RAW: [('John Smith', 'PERSON'), ('Jr.', 'NORP'), ('Chief Accounting Officer', 'TITLE'), ('CAO', 'TITLE')]
MERGED: [('John Smith, Jr.', 'PERSON'), ('Chief Accounting Officer', 'TITLE'), ('CAO', 'TITLE')]


In [8]:
#Experiment 2: MONEY

money_probes = [
    # clear currency + scale
    "Revenue was $2.4 million for the year ended 2024.",
    "Net sales were $325 thousand in Q4.",
    "Total revenue increased to $1,200.",
    "Operating income was ($113) for the period.",

    # scale words without $
    "Net sales totaled 325 thousand in Q4.",
    "Revenue totaled 2.4 million for fiscal 2024.",
    "Income was 1.2 billion last year.",
    "Total employees: 325.",

    # tricky formatting
    "Net sales were $2.4m and revenue was $3.1M.",
    "Revenue was 2,400 (in thousands).",
    "Revenue was $2,400 (in thousands).",
]

# Baseline view (pretrained only)
run_suite(nlp_sm, money_probes, show_tokens=False, header="Experiment 2: Base model MONEY behavior")



=== Experiment 2: Base model MONEY behavior ===

TEXT: Revenue was $2.4 million for the year ended 2024.
ENTS: [('$2.4 million', 'MONEY'), ('the year ended 2024', 'DATE')]

TEXT: Net sales were $325 thousand in Q4.
ENTS: [('$325 thousand', 'MONEY'), ('Q4', 'GPE')]

TEXT: Total revenue increased to $1,200.
ENTS: [('1,200', 'MONEY')]

TEXT: Operating income was ($113) for the period.
ENTS: [('113', 'MONEY')]

TEXT: Net sales totaled 325 thousand in Q4.
ENTS: [('325 thousand', 'CARDINAL'), ('Q4', 'GPE')]

TEXT: Revenue totaled 2.4 million for fiscal 2024.
ENTS: [('2.4 million', 'CARDINAL'), ('fiscal 2024', 'DATE')]

TEXT: Income was 1.2 billion last year.
ENTS: [('1.2 billion', 'CARDINAL'), ('last year', 'DATE')]

TEXT: Total employees: 325.
ENTS: [('325', 'CARDINAL')]

TEXT: Net sales were $2.4m and revenue was $3.1M.
ENTS: [('2.4', 'MONEY'), ('3.1M.', 'MONEY')]

TEXT: Revenue was 2,400 (in thousands).
ENTS: [('2,400', 'CARDINAL'), ('thousands', 'CARDINAL')]

TEXT: Revenue was $2,400 (i

In [9]:
#Pipeline Builder for MONEY_CANDIDATE

def build_money_candidate_pipeline(model="en_core_web_sm"):
    """
    Build a pipeline with an EntityRuler that tags MONEY_CANDIDATE spans
    before the model NER runs.
    """
    nlp = spacy.load(model)
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    patterns = [
        # "$" + number (split tokenization)
        {"label": "MONEY_CANDIDATE", "pattern": [
            {"TEXT": "$"},
            {"TEXT": {"REGEX": r"^[\d,]+(\.\d+)?$"}}
        ]},

        # "$" + number + scale word  -> "$2.4 million", "$325 thousand"
        {"label": "MONEY_CANDIDATE", "pattern": [
            {"TEXT": "$"},
            {"TEXT": {"REGEX": r"^[\d,]+(\.\d+)?$"}},
            {"LOWER": {"IN": ["thousand", "million", "billion"]}},
        ]},

        # number + scale word (no $) -> "2.4 million", "325 thousand"
        {"label": "MONEY_CANDIDATE", "pattern": [
            {"LIKE_NUM": True},
            {"LOWER": {"IN": ["thousand", "million", "billion"]}}
        ]},

        # parenthetical negatives: "(", "$", "113", ")" -> "($113)"
        {"label": "MONEY_CANDIDATE", "pattern": [
            {"TEXT": "("},
            {"TEXT": "$"},
            {"TEXT": {"REGEX": r"^[\d,]+(\.\d+)?$"}},
            {"TEXT": ")"}
        ]},

        # compact scale split: "$", "2.4", "m" -> "$2.4m"
        {"label": "MONEY_CANDIDATE", "pattern": [
            {"TEXT": "$"},
            {"TEXT": {"REGEX": r"^[\d]+(\.\d+)?$"}},
            {"LOWER": {"IN": ["m", "b"]}},
        ]},

        # compact scale glued (sometimes with trailing punct): "$", "3.1M." -> "$3.1M."
        {"label": "MONEY_CANDIDATE", "pattern": [
            {"TEXT": "$"},
            {"TEXT": {"REGEX": r"^[\d]+(\.\d+)?[mMbB]\.?$"}},
        ]},

        # "(in thousands)" scaling cue: "2,400 (in thousands)"
        {"label": "MONEY_CANDIDATE", "pattern": [
            {"LIKE_NUM": True},
            {"TEXT": "("},
            {"LOWER": "in"},
            {"LOWER": {"IN": ["thousands", "millions", "billions"]}},
            {"TEXT": ")"}
        ]},
    ]

    ruler.add_patterns(patterns)
    return nlp

nlp_money = build_money_candidate_pipeline()

run_suite(nlp_money, money_probes, show_tokens=False, header="Experiment 2: With MONEY_CANDIDATE ruler")


=== Experiment 2: With MONEY_CANDIDATE ruler ===

TEXT: Revenue was $2.4 million for the year ended 2024.
ENTS: [('$2.4 million', 'MONEY_CANDIDATE'), ('the year ended 2024', 'DATE')]

TEXT: Net sales were $325 thousand in Q4.
ENTS: [('$325 thousand', 'MONEY_CANDIDATE'), ('Q4', 'GPE')]

TEXT: Total revenue increased to $1,200.
ENTS: [('$1,200', 'MONEY_CANDIDATE')]

TEXT: Operating income was ($113) for the period.
ENTS: [('($113)', 'MONEY_CANDIDATE')]

TEXT: Net sales totaled 325 thousand in Q4.
ENTS: [('325 thousand', 'MONEY_CANDIDATE'), ('Q4', 'GPE')]

TEXT: Revenue totaled 2.4 million for fiscal 2024.
ENTS: [('2.4 million', 'MONEY_CANDIDATE'), ('fiscal 2024', 'DATE')]

TEXT: Income was 1.2 billion last year.
ENTS: [('1.2 billion', 'MONEY_CANDIDATE'), ('last year', 'DATE')]

TEXT: Total employees: 325.
ENTS: [('325', 'CARDINAL')]

TEXT: Net sales were $2.4m and revenue was $3.1M.
ENTS: [('$2.4m', 'MONEY_CANDIDATE'), ('$3.1M.', 'MONEY_CANDIDATE')]

TEXT: Revenue was 2,400 (in thousand

In [17]:
#Candidate post-processing utils
REVENUE_CONTEXT = {
    "revenue", "net sales", "sales", "income",
    "turnover", "earned", "totaled", "totalled",
}

def has_revenue_context(doc):
    t = doc.text.lower()
    return any(kw in t for kw in REVENUE_CONTEXT)

def longest_money_candidates(doc):
    """Prefer longest MONEY_CANDIDATE spans when overlaps occur."""
    cands = [e for e in doc.ents if e.label_ == "MONEY_CANDIDATE"]
    cands = sorted(cands, key=lambda e: (e.start, -(e.end - e.start)))  # same start -> longest first

    kept = []
    for e in cands:
        if any(not (e.end <= k.start or e.start >= k.end) for k in kept):
            continue
        kept.append(e)
    return kept

def normalize_money_text(text: str) -> str:
    """
    Trim trailing sentence punctuation but preserve meaningful closers like ')'
    that are part of your labeled MONEY spans (e.g., ($113), (in thousands)).
    """
    t = text.strip()

    # Strip common trailing punctuation (but NOT ')')
    while t and t[-1] in {".", ",", ";", ":", "!", "?", "…"}:
        t = t[:-1].rstrip()

    # Strip trailing quotes (straight or curly)
    while t and t[-1] in {'"', "'", "”", "’"}:
        t = t[:-1].rstrip()

    return t

def classify_money_candidates(doc):
    """Return cleaned MONEY_CANDIDATE spans + simple revenue/non-revenue classification."""
    revenue_flag = has_revenue_context(doc)
    cands = longest_money_candidates(doc)
    return [(normalize_money_text(e.text), "REVENUE_MONEY" if revenue_flag else "OTHER_MONEY") for e in cands]

In [18]:
#Experiment 2 - demo
def run_money_experiment(nlp, texts):
    print("=== Experiment 2: MONEY_CANDIDATE (prefer longest, trimmed) ===")
    for s in texts:
        d = nlp(s)
        print("\nTEXT:", s)
        print("CANDIDATES:", classify_money_candidates(d))
        print("ALL ENTS:", [(e.text, e.label_) for e in d.ents])

run_money_experiment(nlp_money, money_probes)


=== Experiment 2: MONEY_CANDIDATE (prefer longest, trimmed) ===

TEXT: Revenue was $2.4 million for the year ended 2024.
CANDIDATES: [('$2.4 million', 'REVENUE_MONEY')]
ALL ENTS: [('$2.4 million', 'MONEY_CANDIDATE'), ('the year ended 2024', 'DATE')]

TEXT: Net sales were $325 thousand in Q4.
CANDIDATES: [('$325 thousand', 'REVENUE_MONEY')]
ALL ENTS: [('$325 thousand', 'MONEY_CANDIDATE'), ('Q4', 'GPE')]

TEXT: Total revenue increased to $1,200.
CANDIDATES: [('$1,200', 'REVENUE_MONEY')]
ALL ENTS: [('$1,200', 'MONEY_CANDIDATE')]

TEXT: Operating income was ($113) for the period.
CANDIDATES: [('($113)', 'REVENUE_MONEY')]
ALL ENTS: [('($113)', 'MONEY_CANDIDATE')]

TEXT: Net sales totaled 325 thousand in Q4.
CANDIDATES: [('325 thousand', 'REVENUE_MONEY')]
ALL ENTS: [('325 thousand', 'MONEY_CANDIDATE'), ('Q4', 'GPE')]

TEXT: Revenue totaled 2.4 million for fiscal 2024.
CANDIDATES: [('2.4 million', 'REVENUE_MONEY')]
ALL ENTS: [('2.4 million', 'MONEY_CANDIDATE'), ('fiscal 2024', 'DATE')]

TEXT:

In [12]:
# Experiment 3: ORG probes
org_probes = [
    # standard corporate forms
    "Acme Corp. reported revenue of $2.4 million.",
    "Acme Corporation reported revenue of $2.4 million.",
    "Acme, Inc. reported revenue of $2.4 million.",
    "Acme Inc reported revenue of $2.4 million.",
    "Acme LLC entered into the agreement.",
    "Acme L.L.C. entered into the agreement.",

    # punctuation + abbreviations
    "International Business Machines Corp. (IBM) announced the transaction.",
    "We refer to International Business Machines Corp. as IBM.",
    "Apple Inc.’s subsidiaries include Example Sub, LLC.",

    # SEC-ish usage
    "The Company was incorporated in Delaware in 2001.",
    "We and our subsidiaries (the “Company”) operate in multiple segments.",
    "Acme Corp. (the “Company”) is a Delaware corporation.",

    # “doing business as”
    "Acme Corp., doing business as Acme Health, provides services.",

    # common false positive risk: government/regulators
    "The Securities and Exchange Commission issued guidance.",
]

run_suite(nlp_sm, org_probes, show_tokens=False, header="Experiment 3: ORG baseline (en_core_web_sm)")



=== Experiment 3: ORG baseline (en_core_web_sm) ===

TEXT: Acme Corp. reported revenue of $2.4 million.
ENTS: [('Acme Corp.', 'ORG'), ('$2.4 million', 'MONEY')]

TEXT: Acme Corporation reported revenue of $2.4 million.
ENTS: [('Acme Corporation', 'ORG'), ('$2.4 million', 'MONEY')]

TEXT: Acme, Inc. reported revenue of $2.4 million.
ENTS: [('Acme, Inc.', 'ORG'), ('$2.4 million', 'MONEY')]

TEXT: Acme Inc reported revenue of $2.4 million.
ENTS: [('Acme Inc', 'ORG'), ('$2.4 million', 'MONEY')]

TEXT: Acme LLC entered into the agreement.
ENTS: []

TEXT: Acme L.L.C. entered into the agreement.
ENTS: [('L.L.C.', 'ORG')]

TEXT: International Business Machines Corp. (IBM) announced the transaction.
ENTS: [('International Business Machines Corp.', 'ORG'), ('IBM', 'ORG')]

TEXT: We refer to International Business Machines Corp. as IBM.
ENTS: [('International Business Machines Corp.', 'ORG'), ('IBM', 'ORG')]

TEXT: Apple Inc.’s subsidiaries include Example Sub, LLC.
ENTS: [('Apple Inc.’s', 'ORG'

In [13]:
# ORG_CANDIDATE pipeline builder
ORG_SUFFIXES = [
    "inc", "inc.", "corp", "corp.", "corporation",
    "co", "co.", "company",
    "llc", "l.l.c.", "l.l.c", "ltd", "ltd.",
    "lp", "l.p.", "llp", "l.l.p.", "plc"
]

def build_org_candidate_pipeline(model="en_core_web_sm"):
    """
    Build a pipeline with an EntityRuler that tags ORG_CANDIDATE spans
    before the model NER runs.
    """
    nlp = spacy.load(model)
    ruler = nlp.add_pipe("entity_ruler", before="ner")

    patterns = [
        # TitleCase name + optional comma + suffix (Inc/Corp/LLC/etc.)
        {"label": "ORG_CANDIDATE", "pattern": [
            {"IS_TITLE": True, "OP": "+"},
            {"TEXT": ",", "OP": "?"},
            {"LOWER": {"IN": ORG_SUFFIXES}},
        ]},

        # Dotted suffix token like L.L.C. (when spaCy tokenizes it as one token)
        {"label": "ORG_CANDIDATE", "pattern": [
            {"IS_TITLE": True, "OP": "+"},
            {"TEXT": ",", "OP": "?"},
            {"TEXT": {"REGEX": r"^([A-Za-z]\.){2,}[A-Za-z]\.?$"}},  # e.g., L.L.C.
        ]},

        # DBA phrase (candidate extraction; we’ll optionally normalize in "ORG view")
        {"label": "ORG_CANDIDATE", "pattern": [
            {"LOWER": "doing"},
            {"LOWER": "business"},
            {"LOWER": "as"},
            {"IS_TITLE": True, "OP": "+"},
        ]},
    ]

    ruler.add_patterns(patterns)
    return nlp

In [14]:
# Candidate selection + filters
ORG_STOP_EXACT = {
    "company",
    "the company",
    "“company",
    '"company',
    "company”",
    "company'",
    'company"',
}

# These often get matched by suffix patterns but aren’t org names
ORG_STOP_CONTAINS = {
    "delaware corporation",
    "a delaware corporation",
}

def is_bad_org_candidate(text: str) -> bool:
    t = text.strip().lower()
    if t in ORG_STOP_EXACT:
        return True
    if any(bad in t for bad in ORG_STOP_CONTAINS):
        return True
    return False

def longest_org_candidates(doc):
    """
    Prefer longest ORG_CANDIDATE spans when overlaps occur, and apply stop filters.
    """
    cands = [e for e in doc.ents if e.label_ == "ORG_CANDIDATE"]
    cands = sorted(cands, key=lambda e: (e.start, -(e.end - e.start)))

    kept = []
    for e in cands:
        if any(not (e.end <= k.start or e.start >= k.end) for k in kept):
            continue
        if is_bad_org_candidate(e.text):
            continue
        kept.append(e)
    return kept


In [15]:
# Optional ORG_VIEW normalization (keeps doc spans intact)
DBA_PREFIXES = (
    "doing business as ",
    "d/b/a ",
    "dba ",
)

def normalize_org_for_view(text: str) -> str:
    """
    Normalize ORG output for inspection only. Does not modify doc.ents.
    """
    t = text.strip()
    lower = t.lower()
    for p in DBA_PREFIXES:
        if lower.startswith(p):
            return t[len(p):].strip()
    return t

def extract_org_view(doc):
    """
    Candidate-first ORG view for this stage.
    Returns (normalized_text, 'ORG') tuples for easier comparison with your labels.
    """
    orgs = []
    for e in longest_org_candidates(doc):
        orgs.append((normalize_org_for_view(e.text), "ORG"))
    return orgs



In [16]:
# Runner
def run_org_experiment(nlp, texts):
    print("=== Experiment 3: ORG_CANDIDATE (prefer longest) ===")
    for s in texts:
        d = nlp(s)
        print("\nTEXT:", s)
        print("CANDIDATES:", [(e.text, e.label_) for e in longest_org_candidates(d)])
        print("ORG_VIEW:", extract_org_view(d))
        print("ALL ENTS:", [(e.text, e.label_) for e in d.ents])

nlp_org = build_org_candidate_pipeline()
run_org_experiment(nlp_org, org_probes)

=== Experiment 3: ORG_CANDIDATE (prefer longest) ===

TEXT: Acme Corp. reported revenue of $2.4 million.
CANDIDATES: [('Acme Corp.', 'ORG_CANDIDATE')]
ORG_VIEW: [('Acme Corp.', 'ORG')]
ALL ENTS: [('Acme Corp.', 'ORG_CANDIDATE'), ('$2.4 million', 'MONEY')]

TEXT: Acme Corporation reported revenue of $2.4 million.
CANDIDATES: [('Acme Corporation', 'ORG_CANDIDATE')]
ORG_VIEW: [('Acme Corporation', 'ORG')]
ALL ENTS: [('Acme Corporation', 'ORG_CANDIDATE'), ('$2.4 million', 'MONEY')]

TEXT: Acme, Inc. reported revenue of $2.4 million.
CANDIDATES: [('Acme, Inc.', 'ORG_CANDIDATE')]
ORG_VIEW: [('Acme, Inc.', 'ORG')]
ALL ENTS: [('Acme, Inc.', 'ORG_CANDIDATE'), ('$2.4 million', 'MONEY')]

TEXT: Acme Inc reported revenue of $2.4 million.
CANDIDATES: [('Acme Inc', 'ORG_CANDIDATE')]
ORG_VIEW: [('Acme Inc', 'ORG')]
ALL ENTS: [('Acme Inc', 'ORG_CANDIDATE'), ('$2.4 million', 'MONEY')]

TEXT: Acme LLC entered into the agreement.
CANDIDATES: [('Acme LLC', 'ORG_CANDIDATE')]
ORG_VIEW: [('Acme LLC', 'ORG')]