Manual annotation of a representative subset of filings was conducted prior to evaluation to establish a gold standard for entity extraction performance. This step, while not explicitly scheduled in the original plan, was enabled by early completion of baseline development.

In [1]:
import json
import random
from pathlib import Path
from pprint import pprint



In [2]:
BASELINE_PATH = Path("baseline_regex_outputs.json")

with BASELINE_PATH.open("r", encoding="utf-8") as f:
    baseline_outputs = json.load(f)

print("Baseline filings:", len(baseline_outputs))


Baseline filings: 38


In [3]:
RAW_DATA_PATH = Path("primary_data.json")

with RAW_DATA_PATH.open("r", encoding="utf-8") as f:
    raw_data = json.load(f)

print("Raw filings:", len(raw_data))


Raw filings: 191


**NOTE:**
**Gold annotations are created from raw Item 10 / Item 7 text,**
**NOT from baseline regex outputs.**


Determine the workable universe (38 filings). We only want filings that:
- exist in baseline outputs
- have internal Item 10 content

In [4]:
workable_files = list(baseline_outputs.keys())

print("Workable filings:", len(workable_files))


Workable filings: 38


Select and freeze the gold annotation set (n = 8)

In [5]:
random.seed(42)  # reproducibility

GOLD_N = 8
gold_files = random.sample(workable_files, GOLD_N)

gold_files


['1066684_10K_2020_0001104659-21-042359.json',
 '1017655_10K_2020_0001654954-21-003649.json',
 '1378590_10K_2021_0001437749-21-028984.json',
 '1353499_10K_2020_0001344676-21-000004.json',
 '1327567_10K_2021_0001327567-21-000029.json',
 '1082324_10K_2020_0001140361-21-008678.json',
 '1064722_10K_2020_0001760319-21-000039.json',
 '1404655_10K_2020_0001564590-21-006083.json']

Next code block persists the list, preserving it for use.

In [6]:
with open("gold_annotation_files.json", "w", encoding="utf-8") as f:
    json.dump(gold_files, f, indent=2)

print("Saved gold annotation file list.")


Saved gold annotation file list.


In [7]:
import json, random
from pathlib import Path

RAW_DATA_PATH = Path("primary_data.json")
GOLD_LIST_PATH = Path("gold_annotation_files.json")

with RAW_DATA_PATH.open("r", encoding="utf-8") as f:
    raw_data = json.load(f)

with GOLD_LIST_PATH.open("r", encoding="utf-8") as f:
    gold_files = json.load(f)

print("Gold files:", len(gold_files))


Gold files: 8


In [8]:
GOLD_DIR = Path("gold_annotations")
GOLD_DIR.mkdir(exist_ok=True)

ENTITY_LABELS = ["PERSON", "TITLE", "ORG", "MONEY"]

def init_gold_record(filename: str, record: dict) -> dict:
    return {
        "filename": filename,
        "company": record.get("company"),
        "filing_date": record.get("filing_date"),
        "period_of_report": record.get("period_of_report"),
        "notes": "",
        "annotations": [],   # list of {label, section, start, end, text}
        "schema_version": "1.0"
    }

for fn in gold_files:
    out_path = GOLD_DIR / f"{fn}.gold.json"
    if out_path.exists():
        continue
    gold_obj = init_gold_record(fn, raw_data[fn])
    out_path.write_text(json.dumps(gold_obj, indent=2), encoding="utf-8")

print("Gold templates ensured in:", GOLD_DIR)


Gold templates ensured in: gold_annotations


In [9]:
RAW_TEXT_DIR = Path("gold_raw_text")
RAW_TEXT_DIR.mkdir(exist_ok=True)

def export_raw_text(fn: str):
    rec = raw_data[fn]
    p10 = RAW_TEXT_DIR / f"{fn}.item_10.txt"
    p7  = RAW_TEXT_DIR / f"{fn}.item_7.txt"
    p10.write_text(rec.get("item_10","") or "", encoding="utf-8")
    p7.write_text(rec.get("item_7","") or "", encoding="utf-8")
    return p10, p7

# export for all gold files
for fn in gold_files:
    export_raw_text(fn)

print("Exported raw Item 10/7 text files to:", RAW_TEXT_DIR)


Exported raw Item 10/7 text files to: gold_raw_text


In [10]:
def find_spans(text: str, needle: str):
    spans = []
    if not needle:
        return spans
    start = 0
    while True:
        idx = text.find(needle, start)
        if idx == -1:
            break
        spans.append((idx, idx + len(needle)))
        start = idx + 1
    return spans


In [11]:
def load_gold(fn: str) -> dict:
    path = GOLD_DIR / f"{fn}.gold.json"
    return json.loads(path.read_text(encoding="utf-8"))

def save_gold(fn: str, obj: dict):
    path = GOLD_DIR / f"{fn}.gold.json"
    path.write_text(json.dumps(obj, indent=2), encoding="utf-8")

def add_annotation(fn: str, section: str, label: str, exact_text: str, occurrence: int = 0):
    rec = raw_data[fn]
    source = rec.get(section, "") or ""
    spans = find_spans(source, exact_text)
    if not spans:
        raise ValueError(f"Text not found in {section}: {exact_text!r}")

    if occurrence >= len(spans):
        raise IndexError(f"occurrence={occurrence} but only {len(spans)} matches found")

    start, end = spans[occurrence]
    gold = load_gold(fn)

    gold["annotations"].append({
        "label": label,
        "section": section,
        "start": start,
        "end": end,
        "text": source[start:end]
    })

    save_gold(fn, gold)
    return start, end, len(spans)

def dedupe_annotations(fn):
    gold = load_gold(fn)
    seen = set()
    deduped = []

    for a in gold["annotations"]:
        key = (a["label"], a["section"], a["start"], a["end"])
        if key not in seen:
            seen.add(key)
            deduped.append(a)

    gold["annotations"] = deduped
    save_gold(fn, gold)


In [12]:
def show_gold(fn: str, last_n: int = 20):
    gold = load_gold(fn)
    company = raw_data.get(fn, {}).get("company", "UNKNOWN")

    print("FILE:", fn)
    print("Company:", company)
    print("Total annotations:", len(gold.get("annotations", [])))

    for a in gold.get("annotations", [])[-last_n:]:
        print(a)


In [13]:
def init_gold(fn: str, overwrite: bool = False):
    """
    Create a fresh gold file for `fn` if it doesn't exist.
    If overwrite=True, reset annotations to empty.
    """
    path = GOLD_DIR / f"{fn}.gold.json"

    if path.exists() and not overwrite:
        return  # keep existing

    obj = {
        "file": fn,
        "annotations": []
    }
    save_gold(fn, obj)


LABELING: 

TO LABEL: 
- '1017655_10K_2020_0001654954-21-003649.json'
- '1378590_10K_2021_0001437749-21-028984.json'
- '1353499_10K_2020_0001344676-21-000004.json'
- '1327567_10K_2021_0001327567-21-000029.json'
- '1082324_10K_2020_0001140361-21-008678.json'
- '1064722_10K_2020_0001760319-21-000039.json'
- '1404655_10K_2020_0001564590-21-006083.json'

COMPLETED:
- 1066684_10K_2020_0001104659-21-042359.json

- 
- 
- 
- 
- 
- 
- 



In [14]:
fn = "1066684_10K_2020_0001104659-21-042359.json"
init_gold(fn, overwrite=True)   # start clean

# =========================
# MONEY annotations (Item 7)
# =========================
section = "item_7"
label = "MONEY"

money_values = [
    "$0",
    "$118,000",
    "$178,000",
    "$46,000",
    "$34,000",
    "$164,000",
    "$211,000",
    "$150,000",
    "$350,000",
    "$465,000",
    "$554,100",
    "$600,000",
    "$637,500",
    "$707,000",
    "$24,000",
    "$2,000",
    "$689,000",
    "$7,624",
    "$86,961",
    "$125,000",
    "$173,000",
    "$46,000",   # duplicate on purpose
    "$254,000",
]

seen = {}  # value -> next occurrence index

for v in money_values:
    occ = seen.get(v, 0)
    add_annotation(fn, section, label, exact_text=v, occurrence=occ)
    seen[v] = occ + 1


# =========================
# PERSON annotations (Item 10)
# =========================
section = "item_10"
label = "PERSON"

person_values = [
    "Frederick Jones",
    "Frederick Jones",
    "Frederick Jones",
    "Frederick Jones",
    "Frederick Jones",
]

seen = {}  # reset for PERSON

for v in person_values:
    occ = seen.get(v, 0)
    add_annotation(fn, section, label, exact_text=v, occurrence=occ)
    seen[v] = occ + 1

# =========================
# TITLE annotations (Item 10)
# =========================
section = "item_10"
label = "TITLE"

title_values = [
    "President",
    "Chief Executive Officer",
    "Chief\nFinancial Officer",
    "Director",
    "Chief Executive Officer",
    "CEO"
]

seen = {}  # reset for TITLE

for v in title_values:
    occ = seen.get(v, 0)
    add_annotation(fn, section, label, exact_text=v, occurrence=occ)
    seen[v] = occ + 1

# =========================
# ORG annotations (Item 10)
# =========================
section = "item_10"
label = "ORG"

org_values = [
   "Delfin",
   "Fairwood Peninsula Energy Corporation",
   "Delfin Midstream, Inc.",
   "Energy Global Services FZE",
   "Asiatic Gas Energy Holdings Limited",
   "Marc Rich + Co A.G., Switzerland",
   "Glencore Plc"
]

seen = {}  # reset for ORG

for v in org_values:
    occ = seen.get(v, 0)
    add_annotation(fn, section, label, exact_text=v, occurrence=occ)
    seen[v] = occ + 1

# =========================
# FINAL CLEANUP (IMPORTANT)
# =========================
dedupe_annotations(fn)

In [15]:
show_gold(fn)

FILE: 1066684_10K_2020_0001104659-21-042359.json
Company: THEGLOBE COM INC
Total annotations: 41
{'label': 'MONEY', 'section': 'item_7', 'start': 5744, 'end': 5751, 'text': '$46,000'}
{'label': 'MONEY', 'section': 'item_7', 'start': 5801, 'end': 5809, 'text': '$254,000'}
{'label': 'PERSON', 'section': 'item_10', 'start': 256, 'end': 271, 'text': 'Frederick Jones'}
{'label': 'PERSON', 'section': 'item_10', 'start': 1037, 'end': 1052, 'text': 'Frederick Jones'}
{'label': 'PERSON', 'section': 'item_10', 'start': 3552, 'end': 3567, 'text': 'Frederick Jones'}
{'label': 'PERSON', 'section': 'item_10', 'start': 7976, 'end': 7991, 'text': 'Frederick Jones'}
{'label': 'PERSON', 'section': 'item_10', 'start': 8278, 'end': 8293, 'text': 'Frederick Jones'}
{'label': 'TITLE', 'section': 'item_10', 'start': 272, 'end': 281, 'text': 'President'}
{'label': 'TITLE', 'section': 'item_10', 'start': 283, 'end': 306, 'text': 'Chief Executive Officer'}
{'label': 'TITLE', 'section': 'item_10', 'start': 308, 