In [1]:
import kagglehub
import os
import json
import random
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download latest version
path = kagglehub.dataset_download("pranjalverma08/sec-edgar-annual-financial-filings-2021")

extract_path = os.path.join(path, "extracted")
output_file = "primary_data.json"

In [3]:
#check for data
sample_json = os.path.join(extract_path, "1001601_10K_2020_0001493152-21-008913.json")

with open(sample_json, "r", encoding="utf-8") as f:
    data = json.load(f)

print(type(data))
print(data.keys())  

<class 'dict'>
dict_keys(['cik', 'company', 'filing_type', 'filing_date', 'period_of_report', 'sic', 'state_of_inc', 'state_location', 'fiscal_year_end', 'filing_html_index', 'htm_filing_link', 'complete_text_filing_link', 'filename', 'item_1', 'item_1A', 'item_1B', 'item_2', 'item_3', 'item_4', 'item_5', 'item_6', 'item_7', 'item_7A', 'item_8', 'item_9', 'item_9A', 'item_9B', 'item_10', 'item_11', 'item_12', 'item_13', 'item_14', 'item_15'])


In [4]:
# Fields we want from each filing
FIELDS_TO_KEEP = [
    "cik",
    "company",
    "filing_date",
    "period_of_report",
    "item_1",
    "item_7",
    "item_10"
]

primary_data = {}

In [5]:
# Loop through all JSON files
for filename in os.listdir(extract_path):
    if not filename.endswith(".json"):
        continue
    
    file_path = os.path.join(extract_path, filename)
    
    with open(file_path, "r", encoding="utf-8") as f:
        filing_json = json.load(f)

    # Build a cleaned record
    cleaned_record = {}

    for field in FIELDS_TO_KEEP:
        cleaned_record[field] = filing_json.get(field, None)

    primary_data[filename] = cleaned_record

# Save consolidated JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(primary_data, f, indent=2)

print(f"primary_data.json created with {len(primary_data)} records.")

primary_data.json created with 191 records.


Compute simple text length statistics. This helps choose samples for annotation and confirm variability.

In [6]:
def text_len(t):
    return len(t) if t else 0

item7_lengths = [text_len(r.get("item_7")) for r in primary_data.values()]
item10_lengths = [text_len(r.get("item_10")) for r in primary_data.values()]

print("Item 7 - avg length:", sum(item7_lengths)/len(item7_lengths))
print("Item 7 - max length:", max(item7_lengths))
print("Item 7 - min length:", min(item7_lengths))

print("\nItem 10 - avg length:", sum(item10_lengths)/len(item10_lengths))
print("Item 10 - max length:", max(item10_lengths))
print("Item 10 - min length:", min(item10_lengths))


Item 7 - avg length: 60578.02094240838
Item 7 - max length: 203027
Item 7 - min length: 1650

Item 10 - avg length: 3365.6282722513088
Item 10 - max length: 33767
Item 10 - min length: 140


Show 3 random samples from primary_data

In [7]:
sample_keys = random.sample(list(primary_data.keys()), 3)

for key in sample_keys:
    record = primary_data[key]
    print("\n===== FILE:", key, "=====")
    print("Company:", record.get("company"))
    print("Period:", record.get("period_of_report"))
    print("\nItem 7 preview:\n", (record.get("item_7") or "")[:400])
    print("\nItem 10 preview:\n", (record.get("item_10") or "")[:400])


===== FILE: 1022505_10K_2021_0001493152-21-032215.json =====
Company: CIPHERLOC Corp
Period: 2021-09-30

Item 7 preview:
 ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS
The following discussion and analysis should be read in conjunction with our financial statements, including the notes thereto, appearing elsewhere in this Annual Report on Form 10-K.
Our Business
We are developing products and services around our patented polymorphic encryption technology, which is desig

Item 10 preview:
 ITEM 10. DIRECTORS, EXECUTIVE OFFICERS, AND CORPORATE GOVERNANCE [incorporate by reference to July 2021 proxy statement?]
Directors and Executive Officers
Set forth below is information regarding the Company’s current directors and executive officers. There are no family relationships between any of our directors or executive officers. The directors are elected annually by our stockholders. The ex

===== FILE: 807863_10K_2021_0000807863-21-000098.json =

**Step 1 — Identify the Need for Filtering**

During dataset exploration, it was observed that many companies do not include executive biographies or officer information directly in Item 10 (Directors, Executive Officers, and Corporate Governance) of their Form 10-K. Instead, large companies frequently incorporate this information by reference to their annual Proxy Statement, which is not included in the Kaggle dataset.<br>

As a result: <Br>
Some Item 10 sections contain detailed biographical data suitable for PERSON, TITLE, and ORG extraction.<br>

Others contain only statements such as: <br>
“The information required by this item is incorporated by reference to our Proxy Statement…”<br>

These proxy-style Item 10 records needed to be excluded from training data for the NER model to avoid contaminating annotations with unusable or irrelevant text.

**Step 2 — Define Proxy Reference Patterns**

A list of key phrases commonly used in SEC filings to signal Proxy Statement references was compiled. These included phrases such as:

1. “incorporated by reference”
2. “will be included in our Proxy Statement”
3. “information required by this item is contained in…”
4. “to be filed with the SEC within 120 days”

These phrases were normalized to lowercase to ensure consistent matching.

**Step 3 — Implement the Filtering Function**

A helper function was created to classify each Item 10 section as either:

1. Proxy-style Item 10 (contains no usable executive information), or
2. Internal Item 10 (contains officer names, titles, and relevant organizational references).

The *is_proxy_style* function was applied to every record in primary_data.json.

**Step 4 — Split the Dataset into Two Subsets**

Each filing was assigned to one of two categorized collections:

1. good_records → Filings whose Item 10 contains internal executive content
2. proxy_item10_records → Filings whose Item 10 solely references an external Proxy Statement

In [8]:
with open("primary_data.json", "r", encoding="utf-8") as f:
    primary_data = json.load(f)

PROXY_PHRASES = [
    "incorporated by reference",
    "incorporated herein by reference",
    "will be included in our definitive proxy statement",
    "will be included in our proxy statement",
    "to be included in our definitive proxy statement",
    "to be included in our proxy statement",
    "is included in our proxy statement",
    "will be filed with the sec within 120 days",
    "to be filed with the sec within 120 days",
    "information required by this item is contained in",
    "information required by this item will be included in",
    "will be set forth under the captions",
    "will be set forth under the caption",
    "will be contained in our proxy statement",
    "will be contained in the proxy statement",
    "will be included under the captions",
    "will be included under the caption",
    "is set forth under the captions",
    "is set forth under the caption",
    "may be found",
    "the information required by this item will be set forth"
]

def is_proxy_style_item10(text: str) -> bool:
    if not text:
        return False
    lt = text.lower()
    return any(phrase in lt for phrase in PROXY_PHRASES)

good_records = {}
proxy_item10_records = {}

for filename, record in primary_data.items():
    item10_text = record.get("item_10", "") or ""
    if is_proxy_style_item10(item10_text):
        proxy_item10_records[filename] = record
    else:
        good_records[filename] = record

print("Total filings:", len(primary_data))
print("Proxy-style Item 10 filings:", len(proxy_item10_records))
print("Filings with internal Item 10 content:", len(good_records))


Total filings: 191
Proxy-style Item 10 filings: 153
Filings with internal Item 10 content: 38


**Step 5 — Summary of Results**

After filtering: 

1. Total filings: 191
2. Filings with proxy-style Item 10: 146
3. Filings with usable internal Item 10 content: 45

Manual inspection of excluded examples confirmed that proxy-style classifications were accurate and contained no extractable PERSON or TITLE data. <br>

Despite the reduction, the remaining 45 filings contain more than enough text to support:

1. Named Entity Recognition (NER) model training
2. Manual annotation
3. Comparative evaluation with the regex baseline

This approach ensures cleaner, more reliable training data and avoids misleading patterns originating from Proxy Statement placeholders.

**Step 6 — Preserve the Clean Subset**

The filtered subset of internal Item 10 filings was saved to its own JSON file for future annotation and modeling. This allows the project to cleanly separate:
1. Full dataset (all 191 filings, used for Item 7 / MONEY extraction)
2. Filtered dataset (45 filings with actual executive information, used for PERSON, TITLE, and ORG extraction)

In [9]:
with open("primary_data_item10_internal.json", "w", encoding="utf-8") as f:
    json.dump(good_records, f, indent=2)

print("Saved", len(good_records), "records with internal Item 10 content.")

Saved 38 records with internal Item 10 content.


In [10]:
with open("primary_data_item10_internal.json", "r", encoding="utf-8") as f:
    item10_data = json.load(f)

with open("primary_data.json", "r", encoding="utf-8") as f:
    all_data = json.load(f)

print("Item 10 internal filings:", len(item10_data))
print("All filings (Item 7 source):", len(all_data))

Item 10 internal filings: 38
All filings (Item 7 source): 191


In [11]:
def normalize_entity(text: str) -> str:
    if not text:
        return ""
    t = text.strip()
    # collapse whitespace
    t = re.sub(r"\$\s+", "$", t)  # remove space after $
    # strip trailing punctuation
    t = t.rstrip(" ,.;:()[]")
    return t

Define Regex Patterns for Entities

PERSON_RE = re.compile(
    r"\b(?:Mr\.|Ms\.|Mrs\.|Dr\.)?\s*"
    r"([A-Z][a-z]+(?:\s+[A-Z]\.)?(?:\s+[A-Z][a-z]+)+)\b"
)

Baseline Person extractor. Sample Item 10 file: 1001601_10K_2020_0001493152-21-008913.json
PERSON: ['Executive Officers', 'Corporate Governance Name Age Position Robert', 'Ladd President', 'Chief Executive Officer', 'Chief Financial Officer', 'Director Michael Onghai Chairman', 'Audit Committee', 'Compensation Committee', 'Corporate Governance Committee Member', 'Independent Director Directors']

Extraction was too broad. Needs refinement.

PERSON_RE = re.compile(
    r"\b(?:Mr\.|Ms\.|Mrs\.|Dr\.)?\s*"
    r"([A-Z][a-z]+(?:\s+[A-Z]\.)?(?:\s+[A-Z][a-z]+){1,2})\b"
)

#known false positives we can code in to avoid
PERSON_STOP_PHRASES = {
    "Executive Officers", "Corporate Governance", "Audit Committee",
    "Compensation Committee", "Independent Director", "Directors"
}
This code still had a fair number of accidental positives (structured headers, narrative phrases)

MONEY_RE = re.compile(
    r"(?<!\w)(\$?\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s?(?:million|billion|thousand)?|\$?\s?\d+(?:\.\d+)?\s?(?:million|billion|thousand)?)",
    re.IGNORECASE
)

REVENUE_CONTEXT_RE = re.compile(
    r"\b(revenue|net sales|sales|total revenue|net revenue|operating revenue|net income|net loss)\b",
    re.IGNORECASE
)

Too broad, extracted years, did not require $ so lots of trash came thru.

In [12]:
PERSON_RE = re.compile(
    r"\b(?:Mr\.|Ms\.|Mrs\.|Dr\.)?\s*"
    r"([A-Z][a-z]+(?:\s+[A-Z]\.)?(?:\s+[A-Z][a-z]+){1,2})\b"
)

HONORIFIC_RE = re.compile(r"\b(Mr\.|Ms\.|Mrs\.|Dr\.)\b")

TITLE_KEYWORDS = [
    "Chief", "President", "Director", "Chairman", "CEO", "CFO", "COO",
    "Officer", "Secretary", "Treasurer", "General Counsel"
]



In [13]:
#EXCLUSION ITEMS FOR PERSON
ORG_LIKE_TOKENS = {
    "Inc","Inc.","Corp","Corp.","Corporation","LLC","L.L.C.","LP","L.P.","Ltd","Ltd.","Group",
    "Partners","Partner","Advisors","Advisor","Management","Systems","Investments","Capital",
    "Committee","Board","Company"
}

PLACE_LIKE_TOKENS = {"Street","St","St.","Avenue","Ave","Ave.","Road","Rd","Rd.","Boulevard","Blvd","Blvd."}

CERT_PHRASES = {
    "Chartered Financial Analyst",
    "Certified Public Accountant",
    "Master of Business Administration"
}

NON_NAME_TOKENS = {
    "Name","Age","Position","Item","Part","Form","Annual","Report","Directors","Director",
    "Executive","Officers","Corporate","Governance","Committee","Independent",
    "Chief","President","Interim","Managing","Member","Chairman","Officer",
    "On","In","At","As","Of","For","To","From","By","With","And","The",
    "September","October","November","December","January","February","March","April","May","June","July","August"
}

TITLE_KEYWORDS = ["Chief", "President", "Director", "Chairman", "CEO", "CFO", "COO", "Officer", "Secretary", "Treasurer"]

EXTRA_NON_PERSON_TOKENS = {
    "University","College","Institute","School","Commission","Exchange","Securities",
    "Stock","Fiscal","Year","Registrant","Family","Relationship","None",
    "America","Europe","Asia","Africa","East","Middle","United","States","Emirates",
    "City","York","Jersey","Hong","Kong","Pakistan","Council","Bank","Banking",
    "Award","Service","Administration","Science","Economics","Development","Strategy",
    "Technology","Communications","Assurance","Manager","Counsel","Legal","Affairs",
    "Committee","Committees","Oversight","Articles","Expires","Properties"
}

NON_NAME_TOKENS |= EXTRA_NON_PERSON_TOKENS

ROLE_WORDS = {
    "Secretary", "Chairperson", "Lead", "Founder", "Appointed",
    "Interim", "Non-Executive", "Executive", "Treasurer"
}

NON_NAME_TOKENS |= ROLE_WORDS

FINAL_BLOCKLIST = {
    "Business Conduct",
    "Business Experience",
    "New Business",
    "Strategic Planning",
    "Additional Information",
    "Delinquent Section",
    "Insider Trading Policy",
    "Independence Standards",
    "Stockholder Meetings Our",
    "Beneficial Ownership Reporting",
    "Compliance Section",
    "Revolutionize Modern Business",
    "Global Connectivity Business",
}


In [14]:
TITLE_TERMS = [
    "Chief Executive Officer", "CEO",
    "Chief Financial Officer", "CFO",
    "Chief Operating Officer", "COO",
    "President", "Vice President", "Executive Vice President", "Senior Vice President",
    "Chairman", "Chair", "Director", "Lead Director",
    "General Counsel", "Secretary", "Treasurer"
]

# Build a safe regex from the list (longer phrases first)
TITLE_TERMS_SORTED = sorted(TITLE_TERMS, key=len, reverse=True)
TITLE_RE = re.compile(r"\b(" + "|".join(re.escape(t) for t in TITLE_TERMS_SORTED) + r")\b")


In [15]:
ORG_SUFFIXES = [
    "Inc", "Inc.", "Corp", "Corp.", "Corporation",
    "LLC", "L.L.C.", "LP", "L.P.", "Ltd", "Ltd.",
    "PLC", "P.L.C.", "N.V.", "S.A.", "Pte.", "Pte. Ltd.", "Limited"
]

ORG_RE = re.compile(
    r"\b([A-Z][A-Za-z0-9&.\-]*(?:\s+[A-Z][A-Za-z0-9&.\-]*){0,6}\s+"
    r"(?:"
    + "|".join(re.escape(s) for s in ORG_SUFFIXES) +
    r"))\b"
)


In [16]:
MONEY_RE = re.compile(
    r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?"
    r"(?:\s?(?:million|billion|thousand))?",
    re.IGNORECASE
)

REVENUE_CONTEXT_RE = re.compile(
    r"\b(revenue|net sales|total revenue|net revenue|operating revenue|net income|net loss)\b",
    re.IGNORECASE
)

DEFINE EXTRACTOR FUNCTIONS

In [17]:
def looks_like_person(name: str) -> bool:
    if name in FINAL_BLOCKLIST:
        return False

    parts = name.split()
    if len(parts) < 2 or len(parts) > 3:
        return False

    # existing checks...
    for p in parts:
        p_clean = p.replace(".", "")
        if len(p_clean) == 1 and p.endswith("."):
            continue
        if p_clean in NON_NAME_TOKENS:
            return False
        if p_clean in ORG_LIKE_TOKENS:
            return False
        if p_clean in PLACE_LIKE_TOKENS:
            return False
        if len(p_clean) < 2:
            return False

    return True



In [18]:

def extract_person(text: str, window=80):
    t = text or ""
    out = []

    for m in PERSON_RE.finditer(t):
        cand = normalize_entity(m.group(1))
        if not looks_like_person(cand):
            continue

        start, end = m.span()
        left = t[max(0, start-window): start]
        right = t[end: min(len(t), end+window)]
        ctx = left + " " + right

        # Accept if honorific appears immediately before the name OR nearby in context
        has_honorific = bool(HONORIFIC_RE.search(left[-15:])) or bool(HONORIFIC_RE.search(ctx))

        # Accept if a title keyword is nearby (but avoid generic "Business X" phrases)
        has_title = any(k in ctx for k in TITLE_KEYWORDS)
        bad_business_phrase = "Business " in ctx  # kills Business Strategy/Development/Experience cases

        if not (has_honorific or (has_title and not bad_business_phrase)):
            continue

        out.append(cand)

    return list(dict.fromkeys(out))


In [19]:

def extract_money_revenue_context(text: str, window=120):
    t = text or ""
    out = []
    for m in MONEY_RE.finditer(t):
        amt = normalize_entity(m.group(0))
        start, end = m.span()
        context = t[max(0, start-window): min(len(t), end+window)]
        if REVENUE_CONTEXT_RE.search(context):
            out.append(amt)
    return list(dict.fromkeys(out))

In [20]:
#def extract_person(text: str):
    #out = []
    #for m in PERSON_RE.finditer(text or ""):
        #out.append(normalize_entity(m.group(1)))
    #return list(dict.fromkeys(out))

def extract_titles(text: str):
    out = []
    for m in TITLE_RE.finditer(text or ""):
        out.append(normalize_entity(m.group(1)))
    return list(dict.fromkeys(out))

def extract_orgs(text: str):
    out = []
    for m in ORG_RE.finditer(text or ""):
        out.append(normalize_entity(m.group(1)))
    return list(dict.fromkeys(out))

#def extract_money_revenue_context(text: str, window=120):
    #"""
    #Extract MONEY only when a revenue-ish keyword appears within +/- window chars.
    #"""
    #t = text or ""
    #out = []
    #for m in MONEY_RE.finditer(t):
        #amt = normalize_entity(m.group(0))
        #start, end = m.span()
        #context = t[max(0, start-window): min(len(t), end+window)]
        #if REVENUE_CONTEXT_RE.search(context):
            #out.append(amt)
    #return list(dict.fromkeys(out))


In [21]:
problem_records = []

for filename, record in item10_data.items():
    text = record.get("item_10", "")
    persons = extract_person(text)

    # flag suspicious PERSON outputs
    suspicious = [
        p for p in persons
        if any(
            bad in p
            for bad in ["Conduct", "Committee", "Governance", "Business", "Capital", "Street"]
        )
    ]

    if suspicious:
        problem_records.append((filename, persons, suspicious))

print("Records with suspicious PERSON extractions:", len(problem_records))


Records with suspicious PERSON extractions: 1


In [22]:
from itertools import islice

for filename, persons, suspicious in islice(problem_records, 5):
    print("\nFILE:", filename)
    print("Extracted PERSON:", persons)
    print("Suspicious:", suspicious)



FILE: 1378590_10K_2021_0001437749-21-028984.json
Extracted PERSON: ['Mark G. Downey', 'Biographies\nJoni Kahn', 'Global Services', 'Big Machines', 'Enterprise Security Software', 'Business Objects', 'Pitney Bowes', 'Kenneth Galaznik', 'Spectro Analytical Instruments', 'Scott Landers', 'Monotype Imaging Holdings', 'Global Finance', 'Pitney Bowes Software', 'Michael Taglich', 'Taglich Brothers', 'Roger Kahn', 'Bridgeline Digital', 'Artificial Intelligence', 'Thomas Windhausen', 'Joni Kahn', 'Compliance\nSection']
Suspicious: ['Business Objects']


In [23]:
# Load datasets
with open("primary_data_item10_internal.json", "r", encoding="utf-8") as f:
    item10_data = json.load(f)

with open("primary_data.json", "r", encoding="utf-8") as f:
    all_data = json.load(f)

baseline_outputs = {}

# 1) PERSON / TITLE / ORG from Item 10 internal set (45-ish filings)
for filename, record in item10_data.items():
    t10 = record.get("item_10", "") or ""

    baseline_outputs[filename] = {
        "company": record.get("company"),
        "filing_date": record.get("filing_date"),
        "period_of_report": record.get("period_of_report"),
        "source_section": "item_10",
        "PERSON": extract_person(t10),
        "TITLE": extract_titles(t10),
        "ORG": extract_orgs(t10),
        "MONEY": []  # filled in below from Item 7 (if available)
    }

# 2) MONEY from Item 7 full set (191 filings)
#    We'll attach MONEY to any filename that already exists in baseline_outputs
#    (i.e., only for those internal item_10 filings), so you have one unified output file.
for filename, record in all_data.items():
    if filename not in baseline_outputs:
        continue

    t7 = record.get("item_7", "") or ""
    baseline_outputs[filename]["source_section_money"] = "item_7"
    baseline_outputs[filename]["MONEY"] = extract_money_revenue_context(t7)

print("Baseline outputs prepared for filings:", len(baseline_outputs))

Baseline outputs prepared for filings: 38


In [24]:
out_path = "baseline_regex_outputs.json"

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(baseline_outputs, f, indent=2)

print("Saved baseline output JSON to:", out_path)


Saved baseline output JSON to: baseline_regex_outputs.json


In [25]:
# print 3 examples
sample_files = list(baseline_outputs.keys())[:3]
for k in sample_files:
    r = baseline_outputs[k]
    print("\n" + "="*90)
    print("FILE:", k, "| Company:", r.get("company"))
    print("PERSON:", r["PERSON"][:10])
    print("TITLE:", r["TITLE"][:10])
    print("ORG:", r["ORG"][:10])
    print("MONEY:", r["MONEY"][:10])



FILE: 1001601_10K_2020_0001493152-21-008913.json | Company: MGT CAPITAL INVESTMENTS, INC.
PERSON: ['Robert B. Ladd', 'Michael Onghai']
TITLE: ['President', 'Chief Executive Officer', 'Chief Financial Officer', 'Director', 'Chairman', 'CEO', 'CFO', 'Secretary']
ORG: ['Laddcap Value Partners LP', 'Snowy August Management LLC']
MONEY: ['$990', '$1,440', '$450', '$1,434', '$4,857', '$1,218', '$1,728', '$510', '$276', '$2,584']

FILE: 1017655_10K_2020_0001654954-21-003649.json | Company: PAID INC
PERSON: ['Austin Lewis', 'David Scott', 'Andrew Pilaro', 'Allan Pratt', 'Under Delaware', 'David Ogden', 'Laurie Bradley']
TITLE: ['CEO', 'CFO', 'COO', 'Director', 'President', 'Chairman', 'Chief Executive Officer']
ORG: ['CAP Properties Limited', 'Lewis Asset Management Corporation']
MONEY: ['$3,541', '$19,395', '$15,854', '$282,011', '$0.06']

FILE: 1019034_10K_2020_0001437749-21-007486.json | Company: BIO KEY INTERNATIONAL INC
PERSON: ['Thomas E. Bush', 'James Sullivan\nVice', 'International Bi

Code to produce 'baseline_regex_outputs.json' represents deliverable 1.1.1 completed on December 12, 2025
'baseline_regex_outputs.json' represents deliverable 1.1.2 completed on December 12, 2025


***DEFINE ENTITIES***


### PERSON

**Definition:**  
A PERSON entity is the proper name of a specific, individual human being referenced in the filing.

**What to tag:**
1. Full personal names:
   - Robert B. Ladd
   - Randy Samuel May
   - John Cahill
2. Shortened references when clearly referring to a named individual:
   - Ladd
   - Smith (only when the full name appears elsewhere)

**Span rules:**
1. Label the person’s name only; honorifics are excluded from the span.
   - Mr. Robert B. Ladd → PERSON: Robert B. Ladd
2. Do not include trailing punctuation.
   - Robert B. Ladd, → span ends before the comma

**What NOT to tag:**
1. Groups or roles without a specific named individual:
   - directors, officers, stockholders
2. Titles or positions without names:
   - Chief Executive Officer
3. Organizations, companies, committees, or boards:
   - Audit Committee, Board of Directors
4. Locations, regions, or geopolitical entities:
   - United States, Hong Kong
5. Degrees, certifications, or credentials:
   - Chartered Financial Analyst
6. Generic business phrases or section headers:
   - Business Conduct, Strategic Planning

***2️⃣ TITLE***

**Definition: A TITLE is a formal business or governance role associated with an individual, describing their position within a company or organizational unit.**

*What to tag:*

1. Executive and officer titles:
    - Chief Executive Officer
    - Chief Financial Officer
    - President
    - Executive Chairman
    - Chief Operating Officer

2. Board and governance roles:
    - Director
    - Chairman of the Board
    - Lead Director

3. Committee roles (still tied to a person):
    - Chairman of the Audit Committee
    - Member of the Compensation Committee
    - Span rules:

*Tag the role phrase only, without the name:*

EXAMPLE: Robert B. Ladd, President, Chief Executive Officer and Director<Br>

1. PERSON: Robert B. Ladd
2. TITLE: President
3. TITLE: Chief Executive Officer
4. TITLE: Director

*For composite titles:*

1. If clearly separable by commas/“and”, treat each title as its own span.
    - President, Chief Executive Officer and Director → three TITLE spans.

2. If it’s one self-contained phrase, tag once:
    - Chairman of the Audit Committee → one TITLE span.

*What NOT to tag:*

1. Generic, non-role terms:
    - management, staff, employees

2. Pure committee names with no role word (we’ll treat those as ORG, see below):
    - Audit Committee (ORG, not TITLE)

***3️⃣ ORG**

**Definition: An ORG is any named company, legal entity, or formal organizational body.**

*What to tag:* <br>

1. Companies and legal entities:
    - MGT Capital Investments, Inc.
    - Boxlight Corporation / Boxlight Corp
    - Digital Turbine, Inc.
    - Bitmaintech Pte. Ltd.
    - Zest Labs, Inc.

2. Subsidiaries, funds, and partnerships:
    - Laddcap Value Advisors, LLC
    - Laddcap Value Partners LP

3. Government and regulatory organizations:
    - U.S. Securities and Exchange Commission
    - Internal Revenue Service

4. Named external organizations (customers, vendors, etc.):
    - Neuberger Berman Group
    - InFocus Systems, Inc.

*Span rules:*<br>

1. Include the full official name, including suffixes:
    - MGT Capital Investments, Inc. → one ORG span
    - Laddcap Value Partners LP → one ORG span

*Gray area – “the Company”:*<br>

1. Do NOT label the Company or the Board as ORG, even when capitalized.
2. Only label explicit names (Boxlight Corp, Internal Revenue Service, etc.).

***4️⃣ MONEY***

**Definition: MONEY is any explicit monetary amount (numeric + currency) that represents a revenue-related financial metric (revenue, net sales, income, etc.), not just any dollar amount.**

*What to tag as MONEY:*<br>

1. Amounts like:
    - $10.5 million
    - $3,250,000
    - $1.2 billion
    - US$75,000
    - 5.2 million dollars

2. Amounts clearly tied to metrics such as:
    - revenue / total revenue / net revenue
    - net sales
    - income from operations
    - net income / net loss 
    - earnings (when clearly a financial metric, not e.g. “earnings per share discussion” alone)

*Span rule:*<br>

1. Tag only the amount and currency, not the surrounding word:

EXAMPLE: We generated revenue of $5.2 million in 2020. <Br>
    - MONEY span: $5.2 million<Br>

EXAMPLE: Net sales were approximately $145.6 million for the year ended December 31, 2020.<Br>
    - MONEY span: $145.6 million<br>

2. Ranges:
    - between $5 million and $7 million → treat $5 million and $7 million as two MONEY spans
    - $5–$7 million → one MONEY span

*What NOT to tag as MONEY:*<br>

1. Non-revenue dollar values such as:
    - Executive compensation amounts (salaries, bonuses)
    - Legal damages / settlement amounts
    - Debt balances, cash balances 

2. Percentages:
    - 10%, 25.4% → not MONEY

3. Counts:
    - 10,000 shares, 2,000 employees → not MONEY