# org_label Uniqueness & Counts

Reads a CSV (default: `samples.csv`) and reports how many unique `org_label` values there are, lists them, and shows counts per label.


In [1]:
import pandas as pd
csv_path = "./samples.csv"  # change if needed

# Auto-detect delimiter (comma/TSV/etc.)
df = pd.read_csv(csv_path, sep=None, engine="python")
df.head()

Unnamed: 0,index,inputs,context,topic,org_label,set-0,id,string_label,label
0,0,"(('recent years', 0, 3, 15),)","('In recent years , advanced education for pro...",,DATE,train,0,0,0
1,1,"(('ten years', 0, 84, 93),)","('With this trend , suddenly the mature faces ...",,DATE,train,1,0,0
2,2,"(('EMBA', 0, 170, 174),)",('In order to attract this group of seasoned a...,,WORK_OF_ART,train,2,1,1
3,3,"(('MBA', 0, 160, 163),)",('In order to attract this group of seasoned a...,,WORK_OF_ART,train,3,1,1
4,4,"(('NT$ 1 million', 0, 66, 79),)",('In order to attract this group of seasoned a...,,MONEY,train,4,2,2


In [2]:
# Ensure the 'org_label' column exists and normalize it
if "org_label" not in df.columns:
    raise KeyError(f"'org_label' column not found. Columns are: {list(df.columns)}")

org = df["org_label"].dropna()
if pd.api.types.is_numeric_dtype(org):
    try:
        org = org.astype("Int64")
    except Exception:
        org = org.astype(str).str.strip()
else:
    org = org.astype(str).str.strip()

unique_vals = sorted(pd.unique(org))
print(f"{len(unique_vals)} unique org_label values:")
for v in unique_vals:
    print("-", v)

18 unique org_label values:
- CARDINAL
- DATE
- EVENT
- FAC
- GPE
- LANGUAGE
- LAW
- LOC
- MONEY
- NORP
- ORDINAL
- ORG
- PERCENT
- PERSON
- PRODUCT
- QUANTITY
- TIME
- WORK_OF_ART


In [3]:
# Counts per label (sorted by label)
counts = org.value_counts().sort_index()
counts_df = counts.rename_axis('org_label').reset_index(name='count')
counts_df

Unnamed: 0,org_label,count
0,CARDINAL,13626
1,DATE,23786
2,EVENT,1273
3,FAC,1440
4,GPE,28133
5,LANGUAGE,412
6,LAW,568
7,LOC,2691
8,MONEY,6425
9,NORP,11608


# Question Answer json file prepare

Reads a CSV (default: `samples.csv`) and makes question answer prompt style json

In [4]:
import ast, json

def parse_context(val):
    """context looks like ('sentence',) — return the inner string."""
    if isinstance(val, str):
        try:
            v = ast.literal_eval(val)
            if isinstance(v, (list, tuple)) and len(v) > 0:
                return str(v[0])
        except Exception:
            pass
    return str(val)

def parse_inputs(val):
    """inputs looks like (('text', start, end, ...), (...), ...) — return list of texts."""
    if isinstance(val, str):
        try:
            v = ast.literal_eval(val)
        except Exception:
            v = val
    else:
        v = val

    texts = []
    if isinstance(v, (list, tuple)):
        for item in v:
            if isinstance(item, (list, tuple)) and len(item) > 0:
                texts.append(str(item[0]))
            else:
                texts.append(str(item))
    else:
        texts = [str(v)]

    # de-duplicate while preserving order
    seen = set()
    out = []
    for t in texts:
        t = t.strip()
        if t and t not in seen:
            seen.add(t)
            out.append(t)
    return out

In [5]:
# Map raw NER tags to human-friendly names
LABEL_PRETTY_MAP = {
    "ORG": "ORGANIZATION",
    "NORP": "NATIONALITY, RELIGIOUS, or POLITICAL GROUP",
    "FAC": "FACILITY",
    "LOC": "LOCATION",
    "GPE": "GEOPOLITICAL ENTITY"}

In [6]:
# Replace underscores with spaces for nicer display (e.g., WORK_OF_ART -> WORK OF ART)
def pretty_label(label: str) -> str:
    raw = str(label).strip()
    key = raw.upper()
    return LABEL_PRETTY_MAP.get(key, raw.replace("_", " "))

In [7]:
# --- Example for row 5 with pretty label ---
row0 = df.iloc[5]
sent0 = parse_context(row0["context"])
label0 = pretty_label(row0["org_label"])
ents0 = parse_inputs(row0["inputs"])
q0 = f"What are the {label0} entities?"
a0 = "; ".join(ents0)

print(f'Sentence: "{sent0}"')
print(f"Q: {q0}")
print(f"A: {a0}")

Sentence: "In response , each year over 1000 mature professionals looking to recharge their minds and retool their know - how compete for a precious few openings in executive degree programs at top institutions such as National Taiwan University -LRB- NTU -RRB- and National Chengchi University ."
Q: What are the ORGANIZATION entities?
A: National Chengchi University


In [8]:
# Filter to set-0 == 'train'
subset = df[df["set-0"].astype(str).str.lower() == "train"].copy()

def row_to_record(row):
    sent = parse_context(row["context"])
    label = pretty_label(row["org_label"])
    ents = parse_inputs(row["inputs"])
    return {
        "Sentence": sent,
        "Q": f"What are the {label} entities?",
        "A": "; ".join(ents) if ents else ""
    }

records = [row_to_record(r) for _, r in subset.iterrows()]
print(f"Prepared {len(records)} QA records from train split.")

with open("ner_qa_NEW.json", "w", encoding="utf-8") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

print("Wrote ner_qa.json")


Prepared 128738 QA records from train split.
Wrote ner_qa.json
