In [1]:
import transformers
print(transformers.__version__)


4.55.4


In [2]:
import pandas as pd
import os

# Adjust this if your filename differs
csv_path = "./pii_dataset.csv"
print("Using:", csv_path)

df = pd.read_csv(csv_path)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Peek at a few rows
print(df.head(2).to_dict(orient="records"))


  from pandas.core import (


Using: ./pii_dataset.csv


FileNotFoundError: [Errno 2] No such file or directory: './pii_dataset.csv'

In [None]:
import pandas as pd
import ast

# If you haven't already loaded df:
# df = pd.read_csv("<your path>.csv")

# 1) Find likely columns for tokens and labels
tok_col_candidates = [c for c in df.columns if c.lower() in ["tokens","words","tokens_list","tokens_str"]]
lab_col_candidates = [c for c in df.columns if c.lower() in ["labels","ner_tags","tags"]]

print("Token column candidates:", tok_col_candidates)
print("Label column candidates:", lab_col_candidates)

TOK_COL = tok_col_candidates[0]   # adjust if needed
LAB_COL = lab_col_candidates[0]

# 2) Ensure labels are a Python list (some CSVs save them as strings)
def to_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return x.split()  # last resort: space-separated
    return list(x)

df["TOKENS"] = df[TOK_COL].apply(to_list)
df["BIO"]    = df[LAB_COL].apply(to_list)

# 3) Check alignment and peek
i = 0  # try a few different rows if needed
print("Row", i, "token count vs label count:",
      len(df.loc[i, "TOKENS"]), "vs", len(df.loc[i, "BIO"]))

print("First 10 tokens:", df.loc[i, "TOKENS"][:10])
print("First 10 BIO tags:", df.loc[i, "BIO"][:10])

# 4) How many rows mismatch?
mismatch = (df["TOKENS"].str.len() != df["BIO"].str.len()).sum()
print("Rows with length mismatch:", mismatch, "out of", len(df))


Token column candidates: ['tokens']
Label column candidates: ['labels']
Row 0 token count vs label count: 363 vs 363
First 10 tokens: ['My', 'name', 'is', 'Aaliyah', 'Popova,', 'and', 'I', 'am', 'a', 'jeweler']
First 10 BIO tags: ['O', 'O', 'O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'O', 'O', 'O', 'O']
Rows with length mismatch: 0 out of 4434


In [None]:
from collections import Counter

# 1) Raw -> canonical mapping (now includes USERNAME, URL_PERSONAL)
RAW2CANON = {
    # Names
    "NAME_STUDENT": "NAME", "NAME": "NAME", "PERSON": "NAME",
    # Emails
    "EMAIL": "EMAIL", "EMAIL_ADDRESS": "EMAIL",
    # Phones
    "PHONE_NUM": "PHONE", "PHONE": "PHONE", "PHONE_NUMBER": "PHONE",
    # Addresses
    "STREET_ADDRESS": "ADDRESS", "ADDRESS": "ADDRESS",
    # Extras you asked to keep
    "USERNAME": "USERNAME",
    "URL_PERSONAL": "URL_PERSONAL",
}

TARGET_ENTS = {"NAME","EMAIL","PHONE","ADDRESS","USERNAME","URL_PERSONAL"}

def map_bio_tag(tag: str) -> str:
    if tag == "O":
        return "O"
    if "-" not in tag:  # unexpected form
        return "O"
    prefix, raw = tag.split("-", 1)  # e.g., 'B', 'NAME_STUDENT'
    canon = RAW2CANON.get(raw)
    if canon in TARGET_ENTS:
        return f"{prefix}-{canon}"
    return "O"

# 2) Apply mapping
df["BIO6"] = df["BIO"].apply(lambda seq: [map_bio_tag(t) for t in seq])

# 3) Sanity checks
mismatch_after = (df["TOKENS"].str.len() != df["BIO6"].str.len()).sum()
print("Mismatches after mapping:", mismatch_after)

bio6_types = Counter()
for tags in df["BIO6"]:
    bio6_types.update(tags)
print("BIO6 label distribution (top 15):", bio6_types.most_common(15))

# 4) Quick peek
i = 0
print(list(zip(df.loc[i,"TOKENS"][:20], df.loc[i,"BIO"][:20], df.loc[i,"BIO6"][:20])))


Mismatches after mapping: 0
BIO6 label distribution (top 15): [('O', 1333514), ('B-NAME', 11104), ('I-ADDRESS', 8577), ('I-NAME', 5667), ('B-EMAIL', 3794), ('B-ADDRESS', 3543), ('I-PHONE', 3389), ('B-PHONE', 2419), ('B-USERNAME', 718), ('B-URL_PERSONAL', 620)]
[('My', 'O', 'O'), ('name', 'O', 'O'), ('is', 'O', 'O'), ('Aaliyah', 'B-NAME_STUDENT', 'B-NAME'), ('Popova,', 'I-NAME_STUDENT', 'I-NAME'), ('and', 'O', 'O'), ('I', 'O', 'O'), ('am', 'O', 'O'), ('a', 'O', 'O'), ('jeweler', 'O', 'O'), ('with', 'O', 'O'), ('13', 'O', 'O'), ('years', 'O', 'O'), ('of', 'O', 'O'), ('experience.', 'O', 'O'), ('I', 'O', 'O'), ('remember', 'O', 'O'), ('a', 'O', 'O'), ('very', 'O', 'O'), ('unique', 'O', 'O')]


In [None]:
from collections import Counter

ENTITIES = ["NAME","EMAIL","PHONE","ADDRESS","USERNAME","URL_PERSONAL"]
BIO_LABELS = ["O"] + [f"{p}-{e}" for e in ENTITIES for p in ["B","I"]]
label2id = {l:i for i,l in enumerate(BIO_LABELS)}
id2label = {i:l for l,i in label2id.items()}

# Verify: any tags outside this set?
seen = Counter(t for row in df["BIO6"] for t in row)
unknown = [t for t in seen if t not in BIO_LABELS]
print("Label vocab size:", len(BIO_LABELS))
print("Unknown tags (should be empty):", unknown[:10])
print("Sample mapping:", {k:label2id[k] for k in BIO_LABELS[:8]})
#“Sample mapping” is just a peek at the dictionary that converts your human-readable BIO tags into the integer IDs the model actually trains on.


Label vocab size: 13
Unknown tags (should be empty): []
Sample mapping: {'O': 0, 'B-NAME': 1, 'I-NAME': 2, 'B-EMAIL': 3, 'I-EMAIL': 4, 'B-PHONE': 5, 'I-PHONE': 6, 'B-ADDRESS': 7}


In [None]:
import numpy as np
from datasets import Dataset, DatasetDict

ds_all = Dataset.from_pandas(
    df[["TOKENS","BIO6"]].rename(columns={"TOKENS":"tokens","BIO6":"tags"}),
    preserve_index=False
)

# simple random split (90/10)
N = len(ds_all)
idx = np.arange(N)
rng = np.random.default_rng(42)
rng.shuffle(idx)
cut = int(0.9*N)
train_idx, val_idx = idx[:cut], idx[cut:]

ds = DatasetDict({
    "train": ds_all.select(train_idx.tolist()),
    "validation": ds_all.select(val_idx.tolist()),
})

print(ds)
print("Train rows:", ds["train"].num_rows, "| Val rows:", ds["validation"].num_rows)
print("One row example:", ds["train"][0].keys())


DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 3990
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 444
    })
})
Train rows: 3990 | Val rows: 444
One row example: dict_keys(['tokens', 'tags'])


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

def encode_batch(batch):
    # Tokenize list-of-words with alignment info
    enc = tokenizer(
        batch["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=False,
        max_length=512,
    )
    all_labels = []
    for i, tags in enumerate(batch["tags"]):
        word_ids = enc.word_ids(batch_index=i)
        label_ids = []
        prev_wid = None
        for wid in word_ids:
            if wid is None:
                label_ids.append(-100)            # special tokens
            elif wid != prev_wid:
                label_ids.append(label2id[tags[wid]])  # first subword of this word
            else:
                label_ids.append(-100)            # subsequent subwords
            prev_wid = wid
        all_labels.append(label_ids)
    enc["labels"] = all_labels
    return enc

encoded = ds.map(
    encode_batch,
    batched=True,
    remove_columns=["tokens","tags"],
    desc="Tokenizing and aligning labels",
)

print(encoded)
row = encoded["train"][0]
print("Keys in encoded batch:", row.keys())
print("len(input_ids) vs len(labels):", len(row["input_ids"]), len(row["labels"]))
# How many labels are active (i.e., not -100) in this example?
active = sum(1 for x in row["labels"] if x != -100)
print("Active labels in example row:", active)
print("First 20 labels (ids):", row["labels"][:20])
print("First 20 label names:", [id2label[i] if i!=-100 else "PAD" for i in row["labels"][:20]])


Tokenizing and aligning labels:   0%|          | 0/3990 [00:00<?, ? examples/s]

Tokenizing and aligning labels:   0%|          | 0/444 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3990
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 444
    })
})
Keys in encoded batch: dict_keys(['input_ids', 'attention_mask', 'labels'])
len(input_ids) vs len(labels): 475 475
Active labels in example row: 347
First 20 labels (ids): [-100, 0, 0, 0, 0, 0, 0, 0, 0, -100, 1, 2, -100, 0, 0, 0, 0, 0, -100, 0]
First 20 label names: ['PAD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PAD', 'B-NAME', 'I-NAME', 'PAD', 'O', 'O', 'O', 'O', 'O', 'PAD', 'O']


In [None]:
from transformers import AutoModelForTokenClassification, AutoConfig

MODEL_NAME = "distilbert-base-cased"

config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=len(BIO_LABELS),
    id2label=id2label,
    label2id=label2id,
)

model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config)
print(type(model).__name__, "loaded with", config.num_labels, "labels.")


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForTokenClassification loaded with 13 labels.


In [None]:
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

data_collator = DataCollatorForTokenClassification(tokenizer)

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    """
    p.predictions: (batch, seq_len, num_labels)
    p.label_ids:   (batch, seq_len) with -100 for ignored positions
    """
    preds = np.argmax(p.predictions, axis=2)
    true_preds, true_labels = [], []
    for pred, lab in zip(preds, p.label_ids):
        # keep only positions where label != -100
        keep = lab != -100
        pred_ids = pred[keep]
        lab_ids  = lab[keep]
        true_preds.append([id2label[int(i)] for i in pred_ids])
        true_labels.append([id2label[int(i)] for i in lab_ids])

    results = seqeval.compute(predictions=true_preds, references=true_labels, zero_division=0)
    # Flatten to a friendly dict: overall + per-entity F1
    out = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
    }
    for ent, stats in results.items():
        if isinstance(stats, dict) and "f1" in stats:
            out[f"f1_{ent}"] = stats["f1"]
    return out

args = TrainingArguments(
    output_dir="pii-ner-distilbert",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="overall_f1",
    report_to="none",
    logging_steps=50,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Trainer ready.")


Trainer ready.


  trainer = Trainer(


In [None]:
# train_out = trainer.train()
# print("\n=== Training summary ===")
# print(train_out)

# eval_out = trainer.evaluate()
# print("\n=== Eval metrics (validation) ===")
# for k,v in sorted(eval_out.items()):
#     if k.startswith("eval_"):
#         print(f"{k}: {v:.4f}" if isinstance(v, (int,float)) else f"{k}: {v}")

# # ---------- SAVE THE MODEL & TOKENIZER ----------
# SAVE_DIR = "pii-ner-distilbert"   # reuse output_dir or choose a new folder

# trainer.save_model(SAVE_DIR)      # writes pytorch_model.bin + config.json (with id2label/label2id)
# tokenizer.save_pretrained(SAVE_DIR)

# # quick confirmation
# print("\nSaved files in", SAVE_DIR, ":\n", os.listdir(SAVE_DIR))




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# --- SPEED MODE SWITCHES ---
MAX_LEN = 128            # shorter sequences = much faster
TRAIN_ROWS = 1500        # small train subset
VAL_ROWS = 300           # small val subset
EPOCHS = 1               # quick pass
BATCH = 32               # try 24/32; lower if you hit RAM issues

# 1) Small, fast backbone
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
MODEL_NAME = "distilbert-base-cased"  # tiny & quick

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=len(BIO_LABELS),
    id2label=id2label,
    label2id=label2id,
)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config)

# 2) Re-tokenize with shorter max_length and first-subword labeling
def encode_batch(batch):
    enc = tokenizer(
        batch["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=False,
        max_length=MAX_LEN,
    )
    all_labels = []
    for i, tags in enumerate(batch["tags"]):
        word_ids = enc.word_ids(batch_index=i)
        label_ids, prev_wid = [], None
        for wid in word_ids:
            if wid is None:
                label_ids.append(-100)
            elif wid != prev_wid:
                label_ids.append(label2id[tags[wid]])
            else:
                label_ids.append(-100)
            prev_wid = wid
        all_labels.append(label_ids)
    enc["labels"] = all_labels
    return enc

from datasets import DatasetDict
encoded_fast = DatasetDict({
    "train": ds["train"].select(range(min(TRAIN_ROWS, ds["train"].num_rows))),
    "validation": ds["validation"].select(range(min(VAL_ROWS, ds["validation"].num_rows))),
}).map(encode_batch, batched=True, remove_columns=["tokens","tags"], desc="Tokenize (speed mode)")

# 3) Compat-safe Trainer args (only pass what your version supports)
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
import inspect, numpy as np, evaluate

data_collator = DataCollatorForTokenClassification(tokenizer)
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true_preds, true_labels = [], []
    for pred, lab in zip(preds, p.label_ids):
        keep = lab != -100
        true_preds.append([id2label[int(i)] for i in pred[keep]])
        true_labels.append([id2label[int(i)] for i in lab[keep]])
    res = seqeval.compute(predictions=true_preds, references=true_labels, zero_division=0)
    out = {
        "overall_precision": res.get("overall_precision", 0.0),
        "overall_recall": res.get("overall_recall", 0.0),
        "overall_f1": res.get("overall_f1", 0.0),
    }
    for ent, st in res.items():
        if isinstance(st, dict) and "f1" in st:
            out[f"f1_{ent}"] = st["f1"]
    return out

sig = inspect.signature(TrainingArguments.__init__)
params = set(sig.parameters.keys())

kwargs = dict(
    output_dir="pii-ner-fast",
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=EPOCHS,
    weight_decay=0.0,      # less overhead
    logging_steps=100,
    seed=42,
)

# add optional args only if your installed version supports them
if "dataloader_num_workers" in params:
    kwargs["dataloader_num_workers"] = 2
if "report_to" in params:
    kwargs["report_to"] = "none"
# we skip eval/save strategies during training; we’ll eval once after

args = TrainingArguments(**kwargs)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_fast["train"],
    eval_dataset=encoded_fast["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer ready (speed mode).")

# 4) Train quick, then evaluate once
train_out = trainer.train()
print("\n=== Training summary (speed mode) ===")
print(train_out)

eval_out = trainer.evaluate()
print("\n=== Eval metrics (validation, speed mode) ===")
for k, v in sorted(eval_out.items()):
    if k.startswith("eval_"):
        print(f"{k}: {v:.4f}" if isinstance(v, (int,float)) else f"{k}: {v}")

# Save a checkpoint you can test with Step 8
trainer.save_model("pii-ner-fast")
tokenizer.save_pretrained("pii-ner-fast")
print("\nModel saved to: pii-ner-fast")


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenize (speed mode):   0%|          | 0/1500 [00:00<?, ? examples/s]

Tokenize (speed mode):   0%|          | 0/300 [00:00<?, ? examples/s]

  trainer = Trainer(


Trainer ready (speed mode).


Step,Training Loss



=== Training summary (speed mode) ===
TrainOutput(global_step=47, training_loss=0.23462857591344954, metrics={'train_runtime': 380.3809, 'train_samples_per_second': 3.943, 'train_steps_per_second': 0.124, 'total_flos': 49004657280000.0, 'train_loss': 0.23462857591344954, 'epoch': 1.0})





=== Eval metrics (validation, speed mode) ===
eval_f1_ADDRESS: 0.8056
eval_f1_EMAIL: 0.0000
eval_f1_NAME: 0.6796
eval_f1_PHONE: 0.0000
eval_f1_URL_PERSONAL: 0.0000
eval_f1_USERNAME: 0.0000
eval_loss: 0.0326
eval_overall_f1: 0.6621
eval_overall_precision: 0.6175
eval_overall_recall: 0.7137
eval_runtime: 50.4945
eval_samples_per_second: 5.9410
eval_steps_per_second: 0.1980

Model saved to: pii-ner-fast


In [2]:
import re

from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

MODEL_DIR = "./pii-ner-fast"  # <- path to your saved model folder

tok = AutoTokenizer.from_pretrained(MODEL_DIR)
mdl = AutoModelForTokenClassification.from_pretrained(MODEL_DIR)
mdl.eval()

import re

def extend_spans_to_word_end(spans, text, labels={"NAME","ADDRESS"}):
    """
    If a span ends in the middle of a word (common with subword splits),
    extend it to the end of that word. E.g., '[NAME]onggo' -> '[NAME]'.
    """
    out = []
    for s in spans:
        if s["label"] in labels:
            end = s["end"]
            L = len(text)
            # extend while next char is a word char or common name/addr joiners
            while end < L and re.match(r"[A-Za-z0-9'’-]", text[end]):
                end += 1
            s = {"start": s["start"], "end": end, "label": s["label"]}
        out.append(s)
    return out


def coalesce_same_label_spans(spans, text, max_gap_chars=2):
    """
    Merge consecutive spans with the same label if the gap between them is tiny
    (e.g., subword tail like 'oh' or '#11'), so '[ADDRESS]oh [ADDRESS]113' -> '[ADDRESS]'.
    """
    if not spans:
        return spans
    spans = sorted(spans, key=lambda s: (s["start"], s["end"]))
    merged = [spans[0]]

    for s in spans[1:]:
        prev = merged[-1]
        if s["label"] == prev["label"]:
            gap = text[prev["end"]:s["start"]]
            # allow tiny tails: up to N alnum or -/#, with optional surrounding whitespace
            if re.fullmatch(rf"\s*[-/#A-Za-z0-9]{{0,{max_gap_chars}}}\s*", gap):
                # extend previous span to include gap + current span
                prev["end"] = s["end"]
                continue
        merged.append(s)
    return merged

def predict_tags(text: str):
    enc = tok(text, return_offsets_mapping=True, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        out = mdl(input_ids=enc["input_ids"], attention_mask=enc["attention_mask"])
    pred_ids = out.logits.argmax(-1)[0].tolist()
    tags = [mdl.config.id2label[int(i)] for i in pred_ids]

    # drop special tokens (offsets == (0,0))
    offsets = enc["offset_mapping"][0].tolist()
    toks, clean_tags, clean_offs = [], [], []
    for (s,e), tag in zip(offsets, tags):
        if s==0 and e==0:  # skip special tokens
            continue
        toks.append(text[s:e]); clean_tags.append(tag); clean_offs.append((s,e))
    return toks, clean_tags, clean_offs


# Merge BIO → character spans
def bio_to_char_spans(offsets, tags):
    spans, cur = [], None
    for (s,e), tag in zip(offsets, tags):
        if tag.startswith("B-"):
            if cur: spans.append(cur)
            cur = {"start": s, "end": e, "label": tag.split("-",1)[1]}
        elif tag.startswith("I-"):
            ent = tag.split("-",1)[1]
            if cur and cur["label"] == ent and s <= cur["end"] + 1:
                cur["end"] = e
            else:
                cur = {"start": s, "end": e, "label": ent}
        else:
            if cur: spans.append(cur); cur = None
    if cur: spans.append(cur)
    return spans

# High-precision regex for structured PII
EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
PHONE_RE = re.compile(r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{1,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}\b")
URL_RE   = re.compile(r"\bhttps?://[^\s]+", re.I)
HANDLE_RE= re.compile(r"@\w{1,32}")

def regex_spans(text):
    out = []
    for m in EMAIL_RE.finditer(text):
        out.append({"start": m.start(), "end": m.end(), "label": "EMAIL"})
    for m in PHONE_RE.finditer(text):
        out.append({"start": m.start(), "end": m.end(), "label": "PHONE"})
    for m in URL_RE.finditer(text):
        out.append({"start": m.start(), "end": m.end(), "label": "URL_PERSONAL"})
    for m in HANDLE_RE.finditer(text):
        out.append({"start": m.start(), "end": m.end(), "label": "USERNAME"})
    return out

def ner_spans(text):
    toks, tags, offs = predict_tags(text)
    print("tags:", tags)
    print("offs:", offs)
    return bio_to_char_spans(offs, tags)

def merge_spans(spans):
    spans = sorted(spans, key=lambda s: (s["start"], -(s["end"]-s["start"])))
    merged = []
    for s in spans:
        if not merged or s["start"] >= merged[-1]["end"]:
            merged.append(s)
        else:
            if (s["end"]-s["start"]) > (merged[-1]["end"]-merged[-1]["start"]):
                merged[-1] = s
    return merged

def redact(text, style="tags"):
    # regex for EMAIL/PHONE/URL/USERNAME + NER for NAME/ADDRESS (+ any extras the model finds)
    r = regex_spans(text)
    n = ner_spans(text)

    keep = r[:]  # always keep structured regex hits
    covered = {(s["start"], s["end"]) for s in r}
    for s in n:
        # Always include model-detected NAME/ADDRESS; include others if regex didn’t already catch
        if s["label"] in {"NAME","ADDRESS"} or (s["start"], s["end"]) not in covered:
            keep.append(s)

    spans = merge_spans(keep)
    spans = coalesce_same_label_spans(spans, text, max_gap_chars=2)
    spans = extend_spans_to_word_end(spans, text, labels={"NAME","ADDRESS"})

    out, last = [], 0
    for s in spans:
        out.append(text[last:s["start"]])
        token = f"[{s['label']}]" if style=="tags" else "█"*(s["end"]-s["start"])
        out.append(token)
        last = s["end"]
    out.append(text[last:])
    return "".join(out), spans

# Try a few examples:
samples = [
    "My name is Winston Leonard Prayonggo. My website is https://example.com/winston. You can reach me at"
]
for s in samples:
    red, spans = redact(s, style="tags")
    print("\nIN :", s)
    print("OUT:", red)
    print("SPN:", spans)


  from pandas.core import (


tags: ['O', 'O', 'O', 'B-NAME', 'B-NAME', 'B-NAME', 'B-NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
offs: [(0, 2), (3, 7), (8, 10), (11, 18), (19, 26), (27, 28), (28, 31), (31, 34), (34, 36), (36, 37), (38, 40), (41, 48), (49, 51), (52, 57), (57, 58), (58, 59), (59, 60), (60, 67), (67, 68), (68, 71), (71, 72), (72, 76), (76, 79), (79, 80), (81, 84), (85, 88), (89, 94), (95, 97), (98, 100)]

IN : My name is Winston Leonard Prayonggo. My website is https://example.com/winston. You can reach me at
OUT: My name is [NAME]. My website is [URL_PERSONAL] You can reach me at
SPN: [{'start': 11, 'end': 36, 'label': 'NAME'}, {'start': 52, 'end': 80, 'label': 'URL_PERSONAL'}]
