# Train SpanMarker to Improve Single Entity NER Model

Goals:
1. Build custom dataset specifically for "Person" entity
2. Train SpanMarker NER model (maybe hyperparameter optimization)
3. Evaluate performance


In [None]:
!pip install span_marker[wandb] names-dataset

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## 1. Explore and prepare data

In [56]:
from datasets import load_dataset

dataset_id = "DFKI-SLT/few-nerd"
dataset = load_dataset(dataset_id, "supervised")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
        num_rows: 131767
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
        num_rows: 18824
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
        num_rows: 37648
    })
})

In [57]:
for sample in dataset["train"].select(range(3)):
    print(sample)
    print()

{'id': '0', 'tokens': ['Paul', 'International', 'airport', '.'], 'ner_tags': [0, 0, 0, 0], 'fine_ner_tags': [0, 0, 0, 0]}

{'id': '1', 'tokens': ['It', 'starred', 'Hicks', "'s", 'wife', ',', 'Ellaline', 'Terriss', 'and', 'Edmund', 'Payne', '.'], 'ner_tags': [0, 0, 7, 0, 0, 0, 7, 7, 0, 7, 7, 0], 'fine_ner_tags': [0, 0, 51, 0, 0, 0, 50, 50, 0, 50, 50, 0]}

{'id': '2', 'tokens': ['``', 'Time', '``', 'magazine', 'said', 'the', 'film', 'was', '``', 'a', 'multimillion', 'dollar', 'improvisation', 'that', 'does', 'everything', 'but', 'what', 'the', 'title', 'promises', "''", 'and', 'suggested', 'that', '``', 'writer', 'George', 'Axelrod', '(', '``', 'The', 'Seven', 'Year', 'Itch', '``', ')', 'and', 'director', 'Richard', 'Quine', 'should', 'have', 'taken', 'a', 'hint', 'from', 'Holden', '[', "'s", 'character', 'Richard', 'Benson', ']', ',', 'who', 'writes', 'his', 'movie', ',', 'takes', 'a', 'long', 'sober', 'look', 'at', 'what', 'he', 'has', 'wrought', ',', 'and', 'burns', 'it', '.', "''"], 

In [58]:
# inspect labels
labels = dataset["train"].features["ner_tags"].feature.names
print(labels)

['O', 'art', 'building', 'event', 'location', 'organization', 'other', 'person', 'product']


### Create dataset that consists of examples with `Person` entity only

**Note** - Exactly half of the examples in each split contain Person entities, while other half don't have any. This is to help with diversity of training data and model generalization.

In [61]:
from datasets import ClassLabel, Features, Sequence, Value


FEATURES = Features(
    {
        "tokens": Sequence(feature=Value(dtype="string")),
        "ner_tags": Sequence(feature=ClassLabel(names=["O", "B-PER", "I-PER"])),
    }
)


def get_fewnerd(entity_idx):
    """Loads FewNERD dataset and reformats labels to only include those for the specified entity.

    Args:
        entity_idx (int): The index of the entity from FewNERD dataset to keep.

    Returns:
        dataset: The filtered dataset containing `ner_tags` for only the specified entity.

    """

    def mapper(sample):
        sample["ner_tags"] = [int(tag == entity_idx) for tag in sample["ner_tags"]]
        sample["ner_tags"] = [
            2 if tag == 1 and idx > 0 and sample["ner_tags"][idx - 1] == 1 else tag
            for idx, tag in enumerate(sample["ner_tags"])
        ]
        return sample

    dataset = load_dataset("DFKI-SLT/few-nerd", "supervised")
    dataset = dataset.map(mapper, remove_columns=["id", "fine_ner_tags"])
    dataset = dataset.cast(FEATURES)
    return dataset

In [62]:
fewnerd_dataset = get_fewnerd(7)
fewnerd_dataset

Map:   0%|          | 0/131767 [00:00<?, ? examples/s]

Map:   0%|          | 0/18824 [00:00<?, ? examples/s]

Map:   0%|          | 0/37648 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/131767 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18824 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/37648 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 131767
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 18824
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 37648
    })
})

In [147]:
import random
from datasets import concatenate_datasets, DatasetDict, Dataset

random.seed(42)


def has_person(sample):
    return bool(sum(sample["ner_tags"]))


def has_no_person(sample):
    return not has_person(sample)


def preprocess_raw_dataset(raw_dataset):
    """
    Preprocesses the raw dataset by filtering out examples that do not contain a person entity,
    balancing the dataset by randomly selecting and including examples without a person entity,
    and concatenating the filtered datasets.

    Args:
        raw_dataset (Dataset): The raw dataset to be preprocessed.

    Returns:
        Dataset: The preprocessed dataset.
    """
    dataset_person = raw_dataset.filter(has_person)
    dataset_no_person = raw_dataset.filter(has_no_person)
    dataset_no_person = dataset_no_person.select(
        random.sample(range(len(dataset_no_person)), k=len(dataset_person))
    )
    dataset = concatenate_datasets([dataset_person, dataset_no_person])
    return dataset

In [64]:
# prepare dataset
processed_fewnerd_train = preprocess_raw_dataset(fewnerd_dataset["train"])
processed_fewnerd_test = preprocess_raw_dataset(fewnerd_dataset["test"])
processed_fewnerd_val = preprocess_raw_dataset(fewnerd_dataset["validation"])

processed_dataset_dict = DatasetDict(
    {
        "train": processed_fewnerd_train,
        "validation": processed_fewnerd_val,
        "test": processed_fewnerd_test,
    }
)
processed_dataset_dict

Filter:   0%|          | 0/131767 [00:00<?, ? examples/s]

Filter:   0%|          | 0/131767 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37648 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37648 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18824 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18824 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 85524
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 12546
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 24422
    })
})

In [66]:
for sample in processed_dataset_dict["train"].select(range(10)):
    print(sample)
    print()

{'tokens': ['It', 'starred', 'Hicks', "'s", 'wife', ',', 'Ellaline', 'Terriss', 'and', 'Edmund', 'Payne', '.'], 'ner_tags': [0, 0, 1, 0, 0, 0, 1, 2, 0, 1, 2, 0]}

{'tokens': ['``', 'Time', '``', 'magazine', 'said', 'the', 'film', 'was', '``', 'a', 'multimillion', 'dollar', 'improvisation', 'that', 'does', 'everything', 'but', 'what', 'the', 'title', 'promises', "''", 'and', 'suggested', 'that', '``', 'writer', 'George', 'Axelrod', '(', '``', 'The', 'Seven', 'Year', 'Itch', '``', ')', 'and', 'director', 'Richard', 'Quine', 'should', 'have', 'taken', 'a', 'hint', 'from', 'Holden', '[', "'s", 'character', 'Richard', 'Benson', ']', ',', 'who', 'writes', 'his', 'movie', ',', 'takes', 'a', 'long', 'sober', 'look', 'at', 'what', 'he', 'has', 'wrought', ',', 'and', 'burns', 'it', '.', "''"], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### Augment the dataset with additional examples

Peoples names often occur as standalone strings rather than just being included in a sentence with contextual information. To improve performance on names occuring in this paradigm, let's create some specific examples to augment our dataset with.



In [136]:
import math
import random
from names_dataset import NameDataset


def generate_full_names(n, country_codes):
    """Creates a list of full names from the top `n` first and last names for each country.

    Randomly mixes first and last names.

    Uses this library: https://github.com/philipperemy/name-dataset

    Args:
        n (int): The number of first and last names to use for each country.
        country_codes (list): A list of country codes to use.

    Returns:
        list: A list of full names.
    """
    nd = NameDataset()
    full_names = []
    for code in country_codes:
        first_names = nd.get_top_names(n=math.ceil(n / 2), country_alpha2=code)[code]
        first_names = first_names["M"] + first_names["F"]

        last_names = nd.get_top_names(n=n, country_alpha2=code, use_first_names=False)[
            code
        ]

        random.seed(42)
        random.shuffle(first_names)
        random.shuffle(last_names)

        if len(first_names) != len(last_names):
            first_names = first_names[: len(last_names)]

        full_names += [
            f"{first} {last}" for first, last in zip(first_names, last_names)
        ]

    return full_names

In [197]:
def build_names_dataset():
    def tokenize_and_annotate_names(example):
        example["tokens"] = example["text"].split(" ")
        example["ner_tags"] = [
            1 if idx == 0 else 2 for idx, _ in enumerate(example["tokens"])
        ]
        return example

    # generate list of full names
    nd = NameDataset()
    country_codes = [country.alpha_2 for country in nd.get_country_codes()]
    sample_names = generate_full_names(500, country_codes=country_codes)
    random.shuffle(sample_names)

    # create HF dataset with required features
    names_dataset = Dataset.from_dict({"text": sample_names})
    names_dataset = names_dataset.map(
        tokenize_and_annotate_names, remove_columns=["text"]
    )
    names_dataset = names_dataset.cast(FEATURES)

    return names_dataset

In [198]:
names_dataset = build_names_dataset()

Map:   0%|          | 0/52472 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/52472 [00:00<?, ? examples/s]

In [199]:
for sample in names_dataset.select(range(5)):
    print(sample)
    print()

{'tokens': ['Виктор', 'Sochirca'], 'ner_tags': [1, 2]}

{'tokens': ['Hugo', 'Inacio'], 'ner_tags': [1, 2]}

{'tokens': ['Fransiska', 'Manalu'], 'ner_tags': [1, 2]}

{'tokens': ['Mark', 'Danladi'], 'ner_tags': [1, 2]}

{'tokens': ['Nilima', 'Bappy'], 'ner_tags': [1, 2]}



### Combine datasets

In [203]:
names_dataset_traintest = names_dataset.train_test_split(test_size=0.3, seed=42)
names_dataset_val = names_dataset_traintest["test"].train_test_split(
    test_size=0.5, seed=42
)

In [205]:
names_dataset_dict = DatasetDict(
    {
        "train": names_dataset_traintest["train"],
        "validation": names_dataset_val["train"],
        "test": names_dataset_val["test"],
    }
)

In [210]:
full_dataset_dict = DatasetDict()
for key in processed_dataset_dict.keys():
    full_dataset_dict[key] = concatenate_datasets(
        [processed_dataset_dict[key], names_dataset_dict[key]]
    )

In [224]:
full_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 122254
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 20417
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 32293
    })
})

In [223]:
full_dataset_dict.push_to_hub("andrewrreed/fewnerd-person-names-augmented")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/123 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/33 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/andrewrreed/fewnerd-person-names-augmented/commit/b55ac0adfaf5276d3dbb3071b7bd44939593f834', commit_message='Upload dataset', commit_description='', oid='b55ac0adfaf5276d3dbb3071b7bd44939593f834', pr_url=None, pr_revision=None, pr_num=None)

**Additional thoughts on data curation for future work**
- It might be good to include some short phrases of a 1-5 tokens that are NOT Person entities. This will force the model to not assume every 2-3 word string is a name.... reducing false positives
- With a large list of diverse names, you could also experiment with generating synthetic data with an LLM that inserts that name into a random sentence. This would improve diversity and number of samples.

## Train SpanMarker NER model

In [14]:
%%writefile train_spanmarker.py
import os
from pathlib import Path
import random
import shutil
from datasets import load_dataset, concatenate_datasets, Features, Sequence, ClassLabel, Value, DatasetDict
from transformers import TrainingArguments
from span_marker import SpanMarkerModel, Trainer
from span_marker.model_card import SpanMarkerModelCardData
from huggingface_hub import upload_folder, upload_file

os.environ["WANDB_PROJECT"]="spanmarker-ner-single-entity"

def main() -> None:
    # Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
    labels = ["O", "B-PER", "I-PER"]
    dataset_id = "andrewrreed/fewnerd-person-names-augmented"
    dataset = load_dataset(dataset_id)

    train_dataset = dataset["train"]
    test_dataset = dataset["test"]
    val_dataset = dataset["validation"]

    # Initialize a SpanMarker model using a pretrained ROBERTA-style encoder
    encoder_id = "roberta-base"
    model_id = "andrewrreed/span-marker-roberta-base-person-names-augmented"
    model = SpanMarkerModel.from_pretrained(
        encoder_id,
        labels=labels,
        # SpanMarker hyperparameters:
        model_max_length=512,
        marker_max_length=128,
        entity_max_length=8,
        # Model card variables
        model_card_data=SpanMarkerModelCardData(
            model_id=model_id,
            dataset_id=dataset_id,
            encoder_id=encoder_id,
            license="cc-by-sa-4.0",
        ),
    )

    # Prepare the 🤗 transformers training arguments
    output_dir = Path("models") / model_id
    args = TrainingArguments(
        output_dir=output_dir,
        run_name=model_id,
        # Training Hyperparameters:
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        warmup_ratio=0.1,
        bf16=True,  # Replace `bf16` with `fp16` if your hardware can't use bf16.
        # Other Training parameters
        logging_first_step=True,
        logging_steps=200,
        evaluation_strategy="steps",
        save_strategy="steps",
        save_steps=1000,
        eval_steps=1000,
        load_best_model_at_end=True,
        save_total_limit=3,
        dataloader_num_workers=4,
        report_to="wandb",
    )

    # Initialize the trainer using our model, training args & dataset, and train
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )
    trainer.train()

    # Compute & save the metrics on the val set
    metrics = trainer.evaluate(val_dataset, metric_key_prefix="val")
    trainer.save_metrics("val", metrics)

    # Save the model & training script locally
    trainer.save_model(output_dir / "checkpoint-final")
    shutil.copy2(__file__, output_dir / "checkpoint-final" / "train.py")

    # Upload everything to the Hub
    # model.push_to_hub(model_id, private=True)
    # upload_folder(folder_path=output_dir / "runs", path_in_repo="runs", repo_id=model_id)
    # upload_file(path_or_fileobj=__file__, path_in_repo="train.py", repo_id=model_id)
    # upload_file(path_or_fileobj=output_dir / "all_results.json", path_in_repo="all_results.json", repo_id=model_id)
    # upload_file(path_or_fileobj=output_dir / "emissions.csv", path_in_repo="emissions.csv", repo_id=model_id)


if __name__ == "__main__":
    main()


Overwriting train_spanmarker.py


In [2]:
from pathlib import Path
from span_marker import SpanMarkerModel

model_id = "andrewrreed/span-marker-roberta-base-person-names-augmented"
output_dir = Path("models") / model_id / "checkpoint-final"
model = SpanMarkerModel.from_pretrained(output_dir)

In [3]:
model.push_to_hub(model_id)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/andrewrreed/span-marker-roberta-base-person-names-augmented/commit/42529e5602b47129f22463a8f70cdb9a7be6238c', commit_message='Upload model', commit_description='', oid='42529e5602b47129f22463a8f70cdb9a7be6238c', pr_url=None, pr_revision=None, pr_num=None)

In [4]:
from huggingface_hub import upload_folder, upload_file

upload_folder(folder_path=output_dir / "runs", path_in_repo="runs", repo_id=model_id)
upload_file(path_or_fileobj=__file__, path_in_repo="train.py", repo_id=model_id)
upload_file(
    path_or_fileobj=output_dir / "all_results.json",
    path_in_repo="all_results.json",
    repo_id=model_id,
)
upload_file(
    path_or_fileobj=output_dir / "emissions.csv",
    path_in_repo="emissions.csv",
    repo_id=model_id,
)

ValueError: Provided path: '/home/ubuntu/hf-notebooks/spanmarker-ner-single-entity/models/andrewrreed/span-marker-roberta-base-person-names-augmented/checkpoint-final/runs' is not a directory