# Named Entity Recognition

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/KennethEnevoldsen/DaCy/blob/master/docs/performance.ner.ipynb)


This page examines the performance of competing models for Danish named entity recognition over multiple datasets. Performance is not limited to 
accuracy, but also includes domain generalization, biases and robustness. This page is also a notebook, which open and replicate the results.

## State-of-the-Art comparison
To our knowledge there exists three datasets for Danish named entity recognition;

1) DaNE {cite}`hvingelby2020dane`, which uses the simple annotation scheme of CoNLL 2003 {cite}`missing` with the entities; *person*, *location*, *organization*, and *miscellaneus*.
2) DANSK {cite}`missing`, which uses the extensive annotation scheme similar to that of OntoNotes 5.0 {cite}`missing` including more that 16 entity types.
3) and DAN+ {cite}`missing`, which also uses the annotation scheme of CoNLL 2003, but allows for nested entities for instance *Aarhus Universitet*, where *Aarhus* is a location and *Aarhus Universitet* is an organization.

In this comparison we will be examing performance on DaNE and DANSK, but as no known models have been trained on Danish nested entities, we will not be comparing performance on DAN+.


```{admonition} Measuring Performance
Typically when measuring performance on these benchmark it is normal to feed the model the gold standard tokens. While this allows for easier comparisons of modules and architectures, it inflates the performance metrics. Further, it does not proberly reflect what you are really interested in:
*the performance you can expect when you apply the model to data of a similar type*. Therefore we estimate the model is given no prior knowledge of the data, and only the raw text is fed to the model. Thus the performance metrics might be slightly different compared to e.g. DaNLP.
```

### DaNE: Simple Named Entity Recognition
As already stated DaNE uses an extraction from the CoNLL 2003 dataset, which is as follows {cite}`hvingelby2020dane`:


| Entity | Description |
|--------------|-------------|
| LOC          | includes locations like cities, roads and mountains, as well as both public and commercial places like specific buildings or meeting points, but also abstract places. |
| PERSON | consists of names of people, fictional characters, and animals. The names includes aliases. |
| ORG | can be summarized as all sorts of organizations and collections of people, ranging from companies, brands, political movements, governmental bodies and clubs. |
| MISC | is a broad category of e.g. events, languages, titles and religions, but this tag also includes words derived from one of the four tags as well as words for which one part is from one of the three other tags. |

Here is an example from the dataset:

In [1]:
import spacy
from spacy.tokens import Span
from spacy import displacy

text = """To kendte russiske historikere Andronik Mirganjan og Igor Klamkin tror ikke, at Rusland kan udvikles uden en "jernnæve"."""
nlp = spacy.blank("da")
doc = nlp(text)
doc.ents = [  # type: ignore
    Span(doc, 2, 3, label="MISC"),
    Span(doc, 4, 6, label="PERSON"),
    Span(doc, 7, 9, label="PERSON"),
    Span(doc, 13, 14, label="LOC"),
]

displacy.render(doc, style="ent")

The table below shows the performance of Danish language processing pipelines scored on the DaNE test set. The best scores in each category are highlighted with bold and the second best is underlined.

In [2]:
## Apply each of the model to DaNE
from pathlib import Path
from re import L
from typing import Callable, List

from numpy import save

from evaluation.models import MODELS
from evaluation.datasets import datasets
from spacy.training import Example
from spacy.language import Language
import spacy

from spacy.tokens import Doc
import json


def doc_to_json(doc: Doc):
    json_obj = doc.to_json()
    if hasattr(doc._, "meta"):
        json_obj["meta"] = doc._.meta
    return json_obj


def doc_from_json(json_obj: dict, nlp: Language):
    doc = Doc(nlp.vocab).from_json(json_obj)
    if "meta" in json_obj:
        if not Doc.has_extension("meta"):
            Doc.set_extension("meta", default={}, force=True)
        doc._.meta = json_obj["meta"]
    return doc


def predictions_to_disk(
    save_path: Path, examples: List[Example], mdl_name: str, time_in_seconds: float
):
    save_path.parent.mkdir(exist_ok=True, parents=True)
    meta = {
        "mdl_name": mdl_name,
        "time_in_seconds": time_in_seconds,
        "Hardware": "Apple M1 Pro 16Gb running macOS 13.3.1",
    }

    # write to json
    meta["predicted"] = [doc_to_json(d.predicted) for d in examples]
    meta["reference"] = [doc_to_json(d.reference) for d in examples]

    with open(save_path, "w") as f:
        json.dump(meta, f, indent=2)

    meta["examples"] = examples
    return meta


def predictions_from_disk(path: Path) -> dict:
    nlp = spacy.blank("da")
    with open(path) as f:
        meta = json.load(f)

    reference = [doc_from_json(d, nlp) for d in meta["reference"]]
    predicted = [doc_from_json(d, nlp) for d in meta["predicted"]]

    examples = []
    for ref, pred in zip(reference, predicted):
        example = Example(reference=ref, predicted=pred)
        examples.append(example)

    meta["examples"] = examples

    return meta


def apply_models(
    mdl_name,
    mdl_getter: Callable[[], Language],
    dataset: str,
    splits: list[str] = ["test"],
    cache: bool = True,
):
    from time import time

    docs_path = Path(".")
    _mdl_name = mdl_name.replace("/", "_")
    save_folder = docs_path / "evaluation" / "data" / f"{_mdl_name}"

    results = {}
    for split in splits:
        save_path = save_folder / f"{dataset}_{split}.json"
        if not save_path.exists() and cache:
            print(f"{dataset} ({split}): Running {mdl_name}")
            dataset_getter = datasets.get(dataset)
            examples = dataset_getter()[split]
            nlp = mdl_getter()

            start = time()
            docs = nlp.pipe(example.reference.text for example in examples)
            for doc, example in zip(docs, examples):
                example.predicted = doc
            end = time()
            time_in_seconds = end - start
            results = predictions_to_disk(
                save_path, examples, mdl_name, time_in_seconds
            )
        else:
            print(f"{dataset} ({split}): Loading prediction for {mdl_name}")

        results[split] = predictions_from_disk(save_path)

    return results


dane = {}
for mdl_name, model_getter in MODELS.items():
    mdl_results = apply_models(mdl_name, model_getter, dataset="dane", splits=["test"])
    dane[mdl_name] = mdl_results["test"]

dane (test): Loading prediction for saattrupdan/nbailab-base-ner-scandi
dane (test): Loading prediction for da_dacy_large_trf-0.2.0
dane (test): Loading prediction for da_dacy_medium_trf-0.2.0
dane (test): Loading prediction for da_dacy_small_trf-0.2.0
dane (test): Running da_dacy_large_ner_fine_grained-0.1.0
dane (test): Running da_dacy_medium_ner_fine_grained-0.1.0
dane (test): Running da_dacy_small_ner_fine_grained-0.1.0
dane (test): Running alexandrainst/da-ner-base
dane (test): Running da_core_news_trf-3.5.0
dane (test): Running da_core_news_lg-3.5.0
dane (test): Running da_core_news_md-3.5.0
dane (test): Running da_core_news_sm-3.5.0


In [3]:
from typing import Any, Dict, Optional
import random
from spacy.scorer import Scorer
from spacy.training import Example
import numpy as np
import pandas as pd


def bootstrap(
    examples: List[Example], n_rep: int = 100, getter: Optional[Callable] = None
):
    scorer = Scorer()
    scores = []
    for _i in range(n_rep):
        sample = random.choices(examples, k=len(examples))
        if getter is None:
            score = scorer.score_spans(sample, attr="ents")
        else:
            score = scorer.score_spans(sample, getter=getter, attr="ents")
        scores.append(score)
    return scores


def compute_mean_and_ci(scores):
    ent_f = [score["ents_f"] for score in scores]
    # filter out None
    ent_f = [x for x in ent_f if x is not None]
    if ent_f:
        result_dict = {
            "Average": {"mean": np.mean(ent_f), "ci": np.percentile(ent_f, [2.5, 97.5])}
        }
    else:
        result_dict = {"Average": {"mean": None, "ci": None}}

    score_mapping = {
        "PER": "Person",
        "LOC": "Location",
        "LOCATION": "Location",
        "ORG": "Organization",
        "LANGUAGE": "Language",
        "PRODUCT": "Product",
        "LAW": "Law",
        "ORGANIZATION": "Organization",
        "WORK OF ART": "Work of Art",
        "PERSON": "Person",
        "FACILITY": "Facility",
        "GPE": "GPE",
        "EVENT": "Event",
        "CARDINAL": "Cardinal",
        "DATE": "Date",
        "MONEY": "Money",
        "NORP": "NORP",
        "ORDINAL": "Ordinal",
        "PERCENT": "Percent",
        "QUANTITY": "Quantity",
        "TIME": "Time",
        "MISC": "Misc.",
    }

    labels = set([label for score in scores for label in score["ents_per_type"]])

    for label in labels:
        label_f = [
            score["ents_per_type"].get(label, {"f": None})["f"] for score in scores
        ]
        label_f = [x for x in label_f if x is not None]
        label = score_mapping.get(label, label)
        if len(label_f) == 0:
            result_dict[label] = {"mean": None, "ci": None}
            continue
        result_dict[label] = {
            "mean": np.mean(label_f),
            "ci": np.percentile(label_f, [2.5, 97.5]),
        }
    return result_dict


def create_dataframe(
    examples: List[Example], mdl_name: str, decimals: int = 1, n_rep: int = 100
):
    score = bootstrap(examples, getter=None, n_rep=n_rep)
    score = compute_mean_and_ci(score)

    row = {
        "Models": mdl_name,
    }

    def score_to_string(score: Dict[str, Any], decimals: int = 1) -> str:
        if score["mean"] == 0:
            return " "
        return f"{100*score['mean']:.{decimals}f} ({100*score['ci'][0]:.{decimals}f}, {100*score['ci'][1]:.{decimals}f})"

    for key, value in score.items():
        row[key] = score_to_string(value, decimals=decimals)
    return pd.DataFrame([row])

In [4]:
def highlight_max(s: pd.Series) -> list:
    """Highlight the maximum in a Series with bold text."""
    # convert to str for comparison
    s = s.astype(str)
    is_max = s == s.max()
    return ["font-weight: bold" if v else "" for v in is_max]


def underline_second_max(s: pd.Series) -> list:
    """Underline the second maximum in a Series."""
    is_second_max = s == s.sort_values(ascending=False).iloc[1]
    return ["text-decoration: underline" if v else "" for v in is_second_max]


def create_table(
    df: pd.DataFrame,
    caption="F1 score with 95% confidence interval calculated using bootstrapping with 100 samples.",
):
    # replace index with range
    df.index = range(len(df))  # type: ignore

    col_names = [("", "Models")] + [("F1", col) for col in df.columns[1:]]
    super_header = pd.MultiIndex.from_tuples(col_names)
    df.columns = super_header

    s = df.style.apply(highlight_max, axis=0, subset=df.columns[1:])
    s = s.apply(underline_second_max, axis=0, subset=df.columns[1:])

    # Add a caption
    s = s.set_caption(caption)

    # Center the header and left align the model names
    s = s.set_properties(subset=df.columns[1:], **{"text-align": "right"})

    super_header_style = [
        {"selector": ".level0", "props": [("text-align", "center")]},
        {"selector": ".col_heading", "props": [("text-align", "center")]},
    ]
    # Apply the CSS style to the styler
    s = s.set_table_styles(super_header_style)  # type: ignore
    s = s.set_properties(subset=[("", "Models")], **{"text-align": "left"})
    # remove the index
    s = s.hide(axis="index")
    return s

In [5]:
tables = []
for mdl in dane:
    # skip fine grained NER models for DaNE
    if "fine_grained" in mdl:
        continue
    tables.append(create_dataframe(dane[mdl]["examples"], mdl, n_rep=500))

df = pd.concat(tables)
# sort columns
df = df[["Models", "Average", "Location", "Person", "Organization", "Misc."]]
create_table(
    df,
    "F1 score with 95% confidence interval calculated using bootstrapping with 500 samples.",
)

Unnamed: 0_level_0,F1,F1,F1,F1,F1
Models,Average,Location,Person,Organization,Misc.
saattrupdan/nbailab-base-ner-scandi,"86.1 (82.4, 89.4)","88.2 (82.2, 92.9)","94.9 (91.6, 97.8)","80.3 (74.4, 86.0)","78.4 (70.2, 85.3)"
da_dacy_large_trf-0.2.0,"85.3 (81.4, 89.0)","89.1 (82.9, 94.0)","92.7 (89.0, 95.9)","79.0 (72.3, 85.1)","78.9 (70.8, 86.2)"
da_dacy_medium_trf-0.2.0,"84.8 (80.6, 88.6)","86.6 (80.9, 91.4)","92.5 (88.7, 95.4)","78.5 (71.4, 84.9)","78.9 (70.6, 86.1)"
da_dacy_small_trf-0.2.0,"82.6 (79.4, 85.7)","83.8 (77.9, 89.4)","92.0 (88.1, 95.0)","75.6 (69.7, 81.2)","75.9 (69.2, 82.0)"
alexandrainst/da-ner-base,"70.6 (66.1, 74.9)","84.7 (78.3, 89.8)","90.2 (86.0, 93.8)","64.7 (57.0, 71.4)",
da_core_news_trf-3.5.0,"78.8 (74.9, 82.3)","81.9 (74.9, 88.0)","91.5 (87.9, 94.4)","68.1 (60.7, 74.7)","68.8 (60.2, 76.7)"
da_core_news_lg-3.5.0,"74.5 (70.6, 78.8)","81.5 (74.4, 87.7)","85.3 (80.4, 89.4)","62.6 (54.5, 69.7)","64.4 (55.5, 72.6)"
da_core_news_md-3.5.0,"71.0 (67.1, 74.8)","76.7 (69.2, 83.6)","82.3 (77.2, 86.6)","58.2 (50.1, 66.0)","61.6 (53.3, 70.1)"
da_core_news_sm-3.5.0,"64.2 (60.1, 68.4)","61.5 (52.4, 70.1)","79.7 (74.6, 84.4)","49.1 (40.3, 57.6)","58.4 (49.2, 66.0)"


It is worth mentioning that while the `da_dacy_large_trf-0.2.0` and `saattrupdan/nbailab-base-ner-scandi` performs similarly they have their independent strength and weaknesses. The large DaCy model is a multi-task model performing named-entity recognition as only one of its many tasks and thus if you wish to use one of those we would recommend that model. On the other hand the `nbailab-base-ner-scandi` is trained on multiple Scandinavian languages and thus might be ideal if your dataset might contain these languages as well. `saattrupdan/nbailab-base-ner-scandi` is available in DaCy using `nlp.add_pipe("dacy/ner")`.

```{admonition} You are missing a model
:note:

These tables are continually updated and thus we try to limit the number of models to only the most relevant Danish models. Therefore models like Polyglot with strict requirements and consistently worse performance are excluded. If you want to see a specific model, please open an issue on GitHub.
```




## DANSK: Fine-grained Named Entity Recognition

DANSK is annotated from the Danish Gigaword Corpus {cite}`missing` and a wide variety of domains including conversational, legal, news, social media, web content,  wiki's and Books. Dansk follows includes the following labels:


|  Entity        |             Description                                         |
| -------- | ---------------------------------------------------- |
| PERSON   | People, including fictional                          |
| NORP     | Nationalities or religious or political groups       |
| FACILITY | Building, airports, highways, bridges, etc.          |
| ORGANIZATION | Companies, agencies, institutions, etc.              |
| GPE      | Countries, cities, states.                           |
| LOCATION | Non-GPE locations, mountain ranges, bodies of water  |
| PRODUCT  | Vehicles, weapons, foods, etc. (not services)        |
| EVENT    | Named hurricanes, battles, wars, sports events, etc. |
| WORK OF ART | Titles of books, songs, etc.                         |
| LAW      | Named documents made into laws                       |
| LANGUAGE | Any named language                                   |

As well as annotation for the following concepts:

|   Entity       |   Description                                         |
| -------- | ------------------------------------------- |
| DATE     | Absolute or relative dates or periods       |
| TIME     | Times smaller than a day                    |
| PERCENT  | Percentage (including "*"%)                |
| MONEY    | Monetary values, including unit             |
| QUANTITY | Measurements, as of weight or distance      |
| ORDINAL  | "first", "second"                           |
| CARDINAL | Numerals that do no fall under another type |


We have here opted to create an interactive chart over a table as with the number of labels it quickly becomes unruly. The chart is interactive and you can select the label you want to compare the models on. You can also hover over the dots the see the exact values.

In [6]:
dansk = {}
for mdl_name, model_getter in MODELS.items():
    mdl_results = apply_models(
        mdl_name, model_getter, dataset="dansk", splits=["train", "dev", "test"]
    )
    dansk[mdl_name] = mdl_results

dansk (train): Running saattrupdan/nbailab-base-ner-scandi


Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


dansk (dev): Running saattrupdan/nbailab-base-ner-scandi


Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


dansk (test): Running saattrupdan/nbailab-base-ner-scandi


Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


dansk (train): Running da_dacy_large_trf-0.2.0


Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/Users/au561649/.cache/huggingface/datasets/chcaa___parquet/chcaa--DANSK-ec592bb9b8d7fe08/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [None]:
tables = []
for mdl in dane:
    # only fine-grained models for DANSK
    if "fine_grained" not in mdl:
        continue
    tables.append(create_dataframe(dansk[mdl]["test"]["examples"], mdl, n_rep=100))

df = pd.concat(tables)

In [None]:
import altair as alt


def create_dansk_viz(df: pd.DataFrame):
    plot_df = df.melt(
        id_vars=["Models"],
        var_name="Label",
        value_name="F1 string",
    )

    # Convert the score value to a float
    plot_df["F1"] = plot_df["F1 string"].apply(
        lambda x: float(x.split()[0]) if not isinstance(x, float) else x
    )
    plot_df["CI Lower"] = plot_df["F1 string"].apply(
        lambda x: float(x.split("(")[1].split(",")[0])
    )
    plot_df["CI Upper"] = plot_df["F1 string"].apply(
        lambda x: float(x.split(",")[1].split(")")[0])
    )

    selection = alt.selection_point(
        fields=["Label"],
        bind="legend",
        value=[{"Label": "Average"}],
    )

    base = (
        alt.Chart(plot_df)
        .mark_point(filled=True, size=100)
        .encode(
            x=alt.X("F1", title="F1"),
            y="Models",
            color="Label",
            tooltip=[
                "Models",
                "Label",
                alt.Tooltip("F1 string", title="F1"),
            ],
            opacity=alt.condition(selection, alt.value(1), alt.value(0.0)),
            # only show the tooltip when when the label is selected
        )
    )
    error_bars = (
        alt.Chart(plot_df)
        .mark_errorbar(ticks=False)
        .encode(
            x=alt.X("CI Lower", title="F1"),
            x2="CI Upper",
            y="Models",
            color="Label",
            opacity=alt.condition(selection, alt.value(1), alt.value(0.0)),
        )
    )

    chart = base + error_bars

    return chart.add_params(selection).properties(width=800, height=400)

In [None]:
create_dansk_viz(df)

In [None]:
_df = df
_df = _df.set_index("Models")
ent_columns = sorted(
    [
        "Event",
        "Organization",
        "Language",
        "Person",
        "Ordinal",
        "NORP",
        "Work of Art",
        "Facility",
        "Law",
        "Location",
        "Product",
        "GPE",
    ]
)
non_ent_columns = sorted(["Cardinal", "Date", "Money", "Percent", "Quantity", "Time"])
columns_to_keep = ent_columns + non_ent_columns + ["Average"]

_df = _df[columns_to_keep]

In [None]:
table = _df.T
# iidx = pd.MultiIndex.from_product([ent_columns, non_ent_columns, ["Average"]], names=["Entity", "Non-Entity", ""])
iidx = pd.MultiIndex.from_arrays(
    [
        ["Entities"] * len(ent_columns)
        + ["Non-Entities"] * len(non_ent_columns)
        + ["Average"],
        ent_columns + non_ent_columns + ["Average"],
    ]
)
table.index = iidx

mdl_names = ["Large 0.1.0", "Medium 0.1.0", "Small 0.1.0"]
header = pd.MultiIndex.from_arrays(
    [["Fine-grained Models"] * len(mdl_names), mdl_names]
)
table.columns = header

In [None]:
# convert to latex using styler
style = table.style.format_index(escape="latex", axis=1).format_index(
    escape="latex", axis=0
)


# highlight the maximum
def italicize_second_max(s: pd.Series) -> list:
    """Italicize the second maximum in a Series."""
    is_second_max = s == s.sort_values(ascending=False).iloc[1]
    # check if the second maximum is the same as the maximum
    same_as_max = s == s.max()

    if same_as_max.sum() > 1:
        # if there are more than one maximum, don't italicize
        return ["font-style: normal" for v in is_second_max]
    return ["font-style: italic" if v else "" for v in is_second_max]


style = style.apply(highlight_max, axis=1)
# style = style.apply(underline_second_max, axis=1)
style = style.apply(italicize_second_max, axis=1)

# apply the CSS style
super_header_style = [
    {"selector": ".level0", "props": [("text-align", "center")]},
    {"selector": ".col_heading", "props": [("text-align", "center")]},
]
style = style.set_table_styles(super_header_style)


# add caption
caption = "F1 score with 95% confidence interval calculated using bootstrapping with 100 samples."
style = style.set_caption(caption)
style

# latex = style.to_latex(
#         hrules=True,
#         convert_css=True,
#     )

# print(latex)

## Generalization
To examine model generalization, we utilize the [DANSK](https://huggingface.co/datasets/chcaa/DANSK) dataset {cite}`missing`. This dataset is annotated across many different domains including fiction, web content, social media, wikis, news, legal and conversational data. The original dataset includes annotations corresponding to the ontonotes standard (see [getting started](https://centre-for-humanities-computing.github.io/DaCy/tutorials/basic.html#fine-grained-ner) for the full list). To test the generalization we here convert the annotations to the CoNLL-2003 format using the labels `Person`, `Location`, `Organization`. As CoNLL-2003, `Location` includes cities, roads, mountains, abstract places, specific buildings, and meeting points. Thus the `GPE` (geo-political entity) were converted to `Location`. The `MISC` category in CoNLL-2003 is a diverse category meant to denote all names not in other categories (encapsulating both e.g. events and adjectives such as ”2004 World Cup” and ”Italian”), and is therefore not included.

In [None]:

def convert_to_conll_2003(
    examples,
    mapping={"PERSON": "PER", "GPE": "LOC", "LOCATION": "LOC", "ORGANIZATION": "ORG", "PER": "PER", "LOC": "LOC", "ORG": "ORG"},
) -> list:
    
    def doc_to_conll_2003(doc):
        ents = doc.ents
        ents = [e for e in ents if e.label_ in mapping]
        for ent in ents:
            ent.label_ = mapping[ent.label_]
        doc.ents = ents
        return doc

    for example in examples:
        example.y = doc_to_conll_2003(example.y)
        example.x = doc_to_conll_2003(example.x)
    return examples

    




In [None]:
for mdl_name in dansk:
    examples = dansk[mdl_name]["test"]["examples"]
    examples += dansk[mdl_name]["dev"]["examples"]
    examples += dansk[mdl_name]["train"]["examples"]

    examples = convert_to_conll_2003(examples)

In [None]:
from performance_testing_utils.generalization_utils import (
    dansk,
    convert_to_conll_2003,
    MDL_GETTER_DICT,
    evaluate_generalization,
    create_generation_viz,
)

train, dev, test = dansk()
convert_to_conll_2003(train)
convert_to_conll_2003(dev)
convert_to_conll_2003(test)

dataset = train + dev + test

assert set([e.label_ for doc in dataset for e in doc.ents]) == set(
    ["PER", "LOC", "ORG"]
)

save_folder = Path("performance_tables/ner")
save_folder.mkdir(exist_ok=True, parents=True)

tables = []
# create domains datasets
domains = {}
for doc in dataset:
    domain = doc._.meta["dagw_domain"]
    if domain not in domains:
        domains[domain] = []
    domains[domain].append(doc)

for mdl, getter in MDL_GETTER_DICT.items():
    mdl_name = mdl.replace("/", "_")
    save_path = save_folder / f"{mdl_name}_generalization.csv"
    if not save_path.exists():
        nlp = getter()
        result_df = evaluate_generalization(
            mdl_name=mdl, mdl=nlp, domains_dataset_dict=domains
        )
        result_df.to_csv(save_path, index=False)
    else:
        print(f"- {mdl} already exists, loading in dataframe")
    result_df = pd.read_csv(
        save_path
    )  # always load in dataframe to ensure the same representation
    tables.append(result_df)