# Check DaCy robustness against augmented names

In [1]:
import os
os.chdir("..")

In [2]:
import dacy
from dacy.augmenters import create_pers_augmenter
from dacy.datasets import danish_names, muslim_names
from dacy.score import score

import spacy

from typing import Callable, List

from functools import partial

Start off by loading the test set and defining a function that applies the small Spacy model on the data.

In [3]:
train, dev, test = dacy.datasets.dane(splits=["train", "dev", "test"])
nlp = spacy.load("da_core_news_sm")


def apply_model(example, nlp):
    example.predicted = nlp(example.predicted.text)
    return example


# make an instance of apply_model using the spacy nlp
apply_spacy_model = partial(apply_model, nlp=nlp)

In [4]:
test

<spacy.training.corpus.Corpus at 0x10a01ea60>

Let's test how well the model performs on the original data, data where names are changed to other Danish names, and data where names are changes to names of Muslim origin. The name augmenter allows us to specify a number of naming patterns we wish to augment the names to. Defaults are `["fn,ln", "abbpunct,ln"]`, which means names are augmented to the follow the pattern of either "first_name last_name" (e.g. Mette Frederiksen) or "abbreviated_first_name last_name" (e.g. M. Frederiksen). The patterns include "fn" (first name, Mette), "ln" (last name, Frederiksen), "abb" (abbreviated, M), "abbpunct" (abbreviated + ., M.). These patterns can be designed however you see fit. We will stick to the defaults for now. 

In [5]:
dk_name_dict = danish_names()
muslim_name_dict = muslim_names()

# Set keep_name to False to make the augmenter choose a new name from the dictionary
#   otherwise, it would simply make the name fit the pattern (e.g. make abbreviations)
# force_size ensures that the names are of the same length/format as the pattern.
dk_aug = create_pers_augmenter(dk_name_dict, force_size=True, keep_name=False)
muslim_aug = create_pers_augmenter(muslim_name_dict, force_size=True, keep_name=False)


In [6]:
scores_raw = score(test, apply_spacy_model, score_fn=["ents"])
scores_dk = score(test, apply_spacy_model, augmenter=dk_aug, score_fn=["ents"])
scores_muslim = score(test, apply_spacy_model, augmenter=muslim_aug, score_fn=["ents"])

In [7]:
scores = scores_raw + scores_dk + scores_muslim
scores.to_df()

Unnamed: 0,ents_p,ents_r,ents_f,ents_per_type_PER_p,ents_per_type_PER_r,ents_per_type_PER_f,ents_per_type_LOC_p,ents_per_type_LOC_r,ents_per_type_LOC_f,ents_per_type_MISC_p,ents_per_type_MISC_r,ents_per_type_MISC_f,ents_per_type_ORG_p,ents_per_type_ORG_r,ents_per_type_ORG_f
0,0.719262,0.629032,0.671128,0.768421,0.811111,0.789189,0.673267,0.708333,0.690355,0.68,0.561983,0.615385,0.71134,0.428571,0.534884
1,0.72541,0.634409,0.676864,0.760204,0.827778,0.792553,0.68,0.708333,0.693878,0.701031,0.561983,0.623853,0.726316,0.428571,0.539062
2,0.700624,0.603943,0.648701,0.752874,0.727778,0.740113,0.653846,0.708333,0.68,0.69,0.570248,0.624434,0.669903,0.428571,0.522727


Augmenting names to a Danish name which fit the pattern of either "fn,ln" or "abbpunct,ln" actually made it easier for the model than the raw training data. However, augmenting with muslim names made the model perform a fair bit worse than baseline - look at the recall for PERS!

Let's see how good the model is with words that start with an abbreviation. We will set `force_size` to False so only the first word will be augmented. `keep_name` will be true, so we're ensuring that only the first word in the name will be modified. 

In [8]:
abb_aug = create_pers_augmenter(dk_name_dict, patterns=["abbpunct"], force_size=False, keep_name=True)
scores_abb = score(test, apply_spacy_model, augmenter=abb_aug, score_fn=["ents"])
scores += scores_abb
scores.to_df()

Unnamed: 0,ents_p,ents_r,ents_f,ents_per_type_PER_p,ents_per_type_PER_r,ents_per_type_PER_f,ents_per_type_LOC_p,ents_per_type_LOC_r,ents_per_type_LOC_f,ents_per_type_MISC_p,ents_per_type_MISC_r,ents_per_type_MISC_f,ents_per_type_ORG_p,ents_per_type_ORG_r,ents_per_type_ORG_f
0,0.719262,0.629032,0.671128,0.768421,0.811111,0.789189,0.673267,0.708333,0.690355,0.68,0.561983,0.615385,0.71134,0.428571,0.534884
1,0.72541,0.634409,0.676864,0.760204,0.827778,0.792553,0.68,0.708333,0.693878,0.701031,0.561983,0.623853,0.726316,0.428571,0.539062
2,0.700624,0.603943,0.648701,0.752874,0.727778,0.740113,0.653846,0.708333,0.68,0.69,0.570248,0.624434,0.669903,0.428571,0.522727
3,0.705757,0.59319,0.644596,0.732558,0.7,0.715909,0.68,0.708333,0.693878,0.686869,0.561983,0.618182,0.704082,0.428571,0.532819


In [10]:
scores_raw_2 = score(test, apply_spacy_model, score_fn=["ents"])
scores_raw_2.to_df()

Unnamed: 0,ents_p,ents_r,ents_f,ents_per_type_PER_p,ents_per_type_PER_r,ents_per_type_PER_f,ents_per_type_LOC_p,ents_per_type_LOC_r,ents_per_type_LOC_f,ents_per_type_MISC_p,ents_per_type_MISC_r,ents_per_type_MISC_f,ents_per_type_ORG_p,ents_per_type_ORG_r,ents_per_type_ORG_f
0,0.719262,0.629032,0.671128,0.768421,0.811111,0.789189,0.673267,0.708333,0.690355,0.68,0.561983,0.615385,0.71134,0.428571,0.534884


In [18]:
help(score)

Help on function score in module dacy.testing.score:

score(corpus: 'Corpus', apply_fn: 'Callable', score_fn: 'List[Union[Callable, str]]' = ['toquitken', 'pos', 'ents'], augmenter: 'Optional[Callable]' = None, k: 'int' = 1, nlp: 'Optional[Language]' = None, **kwargs) -> 'Scores'
    scores a models performance on a given corpus with potentially augmentations applied to it.
    
    Args:
        corpus (Corpus): A spacy Corpus
        apply_fn (Callable): A wrapper function for the model you wish to score. The model should take in a spacy Example and output a tagged version of it.
        score_fn (List[Union[Callable, str]], optional): A scoring function which takes in a list of examples and return a dictionary of the form {"score_name": score}.
            Four potiential strings are valid. "ents" for measuring the performance of entity spans. "pos" for measuring the performance of pos-tags.
            "token" for measuring the performance of tokenization. "nlp" for measuring the p

The model looses some performance, not but too bad.

Let's compare DaCy small and large on the same tasks.


In [3]:
#dacy_large = dacy.load("da_dacy_large_tft-0.0.0")
dacy_small = dacy.load("da_dacy_small_tft-0.0.0")

d845d4fef9ea165ee7bd6dd954b95de2?download: 0.00B [00:00, ?B/s][INFO] Downloading '{model}'
d845d4fef9ea165ee7bd6dd954b95de2?download: 52.6MB [03:23, 259kB/s]                            


OSError: [E050] Can't find model '/Users/au561649/.dacy/da_dacy_small_tft-0.0.0'. It doesn't seem to be a Python package or a valid path to a data directory.

In [4]:
dacy_small


<spacy.lang.da.Danish at 0x1632d1c40>

In [9]:
def score_augmenters(dataset, augmenters: List[Callable], apply_fn: Callable):

    baseline_score = score(dataset, apply_fn=apply_fn, score_fn=["ents"])
    scores = baseline_score
    for augmenter in augmenters:
        scores += score(dataset, augmenter=augmenter, apply_fn=apply_fn, score_fn=["ents"])
    return scores

In [10]:
augmenters = [dk_aug, muslim_aug, abb_aug]
apply_large_dacy = partial(apply_model, nlp=dacy_large)
apply_small_dacy = partial(apply_model, nlp=dacy_small)

In [11]:
score_large_dacy = score_augmenters(test, augmenters, apply_large_dacy)
score_large_dacy.to_df()

Unnamed: 0,ents_p,ents_r,ents_f,ents_per_type_LOC_p,ents_per_type_LOC_r,ents_per_type_LOC_f,ents_per_type_PER_p,ents_per_type_PER_r,ents_per_type_PER_f,ents_per_type_MISC_p,ents_per_type_MISC_r,ents_per_type_MISC_f,ents_per_type_ORG_p,ents_per_type_ORG_r,ents_per_type_ORG_f
0,0.833333,0.797491,0.815018,0.798165,0.90625,0.84878,0.89697,0.822222,0.857971,0.801587,0.834711,0.817814,0.813433,0.677019,0.738983
1,0.866055,0.845878,0.855848,0.790909,0.90625,0.84466,0.940541,0.966667,0.953425,0.801587,0.834711,0.817814,0.887097,0.68323,0.77193
2,0.868132,0.849462,0.858696,0.798165,0.90625,0.84878,0.925926,0.972222,0.948509,0.809524,0.842975,0.825911,0.901639,0.68323,0.777385
3,0.833333,0.797491,0.815018,0.798165,0.90625,0.84878,0.89697,0.822222,0.857971,0.801587,0.834711,0.817814,0.813433,0.677019,0.738983


In [12]:
score_small_dacy = score_augmenters(test, augmenters, apply_small_dacy)
score_small_dacy.to_df()

Unnamed: 0,ents_p,ents_r,ents_f,ents_per_type_LOC_p,ents_per_type_LOC_r,ents_per_type_LOC_f,ents_per_type_PER_p,ents_per_type_PER_r,ents_per_type_PER_f,ents_per_type_MISC_p,ents_per_type_MISC_r,ents_per_type_MISC_f,ents_per_type_ORG_p,ents_per_type_ORG_r,ents_per_type_ORG_f
0,0.703565,0.672043,0.687443,0.752294,0.854167,0.8,0.805031,0.711111,0.755162,0.635659,0.677686,0.656,0.610294,0.515528,0.558923
1,0.712747,0.71147,0.712108,0.725664,0.854167,0.784689,0.814208,0.827778,0.820937,0.625954,0.677686,0.650794,0.646154,0.521739,0.57732
2,0.713249,0.704301,0.708747,0.741071,0.864583,0.798077,0.803279,0.816667,0.809917,0.653226,0.669421,0.661224,0.621212,0.509317,0.559727
3,0.703565,0.672043,0.687443,0.752294,0.854167,0.8,0.805031,0.711111,0.755162,0.635659,0.677686,0.656,0.610294,0.515528,0.558923


In [13]:
score_small_spacy = score_augmenters(test, augmenters, apply_spacy_model)
score_small_spacy.to_df()

Unnamed: 0,ents_p,ents_r,ents_f,ents_per_type_LOC_p,ents_per_type_LOC_r,ents_per_type_LOC_f,ents_per_type_PER_p,ents_per_type_PER_r,ents_per_type_PER_f,ents_per_type_MISC_p,ents_per_type_MISC_r,ents_per_type_MISC_f,ents_per_type_ORG_p,ents_per_type_ORG_r,ents_per_type_ORG_f
0,0.705757,0.59319,0.644596,0.68,0.708333,0.693878,0.732558,0.7,0.715909,0.686869,0.561983,0.618182,0.704082,0.428571,0.532819
1,0.728016,0.637993,0.680038,0.68,0.708333,0.693878,0.757576,0.833333,0.793651,0.69697,0.570248,0.627273,0.75,0.428571,0.545455
2,0.697531,0.607527,0.649425,0.666667,0.708333,0.686869,0.734807,0.738889,0.736842,0.69,0.570248,0.624434,0.669903,0.428571,0.522727
3,0.705757,0.59319,0.644596,0.68,0.708333,0.693878,0.732558,0.7,0.715909,0.686869,0.561983,0.618182,0.704082,0.428571,0.532819


As you can see, the models obtain slightly different performance with the `dk_aug` and `muslim_aug` per run. This is because names are randomly sampled each time, where some names might be easier to predict than others. To account for this, `score` includes a `k` argument which you can use to run the model `k` times for a more robust performance estimate. 

## Weird stuff is happening with the score function -> row 1 and 4 should be the same across runs (tested and is the case in `test_name_augmenter`)