# Augmentation
This notebook recreates Table X from the paper XX and illustrates how to use the augmenters and scoring functions included in DaCy

In [1]:
import os # assuming we are located in dacy
os.chdir("..")

In [None]:
!pip install -r requirements.txt
pip install transformers==3.5.1 --no-deps # for DaNLP

In [3]:
import pandas as pd

import dacy
from dacy.augmenters import create_pers_augmenter, create_keyboard_augmenter, create_æøå_augmenter
from dacy.datasets import danish_names, muslim_names
from dacy.score import score, n_sents_score

import spacy
from spacy.training.augment import create_lower_casing_augmenter, dont_augment

from functools import partial

ModuleNotFoundError: No module named 'dacy'

# The dataset: DaNE
Start off by loading the test set of the DaNE dataset.

In [4]:
test = dacy.datasets.dane(splits=["test"])

# Augmenters

Create a list of augmenters we wish to apply to our model.

In [2]:
# randomly augment names
dk_name_dict = danish_names()
muslim_name_dict = muslim_names()

dk_aug = create_pers_augmenter(dk_name_dict, force_size=True, keep_name=False)
muslim_aug = create_pers_augmenter(muslim_name_dict, force_size=True, keep_name=False)

# randomly change 5%/15% of characters to a neighbouring key
keyboard_aug_05 = create_keyboard_augmenter(doc_level=1, char_level=0.05, keyboard="QWERTY_DA")
keyboard_aug_15 = create_keyboard_augmenter(doc_level=1, char_level=0.15, keyboard="QWERTY_DA")

# Change æ=ae, ø=oe, å=aa
æøå_aug = create_æøå_augmenter(doc_level=1, char_level=1)

# lower case text
lower_case_aug = create_lower_casing_augmenter(level=1)

augmenters = [dont_augment, keyboard_aug_05, keyboard_aug_15, æøå_aug, lower_case_aug, dk_aug, muslim_aug]

NameError: name 'danish_names' is not defined

# Apply functions
Defining application functions for necessary models. No need to create one for SpaCy pipelines.

In [5]:
def apply_bert_model(example, bert_model):
    doc = example.predicted
    # uses spacy tokenization
    tokens, labels = bert_model.predict([t.text for t in example.predicted])
    ent = []
    for i, t in enumerate(zip(doc, labels)):
        token, label = t

        # turn OOB labels into spans
        if label == "O":
            continue
        iob, ent_type = label.split("-")
        if (i - 1 >= 0 and iob == "I" and labels[i - 1] == "O") or (
            i == 0 and iob == "I"
        ):
            iob = "B"
        if iob == "B":
            start = i
        if i + 1 >= len(labels) or labels[i + 1].split("-")[0] != "I":
            ent.append(Span(doc, start, i + 1, label=ent_type))
    doc.set_ents(ent)
    example.predicted = doc
    return example

    ### DaNLP's BERT model requires transformers==3.5.1 (install with pip install transformers==3.5.1 --no-deps)

# Models
A list of models to apply. To save memory the models are only loaded in one at a time.

In [12]:
from danlp.models import load_bert_ner_model
from NERDA.precooked import DA_BERT_ML

model_dict = {"spacy_small" : "da_core_news_sm",
              "spacy_medium": "da_core_news_md",
              "spacy_large" : "da_core_news_lg",
              "dacy_small" : "da_dacy_small_tft-0.0.0",
              "dacy_medium" : "da_dacy_medium_tft-0.0.0",
              "dacy_large" : "da_dacy_large_tft-0.0.0",
              "danlp_bert" : load_bert_ner_model,
              "nerda_bert" : DA_BERT_ML,
              }


NameError: name 'dacy_small' is not defined

# Performance

In [None]:
for i, mdl in enumerate(model_dict()):

    # load model
    if "dacy" in mdl:
        apply_fn = dacy.load(model_dict[mdl])
    else if "spacy" in mdl:
        apply_fn = spacy.load(model_dict[mdl])
    else:
        bert = model_dict[mdl]()
        apply_fn = partial(apply_bert_model, bert_model=nerda_bert)

    # apply model
    scores_ = score(corpus=test, apply_fn=apply_fn, augmenters=augmenters)
    scores_["model"] = mdl
    scores = pd.concat([scores, scores_]) if i != 0 else scores_

In [None]:
scores.to_csv("augmentation_performance.csv")