In [1]:
import augmenty 

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


## Testing Spacy Models & Augmenty

In [42]:
import spacy
#load spacy model
nlp = spacy.load("da_core_news_md")

#define augmenter dictionary
names = {"first_name": ["Laura", "Peter", "Poul"],
         "last_name": ["Petersen", "Jensen", "Andersen"]}

patterns = [["first_name"], ["first_name", "last_name"],
            ["first_name", "last_name", "last_name"]]

person_tag = "PER"

# replace 100% of names:
per_augmenter = augmenty.load("per_replace.v1", patterns = patterns, names = names, level = 1, person_tag = "PER", replace_consistency = True)

# text
texts = ["Anton Antonsen er rigtig god til at tale spansk", "Lars Jensen elsker at danse salsa", "Lars Krogh elsker pizza"]

# augment text with custom augmenter
list(augmenty.texts(texts, per_augmenter, nlp))

['Poul Petersen er rigtig god til at tale spansk',
 'Laura Andersen elsker at danse salsa',
 'Laura Jensen Jensen elsker pizza']

In [43]:
doc = nlp("Anton Antonsen er rigtig god til at tale spansk")

for ent in doc.ents:
   print(ent, ent.label_)

Anton Antonsen PER
spansk MISC


## Testing Flair and Polyglot

In [19]:
from danlp.models import load_flair_ner_model
from flair.data import Sentence

#import flair ner model 
tagger = load_flair_ner_model()

#text 
text = "Morten Østergaard bor i København tæt på Kongens Nytorv"
sentence = Sentence(text)

tagger.predict(sentence)

2023-01-29 12:57:39,634 loading file /home/coder/.danlp/flair.ner.pt


In [39]:
for entity in sentence.get_spans('ner'):
    print(entity)

Span [1,2]: "Morten Østergaard"   [− Labels: PER (0.9968)]
Span [5]: "København"   [− Labels: LOC (0.9944)]
Span [8,9]: "Kongens Nytorv"   [− Labels: LOC (0.6427)]


In [None]:
import polyglot
from polyglot.text import Text, Word

text = Text(u"Peter Jensen elsker at danse salsa", hint_language_code="da")
print(text.entities)

## Dacy Models

In [47]:
dacy.models()

['da_dacy_small_tft-0.0.0',
 'da_dacy_medium_tft-0.0.0',
 'da_dacy_large_tft-0.0.0',
 'da_dacy_small_trf-0.1.0',
 'da_dacy_medium_trf-0.1.0',
 'da_dacy_large_trf-0.1.0']

In [2]:
import dacy 
nlp = dacy.load("da_dacy_large_trf-0.1.0")

In [3]:
doc = nlp("Anton Antonsen er rigtig god til at tale spansk")

for ent in doc.ents:
   print(ent, ent.label_)

Anton Antonsen PER
spansk MISC


  matches = self.matcher(doc, allow_missing=True, as_spans=False)


## STANZA

In [1]:
import stanza
from tqdm import tqdm

nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp("Anton Antonsen er rigtig god til at tale spansk")
print(*[f'token: {token.text}\tner: {token.ner}' for sent in doc.sentences for token in sent.tokens], sep='\n')

2023-01-29 14:22:40 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/ner/ontonotes.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/backward_charlm/1billion.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/pretrain/fasttextcrawl.pt:   0%…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/forward_charlm/1billion.pt:   0…

2023-01-29 14:22:45 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2023-01-29 14:22:45 INFO: Use device: cpu
2023-01-29 14:22:45 INFO: Loading: tokenize
2023-01-29 14:22:45 INFO: Loading: ner
2023-01-29 14:22:47 INFO: Done loading processors!


token: Anton	ner: B-PERSON
token: Antonsen	ner: E-PERSON
token: er	ner: O
token: rigtig	ner: O
token: god	ner: O
token: til	ner: O
token: at	ner: O
token: tale	ner: O
token: spansk	ner: O


## Testing with DACY datasets

In [8]:
from dacy.datasets import dane, danish_names, female_names, male_names, muslim_names, load_names

m_name_dict = male_names()

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  names = names.groupby(level=0).apply(lambda x: x / float(x.sum()))


In [43]:
import augmenty
import spacy 
nlp = spacy.load("da_core_news_sm")
patterns = [["first_name"], ["first_name", "last_name"], ["first_name", "last_name", "last_name"]]

# replace 100% of names:
per_augmenter = augmenty.load("per_replace.v1", patterns = patterns, names = m_name_dict, level = 1, person_tag = "PER", replace_consistency = True)

# text
texts = ["Anton Antonsen er rigtig god til at tale spansk", "Lars Jensen elsker at danse salsa", "Lars Krogh elsker pizza"]

# augment text with custom augmenter
list(augmenty.texts(texts, per_augmenter, nlp))



['Raymond Midtgaard Hvid er rigtig god til at tale spansk',
 'Peer Hamann elsker at danse salsa',
 'Regnar Baun Laustsen elsker pizza']