In [2]:
import spacy
from spacy.language import Language

In [3]:
# model_path = "training/vesteinn/DanskBERT-2023-04-21/model-best"
model_path = "training/ned/model-best"
nlp = spacy.load(model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
doc = nlp(
    "Danmarks statsminister hedder Mette Frederiksen fra Socialdemokratiet. Hun er gift med Bo Tengberg og har to børn."
)

# Entities

In [5]:
from spacy import displacy

displacy.render(doc, style="ent")

In [6]:
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_)

Danmarks LOC NIL
Frederiksen PER NIL
Socialdemokratiet ORG Q1424299
Tengberg PER NIL


In [44]:
# load NED dataset
from spacy.tokens import DocBin

db = DocBin().from_disk("corpus/cdt/train.spacy")

docs = list(db.get_docs(nlp.vocab))

In [22]:
for ent in docs[0].ents:
    print(" - ", ent.text, ent.label_, ent.kb_id_)

 -  SID ORG Q12335178
 -  SID-huset LOC Q49614
 -  Kjeld Christensen PER 
 -  Køvenhavner MISC Q1748
 -  Kjeld Christensen PER 
 -  København LOC Q1748
 -  Jylland LOC Q25389
 -  Århus LOC Q25319
 -  Kjeld Christensen PER 
 -  Korsholmskolen ORG Q2385804
 -  Hinnerup LOC Q3175755
 -  Kjeld Christensen PER 
 -  Hinnerup LOC Q3175755
 -  socialdemokratiet ORG Q212101
 -  Hadsten LOC Q3522043
 -  SID ORG Q12335178
 -  socialdemokraterne MISC Q212101
 -  SF ORG Q615603
 -  Fremskridtspartiet ORG Q1455237
 -  Vestergade LOC Q34442


In [45]:
# okay looks fine - lets check the knowledge base
from spacy.kb import InMemoryLookupKB

vector_length = nlp.vocab.vectors_length

kb = InMemoryLookupKB(nlp.vocab, vector_length)
kb.from_disk("assets/daned/knowledge_base.kb")

In [48]:
for ent in docs[0].ents:
    print(" - ", ent.text, ent.label_)
    cands = kb.get_candidates(ent)
    for cand in cands:
        print("\t", cand.alias_, "-", cand.entity_, "-", cand.prior_prob)

 -  SID ORG
 -  SID-huset LOC
	 SID-huset - Q49614 - 1.0
 -  Kjeld Christensen PER
 -  Køvenhavner MISC
	 Køvenhavner - Q1748 - 1.0
 -  Kjeld Christensen PER
 -  København LOC
	 København - Q1748 - 1.0
 -  Jylland LOC
 -  Århus LOC
	 Århus - Q25319 - 1.0
 -  Kjeld Christensen PER
 -  Korsholmskolen ORG
	 Korsholmskolen - Q2385804 - 1.0
 -  Hinnerup LOC
	 Hinnerup - Q3175755 - 1.0
 -  Kjeld Christensen PER
 -  Hinnerup LOC
	 Hinnerup - Q3175755 - 1.0
 -  socialdemokratiet ORG
	 socialdemokratiet - Q212101 - 1.0
 -  Hadsten LOC
	 Hadsten - Q3522043 - 1.0
 -  SID ORG
 -  socialdemokraterne MISC
	 socialdemokraterne - Q212101 - 1.0
 -  SF ORG
	 SF - Q615603 - 0.9230769276618958
	 SF - Q4571771 - 0.07692307978868484
 -  Fremskridtspartiet ORG
 -  Vestergade LOC
	 Vestergade - Q34442 - 1.0


In [47]:
# aahh knowledge base is wack!

out = kb.get_candidates(ent)
for o in out:
    print(o.alias_)
    print(o.entity_)
    print(o.prior_prob)

socialdemokratiet
Q212101
1.0


# POS + Dependency Parsing

In [31]:
displacy.render(doc, style="dep")

# Coref Resolution
missing

In [10]:
import spacy

# model_path = "training/danskbert_w_spacy35.span_resolver/model-best"
# nlp = spacy.load(model_path)

model_path = "training/danskbert_w_spacy35/model-best"
nlp = spacy.load(model_path)

In [11]:
nlp.pipe_names

['transformer',
 'tagger',
 'morphologizer',
 'trainable_lemmatizer',
 'parser',
 'ner',
 'coref',
 'entity_linker']

In [12]:
doc = nlp("Hej mit navn er Kenneth og jeg bor i København.")

[mit, jeg]