In [None]:
import re
import spacy
from spacy.symbols import ORTH
import requests

In [None]:
samples = NerSample.objects.filter(dataset__ner_name='MPR_Nerdpool_8-20_v2').exclude(text=None)

In [None]:
ABBR_BASE = "https://abbr.acdh.oeaw.ac.at/api/abbreviations/?format=json"


def yield_abbr(ABBR_BASE):
    """ iterator to yield all abbreviations from ABBR_BASE """
    next = True
    url = ABBR_BASE
    counter = 0
    while next:
        response = requests.request("GET", url)
        result = response.json()
        if result.get('next', False):
            url = result.get('next')
        else:
            next = False
        results = result.get('results')
        for x in results:
            text = x.get('orth')
            counter += 1
            yield(text)

In [None]:
nlp = spacy.load('blank:de')

In [None]:
exceptions = {}
for x in yield_abbr(ABBR_BASE):
    exceptions[x] = [
        {ORTH: x}
    ]
for key, value in exceptions.items():
    nlp.tokenizer.add_special_case(key, value)

In [None]:
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [None]:
nlp.to_disk('./prodigy/abbr')

In [None]:
my_samples = []
for x in samples:
    orig = x.orig_example
    spans = orig['spans']
    text = orig['text']
    doc = nlp(text)
    for y in doc.sents:
        item = {
            'text': y.text,
            'spans': []
        }
        start, end = y.start_char, y.end_char
        start_t = y.start
        sent_ents = []
        for ent in spans:
            if ent['start'] >= start and ent['end'] <= end:
                new_start = ent['start'] - start
                new_end = ent['end'] - start
                t_start = ent['token_start']
#                 print(f"sent start: {y.start}, ent_t_start: {t_start}, ent_t_end: {y.end}")
                t_end = ent['token_end']
                new_ent = {
                    'label': ent['label'],
                    'start': new_start,
                    'end': new_end,
                    'token_start': t_start - y.start,
                    'token_end': t_end - y.start
                }
                item['spans'].append(new_ent)
        my_samples.append(item)
#     print(item)


In [None]:
import ujson
from pathlib import Path

In [None]:
def write_jsonl(file_path, lines):
    """Create a .jsonl file and dump contents.
    file_path (unicode / Path): The path to the output file.
    lines (list): The JSON-serializable contents of each line.
    """
    data = [ujson.dumps(line, escape_forward_slashes=False) for line in lines]
    Path(file_path).open('w', encoding='utf-8').write('\n'.join(data))

In [None]:
write_jsonl('mrp_short.jsonl', my_samples)

In [None]:
# prodigy db-in mrp_short ../mrp_short.jsonl
# python manage.py enrich_samples --settings=nerdpool.settings.pg_local
# prodigy train ner mrp_short blank:de -o ./mrp_short__blank