In [119]:
import re
import json
import jsonlines
import warnings
import random
from loguru import logger
from tqdm import tqdm

import spacy
from spacy.scorer import Scorer
from spacy.gold import GoldParse
from spacy.tokenizer import Tokenizer
from spacy.util import minibatch, compounding

In [120]:
def create_custom_tokenizer(nlp):
    prefix_re = spacy.util.compile_prefix_regex(tuple([r'-', r'\d{2}\.\d{2}\.\d{4}'] + list(nlp.Defaults.prefixes)))
    infix_re = spacy.util.compile_infix_regex(tuple([r'(\.)', r'(:)', r'(\()', r'(\))'] + list(nlp.Defaults.infixes)))
    suffixes = list(nlp.Defaults.suffixes)
    suffixes.remove('\.\.+')
    suffixes.append('\.\.\.+')
    suffixes.append('Die')
    suffix_re = spacy.util.compile_suffix_regex(tuple([r'-'] + suffixes))
    return Tokenizer(nlp.vocab, nlp.Defaults.tokenizer_exceptions,
                     prefix_search = prefix_re.search, 
                     infix_finditer = infix_re.finditer,
                     suffix_search = suffix_re.search,
                     token_match=None)

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [121]:
boundary = re.compile('^[0-9]$')


def custom_seg(doc):
    prev = doc[0].text
    length = len(doc)
    for index, token in enumerate(doc):
        is_number = token.text == '.' and boundary.match(prev) and index != (length - 1)
        if is_number or token.text in [':', ';', ',', '/', '*'] or not token.is_punct:
            next_t = index + 1
            while next_t < length:
                doc[next_t].sent_start = False
                if doc[next_t].is_space:
                    next_t += 1
                else:
                    break
        prev = token.text
    return doc


CUSTOM_SEG = 'custom_seg'

In [124]:
with jsonlines.open('dataset.jsonl') as reader:
    data = [obj for obj in reader]

train_data = [(row['text'], {'entities': row['labels']}) for row in data[:150]]
test_data = [(row['text'], row['labels']) for row in data[150:]]

In [125]:
nlp = spacy.load('de_core_news_lg')
if CUSTOM_SEG in nlp.pipe_names:
    nlp.remove_pipe(CUSTOM_SEG)
nlp.add_pipe(custom_seg, name=CUSTOM_SEG, before='parser')

nlp.tokenizer = create_custom_tokenizer(nlp)
nlp.pipe_names

['tagger', 'custom_seg', 'parser', 'ner']

In [126]:
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe("ner")

for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [127]:
optimizer = nlp.resume_training()
move_names = list(ner.move_names)
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [128]:
scores = []

In [129]:
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():

    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 16.0, 1.001)

    for _ in tqdm(range(30)):
        random.shuffle(train_data)
        batches = minibatch(train_data, size=sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        logger.info(f"Losses: {losses}")
            
        scores.append(evaluate(nlp, test_data))

  0%|          | 0/30 [00:00<?, ?it/s]2020-12-29 20:41:58.850 | INFO     | __main__:<module>:13 - Losses: {'ner': 5521.989167222448}
  3%|▎         | 1/30 [00:12<06:03, 12.54s/it]2020-12-29 20:42:10.725 | INFO     | __main__:<module>:13 - Losses: {'ner': 5074.476464488759}
  7%|▋         | 2/30 [00:24<05:45, 12.33s/it]2020-12-29 20:42:22.641 | INFO     | __main__:<module>:13 - Losses: {'ner': 4665.722264235562}
 10%|█         | 3/30 [00:36<05:29, 12.20s/it]2020-12-29 20:42:34.743 | INFO     | __main__:<module>:13 - Losses: {'ner': 4692.655696151522}
 13%|█▎        | 4/30 [00:48<05:16, 12.17s/it]2020-12-29 20:42:45.000 | INFO     | __main__:<module>:13 - Losses: {'ner': 5125.704040696715}
 17%|█▋        | 5/30 [00:58<04:49, 11.60s/it]2020-12-29 20:42:52.884 | INFO     | __main__:<module>:13 - Losses: {'ner': 4448.686472449401}
 20%|██        | 6/30 [01:06<04:11, 10.48s/it]2020-12-29 20:43:00.972 | INFO     | __main__:<module>:13 - Losses: {'ner': 4662.792940271889}
 23%|██▎       | 7/30

In [130]:
#last iteration score
index = -1
print(scores[index]['ents_p'], scores[index]['ents_r'], scores[index]['ents_f'])

100.0 100.0 100.0


In [134]:
scores[-1]

{'uas': 0.0,
 'las': 0.0,
 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}},
 'ents_p': 100.0,
 'ents_r': 100.0,
 'ents_f': 100.0,
 'ents_per_type': {'COMPANY_ADDRESS': {'p': 100.0, 'r': 100.0, 'f': 100.0},
  'COMPANY_NAME': {'p': 100.0, 'r': 100.0, 'f': 100.0}},
 'tags_acc': 0.0,
 'token_acc': 100.0,
 'textcat_score': 0.0,
 'textcats_per_cat': {}}

In [131]:
## save model
nlp.meta['name'] = 'Registration Docs Parser'
nlp.meta['version'] = '1'
nlp.remove_pipe(CUSTOM_SEG)
nlp.to_disk('model/')

In [132]:
## load model
nlp = spacy.load('model/')
if CUSTOM_SEG in nlp.pipe_names:
    nlp.remove_pipe(CUSTOM_SEG)
nlp.add_pipe(custom_seg, name=CUSTOM_SEG, before='parser')

In [133]:
## test data on independent data
scorer_test = [[x['text'], x['labels']] for x in data[30:]]
score = evaluate(nlp, scorer_test)
print(score['ents_p'], score['ents_r'], score['ents_f'])

90.48991354466858 92.3529411764706 91.41193595342067
