In [22]:
import glob
from enrich.tei import teis_to_traindata, teis_to_traindata_sents
from enrich.spacy_utils.data_prep import clean_train_data, traindata_to_csv, csv_to_traindata

In [None]:
# source_dir = r"C:\Users\pandorfer\Documents\Redmine\konde\schnitzler-tagebuch\data\editions\*.xml"
source_dir = r"C:\Users\pandorfer\Documents\Redmine\konde\thun-project\thun-data\editions\*.xml"
# source_dir = r"C:\Users\pandorfer\Documents\Redmine\konde\Hermann-Bahr_Arthur-Schnitzler\app\data\letters\**\*.xml"

In [None]:
files = [x for x in glob.iglob(source_dir, recursive=False)]
len(files)

In [None]:
# generate train data
# TRAIN_DATA = teis_to_traindata(files, ne_xpath='.//tei:body//tei:*[@key]', verbose=False) #habs
TRAIN_DATA = teis_to_traindata(files, parent_node='.//tei:div[@type="transcript"]//tei:p', ne_xpath='.//tei:rs', verbose=False) #thun
len(TRAIN_DATA)

In [None]:
# generate train data on sents base
# TRAIN_DATA = teis_to_traindata(files, ne_xpath='.//tei:body//tei:*[@key]', verbose=False) #habs
TRAIN_DATA = teis_to_traindata_sents(
    files, parent_node='.//tei:div[@type="transcript"]//tei:p',
    ne_xpath='.//tei:rs', verbose=False,
    model='de_core_news_sm'
) #thun
len(TRAIN_DATA)

In [None]:
out = traindata_to_csv(TRAIN_DATA, 'data/thun_sents_all.csv')

In [None]:
# clean train data (removing 'empty samples' and samples with less then 4 entities)
CLEAN_TRAIN_DATA = clean_train_data(CLEAN_TRAIN_DATA, min_ents=1, min_text_len=10, lang=[])
len(CLEAN_TRAIN_DATA)

In [None]:
# save cleaned train data as csv
out = traindata_to_csv(CLEAN_TRAIN_DATA, 'data/thun_sents_filtered.csv')

In [None]:
# load saved train data as TRAIN_DATA list of lists
loaded_train_data = csv_to_traindata(out)

In [None]:
##### compare models

In [1]:
import spacy

In [24]:
data = csv_to_traindata('data/vfbr.csv')

In [25]:
len(data)

106

In [26]:
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    for x in examples:
        doc_gold_text = ner_model.make_doc(x[0])
        gold = GoldParse(doc_gold_text, entities=x[1]['entities'])
        pred_value = ner_model(x[0])
        scorer.score(pred_value, gold)
    return scorer.scores

In [27]:
examples = data[60:len(data)]

In [28]:
models = [
    'de_core_news_sm',
    'data/vfbr_60_100',
    'data/vfbr_newlabel_blank_60_100'
]

In [29]:
for x in models:
    ner_model = spacy.load(x)
    results = evaluate(ner_model, examples)
    print(x, "p: {}; f: {}; r: {}".format(results['ents_p'], results['ents_f'], results['ents_r']))

de_core_news_sm p: 0.0; f: 0.0; r: 0.0
data/vfbr_60_100 p: 0.0; f: 0.0; r: 0.0
data/vfbr_newlabel_blank_60_100 p: 66.66666666666666; f: 65.97938144329896; r: 65.3061224489796


In [30]:
examples[1]

['Schuldenübergabe nach dem Tod des Joseph Pruner [Prunner/Brunner] (Antholz) an Peter Pruner betreffs der Schulden gegenüber dem Ehepaar Georg Miller [Müller] (Inhaber des Pruner-Gutes, Antholz-Oberthal)',
 {'entities': [[34, 75, 'OBJECT'], [79, 91, 'OBJECT'], [136, 202, 'OBJECT']]}]

In [2]:
text = "Schuldenübergabe nach dem Tod des Joseph Pruner [Prunner/Brunner] (Antholz) an Peter Pruner betreffs der Schulden gegenüber dem Ehepaar Georg Miller [Müller] (Inhaber des Pruner-Gutes, Antholz-Oberthal)"

In [3]:
text

'Schuldenübergabe nach dem Tod des Joseph Pruner [Prunner/Brunner] (Antholz) an Peter Pruner betreffs der Schulden gegenüber dem Ehepaar Georg Miller [Müller] (Inhaber des Pruner-Gutes, Antholz-Oberthal)'

In [4]:
ner_model = spacy.load('data/vfbr_newlabel_blank_60_100')

In [8]:
hansi = ner_model(text)

In [10]:
for ent in hansi.ents:
    print(ent, ent.label_)

Joseph Pruner [Prunner/Brunner] (Antholz) OBJECT
Peter Pruner betreffs der OBJECT
Ehepaar Georg Miller [Müller] (Inhaber des Pruner-Gutes, Antholz-Oberthal) OBJECT
