In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML

# TEI to spaCy

## Contents

1. Basic Pipeline from TEI to Spacy with annotations on document-level

    1. load 50 TEI encoded XMLs from Deutsches Textarchiv
    2. Extract plain text and author GND
    3. Annotate each document with it's author id

2. A little more advanced Pipeline from TEI to Spacy annotations on character-level
    1. load 50 TEI encoded XMLs from Berliner Intellektuelle
    2. Extract initial and last version
    3. annotate sub-tokens that have been added or deleted

In [2]:
from tei_dataloader import dta_loader, bi_loader, extract_text_versions_from_etree as extract_versions
import numpy as np
from lxml import etree

spec = {"tei":"http://www.tei-c.org/ns/1.0"}

## 1. Basic Pipeline from TEI to Spacy with annotations on document-level

In [3]:
txts = []
labels = []

for dta_doc in dta_loader():
    
    # find out author GND from XML
    authors = dta_doc.xpath(".//tei:author/tei:persName", namespaces=spec)
    
    author_gnd = None
    for author in authors:
        if "ref" in author.attrib:
            author_gnd = author.attrib["ref"]
            break
    author_gnd = "anonymous" if author_gnd == None else author_gnd
    
    # retrieve plain text
    for body in dta_doc.findall(".//tei:body", namespaces=spec):
        txts.append(''.join(body.itertext()).strip())
        labels.append(author_gnd)

In [4]:
print(labels[0])
print(txts[0][:100])
len(txts)

http://d-nb.info/gnd/181074907
Virginia,
oder
die Kolonie von Kentucky.

Erſter Theil.



Virginia an Adele.
Am Bord des Waſhington


15

In [5]:
import spacy

spacy.tokens.Doc.set_extension('author', default=None, force=True)

nlp = spacy.load("de", disable=['parser', 'tagger', 'ner'])

In [6]:
docs = []
for author,doc in zip(labels, nlp.pipe(txts)):
    doc._.set("author", author)
    docs.append(doc)

## 2. Spacy annotations on character-level

In [7]:
import spacy
from spacy.tokens import Token, Span

Token.set_extension('initial', default=list(), force=True)
Token.set_extension('final', default=list(), force=True)

Token.set_extension(
    'has_been_modified',
    method=lambda token: np.sum(token._.initial ^ token._.final) != 0,
    force=True
)

Span.set_extension(
    'has_been_modified',
    method=lambda span: any(t._.has_been_modified() for t in span),
    force=True
)

nlp = spacy.load("de", disable=['parser', 'tagger', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [8]:
docs = []
nlp.max_length = 1e10
for bi_doc in bi_loader():
    for body in bi_doc.findall(".//tei:body", namespaces=spec):
        doc, in_init, in_final = extract_versions(body)
        spacified = nlp(doc)
        for token in spacified:
            token._.initial = np.array(in_init[token.idx:token.idx+len(token)])
            token._.final = np.array(in_final[token.idx:token.idx+len(token)])
        docs.append(spacified)

In [9]:
modified_tokens = []
for doc in docs:
    break
for token in doc:
    if token._.has_been_modified():
        
        if len(token.__str__().strip()) > 0:
            if np.sum(np.logical_and(~token._.initial, token._.final)) > 0:
                modified_tokens.append({
                    "label": "ADD",
                    "start": token.idx,
                    "end": token.idx+len(token)
                })
            if np.sum(np.logical_and(token._.initial, ~token._.final)) > 0:
                modified_tokens.append({
                    "label": "DEL",
                    "start": token.idx,
                    "end": token.idx+len(token)
                })


In [10]:
from spacy.displacy.render import EntityRenderer
options={
    'colors': {'ADD': '#2ca02c', 'DEL': '#d62728'},
    'ents': ['ADD', 'DEL']
}

renderer = EntityRenderer(options=options)
display(HTML(renderer.render_ents(doc.__str__(), modified_tokens, "")))