In [None]:
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML

# TEI to spaCy

## Contents

1. Basic Pipeline from TEI to Spacy with annotations on document-level

    1. load 50 TEI encoded XMLs from Deutsches Textarchiv
    2. Extract plain text and author GND
    3. Annotate each document with it's author id

2. A little more advanced Pipeline from TEI to Spacy annotations on character-level
    1. load 50 TEI encoded XMLs from Berliner Intellektuelle
    2. Extract initial and last version
    3. annotate sub-tokens that have been added or deleted
    
3. Loading word vectors from fasttext

In [None]:
from tei_dataloader import dta_loader, bi_loader, wilhelmus_loader
from tei_dataloader import extract_text_versions_from_etree as extract_versions
import numpy as np
from lxml import etree

spec = {"tei":"http://www.tei-c.org/ns/1.0"}

## 1. Basic Pipeline from TEI to Spacy with annotations on document-level

### 1.1 Deutsches Textarchiv

In [None]:
txts = []
labels = []

for dta_doc in dta_loader():
    
    # find out author GND from XML
    authors = dta_doc.xpath(".//tei:author/tei:persName", namespaces=spec)
    
    author_gnd = None
    for author in authors:
        if "ref" in author.attrib:
            author_gnd = author.attrib["ref"]
            break
    author_gnd = "anonymous" if author_gnd == None else author_gnd
    
    # retrieve plain text
    new_txt = []
    for body in dta_doc.findall(".//tei:body", namespaces=spec):
        new_txt.append(''.join(body.itertext()).strip())
    txts.append(''.join(new_txt))
    labels.append(author_gnd)
        

In [None]:
print(labels[0])
print(txts[0][:100])
len(txts)

In [None]:
from collections import Counter
authors, _ = zip(*Counter(labels).most_common(2))
authors

two_class_txts, two_class_labels = [], []
for txt, label in zip(txts, labels):
    if label in authors:
        two_class_txts.append(txt)
        two_class_labels.append(label)

In [None]:
import spacy

spacy.tokens.Doc.set_extension('author', default=None, force=True)

nlp_de = spacy.load("de", disable=['parser', 'tagger', 'ner'])
nlp_nl = spacy.load("nl", disable=['parser', 'tagger', 'ner'])

In [None]:
docs = []
for author,doc in zip(two_class_labels, nlp_de.pipe(two_class_txts)):
    doc._.set("author", author)
    docs.append(doc)

In [None]:
docs[0]._.author

### Example for text classification

In [None]:
def evaluate(tokenizer, textcat, docs, cats):

    tp = 0.0   # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0   # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

In [None]:
from sklearn.model_selection import train_test_split

def create_textcat(docs, nlp):
    labels = [doc._.author for doc in docs]
    authors = list(set(labels))
    texts, labels = docs, [{author:True if label == author else False for author in authors} for label in labels]

    X_train, X_test, y_train, y_test = train_test_split(texts, labels)

    train_data = list(zip(X_train, [{'cats': cats} for cats in y_train]))

    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    else:
        # otherwise, get it, so we can add labels to it
        textcat = nlp.get_pipe('textcat')

    for author in authors:
        textcat.add_label(author)

    return textcat, train_data, (X_test, y_test)

In [None]:
from spacy.util import minibatch, compounding

def train(train_data, test_data, nlp, textcat):

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(5):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, *test_data)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

#textcat, train_data, test_data = create_textcat(docs, nlp_de)
#train(train_data, test_data, nlp_de, textcat)

## 1.2 Wilhelminus Challenge

In [None]:
wilhelminus5135 = 'Wilhelmus van Nassouwe\n        Ben ick van Duytschen bloet,\n        Den Vaderlant ghetrouwe\n        Blijf ick tot inden doot:\n        Een Prince van Oraengien\n        Ben ick vrij onverveert,\n        Den Coninck van Hispaengien\n        Heb ick altijt gheeert.\n      \n      \n        In Godes vrees te leven\n        Heb ick altijt betracht,\n        Daerom ben ick verdreven\n        Om Landt om Luyd ghebracht:\n        Maer Godt sal my regeren\n        Als een goet Instrument,\n        Dat ick sal wederkeeren\n        In mijnen Regiment.\n      \n      \n        Lijdt u mijn Ondersaten\n        Die oprecht zijn van aert,\n        Godt sal u niet verlaten,\n        Al zijt ghy nu beswaert:\n        Die vroom begheert te leven\n        Bidt Godt nacht ende dach,\n        Dat hy my cracht wil gheven\n        Dat ick u helpen mach.\n      \n      \n        Lijf en goet al te samen\n        Heb ick u niet verschoont,\n        Mijn Broeders hooch van Namen\n        Hebbent u oock vertoont:\n        Graef Adolff is ghebleven,\n        In Vrieslandt in den Slach,\n        Zijn Siel int eewich Leven\n        Verwacht den Jongsten dach.\n      \n      \n        Edel en Hooch gheboren\n        Van Keyserlicken Stam:\n        Een Vorst des Rijcks vercoren\n        Als een vroom Christen Man,\n        Voor Godes Woort ghepreesen,\n        Heb ick vrij onversaecht,\n        Als een Helt sonder vreesen\n        Mijn Edel bloet ghewaecht.\n      \n      \n        Mijn Schilt ende betrouwen\n        Sijt ghy, o Godt mijn Heer,\n        Op u soo wil ick bouwen\n        Verlaet my nemmermeer:\n        Dat ick doch vroom mach blijven\n        U dienaer taller stondt\n        Die Tyranny verdrijven,\n        Die my mijn hert doorwondt.\n      \n      \n        Van al die my beswaren,\n        End mijn Vervolghers zijn,\n        Mijn Godt wilt doch bewaren\n        Den trouwen dienaer dijn:\n        Dat sy my niet verrasschen\n        In haren boosen moet,\n        Haer handen niet en wasschen\n        In mijn onschuldich bloet.\n      \n      \n        Als David moeste vluchten\n        Voor Saul den Tyran:\n        Soo heb ick moeten suchten\n        Met menich Edelman:\n        Maer Godt heeft hem verheven,\n        Verlost uut alder noot,\n        Een Coninckrijck ghegheven\n        In Israel seer groot.\n      \n      \n        Nae tsuer sal ick ontfanghen:\n        Van Godt mijn Heer dat soet,\n        Daer na so doet verlanghen\n        Mijn Vorstelick ghemoet,\n        Dat is dat ick mach sterven\n        Met eeren in dat Velt,\n        Een eewich Rijck verwerven\n        Als een ghetrouwe Helt.\n      \n      \n        Niet doet my meer erbarmen\n        In mijnen wederspoet,\n        Dan datmen siet verarmen\n        Des Conincks Landen goet,\n        Dat u de Spaengiaerts crencken\n        O Edel Neerlandt soet,\n        Als ick daer aen ghedencke\n        Mijn Edel hert dat bloet.\n      \n      \n        Als een Prins op gheseten\n        Met mijner Heyres cracht,\n        Vanden Tyran vermeten\n        Heb ick den Slach verwacht,\n        Die by Maestricht begraven\n        Bevreesde mijn ghewelt,\n        Mijn Ruyters sachmen draven\n        Seer moedich door dat Velt.\n      \n      \n        Soo het den wille des Heeren\n        Op die tijt had gheweest,\n        Had ick gheern willen keeren\n        Van u dit swaer tempeest:\n        Maer de Heer van hier boven\n        Die alle dinck regeert,\n        Diemen altijt moet loven\n        En heeftet niet begheert.\n      \n      \n        Seer Prinslick was ghedreven\n        Mijn Princelick ghemoet,\n        Stantvastich is ghebleven\n        Mijn hert in teghenspoet,\n        Den Heer heb ick ghebeden\n        Van mijnes herten gront,\n        Dat hy mijn saeck wil reden,\n        Mijn onschult doen bekant.\n      \n      \n        Oorlof mijn arme Schapen\n        Die zijt in grooten noot,\n        U Herder sal niet slapen\n        Al zijt ghy nu verstroyt:\n        Tot Godt wilt u begheven,\n        Sijn heylsaem Woort neemt aen,\n        Als vrome Christen leven,\n        Tsal hier haest zijn ghedaan.\n      \n      \n        Voor Godt wil ick belijden\n        End zijner grooter Macht,\n        Dat ick tot gheenen tijden\n        Den Coninck heb veracht:\n        Dan dat ick Godt den Heere\n        Der Hoochster Majesteyt,\n        Heb moeten obedieren,\n        Inder gherechticheyt.'
print(wilhelminus5135)

In [None]:
def get_author(tree):
    authors = tree.xpath(".//author")
    if len(authors) == 1:
        return authors[0].text
    else:
        print("Author tag not found")
        return None

def get_text(tree):
    text = tree.xpath(".//text")
    if len(text) == 1:
        return ''.join(text[0].itertext()).strip()
    else:
        print("Text tag not found")
        return None

        
txts = []
labels = []
for wilheminus_doc in wilhelmus_loader():
    txts.append(get_text(wilheminus_doc))
    labels.append(get_author(wilheminus_doc))

In [None]:
len(labels)

In [None]:
chosen_authors = ["Johan Fruytiers", "D.V. Coornhert", "Philips van Marnix van Sint Aldegonde", "Pieter Datheen"]

for cauthor in chosen_authors:
    print(cauthor, labels.count(cauthor))
    
#chosen_authors = set()
#for author in labels:
#    if labels.count(author) > 60 and author not in ["Anonymous", "anoniem", "8904", None] and ", " not in author:
#        chosen_authors.add(author)    

In [None]:
docs = []        

for author,txt in zip(labels, txts):
    if author in chosen_authors:
        doc = nlp_nl(txt)
        doc._.set("author", author)
        docs.append(doc)
        
print("number of documents, ", len(docs))

In [None]:
textcat, train_data, test_data = create_textcat(docs, nlp_nl)
print("len(train_data)", len(train_data[1]))
print("len(test_data)", len(test_data[1]))
train(train_data, test_data, nlp_nl, textcat)

In [None]:
wilhelminus5135_doc = nlp_nl(wilhelminus5135)
prediction = textcat(wilhelminus5135_doc)
prediction.cats

## 2. Spacy annotations on character-level

In [None]:
import spacy
from spacy.tokens import Token, Span

Token.set_extension('initial', default=list(), force=True)
Token.set_extension('final', default=list(), force=True)

Token.set_extension(
    'has_been_modified',
    method=lambda token: np.sum(token._.initial ^ token._.final) != 0,
    force=True
)

Span.set_extension(
    'has_been_modified',
    method=lambda span: any(t._.has_been_modified() for t in span),
    force=True
)

nlp = spacy.load("de", disable=['parser', 'tagger', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [None]:
docs = []
nlp.max_length = 1e10
for bi_doc in bi_loader():
    for body in bi_doc.findall(".//tei:body", namespaces=spec):
        doc, in_init, in_final = extract_versions(body)
        spacified = nlp(doc)
        for token in spacified:
            token._.initial = np.array(in_init[token.idx:token.idx+len(token)])
            token._.final = np.array(in_final[token.idx:token.idx+len(token)])
        docs.append(spacified)

In [None]:
modified_tokens = []

for doc in docs:
    break;
for token in doc:
    if token._.has_been_modified():
        
        if len(token.__str__().strip()) > 0:
            if np.sum(np.logical_and(~token._.initial, token._.final)) > 0:
                modified_tokens.append({
                    "label": "ADD",
                    "start": token.idx,
                    "end": token.idx+len(token)
                })
            if np.sum(np.logical_and(token._.initial, ~token._.final)) > 0:
                modified_tokens.append({
                    "label": "DEL",
                    "start": token.idx,
                    "end": token.idx+len(token)
                })


from spacy.displacy.render import EntityRenderer
options={
    'colors': {'ADD': '#2ca02c', 'DEL': '#d62728'},
    'ents': ['ADD', 'DEL']
}

renderer = EntityRenderer(options=options)
display(HTML(renderer.render_ents(doc.__str__(), modified_tokens, "")))

## 3. Loading word vectors from fasttext
In order to run the following cell, choose your model in the preferred language from https://fasttext.cc/docs/en/crawl-vectors.html#models and change the path accordingly. 

In [None]:
#%path_to_cc_XX_300_vec = "/datasets/text/fasttext/cc.de.300.vec"

try:
    #nlp = spacy.load("fasttext_model_de")
    pass
except OSError:

    from spacy.lang.de import German
    nlp = German()

    with open(path_to_cc_XX_300_vec, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab

    nlp.to_disk('fasttext_model_de')
    
