In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from IPython.core.display import display, HTML
import standoffconverter

# DH application with spacy

## Table of Contents
1. Custom Spacy attributes

    1. Document level attributes 
        Dataset: Wilhelmus
    2. Document level attributes (optional)
        Dataset: Deutsches Textarchiv
    3. Span level attributes 
        Dataset: Berliner Intellektuelle
    4. Token level attributes (optional)
        Dataset Wilhelmus
    
2. Text categorization
    1. Authorship attribution 
        Dataset: Wilhelmus    
3. Attribute visualization with Displacy
    1. Document sariants 
        Dataset: Berliner Intellektuelle

3. Multi-lingual Word Representations (optional)
    

In [None]:
from tei_dataloader import dta_loader, bi_loader, wilhelmus_loader
from tei_dataloader import extract_text_versions_from_etree as extract_versions
from textcat_utils import split_train_test, get_textcat, evaluate
import numpy as np
from lxml import etree
import spacy
from spacy.util import minibatch, compounding

spec = {"tei":"http://www.tei-c.org/ns/1.0"}

nlp_de = spacy.load("de", disable=['parser', 'tagger', 'ner'])
nlp_nl = spacy.load("nl", disable=['parser', 'tagger', 'ner'])

## 1. Custom Spacy attributes

This section demonstrates the use of the custom attribute namespace `._.`. In DH, the corpora are often encoded in XML (TEI). When parsing a text into spacy, the meta data is not automatically preserved. The custom namespace is capable of representing any TEI meta data inside the Spacy objects.

### 1.A Document level attributes 
The functioning is demonstrated using the Wilhelmus data set from the DH2019 Authorship Attribution challenge
(https://dh2019.adho.org/wilhelmus-challenge/).
The task is to predict the Author of dutch songs. Thus, for each song, we would like to have an `author` attribute.

In [None]:
spacy.tokens.Doc.set_extension('author', default=None, force=True)

We start by loading the text of the XML files and the author from the training set of the corpus. In our simplified showcase, we only focus on the authors that are mentioned in the challenge description.

In [None]:
chosen_authors = [
    "Johan Fruytiers",
    "D.V. Coornhert",
    "Philips van Marnix van Sint Aldegonde",
    "Pieter Datheen"
]

In [None]:
def get_author(tree):
    authors = tree.xpath(".//author")
    if len(authors) == 1:
        return authors[0].text
    else:
        print("Author tag not found")
        return None

def get_text(tree):
    text = tree.xpath(".//text")
    if len(text) == 1:
        return ''.join(text[0].itertext()).strip()
    else:
        print("Text tag not found")
        return None

wilhelmus_docs = []
        
for wilheminus_xml_tree in wilhelmus_loader():
    author_label = get_author(wilheminus_xml_tree)
    if author_label in chosen_authors:
        text = get_text(wilheminus_xml_tree)
        doc = nlp_nl(text)
        doc._.set("author", author_label)
        wilhelmus_docs.append(doc)

In [None]:
len(wilhelmus_docs)

In [None]:
sample_doc = wilhelmus_docs[0]
print(sample_doc._.author)
print(sample_doc.text[:200])

### 1.B Document level attributes (optional)
This Deutsches Textarchiv is very similar to the Wilhelmus example, in this case we just use the `GND` attribute of the `author` tag if it is present.

In [None]:
dta_docs = []

for itree, dta_xml_tree in enumerate(dta_loader()):
    if itree > 5: # we don't need so many files
        break;
        
    authors = dta_xml_tree.xpath(".//tei:author/tei:persName", namespaces=spec)
    
    author_gnd = None
    for author in authors:
        if "ref" in author.attrib:
            author_gnd = author.attrib["ref"]
            break
    author_gnd = "anonymous" if author_gnd == None else author_gnd
    
    # retrieve plain text
    text = []
    for body in dta_xml_tree.findall(".//tei:body", namespaces=spec):
        text.append(''.join(body.itertext()).strip())
    
    text = ''.join(text)
    if len(text) <= nlp_de.max_length: 

        doc = nlp_de(text)
    
    doc._.set("author", author_gnd)
    dta_docs.append(doc)

In [None]:
sample_doc = dta_docs[4]
print(sample_doc._.author)
print(sample_doc.text[:200])

### 1.C Span level attributes 
In this example we consider the edition Berliner Intellektuelle that encompasses letters and manuscripts by a group of people that influenced the intellectual Berlin around 1800. The letters contain a lot of alterations that have been encoded in the TEI with `<add>` and `<del>` very accurately. We introduce four custom attributes into spacy tokens and spans that encode for the initial and "final" version of the document.

In [None]:
from spacy.tokens import Token, Span

Token.set_extension('initial', default=list(), force=True)
Token.set_extension('final', default=list(), force=True)

Token.set_extension(
    'has_been_modified',
    method=lambda token: np.sum(token._.initial ^ token._.final) != 0,
    force=True
)

Span.set_extension(
    'has_been_modified',
    method=lambda span: any(t._.has_been_modified() for t in span),
    force=True
)

In [None]:
bi_docs = []

for bi_doc in bi_loader():
    for body in bi_doc.findall(".//tei:body", namespaces=spec):
        doc, in_init, in_final = extract_versions(body)
        if len(doc) <= nlp_de.max_length: 
            spacified = nlp_de(doc)
            for token in spacified:
                token._.initial = np.array(in_init[token.idx:token.idx+len(token)])
                token._.final = np.array(in_final[token.idx:token.idx+len(token)])
            bi_docs.append(spacified)

In [None]:
for token in bi_docs[0]:
    if token.is_alpha and token._.has_been_modified():
        break

In [None]:
token

In [None]:
token._.has_been_modified()

In [None]:
token._.initial

In [None]:
token._.final

### 1.D Span level attributes (optional)
In this example, we show how to incorporate the verse meter as custom information into lines of the songs of the wilhelmus challenge.

In [None]:
spacy.tokens.Span.set_extension('verse_meter', default=None, force=True)

def get_author(tree):
    authors = tree.xpath(".//author")
    if len(authors) == 1:
        return authors[0].text
    else:
        print("Author tag not found")
        return None
        
        
lines = []
for wilhelmus_xml_tree in wilhelmus_loader():
    
    author_label = get_author(wilhelmus_xml_tree)
    
    if author_label in chosen_authors:
        lines.append([])
        text = wilhelmus_xml_tree.xpath(".//text")[0]
        plain, standoffs = standoffconverter.tree_to_standoff(text)
        doc = nlp_nl(plain)

        tokenidx2i = {t.idx+tchar:t.i for t in doc for tchar in range(len(t.text))}                
        
        for standoff in standoffs:
            if standoff["tag"] == "l":
                lines[-1].append(doc[tokenidx2i[standoff["begin"]]:tokenidx2i[standoff["end"]]])
                lines[-1][-1]._.verse_meter = None if "met" not in standoff["attrib"] else standoff["attrib"]["met"]

In [None]:
lines_example = lines[4]

for line in lines_example[:20]:
    print(line, line._.verse_meter)

## 2. Text categorization

### 2.A Authorship attribution
In this section, the `textcat` pipeline component is introduced to classify documents. We choose the above subset of the Wilhelmus data set for illustration purposes. And will classify the Wilhelmus using the resulting model.

In [None]:
textcat = get_textcat(nlp_nl)
for author in set(a._.author for a in wilhelmus_docs):
    textcat.add_label(author)

In [None]:
def train(train_data, test_data, nlp, textcat):

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(10):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, *test_data)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

            
X_train, X_test, y_train, y_test = split_train_test(wilhelmus_docs)

train(
    list(zip(X_train, [{'cats': cats} for cats in y_train])),
    (X_test, y_test),
    nlp_nl,
    textcat
)

In [None]:
wilhelminus5135 = 'Wilhelmus van Nassouwe\n        Ben ick van Duytschen bloet,\n        Den Vaderlant ghetrouwe\n        Blijf ick tot inden doot:\n        Een Prince van Oraengien\n        Ben ick vrij onverveert,\n        Den Coninck van Hispaengien\n        Heb ick altijt gheeert.\n      \n      \n        In Godes vrees te leven\n        Heb ick altijt betracht,\n        Daerom ben ick verdreven\n        Om Landt om Luyd ghebracht:\n        Maer Godt sal my regeren\n        Als een goet Instrument,\n        Dat ick sal wederkeeren\n        In mijnen Regiment.\n      \n      \n        Lijdt u mijn Ondersaten\n        Die oprecht zijn van aert,\n        Godt sal u niet verlaten,\n        Al zijt ghy nu beswaert:\n        Die vroom begheert te leven\n        Bidt Godt nacht ende dach,\n        Dat hy my cracht wil gheven\n        Dat ick u helpen mach.\n      \n      \n        Lijf en goet al te samen\n        Heb ick u niet verschoont,\n        Mijn Broeders hooch van Namen\n        Hebbent u oock vertoont:\n        Graef Adolff is ghebleven,\n        In Vrieslandt in den Slach,\n        Zijn Siel int eewich Leven\n        Verwacht den Jongsten dach.\n      \n      \n        Edel en Hooch gheboren\n        Van Keyserlicken Stam:\n        Een Vorst des Rijcks vercoren\n        Als een vroom Christen Man,\n        Voor Godes Woort ghepreesen,\n        Heb ick vrij onversaecht,\n        Als een Helt sonder vreesen\n        Mijn Edel bloet ghewaecht.\n      \n      \n        Mijn Schilt ende betrouwen\n        Sijt ghy, o Godt mijn Heer,\n        Op u soo wil ick bouwen\n        Verlaet my nemmermeer:\n        Dat ick doch vroom mach blijven\n        U dienaer taller stondt\n        Die Tyranny verdrijven,\n        Die my mijn hert doorwondt.\n      \n      \n        Van al die my beswaren,\n        End mijn Vervolghers zijn,\n        Mijn Godt wilt doch bewaren\n        Den trouwen dienaer dijn:\n        Dat sy my niet verrasschen\n        In haren boosen moet,\n        Haer handen niet en wasschen\n        In mijn onschuldich bloet.\n      \n      \n        Als David moeste vluchten\n        Voor Saul den Tyran:\n        Soo heb ick moeten suchten\n        Met menich Edelman:\n        Maer Godt heeft hem verheven,\n        Verlost uut alder noot,\n        Een Coninckrijck ghegheven\n        In Israel seer groot.\n      \n      \n        Nae tsuer sal ick ontfanghen:\n        Van Godt mijn Heer dat soet,\n        Daer na so doet verlanghen\n        Mijn Vorstelick ghemoet,\n        Dat is dat ick mach sterven\n        Met eeren in dat Velt,\n        Een eewich Rijck verwerven\n        Als een ghetrouwe Helt.\n      \n      \n        Niet doet my meer erbarmen\n        In mijnen wederspoet,\n        Dan datmen siet verarmen\n        Des Conincks Landen goet,\n        Dat u de Spaengiaerts crencken\n        O Edel Neerlandt soet,\n        Als ick daer aen ghedencke\n        Mijn Edel hert dat bloet.\n      \n      \n        Als een Prins op gheseten\n        Met mijner Heyres cracht,\n        Vanden Tyran vermeten\n        Heb ick den Slach verwacht,\n        Die by Maestricht begraven\n        Bevreesde mijn ghewelt,\n        Mijn Ruyters sachmen draven\n        Seer moedich door dat Velt.\n      \n      \n        Soo het den wille des Heeren\n        Op die tijt had gheweest,\n        Had ick gheern willen keeren\n        Van u dit swaer tempeest:\n        Maer de Heer van hier boven\n        Die alle dinck regeert,\n        Diemen altijt moet loven\n        En heeftet niet begheert.\n      \n      \n        Seer Prinslick was ghedreven\n        Mijn Princelick ghemoet,\n        Stantvastich is ghebleven\n        Mijn hert in teghenspoet,\n        Den Heer heb ick ghebeden\n        Van mijnes herten gront,\n        Dat hy mijn saeck wil reden,\n        Mijn onschult doen bekant.\n      \n      \n        Oorlof mijn arme Schapen\n        Die zijt in grooten noot,\n        U Herder sal niet slapen\n        Al zijt ghy nu verstroyt:\n        Tot Godt wilt u begheven,\n        Sijn heylsaem Woort neemt aen,\n        Als vrome Christen leven,\n        Tsal hier haest zijn ghedaan.\n      \n      \n        Voor Godt wil ick belijden\n        End zijner grooter Macht,\n        Dat ick tot gheenen tijden\n        Den Coninck heb veracht:\n        Dan dat ick Godt den Heere\n        Der Hoochster Majesteyt,\n        Heb moeten obedieren,\n        Inder gherechticheyt.'
print(wilhelminus5135[:200] , "...")



In [None]:
wilhelminus5135_doc = nlp_nl(wilhelminus5135)
prediction = textcat(wilhelminus5135_doc)
for k,v in prediction.cats.items():
    print(k,"{:0.4f}".format(v))

## Attribute Visualization with Displacy

### 3.A Document variants
We already extracted the document variants in 1.C., now we would like to visualize the document variants with displacy.

In [None]:
modified_tokens = []
sample_doc = bi_docs[3]

for token in sample_doc:
    if token._.has_been_modified():
        
        if len(token.__str__().strip()) > 0:
            if np.sum(np.logical_and(~token._.initial, token._.final)) > 0:
                modified_tokens.append({
                    "label": "ADD",
                    "start": token.idx,
                    "end": token.idx+len(token)
                })
            if np.sum(np.logical_and(token._.initial, ~token._.final)) > 0:
                modified_tokens.append({
                    "label": "DEL",
                    "start": token.idx,
                    "end": token.idx+len(token)
                })


from spacy.displacy.render import EntityRenderer
options={
    'colors': {'ADD': '#2ca02c', 'DEL': '#d62728'},
    'ents': ['ADD', 'DEL']
}

renderer = EntityRenderer(options=options)
display(HTML(renderer.render_ents(doc.__str__(), modified_tokens, "")))

## Multi-lingual word representations (optional)
fasttext provides word vectors for 157 different languages (https://fasttext.cc/docs/en/crawl-vectors.html ). In this example, we show how to load vectors for the dutch model and store it as spacy model.

In [None]:
path_to_cc_XX_300_vec = "downloads/cc.nl.300.vec"

try:
    nlp = spacy.load("fasttext_model_nl")

except OSError:

    from spacy.lang.nl import Dutch
    nlp = Dutch()

    with open(path_to_cc_XX_300_vec, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab

    nlp.to_disk('fasttext_model_nl')
    