# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Sprachübergreifende-Textalignierung" data-toc-modified-id="Sprachübergreifende-Textalignierung-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Sprachübergreifende Textalignierung</a></div><div class="lev1 toc-item"><a href="#Manuales-(sprachübergreifende-Alignierung)" data-toc-modified-id="Manuales-(sprachübergreifende-Alignierung)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Manuales (sprachübergreifende Alignierung)</a></div><div class="lev2 toc-item"><a href="#LXML" data-toc-modified-id="LXML-21"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>LXML</a></div>

# Sprachübergreifende Textalignierung

Blockseminar Studiengang "Digitale Methodik in den Geistes- und Kulturwissenschaften" (18.1.2020, 8.2.2020, 15.2.2020)




In [None]:
import os

in_dir = "./data/constitutions/"

# we create a dictionary with our constitutions:
sources = {}

for file in sorted(os.listdir(in_dir)):
    key = os.path.basename(file).split(os.extsep)[0]
    with open(in_dir + '/' + file, encoding="utf-8") as f:
        sources[key] = f.read()

# and a list of available constitutions for quick lookup:
constitutions = list(sources.keys())

print ("{} files read:".format(len(constitutions)))
print (constitutions)

In [None]:
from nltk import tokenize
# nltk.download('punkt')

sentences = {}
nos = {}
for c in constitutions:
    t = tokenize.sent_tokenize(sources[c])
    nos[c] = len(t)
    for i, s in enumerate(t):
        sentences[c + '_' + str(i)] = s

boundary = len(sentences) - nos['1948_-_BU_-_Burma_-_constitution_of_burma']
print("Corpus has {} sentences.".format(len(sentences)))
print("1948_-_BU_-_Burma_-_constitution_of_burma has {}.\n".format(nos['1948_-_BU_-_Burma_-_constitution_of_burma']))

print("Its first 3 sentences are:\n{}".format([sentences['1948_-_BU_-_Burma_-_constitution_of_burma_0'],\
                                              sentences['1948_-_BU_-_Burma_-_constitution_of_burma_1'],\
                                              sentences['1948_-_BU_-_Burma_-_constitution_of_burma_2']]))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', strip_accents='unicode', stop_words=["the", "of", "and"])
dfm = vectorizer.fit_transform(sentences.values())

print(dfm.shape)
print(type(dfm))
print(dfm.toarray())

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

target = dfm[boundary:]
sources = dfm[:boundary,]
print(target.shape)
print(sources.shape)

simils = cosine_similarity(target, sources)
print(simils.shape)

In [None]:
import numpy as np

np.argmax(simils, axis=1)

# Manuales (sprachübergreifende Alignierung)

In [None]:
import os

in_dir = "./data/manual/"

# we create a dictionary with our manuales:
sources = {}

for file in sorted(os.listdir(in_dir)):
    key = os.path.basename(file).split(os.extsep)[0]
    with open(in_dir + '/' + file, encoding="utf-8") as f:
        sources[key] = f.read()

# and a list of available constitutions for quick lookup:
manuales = list(sources.keys())

print ("{} files read:".format(len(manuales)))
print (manuales)

In [None]:
from nltk import tokenize
# nltk.download('punkt')

sentences = {}
nos = {}
for c in manuales:
    t = tokenize.sent_tokenize(sources[c])
    nos[c] = len(t)
    for i, s in enumerate(t):
        sentences[c + '_' + str(i)] = s

print("Corpus has {} sentences.".format(len(sentences)))
print("azp1552_ch17 {}.\n".format(nos['azp1552_ch17']))

print("Its first 3 sentences are:\n{}".format([sentences['azp1552_ch17_2'],\
                                              sentences['azp1552_ch17_3'],\
                                              sentences['azp1552_ch17_4']]))

## LXML

In [2]:
import lxml
from lxml import etree
import glob
import re
import os

in_dir = "./data/manual/"

sources = glob.glob(in_dir + '*.xml')

parsed = { os.path.basename(file).split(os.extsep)[0] :
                 (etree.parse(file))
                     for file in sorted(sources)
         }

manuales = list(parsed.keys())
print (manuales)

nsmap = {"tei": "http://www.tei-c.org/ns/1.0"}

def flatten(element):
    t = ""
    # Dagger milestones
    if element.get("rendition")=="#dagger":
        t += "†"
        if element.tail:
            t += str.replace(element.tail, "\n", " ")
    # asterisk milestones (additions in the 1556 ed.) - create temporary marker
    elif element.get("rendition")=="#asterisk":
        t += "*"
        if element.tail:
            t += str.replace(element.tail, "\n", " ")
    # Unanchored milestones - create temporary marker
    elif element.get("rendition")=="#unanchored":
        t += "‡"
        if element.tail:
            t += str.replace(element.tail, "\n", " ")
    else:
        for c in element.iter("expan"):
            flatten(c)
            if element.tail:
                t += str.replace(element.tail, "\n", " ")
        for c in element.iter("corr"):
            flatten(c)
            if element.tail:
                t += str.replace(element.tail, "\n", " ")
        if element.text:
            t += str.replace(element.text, "\n", " ")
        if element.getchildren():
            t += " ".join((flatten(child)) for child in element.getchildren())
        if element.tail:
            t += str.replace(element.tail, "\n", " ")

    return t

xp_divs = etree.XPath("(//tei:body/tei:div[@type = 'chapter'][not(@n = '0')])", namespaces = nsmap)

divs = {}
text = {}

for ed in manuales:
    t1 = ""
    divs[ed] = xp_divs(parsed[ed])
    t1  = "".join("++div--" + re.sub('\s+', ' ', '<p>' + flatten(div)) for div in divs[ed])
    t2  = re.sub(r'¶', '++break--¶',                       t1)       # where pilcrow signs are
    t3  = re.sub(r'([:\.\?\]])\s+([A-Z])(?!([CIJLVX]+|.)?\.)(?![^†‡*]{0,80}[:\.\?\]][^a-z]*[A-Z])(?=.{0,80}[†‡*])',
                     r'\1 ++break-- \2',                   t2)       # sentences beginning
                                                                     # with punctuation, whitespace, and a
                                                                     # capital letter (not immediately followed by
                                                                     # an abbreviation period)
                                                                     # and a milestone follows within 80 characters
                                                                     # (that do not contain a punctuation character)
    t4  = re.sub(r'\b([A-Z]{2}\s*[a-z])', r'++break-- \1', t3)       # two capital letters
    t5  = t4[::-1]                                                   # reverse the string
    t6  = re.sub(r'([†‡*])(?!.{0,100}--kaerb)', r'\1--kaerb++', t5)  # daggers without sentence boundaries, i.e. not covered above
    t7  = t6[::-1]                                                   # reverse the string
    t8  = re.sub(r'‡', '',                                 t7)       # Eliminate temporary markers: unanchored milestones

    # Concat everything and do a final removal of redundant breaks.
    t9 = re.sub(r'\+\+break--\s*\+\+break--', '++break--', " ".join(t8.strip().split()))
    
    t10 = re.sub(r'\+\+break--', r'<milestone type="lera-segment"/>', t9)
    t11 = re.sub(r'\+\+div--', r'</div><div type="chapter">', t10)
    text[ed] = '<root>' + re.sub(r'&', '&amp;', t11)[6:] + '</div></root>'


print("text['azp1552_ch17'] is:\n{}...".format(text['azp1552_ch17'][:400]))

['azp1552_ch17', 'azp1556_ch17', 'azp1573_ch17']
text['azp1552_ch17'] is:
<root><div type="chapter"><p> <milestone type="lera-segment"/>¶ Do ſeptimo mandamento. N ão furtaras. Capit. xvi j̈. <milestone type="lera-segment"/> PEra fundamento † das preguntas de ſte mãdam ẽto mandamento di ʒemos. Ho pri meyro que ha hi furto m ẽtal, ⁊ fur to real. Ho m ẽtal he võtade vontade de co meter ho real. Eho real he ſeg ũdo segundo Paulo l. 1. ff. đ fur. .§. 1. In ſtit. de obliga...


In [8]:
sentences = {}
nos = {}
for ed in manuales:
    sentences[ed] = {}
    segments = text[ed].split('<milestone type="lera-segment"/>')
    nos[ed] = len(segments)
    for i, s in enumerate(segments):
        sentences[ed][ed + '_' + str(i)] = s.strip()

print("Corpus has {} sentences.".format(len(sentences)))
print("azp1552_ch17 has {}.\n".format(nos['azp1552_ch17']))

print("Its first 5 sentences are:\n{}".format([sentences['azp1552_ch17']['azp1552_ch17_0'],\
                                               sentences['azp1552_ch17']['azp1552_ch17_1'],\
                                               sentences['azp1552_ch17']['azp1552_ch17_2'],\
                                               sentences['azp1552_ch17']['azp1552_ch17_3'],\
                                               sentences['azp1552_ch17']['azp1552_ch17_4']]))

Corpus has 3 sentences.
azp1552_ch17 has 387.

Its first 5 sentences are:
['<root><div type="chapter"><p>', '¶ Do ſeptimo mandamento. N ão furtaras. Capit. xvi j̈.', 'PEra fundamento † das preguntas de ſte mãdam ẽto mandamento di ʒemos. Ho pri meyro que ha hi furto m ẽtal, ⁊ fur to real. Ho m ẽtal he võtade vontade de co meter ho real. Eho real he ſeg ũdo segundo Paulo l. 1. ff. đ fur. .§. 1. In ſtit. de obligat. qu æ ex de li. na ſc. contrata ç ã, ou tratam ẽto engano ſa do alheo cõtra contra võtade vontade do ſe ñor, pera auer a ꝓpriedade, ou po ſ ſi ſ ſam, ou ho v ſo della. Di ſ ſemos (c õtrata ç ã) porq̃ porque ſem ella n ão ha furto real in. d. author="bragagnolo" timestamp="20190408T191343+0200" comment="check" : ainda q̃ que ho ha metal. Di ſ ſemos (do alheo) por q̃ ho tratam ẽto do ſeu, em q̃nto quanto ho he, ou cõ com re ʒ ão cree q̃ que he ſeu, n ão he furto. l. Inter o ẽs .§. Recte. ff. de furt. Acrec ẽtamos ( cõtra contra a võtade vontade do ſe ñor) porq̃ porque ſend

Save sentences as plaintext files.

In [9]:
import csv

for ed in manuales:
    with open('./data/manual/' + ed + '_seg.csv', 'w', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, lineterminator="\n")
        for key, value in sentences[ed].items():
           writer.writerow([key, value])