# Description 
This notebook provides the transformation of the Middle High German Reference corpus.

# Imports and settings

In [1]:
from lxml import etree
import os
import pandas as pd
import re
import json

In [2]:
from formutils import *

In [3]:
from tqdm.notebook import tqdm
tqdm.pandas()

Generate paths and transform .xml to dataframe

In [4]:
directory_path = "../corpora/REM/rem-corralled-20161222"
file_extension = ".xml"

In [5]:
paths = [
    entry.path
    for entry in os.scandir(directory_path)
    if entry.is_file() and entry.name.endswith(file_extension)
]

In [6]:
def get_xml_token(token): # normalized simplified MHG; otherwise ascii chars
    tok_anno = token.find("tok_anno")
    if tok_anno is not None:
        norm = tok_anno.find("norm") 
        if norm is not None and "tag" in norm.attrib:
            token_text = norm.attrib["tag"]
        else:
            token_text = tok_anno.attrib["ascii"]
    else:
        return []
    return token_text

In [7]:
contents = []
for file in tqdm(paths):
    with open(file, "rb") as f:
        root = etree.parse(f).getroot()
        tokens = root.findall(".//token")
        content = {
            "id": root.xpath("/text/@id", smart_strings=False),
            "name": root.xpath("/text/header/text/text()", smart_strings=False),
            "text_type": root.xpath("/text/header/text-type/text()", smart_strings=False),
            "tokens_anno_ascii": root.xpath("/text/token/tok_anno/@ascii", smart_strings=False),
            "tokens_anno_norm": root.xpath("/text/token/tok_anno/norm/@tag", smart_strings=False)
        }
        content["text"] = [x for x in [get_xml_token(token) for token in tokens] if "--" not in x]
        contents.append(content)
        root.clear()

contents_full = pd.DataFrame(contents)

  0%|          | 0/398 [00:00<?, ?it/s]

Build strings from tokens

In [8]:
for col in ["tokens_anno_ascii", "tokens_anno_norm", "text"]:
    col_str = f"{col}_as_string"
    contents_full[col_str] = contents_full[col].str.join(sep=" ")
    contents_full[col_str] = [re.sub(r" . ", ". ", str(x)) for x in contents_full[col_str]]

Filter, explode, and rename

In [9]:
df_ref = explode_columns(contents_full[["id", "name", "text_type", "text_as_string"]])
df_ref.columns = ["id", "name", "text_type", "text"]

Charters contained in the REM were disregarded completely. This is because they are contained in text collections, and also, because there is no quick and reliable way to know whether they are duplicates of my main corpus.

In [10]:
df_ref = df_ref[~df_ref["text_type"].astype(str).str.contains("Urkunde", regex=False)].copy()

In [11]:
with pd.option_context("display.max_colwidth", None, "display.max_rows", None):
    display(df_ref.text_type.value_counts())

Predigt                                               35
Segen                                                 30
Bibeldichtung                                         29
Gebet                                                 19
-                                                     19
Heiligenlegende                                       18
Mariendichtung                                        16
Legenden- und Geschichtsdichtung                      16
Psalmenübersetzung                                    14
Lehrdichtung                                          11
Credo, Beichte                                        10
höfischer Roman                                        8
Gebetsanweisung                                        7
Ordensregel                                            7
Antikenroman                                           6
frühhöfische Epik                                      5
Bibelübersetzung                                       5
Arzneibuch                     

This makes it clear that categorizes need to be summarized better.

Categorize and translate texts


In [12]:
with open("../data-push/0a-transform-reference/category-mapping.json", "r") as f:
    category_mapping = json.load(f)
with open("../data-push/0a-transform-reference/translation-mapping.json", "r") as f:
    translations_mapping = json.load(f)

Restructure and rename

In [13]:
df_ref["category_de"] = df_ref["text_type"].map(category_mapping)
df_ref["category"] = df_ref["category_de"].map(translations_mapping)
df_ref = df_ref.drop(["category_de", "text_type"], axis=1)

In [14]:
with pd.option_context("display.max_colwidth", None, "display.max_rows", None):
    display(df_ref.category.value_counts())

Prayer             74
Miscellaneous      58
Biblical poetry    54
Legend             51
Sermon             40
Didactic poetry    33
Epic               30
Translation        23
Recipe             13
Rhyme               7
Name: category, dtype: int64

# Export

In [15]:
df_ref.to_json("../data-push/0a-transform-reference/df-ref.json")