# Description
This notebook serves a part of the pre-processing, mostly filtering charters by language and date.

# Imports and settings

In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
from datetime import datetime
import langid

In [2]:
from formutils import *

In [3]:
from tqdm.notebook import tqdm
tqdm.pandas()

# Filter by given metadata
❗DISCLAIMER❗
Full Monasterium.net data is not provided; instead, skip here and continue at "Filter by enriched metadata"

In [4]:
df = pd.read_json("../data/0-base/all-charters-mom.json") 

Explode nested lists, check lengths

In [5]:
df_prep = explode_columns(
    df, ["cei_date_ATTRIBUTE_value", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to"]
)
has_same_length(df_prep, df)

True

Sort, drop duplicates

In [6]:
sorted = df_prep.sort_values(by="atom_id", key=lambda x: x.str.len())
nodupl = sorted.drop_duplicates(subset="cei_tenor_joined", keep="last")

Filter by tenor length, arbitrary anti-clutter threshold

In [7]:
with_tenor = nodupl[nodupl["cei_tenor_joined"].astype(str).str.len() >= 10]

Align dates; using workaround filter for Monasterium.net's charter dating convention

In [8]:
def get_cei_date_value(row):
    date = row["cei_date_ATTRIBUTE_value"]
    date_bot = row["cei_dateRange_ATTRIBUTE_from"]
    date_top = row["cei_dateRange_ATTRIBUTE_to"]
    value = next((d for d in (date, date_bot, date_top) if pd.notna(d)), None)
    return value


def is_valid_date(date_value):
    return isinstance(date_value, str) and ("9999" not in date_value and "010101" not in date_value)

In [9]:
df_prep_applied = with_tenor.copy()
df_prep_applied["date_joined"] = df_prep_applied.apply(get_cei_date_value, axis=1)
df_prep_filtered = df_prep_applied.loc[df_prep_applied["date_joined"].apply(is_valid_date)]

Filter by date

In [10]:
df_prep_filtered_mhg = df_prep_filtered[(df_prep_filtered["date_joined"] <= "14000000") &
                                        (df_prep_filtered["date_joined"] >= "11000000")]
len(df_prep_filtered_mhg)

21776

Filter by language

In [11]:
# Find existing languages
df_prep_filtered_mhg_exploded = explode_columns(df_prep_filtered_mhg, ["cei_lang_MOM"])
df_prep_filtered_mhg_exploded.cei_lang_MOM.value_counts().to_frame().sort_values(by="cei_lang_MOM", ascending=False)

Unnamed: 0,cei_lang_MOM
Deutsch,9232
Latein,5905
lat.,168
latinsky,38
dt.,21
Altserbisch.,11
Niederdeutsch,11
Ndt.,9
j. łaciński,7
nemški,6


In [12]:
# Select languages and filter
included_languages = [
    "Deutsch",
    "dt.",
    "deutsch",
    "nemški",
    "německy",
    "nemščina"
    ]

df_filtered = df_prep_filtered_mhg_exploded[df_prep_filtered_mhg_exploded["cei_lang_MOM"].isin(included_languages)]
len(df_filtered)

9262

In [13]:
df_filtered.to_json(f"../data-push/0b-preprocessing-main/charters-filtered-simple.json")

# Filter by enriched metadata

Detect languages

In [14]:
df_filtered = pd.read_json(f"../data-push/0b-preprocessing-main/charters-filtered-simple.json")

In [15]:
# Load language detection model
model_checkpoint = "../models/mom-langdetect/langdetect_20221121"
pipe = pipeline("text-classification", model = model_checkpoint)

In [16]:
def get_detection_result(string):
    return pipe(str(string)[0:512])


def get_detection_label(detection_result):
    return detection_result[0]["label"]


def get_detection_score(detection_result):
    return detection_result[0]["score"]


def detect_language(row):
    lang_detect_result = get_detection_result(row["cei_tenor_joined"])
    row["language_label"] = get_detection_label(lang_detect_result)
    row["language_score"] = get_detection_score(lang_detect_result)
    return row

Detect and count language label and score

In [17]:
detected = df_filtered.progress_apply(detect_language, axis=1)

  0%|          | 0/9262 [00:00<?, ?it/s]

In [18]:
detected.language_label.value_counts()

fnhd    9191
la        33
ca        12
sv         8
cs         5
pt         4
de         2
fi         2
he         2
mhd        1
ro         1
zh         1
Name: language_label, dtype: int64

Filter by selected languages

In [19]:
include_languages = ["fnhd", "mhg"] #includes both, given that wrong classification is resolved by dating
filtered = detected[detected["language_label"].astype(str).str.contains("|".join(include_languages))]
filtered.to_json("../data-push/0b-preprocessing-main/charters-filtered-languages.json")

In [20]:
filtered = pd.read_json("../data-push/0b-preprocessing-main/charters-filtered-languages.json")

In [21]:
filtered.sort_values("language_score").iloc[0].cei_tenor_joined

'Allen, die disen brief sehent oder horent lesen, kúnden wir Johans von Tengen, Walther von der Alten Klingen, Albrecht von Bussnang rittere, Lútolt von Arburg, || alle vier frij herren, Egbrecht von Goldenberg, Gotfrit von Húnaberg, Fridrich von Húnwile, Johans der Giel rittere, Johans Hofmeister von Frowenfeld senger || der stift des thůms ze Costentz, Johans vnd Růd. von Bonstetten gebrůdere, Peter von Ebersperg, Herrnan von Landenberg genant von Werdegg, Vlr. von || Aspermunt der elter, Růdolf von Goldenberg vnd Johans von Seon burger Zúrich vnd veriechen offenlich mit disem brief, als Růd. von Landenberg genant von Werdegg Griffense die burg die statt den se vnd die hofreiti mit lút, gúlt gůt vogteyen gericht twing vnd bann mit aller zů gehoret den edlen wolerbornen herren graf Fridrich, graf Tonat vnd graf Diethelm von Tokkenburg allen drin gebrůdern ze v´nser wegen recht vnd redlich ze koffen geben hat, als die kof brief wisent vnd sagent, die dar v´ber geben vnd besigelt sint, 

Detect languages 2.0

This is done because chances are that the language model does not catch all latin charters due to its architecture

In [22]:
identifier = langid.langid.LanguageIdentifier.from_modelstring(langid.langid.model, norm_probs=True)

filtered["language_langid"] = None
filtered["confidence_langid"] = None

for item, row in tqdm(filtered.iterrows()):
    text = row["cei_tenor_joined"]
    language, confidence = identifier.classify(text)
    filtered.at[item, "language_langid"] = language
    filtered.at[item, "confidence_langid"] = confidence

filtered.language_langid.value_counts().keys().to_list()

0it [00:00, ?it/s]

['de', 'lb', 'la', 'nl']

In [23]:
final_language_filtered = filtered[filtered["language_langid"] == "de"]

Check detection scores

In [24]:
final_language_filtered = filtered
final_language_filtered.sort_values("language_score", ascending=True)

Unnamed: 0,atom_id,cei_abstract_joined,cei_abstract_foreign,cei_tenor_joined,cei_pTenor,cei_placeName,cei_lang_MOM,cei_date,cei_dateRange,cei_date_ATTRIBUTE_value,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy,date_joined,language_label,language_score,language_langid,confidence_langid
172016,"[tag:www.monasterium.net,2011:/charter/CH-StiA...",Sechzehn Mitschuldner der Herren von Landenber...,[],"Allen, die disen brief sehent oder horent lese...",[],[],Deutsch,[],[5. Dezember 1369],,13691205.0,13691205.0,"[StiASG_13691205_GG2-T6_v.jpg, StiASG_13691205...",[],13691205,fnhd,0.521548,de,1.0
184861,"[tag:www.monasterium.net,2011:/charter/CSGVI/1...",Peter von Diemberg und seine Verwandten verzic...,[],"Allen den, die disen brief an sehent lesent od...",[],[],Deutsch,[],[1. September 1337],,13370901.0,13370901.0,[],[],13370901,fnhd,0.595243,de,1.0
184064,"[tag:www.monasterium.net,2011:/charter/CSGV/1314]",Abt Heinrich von St. Gallen verleiht Eglolf vo...,[],"Item ain lehenbrieflin on ain sigel, wie herr ...",[],[],Deutsch,[],[1314],,13140101.0,13141231.0,[],[],13140101,fnhd,0.818374,de,1.0
173885,"[tag:www.monasterium.net,2011:/charter/CH-StiA...",Ulrich Gamps von Kalcheren räumt dem Kloster S...,[],Ich Vli Gamps von Kalcherren burger ze Veltkil...,[],[],Deutsch,[],[4. Februar 1385],,13850204.0,13850204.0,"[StiASG_13850204_RR2-A3_v.jpg, StiASG_13850204...",[],13850204,fnhd,0.832712,de,1.0
186703,"[tag:www.monasterium.net,2011:/charter/CSGX/13...",Ulrich von Hardegg verleiht an Heinrich Salzma...,[],Ich Vllrich von Hardegg vergich offentlich für...,[],[],Deutsch,[],[21. Juni 1384],,13840621.0,13840621.0,[],[],13840621,fnhd,0.888878,de,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186908,"[tag:www.monasterium.net,2011:/charter/CSGX/13...","Johann Midensun, Bürger zu St. Gallen, quittie...",[],"Allen den, die disen brief ansehent lesent ode...",[],[],Deutsch,[],[4. Oktober 1387],,13871004.0,13871004.0,[],[],13871004,fnhd,0.999971,de,1.0
183726,"[tag:www.monasterium.net,2011:/charter/CSGV/13...",Meister und Pfleger des Spitals Lindau verpfli...,[],Wir ... der maister vnd die pfleger dez spital...,[],[],Deutsch,[],[25. November 1301],,13011125.0,13011125.0,[],[],13011125,fnhd,0.999971,de,1.0
183738,"[tag:www.monasterium.net,2011:/charter/CSGV/13...",Abt Heinrich von St. Gallen verpfändet an Hein...,[],Wir von gottis gnadin abt Heinrich von sante G...,[],[St. Gallen],Deutsch,[],[18. April 1302],,13020418.0,13020418.0,[],[],13020418,fnhd,0.999971,de,1.0
251841,"[tag:www.monasterium.net,2011:/charter/DE-BayH...",Probst Chunrat und der Convent zu Ranshofen ge...,[],Wir Chunrat uon gotes gnaden propst ze Ranshou...,[],[Ranshofen],Deutsch,[],[28. Januar 1292],,12920128.0,12920128.0,[],[],12920128,fnhd,0.999971,de,1.0


Sorting the charters by language_score suggests that practically all pieces have a score between 0.80 and 1, suggesting a high certainty regarding the classification.

In [25]:
final_language_filtered.sort_values("confidence_langid", ascending=True)

Unnamed: 0,atom_id,cei_abstract_joined,cei_abstract_foreign,cei_tenor_joined,cei_pTenor,cei_placeName,cei_lang_MOM,cei_date,cei_dateRange,cei_date_ATTRIBUTE_value,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy,date_joined,language_label,language_score,language_langid,confidence_langid
218288,"[tag:www.monasterium.net,2011:/charter/DE-AEK/...",Die Äbtissin Irmgard von Schoeneck verpachtet ...,[],"Wir Irmegart van Schonecgen, van der genaiden ...",[],[Köln],Deutsch,[1384 Februar 21],[],13840221.0,,,"[DE_AEK_MiK_AI_135_001.jpg, DE_AEK_MiK_AI_135_...",[],13840221,fnhd,0.999968,de,0.959485
168004,"[tag:www.monasterium.net,2011:/charter/CH-StaA...","Eglolf von Rorschach beurkundet, dass bei der ...",[],Ich Eglolff von Roschach herr Eglolfs salgen s...,[],[],Deutsch,[],[11. September 1366],,13660911.0,13660911.0,"[StiASG_13660911_MM3-A4_v.jpg, StiASG_13660911...",[],13660911,fnhd,0.999968,de,0.999995
168953,"[tag:www.monasterium.net,2011:/charter/CH-StaA...",Johann von Bodman quittiert der Stadt St. Gall...,[],Ich Hans von Bodmun ritter gesesen ze Kúnseg v...,[],[],Deutsch,[],[undatiert (um 11. November 1385)],,13851111.0,13851111.0,"[StadtASG_13851111_StadtASG-V-5b_v.jpg, StadtA...",[],13851111,fnhd,0.999968,lb,0.999999
185592,"[tag:www.monasterium.net,2011:/charter/CSGVII/...",Herzog Albrecht von Österreich verpflichtet si...,[],"Wir Albr. etc. tun chunt, daz wir vnserm getre...",[],[Waldsee],Deutsch,[],[17. November 1354],,13541117.0,13541117.0,[],[],13541117,fnhd,0.999967,de,1.0
184906,"[tag:www.monasterium.net,2011:/charter/CSGVI/1...","Graf Johann von Habsburg beurkundet, dass Adel...",[],Ich graue Johans von Habspurg kúnden allen den...,[],[],Deutsch,[],[30. März 1339],,13390330.0,13390330.0,[],[],13390330,fnhd,0.999968,de,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123359,"[tag:www.monasterium.net,2011:/charter/AT-StiA...",Propst Wernher von St. Florian verleiht das Ka...,[],Wjr Wernher von Gotes verhengnu°zze Probst des...,[],[St. Florian],Deutsch,[],[15. August 1327],,13270815.0,13270815.0,[],[],13270815,fnhd,0.999967,de,1.0
123364,"[tag:www.monasterium.net,2011:/charter/AT-StiA...","Chunrat von Law, Pfarrer zu St. Marienkirchen ...",[],Ich Chunrat von Law genant Pharrer datz sand M...,[],[o. O.],Deutsch,[],[4. Mai 1327],,13270504.0,13270504.0,[K.._MOM-Bilddateien._~StFlorianjpgweb._~Urkun...,[],13270504,fnhd,0.999968,de,1.0
123366,"[tag:www.monasterium.net,2011:/charter/AT-StiA...",Thomas von Lav und seine Söhne verzichten auf ...,[],Ich thomas von Lav vnd Elspet mein havsvrov vn...,[],[o. O.],Deutsch,[],[4. Mai 1327],,13270504.0,13270504.0,[K.._MOM-Bilddateien._~StFlorianjpgweb._~Urkun...,[],13270504,fnhd,0.999967,de,1.0
123355,"[tag:www.monasterium.net,2011:/charter/AT-StiA...",Friedrich Zwickel gibt eine Wiese in Lugheim z...,[],Ich fridreich zwikchel vergich vnd tvn chvnt a...,[],[o. O.],Deutsch,[],[25. Mai 1326],,13260525.0,13260525.0,[K.._MOM-Bilddateien._~StFlorianjpgweb._~Urkun...,[],13260525,fnhd,0.999964,de,1.0


Using the implementation's Probability Normalization (see https://github.com/saffsd/langid.py), sorting the charters by confidence_langid suggests that practically all pieces have a score between 0.95 and 1, suggesting a high certainty regarding the classification.

For these reasons, no further filtering is done. Further narrowing down the corpus by addition manual restrictions would not be helpful, as it might count as some form of manipulation.
Given that some Latin will always be included in the charters given the developments of charter language in mhg, this suffices.

# Finalize and export

In [26]:
final_language_filtered["year"] = final_language_filtered["date_joined"].apply(
    lambda date: int(str(int(round(date)))[:4])
)

In [27]:
final_language_filtered.to_json("../data-push/0b-preprocessing-main/charters-filtered-final.json")