In [1]:
import pandas as pd
import re
import logging
import glob

In [2]:
# Funktionen zur Umfangsauswertung von https://github.com/hbeyer/pylib/blob/main/lib/pica.py

class Numeral:
    conc = {
        "i" : 1,
        "v" : 5,
        "x" : 10,
        "l" : 50,
        "c" : 100,
        "d" : 500,
        "m" : 1000
    }
    conca = {
        1000 : "m",
        500 : "d",
        100 : "c",
        50 : "l",
        10 : "x",
        5 : "v",
        1 : "i"
    }
    subst = {
        "dcccc" : "cm",
        "lxxxx" : "xc",
        "viiii" : "ix",
        "cccc" : "cd",
        "xxxx" : "xl",
        "iiii" : "iv"
    }

def to_arabic(lett):
    sum = 0
    subtr = False
    lett = lett.strip().lower().replace(".", "").replace(" ", "")
    if re.match(r"[ivxlcdm]+", lett) == None:
        return(None)
    rev = lett[::-1]
    last = 0
    for let in rev:
        val = Numeral.conc[let]
        if val >= last:
            sum += val
            subtr = False
        elif val < last:
            sum -= val
            subtr = True
        elif val == last and subtr == True:
            sum -= val
        last = val
    return(sum)
    
def to_roman(num):
    if isinstance(num, int) == False:
        return(None)
    rom = ""
    while num > 0:
        for key in Numeral.conca:
            if num >= key:
                rom = rom + Numeral.conca[key]
                num -= key
                break
    for add, sub in Numeral.subst.items():
        rom = rom.replace(add, sub)
    return(rom)

def get_norm_p(pages):
    normp = 0
    chunks = re.findall(r"(([^BS]+) (Bl)|([^BS]+) (S$|S[^p]|Bo)|([^BS]+) Sp)", pages)
    for ch in chunks:
        wh, numbl, _bl, nums, _sbo, numsp = ch
        if "-" in wh:
            continue
        if numbl != "":
            normp += get_number(numbl, 2)
        elif nums != "":
            normp += get_number(nums)
        elif numsp != "":
            normp += get_number(numsp, 0.5)
    chunks2 = re.findall(r"S\.? \d+ ?- ?\d+", pages)
    for ch2 in chunks2:
        normp += get_number(ch2)
    return(normp)

def get_number(page_string, mult=1):
    res = 0
    clean = re.sub(r"[\divxdclmIVXDCLM]+,? \[?(das heißt|i. ?e.)", "", page_string)
    spans = re.findall("(\[?(\d+)\]? ?- ?\[?(\d+)\]?)", clean)
    for span in spans:
        whole, start, end = span
        diff = int(end) - int(start)
        clean = re.sub(whole, str(diff), clean)
    extract = re.findall(r"\d+", clean)
    for num in extract:
        res += int(num)
    extract = re.findall(r"([ivxdclm]+) ", clean.lower())
    for num in extract:
        arab = to_arabic(num)
        if arab == None:
            logging.error(f"Nicht zu parsen: {num}")
        else:
            res += arab
    return(int(res * mult))

In [3]:
files = glob.glob("../abzug/*.csv")

df = pd.DataFrame()

for file in files:
    df = pd.concat([df,pd.read_csv(file)], ignore_index=True)

df["normpages"] = df.umfang.map(get_norm_p, na_action='ignore')

In [None]:
df

In [5]:
import mitosheet
mitosheet.sheet(df, analysis_to_replay="id-fgxykrqafp")

MitoWidget(analysis_data_json='{"analysisName": "id-fgxykrqafp", "analysisToReplay": null, "code": [], "stepSu…

In [None]:
from mitosheet import *; register_analysis("id-fgxykrqafp");
    
# Sorted normpages in descending order
df = df.sort_values(by='normpages', ascending=False, na_position='last')

# Reordered column normpages
df_columns = [col for col in df.columns if col != 'normpages']
df_columns.insert(16, 'normpages')
df = df[df_columns]

# Reordered column normpages
df_columns = [col for col in df.columns if col != 'normpages']
df_columns.insert(12, 'normpages')
df = df[df_columns]

# Reordered column normpages
df_columns = [col for col in df.columns if col != 'normpages']
df_columns.insert(9, 'normpages')
df = df[df_columns]

# Changed normpages to dtype int
df['normpages'] = df['normpages'].fillna(0).astype('int')

# Filtered signatur_a
df = df[~df['signatur_a'].str.contains('Frag', na=False)]

# Sorted normpages in ascending order
df = df.sort_values(by='normpages', ascending=True, na_position='first')

# Filtered bbg
df = df[~df['bbg'].str.contains('Qd', na=False)]

# Filtered normpages
df = df[df['normpages'] > 0]
