In [None]:
import json
import pandas as pd

from notarius.shared.constants import DATA_DIR, MAPPINGS_DIR

In [None]:
with open(MAPPINGS_DIR / "dedication.json") as f:
    dedication = json.load(f)

In [None]:
df = pd.read_csv(DATA_DIR / "csv" / "slownik_wezwan.csv")
extended_dedications_comp = df.iloc[:, :2].dropna()
extended_dedications_comp

In [None]:
new_mapping = df.iloc[:, -2:].dropna()
new_mapping.to_dict(orient="records")

In [None]:
# Create mappings dictionary and filter out identity mappings
mappings = {}
for idx, row in new_mapping.iterrows():
    short = row["skrót"]
    expanded = row["rozwinięcie"]
    # Only add if short and expanded are different
    if short != expanded:
        mappings[short] = expanded

print(f"Total mappings: {len(mappings)}")
mappings

In [None]:
# Create words_mapping dictionary and filter out identity mappings
words_mapping = {}
for idx, row in extended_dedications_comp.iterrows():
    short, extended = row
    # Only add if short and extended are different
    if short != extended:
        words_mapping[short] = extended

print(f"Total word mappings: {len(words_mapping)}")
words_mapping

In [None]:
# First pass: direct full-string replacements from words_mapping
for key, value in dedication.items():
    if value in words_mapping.keys():
        print(f"Replacing {value} for {words_mapping[value]}")
        dedication[key] = words_mapping[value]
    else:
        print("---")
        print(f"Couldn't find rep for {value}")

In [None]:
import re
import string

translator = str.maketrans("", "", string.punctuation)

new_dedication_mapping = dedication.copy()

for key, value in dedication.items():
    # Check if there are abbreviations (2+ consecutive uppercase letters)
    match = bool(re.search(r"\b\w*[A-Z]{2}\w*\b", value))

    if match:  # Only process if abbreviations found
        original_value = value

        # Create a list to track which words we've already replaced
        # This prevents replacing the same word multiple times
        words_in_value = value.translate(translator).split(" ")

        # Sort mappings by length (longest first) to avoid partial matches
        # e.g., replace "AAp" before "Ap"
        all_mappings = {**mappings, **words_mapping}
        sorted_mappings = sorted(
            all_mappings.items(), key=lambda x: len(x[0]), reverse=True
        )

        for short, expanded in sorted_mappings:
            # Check if the short form exists as a whole word
            if short in words_in_value:
                # Use word boundaries to ensure we only replace whole words
                pattern = r"\b" + re.escape(short) + r"\b"
                # Only replace if the expanded form is not already present
                if expanded not in value:
                    value = re.sub(pattern, expanded, value)

        if original_value != value:
            print(f"Replacing {original_value} with {value}")
            new_dedication_mapping[key] = value
        else:
            new_dedication_mapping[key] = value

In [None]:
new_dedication_mapping

In [None]:
import re

for key, value in new_dedication_mapping.items():
    original_value = value

    # Sort by length to avoid partial matches
    sorted_mappings = sorted(
        words_mapping.items(), key=lambda x: len(x[0]), reverse=True
    )

    for m_key, m_value in sorted_mappings:
        # Skip if they're the same (identity mapping)
        if m_key == m_value:
            continue

        # Create pattern with word boundaries
        pattern = r"\b" + re.escape(m_key) + r"\b"

        # Only replace if the short form exists AND expanded doesn't already exist
        if re.search(pattern, value) and m_value not in value:
            value = re.sub(pattern, m_value, value, count=1)

    if value != original_value:
        print(f"Replacing {original_value} with {value}")
        new_dedication_mapping[key] = value

In [None]:
# Summary of changes
changes = []
for v1, v2 in zip(dedication.values(), new_dedication_mapping.values()):
    if v1 != v2:
        changes.append((v1, v2))
        print(f"Replacing {v1} with {v2}")

print(f"\nTotal changes: {len(changes)}")

In [None]:
# After your main replacement loop, add cleanup patterns:
cleanup_patterns = {
    r"\bApostoł Ap\b": "Apostoł",
    r"\bApostołowie AAp\b": "Apostołowie",
    r"\bWyznawca W\b": "Wyznawca",
    r"\bBiskup Biskup\b": "Biskup",
    r"\bMęczennica Męczennica\b": "Męczennica",
    r"Najświętsza Maryja Panna Królowa Najświętsza Maryja Panna": "Najświętsza Maryja Panna Królowa",
    r"\bChrystus Chrystus\b": "Chrystus",
    r"\bAndrzej Andrzej Świerad": "Andrzej Świerad",
    r"\bBiskup i Doktor Kościoła Biskup\b": "Biskup i Doktor Kościoła",
    r"\bOblNMP\b": "Oblubieniec Najświętszej Maryi Panny",  # Remove any leftover OblNMP
    r"\bPan Chrystus": "Pan",  # Fix "Chrystus Pan Chrystus"
}

print("\n=== CLEANUP PHASE ===")
cleanup_count = 0

for pattern, replacement in cleanup_patterns.items():
    for key, value in new_dedication_mapping.items():
        if re.search(pattern, value):
            old_value = value
            new_value = re.sub(pattern, replacement, value)
            if old_value != new_value:
                print(f"Cleanup: {old_value} → {new_value}")
                new_dedication_mapping[key] = new_value
                cleanup_count += 1

print(f"\n=== Total cleanup fixes: {cleanup_count} ===")

In [None]:
# Save the updated dedication mapping
with open(MAPPINGS_DIR / "dedication_updated.json", "w", encoding="utf-8") as f:
    json.dump(new_dedication_mapping, f, ensure_ascii=False, indent=2)

print("Saved to dedication_updated.json")