In [None]:
import json
import pandas as pd
from six import text_type

from core.utils.shared import MAPPINGS_DIR, DATA_DIR

In [None]:
with open(MAPPINGS_DIR / "dedication.json") as f:
    dedication = json.load(f)

In [None]:
df = pd.read_csv(DATA_DIR / "csv" / "slownik_wezwan.csv")
extended_dedications_comp = df.iloc[:,:2].dropna()
extended_dedications_comp

In [None]:
new_mapping = df.iloc[:,-2:].dropna()
new_mapping.to_dict(orient="records")

In [None]:
# Create mappings dictionary and filter out identity mappings
mappings = {}
for idx, row in new_mapping.iterrows():
    short = row["skrót"]
    expanded = row["rozwinięcie"]
    # Only add if short and expanded are different
    if short != expanded:
        mappings[short] = expanded

print(f"Total mappings: {len(mappings)}")
mappings

In [None]:
# Create words_mapping dictionary and filter out identity mappings
words_mapping = {}
for idx, row in extended_dedications_comp.iterrows():
    short, extended = row
    # Only add if short and extended are different
    if short != extended:
        words_mapping[short] = extended

print(f"Total word mappings: {len(words_mapping)}")
words_mapping

In [None]:
# First pass: direct full-string replacements from words_mapping
for key, value in dedication.items():
    if value in words_mapping.keys():
        print(f"Replacing {value} for {words_mapping[value]}")
        dedication[key] = words_mapping[value]
    else:
        print("---")
        print(f"Couldn't find rep for {value}")

In [None]:
import re
import string

translator = str.maketrans('', '', string.punctuation)

new_dedication_mapping = dedication.copy()

for key, value in dedication.items():
    # Check if there are abbreviations (2+ consecutive uppercase letters)
    match = bool(re.search(r'\b\w*[A-Z]{2}\w*\b', value))
    
    if match:  # Only process if abbreviations found
        original_value = value
        
        # Create a list to track which words we've already replaced
        # This prevents replacing the same word multiple times
        words_in_value = value.translate(translator).split(" ")
        
        # Sort mappings by length (longest first) to avoid partial matches
        # e.g., replace "AAp" before "Ap"
        all_mappings = {**mappings, **words_mapping}
        sorted_mappings = sorted(all_mappings.items(), 
                                key=lambda x: len(x[0]), 
                                reverse=True)
        
        for short, expanded in sorted_mappings:
            # Check if the short form exists as a whole word
            if short in words_in_value:
                # Use word boundaries to ensure we only replace whole words
                pattern = r'\b' + re.escape(short) + r'\b'
                # Only replace if the expanded form is not already present
                if expanded not in value:
                    value = re.sub(pattern, expanded, value)
        
        if original_value != value:
            print(f"Replacing {original_value} with {value}")
            new_dedication_mapping[key] = value
        else:
            new_dedication_mapping[key] = value

In [None]:
new_dedication_mapping

In [None]:
# Summary of changes
changes = []
for v1, v2 in zip(dedication.values(), new_dedication_mapping.values()):
    if v1 != v2:
        changes.append((v1, v2))
        print(f"Replacing {v1} with {v2}")

print(f"\nTotal changes: {len(changes)}")

In [None]:
# Save the updated dedication mapping
with open(MAPPINGS_DIR / "dedication_updated.json", 'w', encoding='utf-8') as f:
    json.dump(new_dedication_mapping, f, ensure_ascii=False, indent=2)

print("Saved to dedication_updated.json")