In [None]:
import os

import pandas as pd

from notarius.shared.constants import MAPPINGS_DIR

In [None]:
with open(
    os.path.join(
        MAPPINGS_DIR, "generated/run_20250802_135757", "dedication_mappings.csv"
    )
) as f:
    df = pd.read_csv(f)

In [None]:
# Display basic info about the data
print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nFirst few rows:")
df.head(10)

In [None]:
# Load the JSON mapping files for comparison
import json

# Load dedication mappings JSON
with open(
    "/Volumes/T7/AI_Osrodek/tmp/mappings/generated/run_20250802_135757/dedication.json",
    "r",
    encoding="utf-8",
) as f:
    dedication_json = json.load(f)

print(f"Dedication JSON entries: {len(dedication_json)}")
print(f"CSV rows: {len(df)}")

# Show sample from JSON
print("\nSample dedication JSON entries:")
for i, (key, value) in enumerate(list(dedication_json.items())[:10]):
    print(f"{key} -> {value}")

In [None]:
pol2other = {v: k for k, v in dedication_json.items()}
len(set(pol2other.keys()))

In [None]:
# Load saints mapping JSON
with open("../data/mappings/saints_mapping.json", "r", encoding="utf-8") as f:
    saints_json = json.load(f)

In [None]:
# Analyze unique Polish translations in the JSON
polish_translations_dedication = list(dedication_json.values())
polish_translations_saints = list(saints_json.values())

print("=== POLISH TRANSLATIONS ANALYSIS ===")
print(f"Total dedication entries: {len(polish_translations_dedication)}")
print(f"Unique Polish dedications: {len(set(polish_translations_dedication))}")
print(f"Total saints entries: {len(polish_translations_saints)}")
print(f"Unique Polish saints: {len(set(polish_translations_saints))}")

# Combined analysis
all_polish = polish_translations_dedication + polish_translations_saints
print(f"\nCombined total entries: {len(all_polish)}")
print(f"Combined unique Polish translations: {len(set(all_polish))}")

# Most common Polish translations
from collections import Counter

polish_counts_dedication = Counter(polish_translations_dedication)
polish_counts_saints = Counter(polish_translations_saints)

print("\n=== TOP 15 MOST COMMON POLISH DEDICATIONS ===")
for polish, count in polish_counts_dedication.most_common(15):
    print(f"{polish}: {count} occurrences")

print("\n=== TOP 10 MOST COMMON POLISH SAINTS ===")
for polish, count in polish_counts_saints.most_common(10):
    print(f"{polish}: {count} occurrences")

In [None]:
# Analyze Latin patterns and variations
latin_keys_dedication = list(dedication_json.keys())
latin_keys_saints = list(saints_json.keys())

print("=== LATIN PATTERN ANALYSIS ===")
print(f"Total Latin dedication variations: {len(latin_keys_dedication)}")
print(f"Total Latin saints variations: {len(latin_keys_saints)}")

# Analyze common Latin patterns
import re


# Extract common Latin prefixes and patterns
def analyze_latin_patterns(keys):
    patterns = {
        "St_prefix": [k for k in keys if k.startswith("St.")],
        "S_prefix": [k for k in keys if k.startswith("S.")],
        "SS_prefix": [k for k in keys if k.startswith("SS.")],
        "B_M_V": [k for k in keys if "B. M. V" in k],
        "Tit_Ecclesiae": [k for k in keys if "Tit. Ecclesiae" in k],
        "contains_ad": [k for k in keys if " ad " in k],
        "contains_E_M": [k for k in keys if "E. M" in k],
        "contains_Ap": [k for k in keys if "Ap" in k and "Ap." in k],
    }
    return patterns


dedication_patterns = analyze_latin_patterns(latin_keys_dedication)
saints_patterns = analyze_latin_patterns(latin_keys_saints)

print("\n=== DEDICATION LATIN PATTERNS ===")
for pattern, matches in dedication_patterns.items():
    print(f"{pattern}: {len(matches)} matches")
    if matches:
        print(f"  Examples: {matches[:3]}")

print("\n=== SAINTS LATIN PATTERNS ===")
for pattern, matches in saints_patterns.items():
    print(f"{pattern}: {len(matches)} matches")
    if matches:
        print(f"  Examples: {matches[:3]}")

In [None]:
# Analyze abbreviations and special cases in Polish translations
print("=== POLISH ABBREVIATIONS ANALYSIS ===")


# Find entries with potential abbreviations (short words, capital letters)
def find_abbreviations(translations):
    abbrevs = []
    for trans in translations:
        # Look for short strings with mostly capitals or obvious abbreviations
        if len(trans) <= 6 and trans.isupper():
            abbrevs.append(trans)
        elif re.match(r"^[A-Z][a-z]*[A-Z]", trans):
            abbrevs.append(trans)
    return list(set(abbrevs))


dedication_abbrevs = find_abbreviations(polish_translations_dedication)
saints_abbrevs = find_abbreviations(polish_translations_saints)

print("Dedication abbreviations found:")
for abbrev in sorted(dedication_abbrevs):
    count = polish_counts_dedication[abbrev]
    print(f"  {abbrev} ({count} times)")

print("\nSaints abbreviations found:")
for abbrev in sorted(saints_abbrevs):
    count = polish_counts_saints[abbrev]
    print(f"  {abbrev} ({count} times)")

# Find multi-saint entries (containing periods or commas)
multi_saint_dedication = [
    trans
    for trans in set(polish_translations_dedication)
    if "." in trans or "," in trans
]
multi_saint_saints = [
    trans for trans in set(polish_translations_saints) if "." in trans or "," in trans
]

print(f"\nMulti-saint dedication entries: {len(multi_saint_dedication)}")
for entry in multi_saint_dedication[:5]:
    print(f"  {entry}")

print(f"\nMulti-saint saints entries: {len(multi_saint_saints)}")
for entry in multi_saint_saints[:5]:
    print(f"  {entry}")

In [None]:
# Quality analysis - find potential issues in translations
print("=== TRANSLATION QUALITY ANALYSIS ===")


# Find entries where multiple Latin forms map to the same Polish
def find_many_to_one_mappings(mapping_dict):
    polish_to_latin = {}
    for latin, polish in mapping_dict.items():
        if polish not in polish_to_latin:
            polish_to_latin[polish] = []
        polish_to_latin[polish].append(latin)

    # Find Polish entries with many Latin variants
    many_variants = {
        polish: latin_list
        for polish, latin_list in polish_to_latin.items()
        if len(latin_list) > 5
    }
    return many_variants, polish_to_latin


dedication_many_variants, dedication_polish_to_latin = find_many_to_one_mappings(
    dedication_json
)
saints_many_variants, saints_polish_to_latin = find_many_to_one_mappings(saints_json)

print("Polish entries with many Latin variants (>5):")
print(f"Dedication: {len(dedication_many_variants)} entries")
print(f"Saints: {len(saints_many_variants)} entries")

print("\nTop entries with most Latin variants (Dedications):")
sorted_variants = sorted(
    dedication_many_variants.items(), key=lambda x: len(x[1]), reverse=True
)
for polish, latin_list in sorted_variants[:5]:
    print(f"  '{polish}' has {len(latin_list)} variants:")
    for latin in latin_list[:3]:  # Show first 3
        print(f"    {latin}")
    if len(latin_list) > 3:
        print(f"    ... and {len(latin_list) - 3} more")

print("\nTop entries with most Latin variants (Saints):")
sorted_variants_saints = sorted(
    saints_many_variants.items(), key=lambda x: len(x[1]), reverse=True
)
for polish, latin_list in sorted_variants_saints[:3]:
    print(f"  '{polish}' has {len(latin_list)} variants:")
    for latin in latin_list[:3]:
        print(f"    {latin}")
    if len(latin_list) > 3:
        print(f"    ... and {len(latin_list) - 3} more")

In [None]:
# Language detection analysis
print("=== LANGUAGE VARIATION ANALYSIS ===")


# Analyze what languages/patterns appear in the Latin keys
def detect_language_patterns(keys):
    patterns = {
        "pure_latin": [],
        "german_mixed": [],
        "polish_mixed": [],
        "has_numbers": [],
        "very_long": [],
        "contains_punctuation": [],
    }

    for key in keys:
        if len(key) > 100:
            patterns["very_long"].append(key)
        if any(word in key.lower() for word in ["der", "des", "hl.", "kirche"]):
            patterns["german_mixed"].append(key)
        if any(word in key.lower() for word in ["św.", "kościół"]):
            patterns["polish_mixed"].append(key)
        if re.search(r"\d", key):
            patterns["has_numbers"].append(key)
        if len(re.findall(r"[.;,:]", key)) > 3:
            patterns["contains_punctuation"].append(key)
        if all(
            word in ["S.", "St.", "SS.", "B.", "M.", "V.", "E.", "C.", "Ap.", "Bapt."]
            or word.isalpha()
            for word in key.replace(",", " ").replace(".", " ").split()
        ):
            patterns["pure_latin"].append(key)

    return patterns


dedication_language_patterns = detect_language_patterns(latin_keys_dedication)

print("Language pattern analysis for dedications:")
for pattern, matches in dedication_language_patterns.items():
    print(f"{pattern}: {len(matches)} entries")
    if matches and len(matches) <= 3:
        print(f"  Examples: {matches}")
    elif matches:
        print(f"  Examples: {matches[:2]} ... (and {len(matches) - 2} more)")

# Show some problematic entries
print("\n=== POTENTIALLY PROBLEMATIC ENTRIES ===")
print("Very long entries (>100 chars):")
for entry in dedication_language_patterns["very_long"][:3]:
    print(f"  {entry[:100]}...")
    print(f"  -> {dedication_json[entry]}")

print("\nGerman mixed entries:")
for entry in dedication_language_patterns["german_mixed"][:3]:
    print(f"  {entry}")
    print(f"  -> {dedication_json[entry]}")

In [None]:
# CSV vs JSON analysis
print("=== CSV vs JSON COMPARISON ===")

if "source" in df.columns and "target" in df.columns:
    print("CSV structure found with source/target columns")
    csv_source_target_pairs = set(zip(df["source"], df["target"]))
    json_pairs = set(dedication_json.items())

    print(f"CSV unique pairs: {len(csv_source_target_pairs)}")
    print(f"JSON unique pairs: {len(json_pairs)}")

    # Find differences
    csv_only = csv_source_target_pairs - json_pairs
    json_only = json_pairs - csv_source_target_pairs

    print(f"Pairs only in CSV: {len(csv_only)}")
    print(f"Pairs only in JSON: {len(json_only)}")

    if csv_only:
        print("Sample CSV-only pairs:")
        for pair in list(csv_only)[:3]:
            print(f"  {pair[0]} -> {pair[1]}")

    if json_only:
        print("Sample JSON-only pairs:")
        for pair in list(json_only)[:3]:
            print(f"  {pair[0]} -> {pair[1]}")

else:
    print("CSV columns:", df.columns.tolist())
    print("CSV sample data:")
    print(df.head())

In [None]:
# Create visualization of the data patterns
import matplotlib.pyplot as plt

# Set up the plotting style
plt.style.use("default")
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Distribution of Polish translation frequencies
polish_freq_dedication = list(polish_counts_dedication.values())
axes[0, 0].hist(
    polish_freq_dedication, bins=30, alpha=0.7, color="skyblue", edgecolor="black"
)
axes[0, 0].set_title("Distribution of Polish Translation Frequencies\n(Dedications)")
axes[0, 0].set_xlabel("Frequency")
axes[0, 0].set_ylabel("Count")
axes[0, 0].set_yscale("log")

# 2. Length distribution of Latin entries
latin_lengths = [len(key) for key in latin_keys_dedication]
axes[0, 1].hist(
    latin_lengths, bins=30, alpha=0.7, color="lightcoral", edgecolor="black"
)
axes[0, 1].set_title("Distribution of Latin Entry Lengths")
axes[0, 1].set_xlabel("Character Length")
axes[0, 1].set_ylabel("Count")

# 3. Top patterns occurrence
pattern_names = list(dedication_patterns.keys())
pattern_counts = [len(matches) for matches in dedication_patterns.values()]
axes[1, 0].bar(range(len(pattern_names)), pattern_counts, color="lightgreen", alpha=0.7)
axes[1, 0].set_title("Latin Pattern Occurrences")
axes[1, 0].set_xlabel("Pattern Type")
axes[1, 0].set_ylabel("Count")
axes[1, 0].set_xticks(range(len(pattern_names)))
axes[1, 0].set_xticklabels(pattern_names, rotation=45, ha="right")

# 4. Top 10 most common Polish translations
top_polish = polish_counts_dedication.most_common(10)
polish_names = [
    item[0][:20] + "..." if len(item[0]) > 20 else item[0] for item in top_polish
]
polish_counts = [item[1] for item in top_polish]
axes[1, 1].barh(range(len(polish_names)), polish_counts, color="gold", alpha=0.7)
axes[1, 1].set_title("Top 10 Most Common Polish Dedications")
axes[1, 1].set_xlabel("Frequency")
axes[1, 1].set_yticks(range(len(polish_names)))
axes[1, 1].set_yticklabels(polish_names)

plt.tight_layout()
plt.show()

# Summary statistics
print("\n=== SUMMARY STATISTICS ===")
print(f"Total Latin variations: {len(dedication_json)}")
print(f"Unique Polish translations: {len(set(dedication_json.values()))}")
print(
    f"Compression ratio: {len(set(dedication_json.values())) / len(dedication_json):.3f}"
)
print(
    f"Average Latin entry length: {sum(latin_lengths) / len(latin_lengths):.1f} characters"
)
print(f"Max frequency Polish entry: {max(polish_freq_dedication)} occurrences")
print(f"Entries with frequency = 1: {sum(1 for x in polish_freq_dedication if x == 1)}")
print(
    f"Percentage single occurrence: {100 * sum(1 for x in polish_freq_dedication if x == 1) / len(set(dedication_json.values())):.1f}%"
)

In [None]:
df