In [1]:
from __future__ import annotations

from pathlib import Path

IN_DIR = Path("./levels_raw")
OUT_DIR = Path("./levels")

# MarioGPT/VGLC -> your notation
MAP = {
    "-": "-",   # air
    "X": "X",   # ground
    "S": "S",   # solid/brick
    "?": "?",   # question (powerup)
    "Q": "Q",   # question (coin)
    "o": "o",   # coin
    "E": "E",   # enemy (generic)
    "<": "<",   # pipe TL
    ">": ">",   # pipe TR
    "[": "[",   # pipe body L
    "]": "]",   # pipe body R
    "T": "T",   # flower pipe (if present)
    "B": "B",   # bullet head
    "b": "b",   # bullet body
    "M": "M",   # Mario start (if present)
    "F": "F",   # flag/end (if present)

    # IMPORTANT: MarioGPT trace overlay -> air
    "x": "-",
}

# Your allowed alphabet (from your spec); used to validate output.
ALLOWED_OUT = set(list(
    "MF-X#SD%|?@Q!CUL12oEgGkKrRyYtT<>[]*Bb"
))

def translate_level_text(txt: str) -> tuple[str, set[str], set[str]]:
    """
    Returns:
      translated_text, unknown_in, illegal_out
    """
    unknown_in: set[str] = set()
    illegal_out: set[str] = set()

    out_lines = []
    for line in txt.splitlines():
        out_chars = []
        for ch in line:
            if ch in MAP:
                mapped = MAP[ch]
            else:
                # pass-through if it's already one of your symbols
                if ch in ALLOWED_OUT:
                    mapped = ch
                else:
                    unknown_in.add(ch)
                    # safest fallback is air
                    mapped = "-"

            if mapped not in ALLOWED_OUT and mapped != "-":
                illegal_out.add(mapped)
            out_chars.append(mapped)
        out_lines.append("".join(out_chars))

    return "\n".join(out_lines) + "\n", unknown_in, illegal_out


def main() -> None:
    if not IN_DIR.exists():
        raise SystemExit(f"Input directory not found: {IN_DIR.resolve()}")

    OUT_DIR.mkdir(parents=True, exist_ok=True)

    all_unknown: set[str] = set()
    all_illegal: set[str] = set()

    files = sorted(IN_DIR.glob("*.txt"))
    if not files:
        raise SystemExit(f"No .txt files found in {IN_DIR.resolve()}")

    for fp in files:
        raw = fp.read_text(encoding="utf-8", errors="replace")
        translated, unknown_in, illegal_out = translate_level_text(raw)

        all_unknown |= unknown_in
        all_illegal |= illegal_out

        out_fp = OUT_DIR / fp.name
        out_fp.write_text(translated, encoding="utf-8")

    print(f"Translated {len(files)} levels:")
    print(f"  in:  {IN_DIR.resolve()}")
    print(f"  out: {OUT_DIR.resolve()}")

    if all_unknown:
        # show readable list (sorted)
        print("\n[WARN] Unknown input characters encountered (mapped to '-'):",
              "".join(sorted(all_unknown)))

    if all_illegal:
        print("\n[WARN] Illegal output characters produced (should not happen):",
              "".join(sorted(all_illegal)))


if __name__ == "__main__":
    main()


Translated 100 levels:
  in:  C:\Users\user\Studia\DataScience\Semestr_V\pcg-arena\generators\MarioGAN\levels_raw
  out: C:\Users\user\Studia\DataScience\Semestr_V\pcg-arena\generators\MarioGAN\levels
