In [1]:
from __future__ import annotations

import shutil
import zipfile
from pathlib import Path


def unique_path(dest: Path) -> Path:
    """If dest exists, return a non-colliding path by adding _NNNN before suffix."""
    if not dest.exists():
        return dest
    stem, suffix = dest.stem, dest.suffix
    for i in range(1, 100_000):
        candidate = dest.with_name(f"{stem}_{i:04d}{suffix}")
        if not candidate.exists():
            return candidate
    raise RuntimeError(f"Could not find a unique filename for {dest}")


def main() -> None:
    cwd = Path.cwd()
    levels_raw = cwd / "levels_raw"
    levels_raw_zip = cwd / "levels_raw_zip"
    levels_raw.mkdir(parents=True, exist_ok=True)
    levels_raw_zip.mkdir(parents=True, exist_ok=True)

    zip_files = sorted(cwd.glob("*.zip"))
    if not zip_files:
        print("No .zip files found in CWD.")
        return

    moved_txt = 0
    for zip_path in zip_files:
        print(f"Processing: {zip_path.name}")

        # Extract into a controlled temp directory so cleanup is simple/safe.
        extract_dir = cwd / f"__unzipped__{zip_path.stem}"
        if extract_dir.exists():
            shutil.rmtree(extract_dir)
        extract_dir.mkdir(parents=True, exist_ok=True)

        try:
            with zipfile.ZipFile(zip_path, "r") as zf:
                zf.extractall(extract_dir)
        except zipfile.BadZipFile:
            print(f"  !! Bad zip file, skipping: {zip_path.name}")
            shutil.rmtree(extract_dir, ignore_errors=True)
            continue

        # Move all .txt files (recursively) into ./levels_raw
        txt_files = list(extract_dir.rglob("*.txt"))
        for txt in txt_files:
            # If you want to keep original filenames only, use txt.name as below.
            # This handles collisions by renaming with _NNNN.
            dest = unique_path(levels_raw / txt.name)
            dest.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(txt), str(dest))
            moved_txt += 1

        # Delete extracted content
        shutil.rmtree(extract_dir, ignore_errors=True)

        # Move the zip into ./levels_raw_zip (handle name collisions)
        zip_dest = unique_path(levels_raw_zip / zip_path.name)
        shutil.move(str(zip_path), str(zip_dest))

        print(f"  Moved {len(txt_files)} .txt files; zip -> {zip_dest.relative_to(cwd)}")

    print(f"Done. Total .txt files moved: {moved_txt}")


if __name__ == "__main__":
    main()


Processing: batch_000_004.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_000_004.zip
Processing: batch_005_009.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_005_009.zip
Processing: batch_010_014.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_010_014.zip
Processing: batch_015_019.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_015_019.zip
Processing: batch_020_024.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_020_024.zip
Processing: batch_025_029.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_025_029.zip
Processing: batch_030_034.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_030_034.zip
Processing: batch_035_039.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_035_039.zip
Processing: batch_040_044.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_040_044.zip
Processing: batch_045_049.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_045_049.zip
Processing: batch_050_054.zip
  Moved 5 .txt files; zip -> levels_raw_zip\batch_

In [None]:
from __future__ import annotations

from pathlib import Path

IN_DIR = Path("./levels_raw")
OUT_DIR = Path("./levels")

# MarioGPT/VGLC -> your notation
MAP = {
    "-": "-",   # air
    "X": "X",   # ground
    "S": "S",   # solid/brick
    "?": "?",   # question (powerup)
    "Q": "Q",   # question (coin)
    "o": "o",   # coin
    "E": "E",   # enemy (generic)
    "<": "<",   # pipe TL
    ">": ">",   # pipe TR
    "[": "[",   # pipe body L
    "]": "]",   # pipe body R
    "T": "T",   # flower pipe (if present)
    "B": "B",   # bullet head
    "b": "b",   # bullet body
    "M": "M",   # Mario start (if present)
    "F": "F",   # flag/end (if present)

    # IMPORTANT: MarioGPT trace overlay -> air
    "x": "-",
}

# Your allowed alphabet (from your spec); used to validate output.
ALLOWED_OUT = set(list(
    "MF-X#SD%|?@Q!CUL12oEgGkKrRyYtT<>[]*Bb"
))

def translate_level_text(txt: str) -> tuple[str, set[str], set[str]]:
    """
    Returns:
      translated_text, unknown_in, illegal_out
    """
    unknown_in: set[str] = set()
    illegal_out: set[str] = set()

    out_lines = []
    for line in txt.splitlines():
        out_chars = []
        for ch in line:
            if ch in MAP:
                mapped = MAP[ch]
            else:
                # pass-through if it's already one of your symbols
                if ch in ALLOWED_OUT:
                    mapped = ch
                else:
                    unknown_in.add(ch)
                    # safest fallback is air
                    mapped = "-"

            if mapped not in ALLOWED_OUT and mapped != "-":
                illegal_out.add(mapped)
            out_chars.append(mapped)
        out_lines.append("".join(out_chars))

    return "\n".join(out_lines) + "\n", unknown_in, illegal_out


def main() -> None:
    if not IN_DIR.exists():
        raise SystemExit(f"Input directory not found: {IN_DIR.resolve()}")

    OUT_DIR.mkdir(parents=True, exist_ok=True)

    all_unknown: set[str] = set()
    all_illegal: set[str] = set()

    files = sorted(IN_DIR.glob("*.txt"))
    if not files:
        raise SystemExit(f"No .txt files found in {IN_DIR.resolve()}")

    for fp in files:
        raw = fp.read_text(encoding="utf-8", errors="replace")
        translated, unknown_in, illegal_out = translate_level_text(raw)

        all_unknown |= unknown_in
        all_illegal |= illegal_out

        out_fp = OUT_DIR / fp.name
        out_fp.write_text(translated, encoding="utf-8")

    print(f"Translated {len(files)} levels:")
    print(f"  in:  {IN_DIR.resolve()}")
    print(f"  out: {OUT_DIR.resolve()}")

    if all_unknown:
        # show readable list (sorted)
        print("\n[WARN] Unknown input characters encountered (mapped to '-'):",
              "".join(sorted(all_unknown)))

    if all_illegal:
        print("\n[WARN] Illegal output characters produced (should not happen):",
              "".join(sorted(all_illegal)))


if __name__ == "__main__":
    main()


Translated 100 levels:
  in:  C:\Users\user\Studia\DataScience\Semestr_V\pcg-arena\generators\MarioGPT\levels_raw
  out: C:\Users\user\Studia\DataScience\Semestr_V\pcg-arena\generators\MarioGPT\levels
