In [1]:
# Reconstruct YOLO-style train/ from local ZIP parts inside this notebook's folder.
# Expected ZIP layout: images/<file>, labels/<file>  (from the splitter you used)

from pathlib import Path
import zipfile, shutil, hashlib

# --- params you may tweak ---
DEST_TRAIN = Path("./train")     # where to reconstruct
ZIP_GLOB   = "train_part_*.zip"  # which zip files to merge (all in this folder)
# ----------------------------

def sha1(p: Path) -> str:
    h = hashlib.sha1()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""):
            h.update(chunk)
    return h.hexdigest()

root = Path(".").resolve()
zips = sorted(root.glob(ZIP_GLOB))
if not zips:
    raise SystemExit(f"No ZIPs found matching pattern: {ZIP_GLOB}")

(DEST_TRAIN / "images").mkdir(parents=True, exist_ok=True)
(DEST_TRAIN / "labels").mkdir(parents=True, exist_ok=True)

print("Merging ZIPs:")
for z in zips:
    print(f"  - {z.name}")
print()

for z in zips:
    print(f"Processing {z.name} ...")
    with zipfile.ZipFile(z) as zf:
        for info in zf.infolist():
            if info.is_dir():
                continue
            rel = Path(info.filename)
            parts = rel.parts
            if not parts or parts[0].lower() not in {"images","labels"}:
                continue  # ignore anything not under images/ or labels/
            target = DEST_TRAIN / Path(*([parts[0].lower()] + list(parts[1:])))
            target.parent.mkdir(parents=True, exist_ok=True)

            if target.exists():
                # write to a temp file and compare; if identical, skip; else suffix
                tmp = target.with_suffix(target.suffix + ".tmp_merge_check")
                with zf.open(info) as src, tmp.open("wb") as out:
                    shutil.copyfileobj(src, out)
                if sha1(tmp) == sha1(target):
                    tmp.unlink()
                    continue
                tmp.unlink()
                base, suff = target.stem, target.suffix
                i = 1
                while True:
                    cand = target.with_name(f"{base}__dup{i}{suff}")
                    if not cand.exists():
                        target = cand
                        break
                    i += 1

            with zf.open(info) as src, target.open("wb") as out:
                shutil.copyfileobj(src, out)

# --- quick summary & basic integrity check ---
imgs = sorted((DEST_TRAIN / "images").glob("*"))
labs = sorted((DEST_TRAIN / "labels").glob("*.txt"))
img_stems = {p.stem for p in imgs}
lab_stems = {p.stem for p in labs}
missing_labels = sorted(img_stems - lab_stems)
orphan_labels  = sorted(lab_stems - img_stems)

print("\nDone.")
print(f"Reconstructed at: {DEST_TRAIN.resolve()}")
print(f"Images: {len(imgs)} | Labels: {len(labs)}")

def preview(lst, n=10):
    return ", ".join(lst[:n]) + (" ..." if len(lst) > n else "")

if missing_labels:
    print(f"\nImages without labels ({len(missing_labels)}): {preview(missing_labels)}")
if orphan_labels:
    print(f"Labels without images ({len(orphan_labels)}): {preview(orphan_labels)}")
if not missing_labels and not orphan_labels:
    print("All image/label stems look matched.")


Merging ZIPs:
  - train_part_A.zip
  - train_part_B.zip

Processing train_part_A.zip ...
Processing train_part_B.zip ...

Done.
Reconstructed at: /cluster/home/henrban/SOLAQUA-UOD/uw_yolov8/data/train
Images: 17956 | Labels: 17956
All image/label stems look matched.


In [3]:
# Unzip test.zip (in the same folder as this notebook) into ./test/

from pathlib import Path
import zipfile, shutil

ZIP_PATH = Path("val.zip")       # your zip file name
DEST_DIR = Path("./val")         # where to extract

if not ZIP_PATH.exists():
    raise FileNotFoundError(f"❌ Can't find {ZIP_PATH.name} in {Path('.').resolve()}")

# Remove old folder if it exists (optional)
if DEST_DIR.exists():
    print(f"⚠️ Removing existing {DEST_DIR} ...")
    shutil.rmtree(DEST_DIR)

print(f"Extracting {ZIP_PATH.name} → {DEST_DIR}/ ...")
with zipfile.ZipFile(ZIP_PATH, "r") as zf:
    zf.extractall(DEST_DIR)

print("✅ Done.")


Extracting val.zip → val/ ...
✅ Done.
