In [1]:
# === Split CFC/images/train into 4 ZIPs, keeping paired labels ===
from pathlib import Path
import zipfile, random, math

# --- CONFIG ---
DATA_ROOT  = Path("../data/CFC")      # dataset root
IMG_ROOT   = DATA_ROOT / "images" / "train"
LAB_ROOT   = DATA_ROOT / "labels" / "train"
OUT_PREFIX = "cfc_train_part"
NUM_PARTS  = 4
SHUFFLE    = True
SEED       = 42
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
# --------------

assert IMG_ROOT.is_dir(), f"❌ Missing: {IMG_ROOT}"
assert LAB_ROOT.is_dir(), f"❌ Missing: {LAB_ROOT}"

def find_images(folder: Path):
    return [p for p in folder.rglob("*") if p.suffix.lower() in IMG_EXTS]

def find_label(img_path: Path):
    # mirror train/images/...  → train/labels/...
    rel = img_path.relative_to(IMG_ROOT)
    lbl = (LAB_ROOT / rel).with_suffix(".txt")
    if lbl.exists():
        return lbl
    # fallback: flat label
    fallback = LAB_ROOT / (img_path.stem + ".txt")
    return fallback if fallback.exists() else None

# ---- collect pairs ----
imgs = find_images(IMG_ROOT)
print(f"Found {len(imgs)} images")

pairs = [(img, find_label(img)) for img in imgs]

# shuffle/deterministic
if SHUFFLE:
    random.seed(SEED)
    random.shuffle(pairs)
else:
    pairs.sort(key=lambda x: x[0].as_posix())

# ---- split into 4 chunks ----
n = len(pairs)
sizes = [n // NUM_PARTS + (1 if i < (n % NUM_PARTS) else 0) 
         for i in range(NUM_PARTS)]

chunks = []
start = 0
for sz in sizes:
    chunks.append(pairs[start:start+sz])
    start += sz

# ---- write ZIPs ----
def write_zip(chunk, out_name):
    with zipfile.ZipFile(out_name, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zf:
        for img, lbl in chunk:
            rel_img = img.relative_to(IMG_ROOT)
            zf.write(img, arcname=Path("images") / rel_img)
            if lbl is not None:
                # mirrored labels/<same subfolder>/file.txt
                rel_lbl = lbl.relative_to(LAB_ROOT)
                zf.write(lbl, arcname=Path("labels") / rel_lbl)

out_paths = []
for i, chunk in enumerate(chunks):
    zipname = f"{OUT_PREFIX}_{chr(ord('A')+i)}.zip"
    write_zip(chunk, zipname)
    out_paths.append(zipname)
    print(f"✅ wrote {zipname}: {len(chunk)} images, "
          f"{sum(l is not None for _,l in chunk)} labels")



Found 76619 images
✅ wrote cfc_train_part_A.zip: 19155 images, 17447 labels
✅ wrote cfc_train_part_B.zip: 19155 images, 17489 labels
✅ wrote cfc_train_part_C.zip: 19155 images, 17348 labels
✅ wrote cfc_train_part_D.zip: 19154 images, 17370 labels


In [None]:
# Reconstruct CFC/{images,labels}/train from multiple ZIP parts.
# Drop this in a server .ipynb that sits near your ZIPs or adjust ZIP_DIR/ZIP_GLOB.

from pathlib import Path
import zipfile, shutil, hashlib, re

# --- CONFIG (edit if needed) ---
ZIP_DIR   = Path(".")                 # folder that contains your ZIPs
ZIP_GLOB  = "cfc_train_part_*.zip"    # pattern for the 4 zip files (A..D)
DEST_ROOT = Path("./data/CFC")        # dataset root on the server
SPLIT     = "train"                   # we're reconstructing the 'train' split
# --------------------------------

DEST_IMG = DEST_ROOT / "images" / SPLIT
DEST_LAB = DEST_ROOT / "labels" / SPLIT
DEST_IMG.mkdir(parents=True, exist_ok=True)
DEST_LAB.mkdir(parents=True, exist_ok=True)

zips = sorted(ZIP_DIR.glob(ZIP_GLOB))
if not zips:
    raise SystemExit(f"No ZIPs found matching {ZIP_GLOB} in {ZIP_DIR.resolve()}")

print("Merging ZIPs:")
for z in zips: print("  -", z.name)

def sha1(path: Path) -> str:
    h = hashlib.sha1()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""):
            h.update(chunk)
    return h.hexdigest()

def safe_extract_file(zf: zipfile.ZipFile, info: zipfile.ZipInfo, target: Path):
    """Extract a single member to 'target'. If target exists:
       - if identical content -> skip
       - if different        -> write with __dupN suffix."""
    target.parent.mkdir(parents=True, exist_ok=True)

    if target.exists():
        # compare content via temp write + sha1
        tmp = target.with_suffix(target.suffix + ".tmp_merge")
        with zf.open(info) as src, tmp.open("wb") as out:
            shutil.copyfileobj(src, out)
        if sha1(tmp) == sha1(target):
            tmp.unlink()
            return target  # identical, skip
        tmp.unlink()
        base, suf = target.stem, target.suffix
        i = 1
        while True:
            cand = target.with_name(f"{base}__dup{i}{suf}")
            if not cand.exists():
                target = cand
                break
            i += 1

    with zf.open(info) as src, target.open("wb") as out:
        shutil.copyfileobj(src, out)
    return target

img_count = lab_count = 0

for zpath in zips:
    with zipfile.ZipFile(zpath) as zf:
        for info in zf.infolist():
            if info.is_dir(): 
                continue
            # Expect archive paths like: images/<...> or labels/<...>
            parts = Path(info.filename).parts
            if not parts:
                continue
            top = parts[0].lower()
            if top not in {"images", "labels"}:
                # ignore anything outside expected folders
                continue

            rel_path = Path(*parts[1:])  # keep original subfolder structure
            if top == "images":
                dst = DEST_IMG / rel_path
                safe_extract_file(zf, info, dst)
                img_count += 1
            else:
                dst = DEST_LAB / rel_path.with_suffix(".txt")
                safe_extract_file(zf, info, dst)
                lab_count += 1

print("\n✅ Done reconstructing.")
print("Images written to :", DEST_IMG.resolve())
print("Labels written to :", DEST_LAB.resolve())

# --- Quick integrity summary ---
img_files = [p for p in DEST_IMG.rglob("*") if p.is_file()]
lab_files = [p for p in DEST_LAB.rglob("*.txt")]
print(f"Found {len(img_files)} image files and {len(lab_files)} label files on disk.")

# Count images that have a same-stem label (flat check across subfolders)
def stem_set(paths): 
    return {(p.parent.relative_to(DEST_IMG).as_posix(), p.stem) for p in paths}

# Build comparable keys: (relative_subdir, stem)
img_keys = {(p.parent.relative_to(DEST_IMG).as_posix(), p.stem) for p in img_files}
lab_keys = {(p.parent.relative_to(DEST_LAB).as_posix(), p.stem) for p in lab_files}

with_label    = len(img_keys & lab_keys)
without_label = len(img_keys - lab_keys)
orphan_labels = len(lab_keys - img_keys)

print(f"Images with labels   : {with_label}")
print(f"Images without labels: {without_label}")
print(f"Labels without images: {orphan_labels}")

if without_label:
    print("Note: some images have no matching label (expected if backgrounds were included).")


Merging ZIPs:
  - train_part_A.zip
  - train_part_B.zip

Processing train_part_A.zip ...
Processing train_part_B.zip ...

Done.
Reconstructed at: /cluster/home/henrban/SOLAQUA-UOD/uw_yolov8/data/train
Images: 17956 | Labels: 17956
All image/label stems look matched.


In [3]:
# Unzip test.zip (in the same folder as this notebook) into ./test/

from pathlib import Path
import zipfile, shutil

ZIP_PATH = Path("val.zip")       # your zip file name
DEST_DIR = Path("./val")         # where to extract

if not ZIP_PATH.exists():
    raise FileNotFoundError(f"❌ Can't find {ZIP_PATH.name} in {Path('.').resolve()}")

# Remove old folder if it exists (optional)
if DEST_DIR.exists():
    print(f"⚠️ Removing existing {DEST_DIR} ...")
    shutil.rmtree(DEST_DIR)

print(f"Extracting {ZIP_PATH.name} → {DEST_DIR}/ ...")
with zipfile.ZipFile(ZIP_PATH, "r") as zf:
    zf.extractall(DEST_DIR)

print("✅ Done.")


Extracting val.zip → val/ ...
✅ Done.
