In [None]:
# original
import os, json, random, shutil
from pathlib import Path
from collections import defaultdict
import numpy as np
import cv2

from pathlib import Path

WEATHER_LABELS = ("clear", "fog", "rain", "snow")

def with_weather_suffix(orig_name: str, tag: str) -> str:
    """'000123.jpg' + 'fog' -> '000123_fog.jpg'."""
    base = Path(orig_name).stem
    return f"{base}_{tag}.jpg"

def weather_from_filename(file_name: str) -> str:
    """Parse weather tag from filename; defaults to 'clear'."""
    tag = Path(file_name).stem.split("_")[-1].lower()
    return tag if tag in WEATHER_LABELS else "clear"


# ========= CONFIG =========
COCO_ROOT = "/nas.dbms/asera/PROJECTS/DATASET/ACDC-1"  # adjust
TRAIN_IM_DIR = f"{COCO_ROOT}/train"
VAL_IM_DIR   = f"{COCO_ROOT}/valid"
TRAIN_ANN_IN = f"{COCO_ROOT}/train/_annotations.coco.json"
VAL_ANN_IN   = f"{COCO_ROOT}/valid/_annotations.coco.json"

OUT_ROOT     = f"{COCO_ROOT}/ACDC-1-NEW"  # will be created
OUT_TRAIN_IM = f"{OUT_ROOT}/images/train"
OUT_VAL_IM   = f"{OUT_ROOT}/images/val"
OUT_TRAIN_JS = f"{OUT_ROOT}/annotations/mini_train.json"
OUT_VAL_JS   = f"{OUT_ROOT}/annotations/mini_val.json"


# pick how many final images per split
TRAIN_TARGET = 3000
VAL_TARGET   = 500


# size selection: "all", "small", "medium", "large"
SIZE_FILTER = "all"   # set to "all" | "small" | "medium" | "large"

# COCO size rules: small < 32^2, medium in [32^2, 96^2), large >= 96^2
# or strict max side rule if USE_COCO_AREA=False
USE_COCO_AREA = True
AREA_SMALL_MAX  = 32 * 32
AREA_MED_MAX    = 96 * 96
MAX_SIDE_SMALL  = 48
MAX_SIDE_MEDMAX = 96

# keep only annotations that match SIZE_FILTER within selected images
KEEP_ONLY_MATCHING_ANNS = True

# weather settings
APPLY_WEATHER = False   # master switch. False means no weather, images copied as-is
APPLY_FOG  = False
APPLY_RAIN = False
APPLY_SNOW = False
WEATHER_PER_IMAGE = 3   # 1 means pick one at random, 3 means generate fog+rain+snow

# DAWN-like classes
TARGET_CLASS_NAMES = ["person","bicycle","car","motorcycle","bus","truck"]
# TARGET_CLASS_NAMES = ["0","1","2","3","4","5"]  # DAWN-1 classes
# Select all COCO classes without listing them one by one
class _All:
    def __contains__(self, _):
        return True

# TARGET_CLASS_NAMES = _All()  # means all 80 COCO classes

# reproducibility
RNG_SEED = 42
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

# ========= WEATHER OPS =========
def _gaussian_mask(h, w, sigma=0.35):
    yy, xx = np.mgrid[0:h, 0:w]
    yy = (yy - h/2) / (sigma*h)
    xx = (xx - w/2) / (sigma*w)
    m = np.exp(-(xx**2 + yy**2))
    m = (m - m.min()) / (m.max() - m.min() + 1e-8)
    return m[..., None].astype(np.float32)

def add_fog(img, strength=0.55):
    blur = cv2.GaussianBlur(img, (0,0), sigmaX=max(3, int(0.02*max(img.shape[:2]))))
    fog  = np.clip(blur + strength*255, 0, 255).astype(np.uint8)
    mask = _gaussian_mask(img.shape[0], img.shape[1], sigma=0.38)
    out  = (img*(1-mask) + fog*mask).astype(np.uint8)
    return out

def add_rain(img, streaks=1100, alpha=0.20, angle_deg=18, length=(8,18)):
    h,w = img.shape[:2]
    overlay = img.copy()
    ang = np.deg2rad(angle_deg)
    c,s = np.cos(ang), np.sin(ang)
    for _ in range(streaks):
        L = np.random.randint(length[0], length[1]+1)
        x = np.random.randint(0, w)
        y = np.random.randint(0, h)
        x2 = int(x + L*c); y2 = int(y + L*s)
        cv2.line(overlay, (x,y), (x2,y2), (255,255,255), 1, cv2.LINE_AA)
    overlay = cv2.blur(overlay, (3,3))
    return cv2.addWeighted(overlay, alpha, img, 1-alpha, 0)

def add_snow(img, flakes=2000, alpha=0.30):
    h,w = img.shape[:2]
    overlay = img.copy()
    for _ in range(flakes):
        x = np.random.randint(0, w)
        y = np.random.randint(0, h)
        r = np.random.randint(1,3)
        cv2.circle(overlay, (x,y), r, (255,255,255), -1, cv2.LINE_AA)
    overlay = cv2.GaussianBlur(overlay, (3,3), 0)
    return cv2.addWeighted(overlay, alpha, img, 1-alpha, 0)

def weather_once(img, which):
    if which == "fog":  return add_fog(img)
    if which == "rain": return add_rain(img)
    if which == "snow": return add_snow(img)
    return img

# ========= HELPERS =========
def load_json(p):
    with open(p, "r") as f:
        return json.load(f)

def ensure_dir(p):
    Path(p).mkdir(parents=True, exist_ok=True)

def filter_to_targets(coco):
    keep_cat = [c for c in coco["categories"] if c["name"] in TARGET_CLASS_NAMES]
    oldid2new = {c["id"]: i+1 for i,c in enumerate(keep_cat)}
    for i,c in enumerate(keep_cat):
        c["id"] = i+1
    return keep_cat, oldid2new

def coco_size_bucket(area):
    if area < AREA_SMALL_MAX:
        return "small"
    if area < AREA_MED_MAX:
        return "medium"
    return "large"

def side_size_bucket(w, h):
    mx = max(w, h)
    if mx < MAX_SIDE_SMALL:
        return "small"
    if mx < MAX_SIDE_MEDMAX:
        return "medium"
    return "large"

def ann_size_bucket(ann):
    x,y,w,h = ann["bbox"]
    area = ann.get("area", w*h)
    return coco_size_bucket(area) if USE_COCO_AREA else side_size_bucket(w, h)

def matches_size_filter(size_name):
    if SIZE_FILTER == "all":
        return True
    return size_name == SIZE_FILTER

def choose_weathers():
    if not APPLY_WEATHER:
        return []
    opts = []
    if APPLY_FOG:  opts.append("fog")
    if APPLY_RAIN: opts.append("rain")
    if APPLY_SNOW: opts.append("snow")
    if not opts:
        return []
    if WEATHER_PER_IMAGE == 3:
        return opts
    return [random.choice(opts)]

def sample_images(kept_images, target_n):
    if len(kept_images) <= target_n:
        return kept_images
    return random.sample(kept_images, target_n)

# ========= CORE SELECTION =========
def collect_images_by_size(coco, img_dir):
    keep_cats, old2new = filter_to_targets(coco)
    # group anns by image, only target cats
    anns_by_img_all = defaultdict(list)
    for a in coco["annotations"]:
        if a["category_id"] in old2new:
            anns_by_img_all[a["image_id"]].append(a)

    # keep images that have at least one annotation matching SIZE_FILTER
    kept_images = []
    matched_anns_by_img = defaultdict(list)
    for img in coco["images"]:
        iid = img["id"]
        if iid not in anns_by_img_all:
            continue
        p = os.path.join(img_dir, img["file_name"])
        if not os.path.isfile(p):
            continue

        anns = anns_by_img_all[iid]
        has_match = False
        tmp = []
        for a in anns:
            sz = ann_size_bucket(a)
            if matches_size_filter(sz):
                has_match = True
                tmp.append(a)

        if SIZE_FILTER == "all":
            # any target class is fine
            if len(anns) > 0:
                kept_images.append(img)
                matched_anns_by_img[iid] = anns
        else:
            if has_match:
                kept_images.append(img)
                matched_anns_by_img[iid] = tmp if KEEP_ONLY_MATCHING_ANNS else anns

    return kept_images, matched_anns_by_img, keep_cats, old2new

# ========= JSON REWRITE =========
def rewrite_json(coco, kept_images, anns_by_img, keep_cats, old2new, new_filenames):
    keep_img_ids = {im["id"] for im in kept_images}
    out_imgs = []
    for im in kept_images:
        new_file = new_filenames.get(im["id"], im["file_name"])
        out_imgs.append({
            "id": im["id"],
            "file_name": new_file,
            "width": im.get("width"),
            "height": im.get("height")
        })

    out_anns = []
    nid = 1
    for iid in keep_img_ids:
        for a in anns_by_img[iid]:
            cid_new = old2new.get(a["category_id"])
            if not cid_new:
                continue
            x,y,w,h = a["bbox"]
            out_anns.append({
                "id": nid,
                "image_id": iid,
                "category_id": cid_new,
                "bbox": [float(x), float(y), float(w), float(h)],
                "area": float(a.get("area", w*h)),
                "iscrowd": int(a.get("iscrowd", 0))
            })
            nid += 1

    # safety: drop images that lost all anns
    valid_ids = {a["image_id"] for a in out_anns}
    out_imgs = [im for im in out_imgs if im["id"] in valid_ids]

    return {
        "info": coco.get("info", {"description": "COCO size-filtered mini"}),
        "licenses": coco.get("licenses", []),
        "images": out_imgs,
        "annotations": out_anns,
        "categories": keep_cats
    }

# ========= PIPELINE =========
def process_split(img_dir, ann_in, out_img_dir, out_json_path, target_n):
    print(f"[SPLIT] {img_dir} size={SIZE_FILTER} weather={'on' if APPLY_WEATHER else 'off'}")
    coco = load_json(ann_in)
    kept_images, anns_by_img, keep_cats, old2new = collect_images_by_size(coco, img_dir)
    if not kept_images:
        raise RuntimeError("No images matched the size filter with target classes.")

    print(f"  eligible images: {len(kept_images)}")
    samp = sample_images(kept_images, target_n)
    ensure_dir(out_img_dir)

    new_names = {}
    for im in samp:
        in_path = os.path.join(img_dir, im["file_name"])
        base = Path(im["file_name"]).stem
        if APPLY_WEATHER:
            img = cv2.imread(in_path)
            if img is None:
                continue
            weathers = choose_weathers()
            if not weathers:
                # edge case, treat as copy
                out_name = f"{base}.jpg"
                out_path = os.path.join(out_img_dir, out_name)
                shutil.copyfile(in_path, out_path)
                new_names[im["id"]] = out_name
            elif len(weathers) == 1:
                which = weathers[0]
                out_name = f"{base}_{which}.jpg"
                out_path = os.path.join(out_img_dir, out_name)
                out_img = weather_once(img, which)
                cv2.imwrite(out_path, out_img)
                new_names[im["id"]] = out_name
            else:
                out_first_name = None
                for idx, which in enumerate(weathers):
                    out_name = f"{base}_{which}.jpg"
                    out_path = os.path.join(out_img_dir, out_name)
                    out_img = weather_once(img, which)
                    cv2.imwrite(out_path, out_img)
                    if idx == 0:
                        out_first_name = out_name
                new_names[im["id"]] = out_first_name
        else:
            # weather off, copy original
            out_name = f"{base}.jpg"
            out_path = os.path.join(out_img_dir, out_name)
            shutil.copyfile(in_path, out_path)
            new_names[im["id"]] = out_name

    out_json = rewrite_json(coco, samp, anns_by_img, keep_cats, old2new, new_names)
    ensure_dir(Path(out_json_path).parent)
    with open(out_json_path, "w") as f:
        json.dump(out_json, f)
    print(f"  -> wrote {len(out_json['images'])} images to {out_img_dir}")
    print(f"  -> {out_json_path}")
    if len(out_json["annotations"]) == 0:
        print("  warning: zero annotations after filtering")

def main():
    ensure_dir(OUT_ROOT)
    process_split(TRAIN_IM_DIR, TRAIN_ANN_IN, OUT_TRAIN_IM, OUT_TRAIN_JS, TRAIN_TARGET)
    process_split(VAL_IM_DIR,   VAL_ANN_IN,   OUT_VAL_IM,   OUT_VAL_JS,   VAL_TARGET)

if __name__ == "__main__":
    main()


In [None]:
# weather label tags added
import os, json, random, shutil
from pathlib import Path
from collections import defaultdict
import numpy as np
import cv2

from pathlib import Path

WEATHER_LABELS = ("clear", "fog", "rain", "snow")

def with_weather_suffix(orig_name: str, tag: str) -> str:
    """'000123.jpg' + 'fog' -> '000123_fog.jpg'."""
    base = Path(orig_name).stem
    return f"{base}_{tag}.jpg"

def weather_from_filename(file_name: str) -> str:
    """Parse weather tag from filename; defaults to 'clear'."""
    tag = Path(file_name).stem.split("_")[-1].lower()
    return tag if tag in WEATHER_LABELS else "clear"


# ========= CONFIG =========
COCO_ROOT = "/nas.dbms/asera/PROJECTS/DATASET/COCO"  # adjust
TRAIN_IM_DIR = f"{COCO_ROOT}/images/train2017"
VAL_IM_DIR   = f"{COCO_ROOT}/images/val2017"
TRAIN_ANN_IN = f"{COCO_ROOT}/annotations/instances_train2017.json"
VAL_ANN_IN   = f"{COCO_ROOT}/annotations/instances_val2017.json"

OUT_ROOT     = f"{COCO_ROOT}/weather-mini-tag"  # will be created
OUT_TRAIN_IM = f"{OUT_ROOT}/images/train2017_weather_tag_clear-2400k6c"
OUT_VAL_IM   = f"{OUT_ROOT}/images/val2017_weather_tag_clear-500k6c"
OUT_TRAIN_JS = f"{OUT_ROOT}/annotations/mini_train2017_weather_tag_clear-2400k6c.json"
OUT_VAL_JS   = f"{OUT_ROOT}/annotations/mini_val2017_weather_tag_clear-500k6c.json"


# pick how many final images per split
TRAIN_TARGET = 2400
VAL_TARGET   = 500


# size selection: "all", "small", "medium", "large"
SIZE_FILTER = "all"   # set to "all" | "small" | "medium" | "large"

# COCO size rules: small < 32^2, medium in [32^2, 96^2), large >= 96^2
# or strict max side rule if USE_COCO_AREA=False
USE_COCO_AREA = True
AREA_SMALL_MAX  = 32 * 32
AREA_MED_MAX    = 96 * 96
MAX_SIDE_SMALL  = 48
MAX_SIDE_MEDMAX = 96

# keep only annotations that match SIZE_FILTER within selected images
KEEP_ONLY_MATCHING_ANNS = True

# weather settings
APPLY_WEATHER = False   # master switch. False means no weather, images copied as-is
APPLY_FOG  = False
APPLY_RAIN = False
APPLY_SNOW = False
WEATHER_PER_IMAGE = 3   # 1 means pick one at random, 3 means generate fog+rain+snow

# DAWN-like classes
TARGET_CLASS_NAMES = ["person","bicycle","car","motorcycle","bus","truck"]
# Select all COCO classes without listing them one by one
class _All:
    def __contains__(self, _):
        return True

# TARGET_CLASS_NAMES = _All()  # means all 80 COCO classes

# reproducibility
RNG_SEED = 42
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

# ========= WEATHER OPS =========
def _gaussian_mask(h, w, sigma=0.35):
    yy, xx = np.mgrid[0:h, 0:w]
    yy = (yy - h/2) / (sigma*h)
    xx = (xx - w/2) / (sigma*w)
    m = np.exp(-(xx**2 + yy**2))
    m = (m - m.min()) / (m.max() - m.min() + 1e-8)
    return m[..., None].astype(np.float32)

def add_fog(img, strength=0.55):
    blur = cv2.GaussianBlur(img, (0,0), sigmaX=max(3, int(0.02*max(img.shape[:2]))))
    fog  = np.clip(blur + strength*255, 0, 255).astype(np.uint8)
    mask = _gaussian_mask(img.shape[0], img.shape[1], sigma=0.38)
    out  = (img*(1-mask) + fog*mask).astype(np.uint8)
    return out

def add_rain(img, streaks=1100, alpha=0.20, angle_deg=18, length=(8,18)):
    h,w = img.shape[:2]
    overlay = img.copy()
    ang = np.deg2rad(angle_deg)
    c,s = np.cos(ang), np.sin(ang)
    for _ in range(streaks):
        L = np.random.randint(length[0], length[1]+1)
        x = np.random.randint(0, w)
        y = np.random.randint(0, h)
        x2 = int(x + L*c); y2 = int(y + L*s)
        cv2.line(overlay, (x,y), (x2,y2), (255,255,255), 1, cv2.LINE_AA)
    overlay = cv2.blur(overlay, (3,3))
    return cv2.addWeighted(overlay, alpha, img, 1-alpha, 0)

def add_snow(img, flakes=2000, alpha=0.30):
    h,w = img.shape[:2]
    overlay = img.copy()
    for _ in range(flakes):
        x = np.random.randint(0, w)
        y = np.random.randint(0, h)
        r = np.random.randint(1,3)
        cv2.circle(overlay, (x,y), r, (255,255,255), -1, cv2.LINE_AA)
    overlay = cv2.GaussianBlur(overlay, (3,3), 0)
    return cv2.addWeighted(overlay, alpha, img, 1-alpha, 0)

def weather_once(img, which):
    if which == "fog":  return add_fog(img)
    if which == "rain": return add_rain(img)
    if which == "snow": return add_snow(img)
    return img

# ========= HELPERS =========
def load_json(p):
    with open(p, "r") as f:
        return json.load(f)

def ensure_dir(p):
    Path(p).mkdir(parents=True, exist_ok=True)

def filter_to_targets(coco):
    keep_cat = [c for c in coco["categories"] if c["name"] in TARGET_CLASS_NAMES]
    oldid2new = {c["id"]: i+1 for i,c in enumerate(keep_cat)}
    for i,c in enumerate(keep_cat):
        c["id"] = i+1
    return keep_cat, oldid2new

def coco_size_bucket(area):
    if area < AREA_SMALL_MAX:
        return "small"
    if area < AREA_MED_MAX:
        return "medium"
    return "large"

def side_size_bucket(w, h):
    mx = max(w, h)
    if mx < MAX_SIDE_SMALL:
        return "small"
    if mx < MAX_SIDE_MEDMAX:
        return "medium"
    return "large"

def ann_size_bucket(ann):
    x,y,w,h = ann["bbox"]
    area = ann.get("area", w*h)
    return coco_size_bucket(area) if USE_COCO_AREA else side_size_bucket(w, h)

def matches_size_filter(size_name):
    if SIZE_FILTER == "all":
        return True
    return size_name == SIZE_FILTER

def choose_weathers():
    if not APPLY_WEATHER:
        return []
    opts = []
    if APPLY_FOG:  opts.append("fog")
    if APPLY_RAIN: opts.append("rain")
    if APPLY_SNOW: opts.append("snow")
    if not opts:
        return []
    if WEATHER_PER_IMAGE == 3:
        return opts
    return [random.choice(opts)]

def sample_images(kept_images, target_n):
    if len(kept_images) <= target_n:
        return kept_images
    return random.sample(kept_images, target_n)

# ========= CORE SELECTION =========
def collect_images_by_size(coco, img_dir):
    keep_cats, old2new = filter_to_targets(coco)
    # group anns by image, only target cats
    anns_by_img_all = defaultdict(list)
    for a in coco["annotations"]:
        if a["category_id"] in old2new:
            anns_by_img_all[a["image_id"]].append(a)

    # keep images that have at least one annotation matching SIZE_FILTER
    kept_images = []
    matched_anns_by_img = defaultdict(list)
    for img in coco["images"]:
        iid = img["id"]
        if iid not in anns_by_img_all:
            continue
        p = os.path.join(img_dir, img["file_name"])
        if not os.path.isfile(p):
            continue

        anns = anns_by_img_all[iid]
        has_match = False
        tmp = []
        for a in anns:
            sz = ann_size_bucket(a)
            if matches_size_filter(sz):
                has_match = True
                tmp.append(a)

        if SIZE_FILTER == "all":
            # any target class is fine
            if len(anns) > 0:
                kept_images.append(img)
                matched_anns_by_img[iid] = anns
        else:
            if has_match:
                kept_images.append(img)
                matched_anns_by_img[iid] = tmp if KEEP_ONLY_MATCHING_ANNS else anns

    return kept_images, matched_anns_by_img, keep_cats, old2new

# ========= JSON REWRITE =========
def rewrite_json(coco, variants, anns_by_img, keep_cats, old2new):
    """
    variants: list of dicts with keys:
      {'orig_id', 'new_id', 'file_name', 'width', 'height', 'weather'}
    """
    out_imgs = []
    out_anns = []
    nid = 1

    for rec in variants:
        out_imgs.append({
            "id": rec["new_id"],
            "file_name": rec["file_name"],
            "width": rec["width"],
            "height": rec["height"],
            # custom field; most COCO readers will ignore it unless you use it
            "weather": rec["weather"]
        })

        for a in anns_by_img[rec["orig_id"]]:
            cid_new = old2new.get(a["category_id"])
            if not cid_new:
                continue
            x, y, bw, bh = a["bbox"]
            area = a.get("area", bw * bh)
            out_anns.append({
                "id": nid,
                "image_id": rec["new_id"],
                "category_id": cid_new,
                "bbox": [float(x), float(y), float(bw), float(bh)],
                "area": float(area),
                "iscrowd": int(a.get("iscrowd", 0)),
            })
            nid += 1

    # drop images that ended up annotation-less
    valid_ids = {a["image_id"] for a in out_anns}
    out_imgs = [im for im in out_imgs if im["id"] in valid_ids]

    return {
        "info": coco.get("info", {"description": "COCO size-filtered mini with weather tags"}),
        "licenses": coco.get("licenses", []),
        "images": out_imgs,
        "annotations": out_anns,
        "categories": keep_cats
    }

# ========= PIPELINE =========
def process_split(img_dir, ann_in, out_img_dir, out_json_path, target_n):
    print(f"[SPLIT] {img_dir} size={SIZE_FILTER} weather={'on' if APPLY_WEATHER else 'off'}")
    coco = load_json(ann_in)
    kept_images, anns_by_img, keep_cats, old2new = collect_images_by_size(coco, img_dir)
    if not kept_images:
        raise RuntimeError("No images matched the size filter with target classes.")

    print(f"  eligible images: {len(kept_images)}")
    samp = sample_images(kept_images, target_n)
    ensure_dir(out_img_dir)

    variants = []
    next_img_id = 1  # re-id images in the new JSON

    for im in samp:
        in_path = os.path.join(img_dir, im["file_name"])
        base = Path(im["file_name"]).stem

        if APPLY_WEATHER:
            img = cv2.imread(in_path)
            if img is None:
                continue
            weathers = choose_weathers() or ["clear"]  # safety

            for which in weathers:
                out_name = with_weather_suffix(im["file_name"], which)
                out_path = os.path.join(out_img_dir, out_name)
                if which == "clear":
                    shutil.copyfile(in_path, out_path)
                else:
                    out_img = weather_once(img, which)
                    cv2.imwrite(out_path, out_img)

                variants.append({
                    "orig_id": im["id"],
                    "new_id": next_img_id,
                    "file_name": out_name,
                    "width": im.get("width"),
                    "height": im.get("height"),
                    "weather": which
                })
                next_img_id += 1

        else:
            # weather off: still tag filenames as _clear so you can FiLM by label
            out_name = with_weather_suffix(im["file_name"], "clear")
            out_path = os.path.join(out_img_dir, out_name)
            shutil.copyfile(in_path, out_path)

            variants.append({
                "orig_id": im["id"],
                "new_id": next_img_id,
                "file_name": out_name,
                "width": im.get("width"),
                "height": im.get("height"),
                "weather": "clear"
            })
            next_img_id += 1

    out_json = rewrite_json(coco, variants, anns_by_img, keep_cats, old2new)
    ensure_dir(Path(out_json_path).parent)
    with open(out_json_path, "w") as f:
        json.dump(out_json, f)
    print(f"  -> wrote {len(out_json['images'])} images to {out_img_dir}")
    print(f"  -> {out_json_path}")
    if len(out_json["annotations"]) == 0:
        print("  warning: zero annotations after filtering")


def main():
    ensure_dir(OUT_ROOT)
    process_split(TRAIN_IM_DIR, TRAIN_ANN_IN, OUT_TRAIN_IM, OUT_TRAIN_JS, TRAIN_TARGET)
    process_split(VAL_IM_DIR,   VAL_ANN_IN,   OUT_VAL_IM,   OUT_VAL_JS,   VAL_TARGET)

if __name__ == "__main__":
    main()


In [None]:
# Option to remove a class with 0 objects
import os, json, random, shutil
from pathlib import Path
from collections import defaultdict
import numpy as np
import cv2

from pathlib import Path

WEATHER_LABELS = ("clear", "fog", "rain", "snow")

# remove any category that ends up with 0 annotations after filtering/sampling
REMOVE_EMPTY_CLASSES = True


def with_weather_suffix(orig_name: str, tag: str) -> str:
    """'000123.jpg' + 'fog' -> '000123_fog.jpg'."""
    base = Path(orig_name).stem
    return f"{base}_{tag}.jpg"

def weather_from_filename(file_name: str) -> str:
    """Parse weather tag from filename; defaults to 'clear'."""
    tag = Path(file_name).stem.split("_")[-1].lower()
    return tag if tag in WEATHER_LABELS else "clear"


# ========= CONFIG =========
COCO_ROOT = "/nas.dbms/asera/PROJECTS/DATASET/ACDC-1"  # adjust
TRAIN_IM_DIR = f"{COCO_ROOT}/train"
VAL_IM_DIR   = f"{COCO_ROOT}/valid"
TRAIN_ANN_IN = f"{COCO_ROOT}/train/_annotations.coco.json"
VAL_ANN_IN   = f"{COCO_ROOT}/valid/_annotations.coco.json"

OUT_ROOT     = f"{COCO_ROOT}/ACDC-1-NEW"  # will be created
OUT_TRAIN_IM = f"{OUT_ROOT}/images/train"
OUT_VAL_IM   = f"{OUT_ROOT}/images/val"
OUT_TRAIN_JS = f"{OUT_ROOT}/annotations/mini_train.json"
OUT_VAL_JS   = f"{OUT_ROOT}/annotations/mini_val.json"


# pick how many final images per split
TRAIN_TARGET = 3000
VAL_TARGET   = 500


# size selection: "all", "small", "medium", "large"
SIZE_FILTER = "all"   # set to "all" | "small" | "medium" | "large"

# COCO size rules: small < 32^2, medium in [32^2, 96^2), large >= 96^2
# or strict max side rule if USE_COCO_AREA=False
USE_COCO_AREA = True
AREA_SMALL_MAX  = 32 * 32
AREA_MED_MAX    = 96 * 96
MAX_SIDE_SMALL  = 48
MAX_SIDE_MEDMAX = 96

# keep only annotations that match SIZE_FILTER within selected images
KEEP_ONLY_MATCHING_ANNS = True

# weather settings
APPLY_WEATHER = False   # master switch. False means no weather, images copied as-is
APPLY_FOG  = False
APPLY_RAIN = False
APPLY_SNOW = False
WEATHER_PER_IMAGE = 3   # 1 means pick one at random, 3 means generate fog+rain+snow

# DAWN-like classes
TARGET_CLASS_NAMES = ["person","bicycle","car","motorcycle","bus","truck"]
# TARGET_CLASS_NAMES = ["0","1","2","3","4","5"]  # DAWN-1 classes
# Select all COCO classes without listing them one by one
class _All:
    def __contains__(self, _):
        return True

# TARGET_CLASS_NAMES = _All()  # means all 80 COCO classes

# reproducibility
RNG_SEED = 42
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

# ========= WEATHER OPS =========
def _gaussian_mask(h, w, sigma=0.35):
    yy, xx = np.mgrid[0:h, 0:w]
    yy = (yy - h/2) / (sigma*h)
    xx = (xx - w/2) / (sigma*w)
    m = np.exp(-(xx**2 + yy**2))
    m = (m - m.min()) / (m.max() - m.min() + 1e-8)
    return m[..., None].astype(np.float32)

def add_fog(img, strength=0.55):
    blur = cv2.GaussianBlur(img, (0,0), sigmaX=max(3, int(0.02*max(img.shape[:2]))))
    fog  = np.clip(blur + strength*255, 0, 255).astype(np.uint8)
    mask = _gaussian_mask(img.shape[0], img.shape[1], sigma=0.38)
    out  = (img*(1-mask) + fog*mask).astype(np.uint8)
    return out

def add_rain(img, streaks=1100, alpha=0.20, angle_deg=18, length=(8,18)):
    h,w = img.shape[:2]
    overlay = img.copy()
    ang = np.deg2rad(angle_deg)
    c,s = np.cos(ang), np.sin(ang)
    for _ in range(streaks):
        L = np.random.randint(length[0], length[1]+1)
        x = np.random.randint(0, w)
        y = np.random.randint(0, h)
        x2 = int(x + L*c); y2 = int(y + L*s)
        cv2.line(overlay, (x,y), (x2,y2), (255,255,255), 1, cv2.LINE_AA)
    overlay = cv2.blur(overlay, (3,3))
    return cv2.addWeighted(overlay, alpha, img, 1-alpha, 0)

def add_snow(img, flakes=2000, alpha=0.30):
    h,w = img.shape[:2]
    overlay = img.copy()
    for _ in range(flakes):
        x = np.random.randint(0, w)
        y = np.random.randint(0, h)
        r = np.random.randint(1,3)
        cv2.circle(overlay, (x,y), r, (255,255,255), -1, cv2.LINE_AA)
    overlay = cv2.GaussianBlur(overlay, (3,3), 0)
    return cv2.addWeighted(overlay, alpha, img, 1-alpha, 0)

def weather_once(img, which):
    if which == "fog":  return add_fog(img)
    if which == "rain": return add_rain(img)
    if which == "snow": return add_snow(img)
    return img

# ========= HELPERS =========
def load_json(p):
    with open(p, "r") as f:
        return json.load(f)

def ensure_dir(p):
    Path(p).mkdir(parents=True, exist_ok=True)

def filter_to_targets(coco):
    keep_cat = [c for c in coco["categories"] if c["name"] in TARGET_CLASS_NAMES]
    oldid2new = {c["id"]: i+1 for i,c in enumerate(keep_cat)}
    for i,c in enumerate(keep_cat):
        c["id"] = i+1
    return keep_cat, oldid2new

def coco_size_bucket(area):
    if area < AREA_SMALL_MAX:
        return "small"
    if area < AREA_MED_MAX:
        return "medium"
    return "large"

def side_size_bucket(w, h):
    mx = max(w, h)
    if mx < MAX_SIDE_SMALL:
        return "small"
    if mx < MAX_SIDE_MEDMAX:
        return "medium"
    return "large"

def ann_size_bucket(ann):
    x,y,w,h = ann["bbox"]
    area = ann.get("area", w*h)
    return coco_size_bucket(area) if USE_COCO_AREA else side_size_bucket(w, h)

def matches_size_filter(size_name):
    if SIZE_FILTER == "all":
        return True
    return size_name == SIZE_FILTER

def choose_weathers():
    if not APPLY_WEATHER:
        return []
    opts = []
    if APPLY_FOG:  opts.append("fog")
    if APPLY_RAIN: opts.append("rain")
    if APPLY_SNOW: opts.append("snow")
    if not opts:
        return []
    if WEATHER_PER_IMAGE == 3:
        return opts
    return [random.choice(opts)]

def sample_images(kept_images, target_n):
    if len(kept_images) <= target_n:
        return kept_images
    return random.sample(kept_images, target_n)

# ========= CORE SELECTION =========
def collect_images_by_size(coco, img_dir):
    keep_cats, old2new = filter_to_targets(coco)
    # group anns by image, only target cats
    anns_by_img_all = defaultdict(list)
    for a in coco["annotations"]:
        if a["category_id"] in old2new:
            anns_by_img_all[a["image_id"]].append(a)

    # keep images that have at least one annotation matching SIZE_FILTER
    kept_images = []
    matched_anns_by_img = defaultdict(list)
    for img in coco["images"]:
        iid = img["id"]
        if iid not in anns_by_img_all:
            continue
        p = os.path.join(img_dir, img["file_name"])
        if not os.path.isfile(p):
            continue

        anns = anns_by_img_all[iid]
        has_match = False
        tmp = []
        for a in anns:
            sz = ann_size_bucket(a)
            if matches_size_filter(sz):
                has_match = True
                tmp.append(a)

        if SIZE_FILTER == "all":
            # any target class is fine
            if len(anns) > 0:
                kept_images.append(img)
                matched_anns_by_img[iid] = anns
        else:
            if has_match:
                kept_images.append(img)
                matched_anns_by_img[iid] = tmp if KEEP_ONLY_MATCHING_ANNS else anns

    return kept_images, matched_anns_by_img, keep_cats, old2new

# ========= JSON REWRITE =========
def rewrite_json(coco, kept_images, anns_by_img, keep_cats, old2new, new_filenames):
    """
    Builds a new COCO JSON:
      - remaps categories to 1..K
      - optionally removes categories with 0 instances
      - drops images with no remaining annotations
    """
    keep_img_ids = {im["id"] for im in kept_images}

    # stage 1: build image entries with possibly new filenames
    out_imgs = []
    for im in kept_images:
        new_file = new_filenames.get(im["id"], im["file_name"])
        out_imgs.append({
            "id": im["id"],
            "file_name": new_file,
            "width": im.get("width"),
            "height": im.get("height")
        })

    # stage 2: count instances per *new* category id (after TARGET_CLASS_NAMES filter)
    # counts are computed on the anns that survived SIZE_FILTER and exist in anns_by_img
    cat_counts_newid = defaultdict(int)
    for iid in keep_img_ids:
        for a in anns_by_img[iid]:
            cid_new = old2new.get(a["category_id"])
            if cid_new is not None:
                cat_counts_newid[cid_new] += 1

    # stage 3: select category ids to keep
    if REMOVE_EMPTY_CLASSES:
        kept_new_cat_ids = sorted([k for k, v in cat_counts_newid.items() if v > 0])
    else:
        # keep everything that was in keep_cats regardless of count
        kept_new_cat_ids = sorted([i + 1 for i in range(len(keep_cats))])

    # build mapping from current new-id -> final compact id 1..K
    new_to_final = {cid_new: i + 1 for i, cid_new in enumerate(kept_new_cat_ids)}
    kept_new_cat_ids_set = set(kept_new_cat_ids)

    # stage 4: build categories list with compact ids
    out_cats = []
    # keep_cats already had ids reassigned to 1..len(keep_cats) in filter_to_targets()
    # we must only keep those whose id is in kept_new_cat_ids, and reassign their id to new_to_final
    for c in keep_cats:
        old_new_id = c["id"]
        if old_new_id in kept_new_cat_ids_set:
            out_cats.append({
                "id": new_to_final[old_new_id],
                "name": c["name"],
                "supercategory": c.get("supercategory", "")
            })

    # stage 5: build annotations with compact category ids
    out_anns = []
    nid = 1
    for iid in keep_img_ids:
        for a in anns_by_img[iid]:
            cid_new = old2new.get(a["category_id"])
            if cid_new is None:
                continue
            if cid_new not in kept_new_cat_ids_set:
                continue  # category removed due to zero instances
            x, y, w, h = a["bbox"]
            out_anns.append({
                "id": nid,
                "image_id": iid,
                "category_id": new_to_final[cid_new],
                "bbox": [float(x), float(y), float(w), float(h)],
                "area": float(a.get("area", w * h)),
                "iscrowd": int(a.get("iscrowd", 0))
            })
            nid += 1

    # stage 6: drop images that lost all annotations after category removal
    valid_ids = {a["image_id"] for a in out_anns}
    out_imgs = [im for im in out_imgs if im["id"] in valid_ids]

    # optional: simple console summary
    if REMOVE_EMPTY_CLASSES:
        removed = [c["name"] for c in keep_cats if c["id"] not in kept_new_cat_ids_set]
        if removed:
            print(f"  removed empty classes: {removed}")

    return {
        "info": coco.get("info", {"description": "COCO size-filtered mini"}),
        "licenses": coco.get("licenses", []),
        "images": out_imgs,
        "annotations": out_anns,
        "categories": out_cats
    }


# ========= PIPELINE =========
def process_split(img_dir, ann_in, out_img_dir, out_json_path, target_n):
    print(f"[SPLIT] {img_dir} size={SIZE_FILTER} weather={'on' if APPLY_WEATHER else 'off'}")
    coco = load_json(ann_in)
    kept_images, anns_by_img, keep_cats, old2new = collect_images_by_size(coco, img_dir)
    if not kept_images:
        raise RuntimeError("No images matched the size filter with target classes.")

    print(f"  eligible images: {len(kept_images)}")
    samp = sample_images(kept_images, target_n)
    ensure_dir(out_img_dir)

    new_names = {}
    for im in samp:
        in_path = os.path.join(img_dir, im["file_name"])
        base = Path(im["file_name"]).stem
        if APPLY_WEATHER:
            img = cv2.imread(in_path)
            if img is None:
                continue
            weathers = choose_weathers()
            if not weathers:
                # edge case, treat as copy
                out_name = f"{base}.jpg"
                out_path = os.path.join(out_img_dir, out_name)
                shutil.copyfile(in_path, out_path)
                new_names[im["id"]] = out_name
            elif len(weathers) == 1:
                which = weathers[0]
                out_name = f"{base}_{which}.jpg"
                out_path = os.path.join(out_img_dir, out_name)
                out_img = weather_once(img, which)
                cv2.imwrite(out_path, out_img)
                new_names[im["id"]] = out_name
            else:
                out_first_name = None
                for idx, which in enumerate(weathers):
                    out_name = f"{base}_{which}.jpg"
                    out_path = os.path.join(out_img_dir, out_name)
                    out_img = weather_once(img, which)
                    cv2.imwrite(out_path, out_img)
                    if idx == 0:
                        out_first_name = out_name
                new_names[im["id"]] = out_first_name
        else:
            # weather off, copy original
            out_name = f"{base}.jpg"
            out_path = os.path.join(out_img_dir, out_name)
            shutil.copyfile(in_path, out_path)
            new_names[im["id"]] = out_name

    out_json = rewrite_json(coco, samp, anns_by_img, keep_cats, old2new, new_names)
    ensure_dir(Path(out_json_path).parent)
    with open(out_json_path, "w") as f:
        json.dump(out_json, f)
    print(f"  -> wrote {len(out_json['images'])} images to {out_img_dir}")
    print(f"  -> {out_json_path}")
    if len(out_json["annotations"]) == 0:
        print("  warning: zero annotations after filtering")

def main():
    ensure_dir(OUT_ROOT)
    process_split(TRAIN_IM_DIR, TRAIN_ANN_IN, OUT_TRAIN_IM, OUT_TRAIN_JS, TRAIN_TARGET)
    process_split(VAL_IM_DIR,   VAL_ANN_IN,   OUT_VAL_IM,   OUT_VAL_JS,   VAL_TARGET)

if __name__ == "__main__":
    main()
