In [None]:
import random, re, shutil
import csv
import random, re, shutil
from pathlib import Path

# Randomly select anomalies and normal images separately to evaluate and pick an appropriate threshold for the detector 

In [None]:
for threshold in [-346, -200, -100, 10, 100, 250, 350, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 5000, 8769]:
    ROOT = Path(f"/scratch/ahmad9/caserm/detect_mse_and_density/model1600/512.256.4/results_thresh_{threshold}/evaluate/")
    TXT_DIR = f"/scratch/ahmad9/caserm/detect_mse_and_density/model1600/512.256.4/results_thresh_{threshold}"  # directory containing many .txt files
    OUT_DIR_ALL = f"/scratch/ahmad9/caserm/detect_mse_and_density/model1600/512.256.4/results_thresh_{threshold}/evaluate/randomly_selected_all"
    OUT_DIR_ANOMALY = f"/scratch/ahmad9/caserm/detect_mse_and_density/model1600/512.256.4/results_thresh_{threshold}/evaluate/randomly_selected_anomaly"
    OUT_DIR_NORMAL  = f"/scratch/ahmad9/caserm/detect_mse_and_density/model1600/512.256.4/results_thresh_{threshold}/evaluate/randomly_selected_normal"
    csv_path = Path(f"/scratch/ahmad9/caserm/detect_mse_and_density/model1600/512.256.4/results_thresh_{threshold}/evaluate/sampled_lines.csv")
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    N     = 500   # how many lines to sample from the merged list
    SEED  = 42    # random seed for reproducibility
    FILTER_SUBSTR = None  # e.g., "rank 0" or "anomaly"; use None to skip filtering

    # --- Implementation (run as-is) ---
    # Example line:
    # Rank 0 · line 4, /path/to/file.png: The image is normal
    # Regex to pull the PNG path and (optionally) the label
    PNG_AND_LABEL = re.compile(r',\s*(/[^:]+?\.png):\s*The image is\s+(normal|anomaly)\s*$', re.IGNORECASE)
    PNG_IN_LINE   = re.compile(r',\s*(/[^:]+?\.png):')

    def unique_dest(outdir: Path, src: Path) -> Path:
        """Return a unique destination path in outdir for src (avoid collisions)."""
        dst = outdir / src.name
        if not dst.exists():
            return dst
        stem, suffix = src.stem, src.suffix
        k = 1
        while True:
            cand = outdir / f"{stem}_{k}{suffix}"
            if not cand.exists():
                return cand
            k += 1

    # 1) Collect & merge all .txt files
    txt_dir = Path(TXT_DIR)
    all_txts = sorted(txt_dir.glob("*.txt"))
    merged_lines = []
    for tf in all_txts:
        try:
            merged_lines.extend(tf.read_text(encoding="utf-8", errors="ignore").splitlines())
        except Exception as e:
            print(f"Skipping {tf}: {e}")

    if FILTER_SUBSTR:
        merged_lines = [ln for ln in merged_lines if FILTER_SUBSTR in ln]

    # Keep only lines that look like they contain a PNG path (for sampling)
    candidate_lines = [ln for ln in merged_lines if PNG_IN_LINE.search(ln)]
    if not candidate_lines:
        raise RuntimeError("No candidate lines with .png paths were found.")

    # 2) Reproducible sampling of lines
    rng = random.Random(SEED)
    n = min(N, len(candidate_lines))
    sampled_lines = rng.sample(candidate_lines, n)

    # Prepare output dirs
    out_anom = Path(OUT_DIR_ANOMALY); out_anom.mkdir(parents=True, exist_ok=True)
    out_norm = Path(OUT_DIR_NORMAL);  out_norm.mkdir(parents=True, exist_ok=True)
    out_all  = Path(OUT_DIR_ALL);     out_all.mkdir(parents=True, exist_ok=True)

    # 3) Process sampled lines: copy to ALL, then by label
    copied_all, copied_anom, copied_norm, missing = [], [], [], []
    for ln in sampled_lines:
        # Parse path + label (fallback to tail)
        m = PNG_AND_LABEL.search(ln)
        if m:
            png_path = Path(m.group(1))
            label = m.group(2).lower().strip()
        else:
            m2 = PNG_IN_LINE.search(ln)
            if not m2:
                continue
            png_path = Path(m2.group(1))
            tail = ln.strip().lower()
            if   tail.endswith("anomaly"): label = "anomaly"
            elif tail.endswith("normal"):  label = "normal"
            else:                          label = ""  # unknown label is fine for ALL

        if png_path.exists():
            # Copy to ALL
            dst_all = unique_dest(out_all, png_path)
            shutil.copy2(png_path, dst_all)
            copied_all.append((png_path, dst_all))

            # Copy by label
            if label == "anomaly":
                dst = unique_dest(out_anom, png_path)
                shutil.copy2(png_path, dst)
                copied_anom.append((png_path, dst))
            elif label == "normal":
                dst = unique_dest(out_norm, png_path)
                shutil.copy2(png_path, dst)
                copied_norm.append((png_path, dst))
        else:
            missing.append(str(png_path))

    # Manifests
    (out_anom / "copied_manifest_anomaly.csv").write_text(
        "src,dst\n" + "\n".join(f"{s},{d}" for s, d in copied_anom), encoding="utf-8"
    )
    (out_norm / "copied_manifest_normal.csv").write_text(
        "src,dst\n" + "\n".join(f"{s},{d}" for s, d in copied_norm), encoding="utf-8"
    )
    # Save the exact sampled lines for traceability
    sampled_text = "\n".join(sampled_lines)
    (ROOT / "sampled_lines.txt").write_text(sampled_text, encoding="utf-8")


    rows = []
    for ln in sampled_lines:
        m = PNG_AND_LABEL.search(ln)
        if m:
            png = m.group(1)
            label = m.group(2).lower().strip()
        else:
            m2 = PNG_IN_LINE.search(ln)
            png = m2.group(1) if m2 else ""
            tail = ln.strip().lower()
            label = "anomaly" if tail.endswith("anomaly") else "normal" if tail.endswith("normal") else ""

        png_name = Path(png).name if png else ""
        rows.append((ln, png, label, png_name))

    # (Optional) natural sort so numbers in filenames sort as 1,2,10 not 1,10,2
    def _nkey(s):  # natural sort key
        parts = re.split(r"(\d+)", s or "")
        return [int(p) if p.isdigit() else p.lower() for p in parts]

    rows_sorted = sorted(rows, key=lambda r: _nkey(r[3]))  # sort by png_name

    with csv_path.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["full_line", "png_path", "label", "png_name"])
        w.writerows(rows_sorted)

    print(f"Wrote sampled-lines CSV → {csv_path} (sorted by png_name)")

    if missing:
        (out_anom / "missing.txt").write_text("\n".join(missing), encoding="utf-8")

    print(f"Merged {len(all_txts)} .txt files; {len(merged_lines)} lines total "
        f"({len(candidate_lines)} with PNGs).")
    print(f"Sampled {n} lines → copied {len(copied_anom)} anomalies, {len(copied_norm)} normals; "
        f"missing {len(missing)}.")
    print(f"Anomaly out dir: {out_anom}")
    print(f"Normal  out dir: {out_norm}")
