In [None]:
import os

ROOT_DIR = r"C:\Users\benny\OneDrive\Documents\Github\site\temp3"  # your site folder containing .qmd + assets/
ASSETS_DIR = os.path.join(ROOT_DIR, "assets")

DRY_RUN = True   # <-- set to False to actually rename + write QMD updates


In [None]:
import os
import re
import csv
from collections import defaultdict
from urllib.parse import unquote

# Matches markdown links/images: (...assets/anything...)
MD_ASSET_LINK_RE = re.compile(r'\(([^)\n]*assets/[^)\n]+)\)')
# Matches HTML attributes: src="...assets/..." or href="...assets/..."
HTML_ASSET_ATTR_RE = re.compile(r'''(?:src|href)\s*=\s*["']([^"']*assets/[^"']+)["']''', re.IGNORECASE)

def extract_asset_links(text: str):
    """Return unique asset link strings found in QMD text (preserve first-seen order)."""
    links = []
    for m in MD_ASSET_LINK_RE.finditer(text or ""):
        links.append(m.group(1))
    for m in HTML_ASSET_ATTR_RE.finditer(text or ""):
        links.append(m.group(1))

    seen = set()
    out = []
    for x in links:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

def find_file_on_disk(qmd_dir: str, link_path: str):
    """
    link_path like "../assets/file(1).png?format=1500w" or "assets/x.png"
    Returns absolute path if exists, else None
    """
    clean = link_path.split("?")[0]  # drop query
    candidates = [
        os.path.normpath(os.path.join(qmd_dir, clean)),                            # literal
        os.path.normpath(os.path.join(qmd_dir, unquote(clean))),                   # url-decoded
        os.path.normpath(os.path.join(qmd_dir, clean.replace("+", " "))),          # plus->space
        os.path.normpath(os.path.join(qmd_dir, unquote(clean).replace("+", " "))), # both
    ]
    for p in candidates:
        if os.path.exists(p) and os.path.isfile(p):
            return p
    return None

def relink_same_prefix(original_link: str, new_filename: str) -> str:
    """
    Keep the original link's path prefix (e.g., ../assets/) but replace the filename.
    Also removes query params.
    """
    clean = original_link.split("?")[0]
    prefix = clean.rsplit("/", 1)[0]  # everything before filename
    return f"{prefix}/{new_filename}"

def safe_new_name(base_stem: str, idx: int, ext: str) -> str:
    ext = ext if ext else ".jpg"
    return f"{base_stem}-{idx:02d}{ext}"

def rename_assets_rename_only(root_dir: str, assets_dir: str, dry_run: bool = True):
    if not os.path.isdir(assets_dir):
        raise FileNotFoundError(f"Assets dir not found: {assets_dir}")

    # Collect QMD files
    qmd_files = []
    for r, _, files in os.walk(root_dir):
        if os.path.abspath(r).startswith(os.path.abspath(assets_dir)):
            continue
        for fn in files:
            if fn.endswith(".qmd"):
                qmd_files.append(os.path.join(r, fn))
    qmd_files.sort()  # deterministic order

    print(f"Found {len(qmd_files)} QMD files")

    # 1) Build reference map: abs_asset_path -> list of (qmd_path, original_link, position)
    refs = defaultdict(list)

    # Also store per QMD the links in order for canonical indexing
    qmd_link_order = {}

    for qmd_path in qmd_files:
        qmd_dir = os.path.dirname(qmd_path)
        with open(qmd_path, "r", encoding="utf-8") as f:
            text = f.read()

        links = extract_asset_links(text)
        qmd_link_order[qmd_path] = links

        for pos, link in enumerate(links, start=1):
            abs_path = find_file_on_disk(qmd_dir, link)
            if abs_path:
                refs[abs_path].append((qmd_path, link, pos))

    print(f"Referenced unique asset files: {len(refs)}")

    # 2) Decide canonical name per asset (rename-only):
    #    Choose canonical QMD = first QMD (sorted) that references the asset.
    #    Choose index based on first appearance order in that canonical QMD.
    #    If collisions occur, bump index until free.
    asset_plan = {}  # abs_asset_path -> dict(new_name, canonical_qmd, ref_count)

    # Track counters per canonical QMD for naming
    # BUT: we must ensure stable per-asset index = first appearance in canonical QMD,
    # then collision bump if needed.
    for abs_asset_path, occurrences in refs.items():
        occurrences_sorted = sorted(occurrences, key=lambda x: (x[0], x[2]))  # by qmd_path then position
        canonical_qmd, _link, pos = occurrences_sorted[0]
        stem = os.path.splitext(os.path.basename(canonical_qmd))[0]
        ext = os.path.splitext(abs_asset_path)[1] or os.path.splitext(_link.split("?")[0])[1] or ".jpg"

        # base index = pos in canonical qmd order (so it "follows qmd filename" naturally)
        idx = pos

        new_name = safe_new_name(stem, idx, ext)
        new_abs = os.path.join(assets_dir, new_name)

        # collision resolution
        bump = idx
        while os.path.exists(new_abs) and os.path.normpath(new_abs) != os.path.normpath(abs_asset_path):
            bump += 1
            new_name = safe_new_name(stem, bump, ext)
            new_abs = os.path.join(assets_dir, new_name)

        asset_plan[abs_asset_path] = {
            "canonical_qmd": canonical_qmd,
            "ref_count": len(occurrences),
            "new_name": new_name,
            "new_abs": new_abs,
        }

    shared_assets = [p for p, info in asset_plan.items() if info["ref_count"] > 1]
    print(f"Shared assets (used by >1 QMD): {len(shared_assets)}")
    print(f"DRY_RUN={dry_run}")

    # 3) Apply renames (each asset once)
    rename_map = {}  # old_abs -> new_abs
    for old_abs, info in asset_plan.items():
        new_abs = info["new_abs"]
        if os.path.normpath(old_abs) == os.path.normpath(new_abs):
            continue  # already in desired name
        rename_map[old_abs] = new_abs

    # 4) Update QMD contents based on rename_map
    updated_files = 0
    total_replacements = 0

    # pre-read all QMD to memory to avoid multiple disk writes
    qmd_text = {}
    for qmd_path in qmd_files:
        with open(qmd_path, "r", encoding="utf-8") as f:
            qmd_text[qmd_path] = f.read()

    # create helper: old filename -> new filename (because QMD links are relative)
    oldfile_to_newfile = {}
    for old_abs, new_abs in rename_map.items():
        oldfile_to_newfile[os.path.basename(old_abs)] = os.path.basename(new_abs)

    # update links
    for qmd_path in qmd_files:
        text = qmd_text[qmd_path]
        links = extract_asset_links(text)
        if not links:
            continue

        new_text = text
        changed = False
        for link in links:
            clean = link.split("?")[0]
            filename = clean.rsplit("/", 1)[-1]
            if filename in oldfile_to_newfile:
                new_link = relink_same_prefix(link, oldfile_to_newfile[filename])
                if link in new_text:
                    new_text = new_text.replace(link, new_link)
                    total_replacements += 1
                    changed = True
                elif clean in new_text:
                    new_text = new_text.replace(clean, new_link)
                    total_replacements += 1
                    changed = True

        if changed and new_text != text:
            qmd_text[qmd_path] = new_text
            updated_files += 1

    # 5) Write logs
    rename_csv = os.path.join(assets_dir, "_rename_map.csv")
    shared_csv = os.path.join(assets_dir, "_shared_assets.csv")

    rows = []
    shared_rows = []
    for old_abs, info in asset_plan.items():
        old_name = os.path.basename(old_abs)
        new_name = os.path.basename(info["new_abs"])
        row = [
            old_name,
            new_name,
            os.path.relpath(info["canonical_qmd"], root_dir).replace("\\", "/"),
            info["ref_count"],
        ]
        rows.append(row)
        if info["ref_count"] > 1:
            shared_rows.append(row)

    if not dry_run:
        # apply filesystem renames
        for old_abs, new_abs in rename_map.items():
            os.rename(old_abs, new_abs)

        # write updated QMDs
        for qmd_path, text in qmd_text.items():
            with open(qmd_path, "w", encoding="utf-8", newline="\n") as f:
                f.write(text)

        # write csv logs
        with open(rename_csv, "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerow(["old_filename", "new_filename", "canonical_qmd", "ref_count"])
            w.writerows(rows)

        with open(shared_csv, "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerow(["old_filename", "new_filename", "canonical_qmd", "ref_count"])
            w.writerows(shared_rows)

    print("\n--- SUMMARY ---")
    print(f"Planned renames: {len(rename_map)}")
    print(f"QMD files to update: {updated_files}")
    print(f"Total link replacements: {total_replacements}")
    print(f"Rename log (will be written when DRY_RUN=False): {rename_csv}")
    print(f"Shared assets report (will be written when DRY_RUN=False): {shared_csv}")

    # return some quick stats for notebook inspection
    return {
        "qmd_files": len(qmd_files),
        "unique_assets_referenced": len(refs),
        "shared_assets": len(shared_assets),
        "planned_renames": len(rename_map),
        "qmd_files_to_update": updated_files,
        "total_replacements": total_replacements,
    }


In [None]:
DRY_RUN = False
stats = rename_assets_rename_only(ROOT_DIR, ASSETS_DIR, dry_run=DRY_RUN)
stats
