In [2]:
import os
import re
import requests
import mimetypes
import time
from urllib.parse import urlparse, unquote
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# --- CONFIGURATION ---
SEARCH_DIR = '.' 
ASSETS_DIR_NAME = 'assets'
DELAY_SECONDS = 1.0  # Wait time between downloads to prevent crashing

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Regex for Squarespace + Custom Domain
URL_PATTERN = re.compile(
    r'(https?://(?:static1\.squarespace\.com|images\.squarespace-cdn\.com|benny\.istan\.to/s/|.*squarespace\.com/static)[^\s"\'\)]+)',
    re.IGNORECASE
)

def create_session():
    """Creates a browser-like session with auto-retry."""
    session = requests.Session()
    session.headers.update(HEADERS)
    retry_strategy = Retry(
        total=5,
        backoff_factor=1, 
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    return session

def fix_extension(ext):
    if not ext: return '.jpg'
    if ext.lower() in ['.jpe', '.jpeg']: return '.jpg'
    return ext

def smart_sort_key(filepath):
    """
    Extracts date from filename to sort chronologically.
    Files without dates (index.qmd) are put at the END (using 99999999).
    """
    filename = os.path.basename(filepath)
    # Check for YYYYMMDD at start of filename
    match = re.match(r'^(\d{8})', filename)
    if match:
        return int(match.group(1))
    
    # If no date, use a high number so it comes last, plus the filename for alphabetical tie-break
    # We return a tuple: (SortOrder, Filename)
    return 99999999

def download_assets():
    print(f"--- STARTING SMART SCAN in: {os.getcwd()} ---")
    
    session = create_session()
    assets_path = os.path.join(os.getcwd(), ASSETS_DIR_NAME)
    if not os.path.exists(assets_path):
        os.makedirs(assets_path)

    # 1. Gather ALL files
    all_qmd_files = []
    print("Gathering file list...")
    
    for root, dirs, files in os.walk(SEARCH_DIR):
        if ASSETS_DIR_NAME in root: continue
        for file in files:
            if file.endswith('.qmd'):
                full_path = os.path.join(root, file)
                all_qmd_files.append(full_path)

    # 2. Sort them (Dated files first, others last)
    # Python sorts tuples element by element.
    # We pass the filepath to the key function.
    all_qmd_files.sort(key=lambda x: (smart_sort_key(x), x))
    
    total_files = len(all_qmd_files)
    print(f"Found {total_files} files. Processing chronologically...")

    # 3. Process
    url_map = {} 
    
    for idx, file_path in enumerate(all_qmd_files, 1):
        filename = os.path.basename(file_path)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        urls = set(URL_PATTERN.findall(content))
        
        if not urls: continue

        print(f"\n[{idx}/{total_files}] Processing {filename} ({len(urls)} links)")
        
        new_content = content
        files_changed = False

        for url in urls:
            clean_key = url.split('?')[0]

            if clean_key in url_map:
                local_filename = url_map[clean_key]
            else:
                # --- DOWNLOAD ---
                parsed = urlparse(url)
                path_parts = unquote(parsed.path).split('/')
                base_name = path_parts[-1] if path_parts[-1] else path_parts[-2]
                base_name = os.path.splitext(base_name)[0]
                if not base_name or len(base_name) < 2: base_name = "asset"

                print(f"  -> Downloading: {base_name}...", end=" ", flush=True)

                try:
                    r = session.get(url, stream=True, allow_redirects=True, timeout=30)
                    
                    if r.status_code == 200:
                        cd = r.headers.get("Content-Disposition", "")
                        fname_match = re.findall("filename=\"?(.+)\"?", cd)
                        ext = os.path.splitext(fname_match[0])[1] if fname_match else ""
                        
                        if not ext:
                            ctype = r.headers.get('content-type', '').split(';')[0]
                            ext = mimetypes.guess_extension(ctype)
                        
                        ext = fix_extension(ext)
                        
                        filename_asset = f"{base_name}{ext}"
                        save_path = os.path.join(assets_path, filename_asset)

                        uniq_counter = 1
                        final_name = filename_asset
                        while os.path.exists(save_path):
                            final_name = f"{base_name}_{uniq_counter}{ext}"
                            save_path = os.path.join(assets_path, final_name)
                            uniq_counter += 1

                        with open(save_path, 'wb') as f_out:
                            for chunk in r.iter_content(chunk_size=8192):
                                f_out.write(chunk)
                        
                        url_map[clean_key] = final_name
                        local_filename = final_name
                        print(f"[OK]")
                        time.sleep(DELAY_SECONDS) 

                    else:
                        print(f"[FAILED] HTTP {r.status_code}")
                        continue

                except Exception as e:
                    print(f"[ERROR] {e}")
                    continue

            # --- REPLACE LINKS ---
            qmd_dir = os.path.dirname(file_path)
            rel_dir = os.path.relpath(assets_path, qmd_dir)
            local_rel_path = os.path.join(rel_dir, local_filename).replace("\\", "/")
            
            if local_rel_path:
                new_content = new_content.replace(url, local_rel_path)
                files_changed = True

        if files_changed:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)
            print(f"  -> Links updated.")

if __name__ == "__main__":
    download_assets()

--- STARTING SMART SCAN in: C:\Users\benny\OneDrive\Documents\Github\site ---
Gathering file list...
Found 138 files. Processing chronologically...

[29/138] Processing 20101215-using-google-to-disseminate-information-during-2010-tsunami-in-mentawai-islands.qmd (12 links)
  -> Downloading: Screen+Shot+2020-12-09+at+9.55.02+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.52.33+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.52.48+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.53.09+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.53.17+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.51.51+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.52.10+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.53.31+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.53.00+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.52.22+PM... [OK]
  -> Downloading: Screen+Shot+2020-12-09+at+9.54.51+PM... [OK]
  -> Downloading: Screen+Shot+2020

In [2]:
import os
import re
import csv
import time
import html
import mimetypes
import requests
from urllib.parse import urlparse, unquote, parse_qs
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import xml.etree.ElementTree as ET

# ---------------- CONFIG ----------------
SEARCH_DIR = "../temp3"
ASSETS_DIR_NAME = "assets"
DELAY_SECONDS = 1.0

# If you have root-relative /s/... links, we need a base domain:
BASE_SITE_URL = "https://benny.istan.to"  # change if needed

# Optional: parse WP XML export too (captures wp:attachment_url etc.)
XML_PATH = "/mnt/data/Squarespace-Wordpress-Export-01-31-2026.xml"  # set to None to disable

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    )
}

# Target hosts (extend if you have others)
TARGET_HOSTS = {
    "images.squarespace-cdn.com",
    "static1.squarespace.com",
    "benny.istan.to",
    "bennyistanto.squarespace.com",
}

# ---------------- URL EXTRACTION ----------------
# Remove fenced code blocks to avoid downloading URLs in code examples
FENCED_CODE_RE = re.compile(r"```.*?```", re.DOTALL)

# Markdown image/link targets: ![alt](URL)  or [text](URL)
MD_LINK_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)|\[[^\]]*\]\(([^)]+)\)")

# HTML attributes likely to contain assets
HTML_ATTR_RE = re.compile(
    r"""(?:src|href|data-src|data-image|poster)\s*=\s*["']([^"']+)["']""",
    re.IGNORECASE
)

# srcset="url 500w, url 1000w"
SRCSET_RE = re.compile(r"""srcset\s*=\s*["']([^"']+)["']""", re.IGNORECASE)

# CSS url(...)
CSS_URL_RE = re.compile(r"""url\(\s*['"]?([^'")]+)['"]?\s*\)""", re.IGNORECASE)

# Raw URLs fallback
RAW_URL_RE = re.compile(r"""https?://[^\s"'<>()]+""", re.IGNORECASE)


def create_session():
    """Browser-like session with retry."""
    session = requests.Session()
    session.headers.update(HEADERS)
    retry_strategy = Retry(
        total=6,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    return session


def strip_trailing_punct(u: str) -> str:
    return u.rstrip(').,;:\'"!?]>')

def normalize_url(u: str) -> str:
    """
    Normalize:
    - HTML unescape
    - strip punctuation
    - protocol-relative -> https
    - root-relative /s/... -> BASE_SITE_URL + /s/...
    """
    u = (u or "").strip()
    u = u.strip("<>")  # <https://...> style
    u = html.unescape(u)
    u = strip_trailing_punct(u)

    if u.startswith("//"):
        u = "https:" + u
    if u.startswith("/s/"):
        u = BASE_SITE_URL.rstrip("/") + u

    return u


def is_target_asset_url(u: str) -> bool:
    try:
        p = urlparse(u)
        if not p.netloc:
            return False
        host = p.netloc.lower()
        return host in TARGET_HOSTS
    except Exception:
        return False


def asset_key(u: str) -> str:
    """
    Canonical key used for de-dup:
    only host + path (ignore scheme/query/fragment).
    """
    p = urlparse(u)
    return f"{p.netloc.lower()}{p.path}"


def best_quality_score(u: str, width_hint: int = 0) -> int:
    """
    Score asset URL to choose best candidate per asset_key.
    - If srcset provides width hint (e.g., 1500w), prefer larger.
    - If query has format=1500w, prefer larger.
    """
    score = width_hint or 0
    try:
        q = parse_qs(urlparse(u).query)
        fmt = (q.get("format", [""])[0] or "").lower()
        m = re.search(r"(\d{3,5})w", fmt)
        if m:
            score = max(score, int(m.group(1)))
        if fmt in {"original", "raw"}:
            score = max(score, 99999)
    except Exception:
        pass
    return score


def extract_urls_from_text(text: str):
    """
    Returns list of tuples: (raw_url_string, normalized_url, width_hint)
    width_hint used only for srcset.
    """
    text = text or ""
    text = FENCED_CODE_RE.sub("", text)

    found = []

    # Markdown links/images
    for m in MD_LINK_RE.findall(text):
        # m is tuple (img_url, link_url) - one of them is non-empty
        for raw in m:
            if raw:
                raw = raw.strip().strip('"').strip("'")
                norm = normalize_url(raw)
                found.append((raw, norm, 0))

    # HTML attributes
    for raw in HTML_ATTR_RE.findall(text):
        raw = raw.strip()
        norm = normalize_url(raw)
        found.append((raw, norm, 0))

    # srcset
    for srcset in SRCSET_RE.findall(text):
        parts = [p.strip() for p in srcset.split(",") if p.strip()]
        for part in parts:
            # "URL 500w" or "URL 2x"
            tokens = part.split()
            raw = tokens[0].strip()
            width_hint = 0
            if len(tokens) > 1:
                m = re.match(r"(\d{2,5})w", tokens[1].lower())
                if m:
                    width_hint = int(m.group(1))
            norm = normalize_url(raw)
            found.append((raw, norm, width_hint))

    # CSS url(...)
    for raw in CSS_URL_RE.findall(text):
        raw = raw.strip()
        norm = normalize_url(raw)
        found.append((raw, norm, 0))

    # Raw URL fallback
    for raw in RAW_URL_RE.findall(text):
        raw = raw.strip()
        norm = normalize_url(raw)
        found.append((raw, norm, 0))

    # Filter targets
    out = []
    for raw, norm, w in found:
        if norm and is_target_asset_url(norm):
            out.append((raw, norm, w))
    return out


# ---------------- DOWNLOAD ----------------
ASSET_ID_RE = re.compile(r"(\d{13}-[A-Z0-9]{8,})")

def safe_filename(name: str) -> str:
    name = unquote(name)
    name = name.replace(" ", "_")
    name = re.sub(r"[^A-Za-z0-9._-]+", "", name)
    name = re.sub(r"_{2,}", "_", name).strip("._-")
    return name or "asset"

def fix_extension(ext: str) -> str:
    if not ext:
        return ".jpg"
    ext = ext.lower()
    if ext in [".jpe", ".jpeg"]:
        return ".jpg"
    return ext

def choose_base_name(final_url: str) -> str:
    """
    Prefer Squarespace asset id segment if present, else filename stem.
    """
    p = urlparse(final_url)
    m = ASSET_ID_RE.search(p.path)
    if m:
        return m.group(1)
    # fallback to filename stem
    base = os.path.basename(p.path) or "asset"
    stem = os.path.splitext(base)[0] or "asset"
    return safe_filename(stem)

def download_one(session, url, assets_path):
    """
    Download URL -> local filename. Returns (ok, local_filename, final_url, error_msg)
    """
    try:
        r = session.get(url, stream=True, allow_redirects=True, timeout=45)
        if r.status_code != 200:
            return False, None, r.url, f"HTTP {r.status_code}"

        ctype = (r.headers.get("content-type") or "").split(";")[0].strip().lower()
        if ctype.startswith("text/html"):
            return False, None, r.url, "Got HTML (likely blocked/redirected)"

        # decide extension
        ext = os.path.splitext(urlparse(r.url).path)[1]
        if not ext:
            ext = mimetypes.guess_extension(ctype) or ""
        ext = fix_extension(ext)

        base = choose_base_name(r.url)

        filename_asset = f"{base}{ext}"
        save_path = os.path.join(assets_path, filename_asset)

        uniq_counter = 1
        final_name = filename_asset
        while os.path.exists(save_path):
            final_name = f"{base}_{uniq_counter}{ext}"
            save_path = os.path.join(assets_path, final_name)
            uniq_counter += 1

        with open(save_path, "wb") as f_out:
            for chunk in r.iter_content(chunk_size=1024 * 64):
                if chunk:
                    f_out.write(chunk)

        # polite delay (respect Retry-After if 429 happened earlier via retries)
        time.sleep(DELAY_SECONDS)
        return True, final_name, r.url, None

    except Exception as e:
        return False, None, url, str(e)


# ---------------- MAIN WORKFLOW ----------------
def smart_sort_key(filepath):
    filename = os.path.basename(filepath)
    match = re.match(r"^(\d{8})", filename)
    if match:
        return int(match.group(1))
    return 99999999


def collect_qmd_files():
    all_qmd_files = []
    for root, dirs, files in os.walk(SEARCH_DIR):
        if ASSETS_DIR_NAME in root:
            continue
        for file in files:
            if file.endswith(".qmd"):
                all_qmd_files.append(os.path.join(root, file))
    all_qmd_files.sort(key=lambda x: (smart_sort_key(x), x))
    return all_qmd_files


def collect_urls_from_xml(xml_path):
    """
    Collect URLs from:
    - content:encoded
    - wp:attachment_url
    """
    if not xml_path or not os.path.exists(xml_path):
        return []

    ns = {
        "content": "http://purl.org/rss/1.0/modules/content/",
        "wp": "http://wordpress.org/export/1.2/",
    }
    tree = ET.parse(xml_path)
    root = tree.getroot()
    channel = root.find("channel")
    if channel is None:
        return []

    extracted = []
    for item in channel.findall("item"):
        content = item.findtext("{http://purl.org/rss/1.0/modules/content/}encoded") or ""
        attach = item.findtext("{http://wordpress.org/export/1.2/}attachment_url") or ""
        combined = content + "\n" + attach
        extracted.extend(extract_urls_from_text(combined))
    return extracted


def download_assets():
    print(f"--- STARTING SCAN in: {os.getcwd()} ---")

    session = create_session()
    assets_path = os.path.join(os.getcwd(), ASSETS_DIR_NAME)
    os.makedirs(assets_path, exist_ok=True)

    # 1) Collect QMD URLs (raw->norm)
    qmd_files = collect_qmd_files()
    print(f"Found {len(qmd_files)} .qmd files (chronological order).")

    # For each asset_key, keep best candidate (highest score)
    best_url_for_key = {}   # key -> (best_url, score)
    raw_to_key = {}         # raw string -> asset_key (used for replacement)
    raw_to_url = {}         # raw string -> normalized url (used to compute key)

    for idx, file_path in enumerate(qmd_files, 1):
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        triples = extract_urls_from_text(content)
        if not triples:
            continue

        for raw, norm_u, w in triples:
            k = asset_key(norm_u)
            raw_to_key[(file_path, raw)] = k
            raw_to_url[(file_path, raw)] = norm_u

            score = best_quality_score(norm_u, width_hint=w)
            prev = best_url_for_key.get(k)
            if (prev is None) or (score > prev[1]):
                best_url_for_key[k] = (norm_u, score)

    print(f"Discovered {len(best_url_for_key)} unique asset targets from QMD.")

    # 2) Optionally extend with XML export
    xml_triples = collect_urls_from_xml(XML_PATH)
    if xml_triples:
        before = len(best_url_for_key)
        for raw, norm_u, w in xml_triples:
            k = asset_key(norm_u)
            score = best_quality_score(norm_u, width_hint=w)
            prev = best_url_for_key.get(k)
            if (prev is None) or (score > prev[1]):
                best_url_for_key[k] = (norm_u, score)
        after = len(best_url_for_key)
        print(f"Extended with XML export: {before} -> {after} unique assets.")

    # 3) Download unique assets
    url_map = {}  # key -> local_filename
    success_csv = os.path.join(assets_path, "_download_success.csv")
    failed_txt = os.path.join(assets_path, "_download_failed.txt")

    success_rows = []
    failed = []

    keys = list(best_url_for_key.keys())
    print(f"Downloading {len(keys)} unique assets...")

    for i, k in enumerate(keys, 1):
        url = best_url_for_key[k][0]
        ok, local_name, final_url, err = download_one(session, url, assets_path)
        if ok:
            url_map[k] = local_name
            success_rows.append([k, url, final_url, local_name])
            print(f"[{i}/{len(keys)}] OK  -> {local_name}")
        else:
            failed.append(f"{url}\t{err}")
            print(f"[{i}/{len(keys)}] FAIL -> {url} ({err})")

    # write logs
    with open(success_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["asset_key", "requested_url", "final_url", "local_filename"])
        w.writerows(success_rows)

    if failed:
        with open(failed_txt, "w", encoding="utf-8") as f:
            f.write("\n".join(failed) + "\n")

    # 4) Replace links in QMD
    print("Updating QMD links...")
    changed_files = 0

    for idx, file_path in enumerate(qmd_files, 1):
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        triples = extract_urls_from_text(content)
        if not triples:
            continue

        new_content = content
        changed = False

        qmd_dir = os.path.dirname(file_path)
        rel_dir = os.path.relpath(assets_path, qmd_dir).replace("\\", "/")

        for raw, norm_u, w in triples:
            k = asset_key(norm_u)
            if k not in url_map:
                continue
            local_rel_path = f"{rel_dir}/{url_map[k]}".replace("\\", "/")
            if raw in new_content:
                new_content = new_content.replace(raw, local_rel_path)
                changed = True

        if changed and new_content != content:
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(new_content)
            changed_files += 1

    print(f"Done.")
    print(f"- Downloaded: {len(url_map)}")
    print(f"- Failed: {len(failed)} (see {failed_txt})" if failed else "- Failed: 0")
    print(f"- QMD files updated: {changed_files}")
    print(f"- Logs: {success_csv}")


if __name__ == "__main__":
    download_assets()


--- STARTING SCAN in: C:\Users\benny\OneDrive\Documents\Github\site\notebook ---
Found 139 .qmd files (chronological order).
Discovered 1013 unique asset targets from QMD.
Downloading 1013 unique assets...
[1/1013] OK  -> 1589970243551-TI4HHMOJFZOGLUJB19WX.jpg
[2/1013] OK  -> 1589970591143-YHG1TDVA2R9YHBJI0CFX.jpg
[3/1013] OK  -> 1607418713378-6IIY5BYUD6NFY2MOVF5G.jpg
[4/1013] OK  -> 1607419487327-WSBT820AUOYD80WO83IL.jpg
[5/1013] OK  -> 1607421400770-X17HT6PKV206P10JL1XE.png
[6/1013] OK  -> 1607421402584-P8RGCU47EXZAG0XENSGR.png
[7/1013] OK  -> 1607421404422-R2U9KRA8KXHX6HKG5XAI.png
[8/1013] OK  -> 1607421407557-MLPAOSAGN3MUKRWXSJND.png
[9/1013] OK  -> 1607421411007-0UTKLSL9VD6T4SQAIT7W.png
[10/1013] OK  -> 1607421414781-9XNFEM9CRZS02O1AAMNU.png
[11/1013] OK  -> 1607421418259-FYGE7IB89QD2T4MAAUSX.png
[12/1013] OK  -> 1607421421371-D0OYPIUHRZF7LHXINKHX.png
[13/1013] OK  -> 1607421425745-FISBMKO0KWDVGSMD8UWJ.png
[14/1013] OK  -> 1607421431834-UAIW48FPGME5X9LZNXP7.png
[15/1013] OK  -> 16

In [3]:
SITE_DIR = r"C:\Users\benny\OneDrive\Documents\Github\site\temp3"   # where your .qmd are
ASSETS_DIR = os.path.join(SITE_DIR, "assets")
MANIFEST_PATH = os.path.join(SITE_DIR, "_asset_manifest.txt")

BASE_SITE_URL = "https://benny.istan.to"  # for /s/... links if any
DELAY_SECONDS = 1.0
OVERWRITE_EXISTING = False  # if True, redownload even if file exists


In [4]:
import os, re, csv, time, html, mimetypes
from urllib.parse import urlparse, unquote, parse_qs, urljoin

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    )
}

# Domains that usually host actual files
ASSET_HOSTS = {
    "images.squarespace-cdn.com",
    "static1.squarespace.com",
    "images.squarespace.com",
    "static.squarespace.com",
}

# Your site domain CAN be kept, but only for /s/ file URLs
SITE_HOST_ALLOW = {"benny.istan.to", "bennyistanto.squarespace.com"}

DOWNLOADABLE_EXTS = {
    ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg",
    ".pdf", ".zip", ".mp4", ".mov", ".mp3", ".wav",
    ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
}

FENCED_CODE_RE = re.compile(r"```.*?```", re.DOTALL)
MD_LINK_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)|\[[^\]]*\]\(([^)]+)\)")
HTML_ATTR_RE = re.compile(r"""(?:src|href|data-src|data-image|data-original|poster)\s*=\s*["']([^"']+)["']""", re.IGNORECASE)
SRCSET_RE = re.compile(r"""srcset\s*=\s*["']([^"']+)["']""", re.IGNORECASE)
CSS_URL_RE = re.compile(r"""url\(\s*['"]?([^'")]+)['"]?\s*\)""", re.IGNORECASE)
RAW_URL_RE = re.compile(r"""https?://[^\s"'<>()]+""", re.IGNORECASE)

ASSET_ID_RE = re.compile(r"(\d{13}-[A-Z0-9]{8,})")


def create_session():
    s = requests.Session()
    s.headers.update(HEADERS)
    retry = Retry(
        total=6,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"],
    )
    adapter = HTTPAdapter(max_retries=retry)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    return s


def strip_trailing_punct(u: str) -> str:
    return u.rstrip(').,;:\'"!?]>')

def normalize_url(u: str, base_site_url: str) -> str:
    u = (u or "").strip().strip("<>")
    u = html.unescape(u)
    u = strip_trailing_punct(u)
    if u.startswith("//"):
        u = "https:" + u
    if u.startswith("/"):
        u = urljoin(base_site_url.rstrip("/") + "/", u.lstrip("/"))
    return u


def looks_like_downloadable_file(u: str) -> bool:
    """
    True if URL is likely a file asset (not a page).
    """
    try:
        p = urlparse(u)
        host = p.netloc.lower()
        path = p.path or ""
        ext = os.path.splitext(path)[1].lower()

        # 1) Asset CDN hosts: require a file extension OR content-type query
        if host in ASSET_HOSTS:
            if ext in DOWNLOADABLE_EXTS:
                return True
            q = parse_qs(p.query)
            if "content-type" in q:
                return True
            return False

        # 2) Your own domain: only download if it's a Squarespace file link (/s/...) AND looks like a file
        if host in SITE_HOST_ALLOW:
            if path.startswith("/s/") and (ext in DOWNLOADABLE_EXTS or ext):
                return True
            return False

        return False
    except Exception:
        return False


def asset_key(u: str) -> str:
    p = urlparse(u)
    return f"{p.netloc.lower()}{p.path}"


def best_quality_score(u: str, width_hint: int = 0) -> int:
    score = width_hint or 0
    try:
        q = parse_qs(urlparse(u).query)
        fmt = (q.get("format", [""])[0] or "").lower()
        m = re.search(r"(\d{3,5})w", fmt)
        if m:
            score = max(score, int(m.group(1)))
        if fmt in {"original", "raw"}:
            score = max(score, 99999)
    except Exception:
        pass
    return score


def safe_filename(name: str) -> str:
    name = unquote(name).replace(" ", "_")
    name = re.sub(r"[^A-Za-z0-9._-]+", "", name)
    name = re.sub(r"_{2,}", "_", name).strip("._-")
    return name or "asset"


def fix_extension(ext: str) -> str:
    if not ext:
        return ".jpg"
    ext = ext.lower()
    if ext in [".jpe", ".jpeg"]:
        return ".jpg"
    return ext


def choose_base_name(final_url: str) -> str:
    p = urlparse(final_url)
    m = ASSET_ID_RE.search(p.path or "")
    if m:
        return m.group(1)
    base = os.path.basename(p.path) or "asset"
    stem = os.path.splitext(base)[0] or "asset"
    return safe_filename(stem)


def extract_urls_from_text(text: str, base_site_url: str):
    text = FENCED_CODE_RE.sub("", text or "")
    found = []

    # Markdown links/images
    for m in MD_LINK_RE.findall(text):
        for raw in m:
            if raw:
                raw = raw.strip().strip('"').strip("'")
                norm = normalize_url(raw, base_site_url)
                found.append((raw, norm, 0))

    # HTML attributes
    for raw in HTML_ATTR_RE.findall(text):
        raw = raw.strip()
        norm = normalize_url(raw, base_site_url)
        found.append((raw, norm, 0))

    # srcset
    for srcset in SRCSET_RE.findall(text):
        parts = [p.strip() for p in srcset.split(",") if p.strip()]
        for part in parts:
            tokens = part.split()
            raw = tokens[0].strip()
            width_hint = 0
            if len(tokens) > 1:
                m = re.match(r"(\d{2,5})w", tokens[1].lower())
                if m:
                    width_hint = int(m.group(1))
            norm = normalize_url(raw, base_site_url)
            found.append((raw, norm, width_hint))

    # CSS url(...)
    for raw in CSS_URL_RE.findall(text):
        raw = raw.strip()
        norm = normalize_url(raw, base_site_url)
        found.append((raw, norm, 0))

    # Raw URLs
    for raw in RAW_URL_RE.findall(text):
        raw = raw.strip()
        norm = normalize_url(raw, base_site_url)
        found.append((raw, norm, 0))

    # Keep only downloadables
    out = []
    for raw, norm, w in found:
        if norm and looks_like_downloadable_file(norm):
            out.append((raw, norm, w))
    return out


def collect_qmd_files(site_dir: str, assets_dir: str):
    qmds = []
    for root, _, files in os.walk(site_dir):
        # skip the assets dir itself
        if os.path.abspath(root).startswith(os.path.abspath(assets_dir)):
            continue
        for fn in files:
            if fn.endswith(".qmd"):
                qmds.append(os.path.join(root, fn))
    qmds.sort()
    return qmds


def download_one(session, url, assets_path, delay_seconds: float, overwrite: bool):
    try:
        r = session.get(url, stream=True, allow_redirects=True, timeout=45)
        if r.status_code != 200:
            return False, None, r.url, f"HTTP {r.status_code}"

        ctype = (r.headers.get("content-type") or "").split(";")[0].strip().lower()
        if ctype.startswith("text/html"):
            return False, None, r.url, "Got HTML (page/blocked/redirected)"

        ext = os.path.splitext(urlparse(r.url).path)[1]
        if not ext:
            ext = mimetypes.guess_extension(ctype) or ""
        ext = fix_extension(ext)

        base = choose_base_name(r.url)
        filename_asset = f"{base}{ext}"
        save_path = os.path.join(assets_path, filename_asset)

        if os.path.exists(save_path) and not overwrite:
            return True, filename_asset, r.url, None

        uniq = 1
        final_name = filename_asset
        while os.path.exists(save_path) and overwrite is False:
            final_name = f"{base}_{uniq}{ext}"
            save_path = os.path.join(assets_path, final_name)
            uniq += 1

        with open(save_path, "wb") as f_out:
            for chunk in r.iter_content(chunk_size=1024 * 64):
                if chunk:
                    f_out.write(chunk)

        time.sleep(delay_seconds)
        return True, final_name, r.url, None

    except Exception as e:
        return False, None, url, str(e)


def run_download(site_dir, assets_dir, manifest_path, base_site_url, delay_seconds=1.0, overwrite=False):
    print(f"--- STARTING SCAN in: {site_dir} ---")
    os.makedirs(assets_dir, exist_ok=True)

    session = create_session()
    qmd_files = collect_qmd_files(site_dir, assets_dir)
    print(f"Found {len(qmd_files)} .qmd files.")

    # key -> (best_url, score)
    best = {}

    # QMD scan
    qmd_count = 0
    for fp in qmd_files:
        with open(fp, "r", encoding="utf-8") as f:
            text = f.read()
        triples = extract_urls_from_text(text, base_site_url)
        for _raw, norm, w in triples:
            k = asset_key(norm)
            score = best_quality_score(norm, w)
            prev = best.get(k)
            if prev is None or score > prev[1]:
                best[k] = (norm, score)
        qmd_count += len(triples)

    print(f"QMD scan: {len(best)} unique downloadable assets (from {qmd_count} URL hits).")

    # Manifest scan
    if manifest_path and os.path.exists(manifest_path):
        before = len(best)
        with open(manifest_path, "r", encoding="utf-8") as f:
            for line in f:
                u = normalize_url(line.strip(), base_site_url)
                if u and looks_like_downloadable_file(u):
                    k = asset_key(u)
                    score = best_quality_score(u, 0)
                    prev = best.get(k)
                    if prev is None or score > prev[1]:
                        best[k] = (u, score)
        after = len(best)
        print(f"Manifest added: {before} -> {after} unique downloadable assets.")
    else:
        print("Manifest not found / not used (this is why you only saw ~1013 previously).")

    keys = list(best.keys())
    print(f"Downloading {len(keys)} unique assets...")

    success_csv = os.path.join(assets_dir, "_download_success.csv")
    failed_txt = os.path.join(assets_dir, "_download_failed.txt")
    url_map = {}
    success_rows = []
    failed = []

    for i, k in enumerate(keys, 1):
        url = best[k][0]
        ok, local_name, final_url, err = download_one(session, url, assets_dir, delay_seconds, overwrite)
        if ok:
            url_map[k] = local_name
            success_rows.append([k, url, final_url, local_name])
            if i <= 20 or i % 200 == 0:
                print(f"[{i}/{len(keys)}] OK  -> {local_name}")
        else:
            failed.append(f"{url}\t{err}")
            if i <= 20 or i % 200 == 0:
                print(f"[{i}/{len(keys)}] FAIL -> {url} ({err})")

    with open(success_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["asset_key", "requested_url", "final_url", "local_filename"])
        w.writerows(success_rows)

    if failed:
        with open(failed_txt, "w", encoding="utf-8") as f:
            f.write("\n".join(failed) + "\n")

    # Rewrite QMD links
    changed_files = 0
    for fp in qmd_files:
        with open(fp, "r", encoding="utf-8") as f:
            text = f.read()

        triples = extract_urls_from_text(text, base_site_url)
        if not triples:
            continue

        qmd_dir = os.path.dirname(fp)
        rel_assets = os.path.relpath(assets_dir, qmd_dir).replace("\\", "/")

        new_text = text
        changed = False
        for raw, norm, _w in triples:
            k = asset_key(norm)
            if k in url_map:
                local_rel = f"{rel_assets}/{url_map[k]}".replace("\\", "/")
                if raw in new_text:
                    new_text = new_text.replace(raw, local_rel)
                    changed = True

        if changed and new_text != text:
            with open(fp, "w", encoding="utf-8", newline="\n") as f:
                f.write(new_text)
            changed_files += 1

    print("\nDone.")
    print(f"- Downloaded: {len(url_map)}")
    print(f"- Failed: {len(failed)} (see {failed_txt})" if failed else "- Failed: 0")
    print(f"- Updated QMD files: {changed_files}")
    print(f"- Logs: {success_csv}")


In [5]:
run_download(
    site_dir=SITE_DIR,
    assets_dir=ASSETS_DIR,
    manifest_path=MANIFEST_PATH,
    base_site_url=BASE_SITE_URL,
    delay_seconds=DELAY_SECONDS,
    overwrite=OVERWRITE_EXISTING
)


--- STARTING SCAN in: C:\Users\benny\OneDrive\Documents\Github\site\temp3 ---
Found 139 .qmd files.
QMD scan: 0 unique downloadable assets (from 0 URL hits).
Manifest added: 0 -> 951 unique downloadable assets.
Downloading 951 unique assets...
[1/951] OK  -> CV_ISTANTO_Benny_20250516.pdf
[2/951] OK  -> 15325333_10211141699402159_1841558926548787098_o_10211141699402159.jpg
[3/951] OK  -> mmr_pheno_eos1_2022.png
[4/951] OK  -> Screenshot2023-08-12141329.png
[5/951] OK  -> ScreenShot2022-03-19at9.42.39PM.png
[6/951] OK  -> ukr_pop_landscan_population_2021_wbg_A4L.png
[7/951] OK  -> ScreenShot2022-10-18at5.00.48PM.png
[8/951] OK  -> ScreenShot2022-10-18at4.36.45PM.png
[9/951] OK  -> 1692902253633.jpg
[10/951] OK  -> Screenshot2023-06-15075323.png
[11/951] OK  -> Heatwave_CA_2016a.png
[12/951] OK  -> IMG_2664.jpg
[13/951] OK  -> idn_annualrain_2021.png
[14/951] OK  -> viz4.png
[15/951] OK  -> Screenshot2023-05-09224753.png
[16/951] OK  -> 1589366320149-LLK82OODORI0BZMMZ0LA.png
[17/951] OK  