In [2]:
# ---------- CONFIG ----------
INPUT_XML_FILE = r"C:\Users\benny\Downloads\Squarespace-Wordpress-Export-01-31-2026.xml"
OUTPUT_DIR     = r"C:\Users\benny\OneDrive\Documents\Github\site\temp3"   # or just "site" for relative
BASE_SITE_URL  = "https://benny.istan.to"  # used to resolve /s/... or /... links

PRESERVE_HIERARCHY = True     # nest pages by wp:post_parent
OVERWRITE          = False    # overwrite existing .qmd and manifest files
ALLOW_BLOG_PAGE_OVERWRITE = False  # if a PAGE slug == "blog", write to pages/blog/ instead of blog/index.qmd


In [3]:
import os
import re
import csv
from datetime import datetime
from urllib.parse import urljoin

import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from markdownify import markdownify as md

NAMESPACES = {
    "wp": "http://wordpress.org/export/1.2/",
    "content": "http://purl.org/rss/1.0/modules/content/",
    "excerpt": "http://wordpress.org/export/1.2/excerpt/",
    "dc": "http://purl.org/dc/elements/1.1/",
}

RAW_URL_RE = re.compile(r"https?://[^\s\"\'<>]+", re.IGNORECASE)


def get_text(element: ET.Element, tag: str, ns: dict | None = None) -> str:
    try:
        found = element.find(tag, namespaces=ns) if ns else element.find(tag)
        return (found.text or "").strip() if found is not None else ""
    except Exception:
        return ""


def parse_rfc822_to_iso(pub_date: str) -> str:
    if not pub_date:
        return ""
    try:
        dt_obj = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %z")
        return dt_obj.strftime("%Y-%m-%d")
    except Exception:
        return ""


def yaml_escape(s: str) -> str:
    return (s or "").replace('"', "'").replace("\n", " ").strip()


def format_yaml(title: str, date_rfc822: str, categories: list[str], tags: list[str],
                author: str, summary: str = "") -> str:
    title = title or "Untitled"
    out = ["---", f'title: "{yaml_escape(title)}"']

    if author:
        out.append(f'author: "{yaml_escape(author)}"')

    date_iso = parse_rfc822_to_iso(date_rfc822)
    if date_iso:
        out.append(f'date: "{date_iso}"')

    merged = sorted({t for t in (categories or []) + (tags or []) if t})
    if merged:
        out.append("categories:")
        for t in merged:
            out.append(f'  - "{yaml_escape(t)}"')

    if summary:
        out.append(f'description: "{yaml_escape(summary)}"')

    out.append("---\n")
    return "\n".join(out)


def pick_best_from_srcset(srcset: str) -> str:
    if not srcset:
        return ""
    best_url, best_w = "", -1
    for part in srcset.split(","):
        part = part.strip()
        if not part:
            continue
        tokens = part.split()
        url = tokens[0].strip()
        w = 0
        if len(tokens) > 1 and tokens[1].lower().endswith("w"):
            try:
                w = int(tokens[1][:-1])
            except Exception:
                w = 0
        if w >= best_w:
            best_w = w
            best_url = url
    return best_url


def normalize_url(u: str, base_site_url: str) -> str:
    u = (u or "").strip()
    if not u:
        return ""
    if u.startswith("//"):
        u = "https:" + u
    if u.startswith("/"):
        u = urljoin(base_site_url.rstrip("/") + "/", u.lstrip("/"))
    return u


def normalize_media_attributes(soup: BeautifulSoup, base_site_url: str) -> None:
    # Images: promote real URL into src and drop srcset
    for img in soup.find_all("img"):
        cand = ""
        for attr in ("data-src", "data-image", "data-original"):
            if img.get(attr):
                cand = img.get(attr)
                break
        if not cand and img.get("srcset"):
            cand = pick_best_from_srcset(img.get("srcset"))
        if not cand and img.get("src"):
            cand = img.get("src")

        cand = normalize_url(cand, base_site_url)
        if cand:
            img["src"] = cand
        if img.get("srcset"):
            del img["srcset"]

    # Links
    for a in soup.find_all("a"):
        href = normalize_url(a.get("href", ""), base_site_url)
        if href:
            a["href"] = href

    # Other common media
    for tag_name, attr in [("source", "src"), ("video", "src"), ("audio", "src"), ("iframe", "src")]:
        for t in soup.find_all(tag_name):
            v = normalize_url(t.get(attr, ""), base_site_url)
            if v:
                t[attr] = v


def clean_content_to_markdown(html_content: str, base_site_url: str) -> str:
    if not html_content:
        return ""
    soup = BeautifulSoup(html_content, "lxml")

    # Unwrap common Squarespace wrappers
    for div in soup.find_all("div", class_="sqs-html-content"):
        div.unwrap()

    # Critical: normalize <img> / links BEFORE markdownify
    normalize_media_attributes(soup, base_site_url)

    markdown_text = md(str(soup), heading_style="ATX", bullets="-")
    markdown_text = re.sub(r"\n{3,}", "\n\n", markdown_text).strip()
    return markdown_text


def extract_asset_urls_from_raw_html(html_content: str, base_site_url: str) -> set[str]:
    urls: set[str] = set()
    if not html_content:
        return urls

    soup = BeautifulSoup(html_content, "lxml")

    attrs = ["src", "href", "data-src", "data-image", "data-original", "poster"]
    for tag in soup.find_all(True):
        for a in attrs:
            if tag.get(a):
                urls.add(normalize_url(tag.get(a), base_site_url))

    # srcset candidates
    for img in soup.find_all("img"):
        if img.get("srcset"):
            for part in img["srcset"].split(","):
                part = part.strip()
                if part:
                    urls.add(normalize_url(part.split()[0].strip(), base_site_url))

    # raw URL fallback
    for u in RAW_URL_RE.findall(html_content):
        urls.add(normalize_url(u, base_site_url))

    return {u for u in urls if u}


def write_blog_index(out_root: str, overwrite: bool = False) -> None:
    blog_dir = os.path.join(out_root, "blog")
    os.makedirs(blog_dir, exist_ok=True)
    path = os.path.join(blog_dir, "index.qmd")
    if os.path.exists(path) and not overwrite:
        return
    content = """---
title: "Blog"
listing:
  contents: .
  sort: "date desc"
  type: default
  categories: true
  page-size: 20
  exclude: "index.qmd"
---

Welcome to the blog.
"""
    with open(path, "w", encoding="utf-8", newline="\n") as f:
        f.write(content)


def build_pages_map(channel: ET.Element) -> dict[str, dict]:
    pages_map: dict[str, dict] = {}
    for item in channel.findall("item"):
        post_type = get_text(item, "wp:post_type", NAMESPACES)
        if post_type == "attachment":
            continue
        post_id = get_text(item, "wp:post_id", NAMESPACES)
        slug = get_text(item, "wp:post_name", NAMESPACES)
        parent_id = get_text(item, "wp:post_parent", NAMESPACES)
        if post_id:
            pages_map[post_id] = {"slug": slug, "parent_id": parent_id}
    return pages_map


def get_parent_slug_path(pages_map: dict[str, dict], parent_id: str) -> str:
    if not parent_id or parent_id == "0" or parent_id not in pages_map:
        return ""
    parent = pages_map[parent_id]
    grand = get_parent_slug_path(pages_map, parent["parent_id"])
    return os.path.join(grand, parent["slug"]) if grand else parent["slug"]


def convert_wxr_to_quarto(
    input_xml: str,
    out_root: str,
    base_site_url: str,
    preserve_hierarchy: bool = True,
    overwrite: bool = False,
    allow_blog_page_overwrite: bool = False
):
    if not os.path.exists(input_xml):
        raise FileNotFoundError(f"XML not found: {input_xml}")

    os.makedirs(out_root, exist_ok=True)

    manifest_txt = os.path.join(out_root, "_asset_manifest.txt")
    manifest_csv = os.path.join(out_root, "_asset_manifest.csv")

    if overwrite:
        for p in (manifest_txt, manifest_csv):
            if os.path.exists(p):
                os.remove(p)

    write_blog_index(out_root, overwrite=overwrite)

    tree = ET.parse(input_xml)
    root = tree.getroot()
    channel = root.find("channel")
    if channel is None:
        raise ValueError("Invalid WXR: missing <channel>")

    pages_map = build_pages_map(channel)

    all_assets: set[str] = set()
    manifest_rows: dict[str, list[str]] = {}  # url -> row
    count_written = 0

    for item in channel.findall("item"):
        post_type = get_text(item, "wp:post_type", NAMESPACES)
        status = get_text(item, "wp:status", NAMESPACES)

        if status != "publish":
            continue
        if post_type in {"attachment", "nav_menu_item"}:
            continue

        title = get_text(item, "title")
        post_name = get_text(item, "wp:post_name", NAMESPACES)
        post_id = get_text(item, "wp:post_id", NAMESPACES)
        parent_id = get_text(item, "wp:post_parent", NAMESPACES)
        pub_date = get_text(item, "pubDate")
        author = get_text(item, "dc:creator", NAMESPACES)
        excerpt = get_text(item, "excerpt:encoded", NAMESPACES)

        categories, tags = [], []
        for cat in item.findall("category"):
            domain = (cat.get("domain") or "").strip()
            txt = (cat.text or "").strip()
            if not txt:
                continue
            if domain == "category":
                categories.append(txt)
            elif domain == "post_tag":
                tags.append(txt)

        raw_html = get_text(item, "content:encoded", NAMESPACES)

        # ---- Asset manifest extraction ----
        asset_urls = extract_asset_urls_from_raw_html(raw_html, base_site_url)

        attach_url = get_text(item, "wp:attachment_url", NAMESPACES)
        if attach_url:
            asset_urls.add(normalize_url(attach_url, base_site_url))

        for u in asset_urls:
            if u and u not in manifest_rows:
                all_assets.add(u)
                manifest_rows[u] = [u, post_type, post_name or "", post_id or ""]

        # ---- HTML -> Markdown ----
        body_md = clean_content_to_markdown(raw_html, base_site_url)
        front = format_yaml(title, pub_date, categories, tags, author, excerpt)
        final_content = front + body_md + "\n"

        # ---- Output path ----
        filename = "index.qmd"

        if post_type == "post":
            target_dir = os.path.join(out_root, "blog")
            filename = f"{post_name}.qmd" if post_name else "untitled.qmd"
        else:
            if post_name == "home":
                target_dir = out_root
            elif post_name == "blog":
                target_dir = os.path.join(out_root, "blog") if allow_blog_page_overwrite else os.path.join(out_root, "pages", "blog")
            else:
                if preserve_hierarchy:
                    parent_path = get_parent_slug_path(pages_map, parent_id)
                    target_dir = os.path.join(out_root, parent_path, post_name) if parent_path else os.path.join(out_root, post_name)
                else:
                    target_dir = os.path.join(out_root, post_name)

        os.makedirs(target_dir, exist_ok=True)
        full_path = os.path.join(target_dir, filename)

        if (not overwrite) and os.path.exists(full_path):
            base, ext = os.path.splitext(full_path)
            i = 2
            while os.path.exists(f"{base}-{i}{ext}"):
                i += 1
            full_path = f"{base}-{i}{ext}"

        with open(full_path, "w", encoding="utf-8", newline="\n") as f:
            f.write(final_content)

        count_written += 1

    # ---- Write manifest files ----
    with open(manifest_txt, "w", encoding="utf-8", newline="\n") as f:
        for u in sorted(all_assets):
            f.write(u + "\n")

    with open(manifest_csv, "w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(["url", "post_type", "slug", "post_id"])
        for u in sorted(manifest_rows.keys()):
            w.writerow(manifest_rows[u])

    print(f"Extraction complete! {count_written} QMD files created.")
    print(f"Asset manifest: {manifest_txt} ({len(all_assets)} unique URLs)")
    return count_written, len(all_assets)


In [4]:
count_qmd, count_assets = convert_wxr_to_quarto(
    input_xml=INPUT_XML_FILE,
    out_root=OUTPUT_DIR,
    base_site_url=BASE_SITE_URL,
    preserve_hierarchy=PRESERVE_HIERARCHY,
    overwrite=OVERWRITE,
    allow_blog_page_overwrite=ALLOW_BLOG_PAGE_OVERWRITE,
)

count_qmd, count_assets


Extraction complete! 138 QMD files created.
Asset manifest: C:\Users\benny\OneDrive\Documents\Github\site\temp3\_asset_manifest.txt (1969 unique URLs)


(138, 1969)

## End of code