In [9]:
# | default_exp content_mapper

In [10]:
# | export
from pathlib import Path
from typing import Optional
from seo_rat.index_tracking import fetch_sitemap_urls


In [11]:
# | export
def url_to_relpath(url: str, base_path: str, site_url: str) -> Optional[Path]:
    try:
        url_cleaned = url.removeprefix(site_url).removesuffix(".html")
    except Exception as e:
        print(f"Error processing URL: {url} with site URL: {site_url}. Error: {e}")
        return None
    return Path(base_path) / url_cleaned


In [12]:
# | export
SOURCE_EXTS = (".qmd", ".md", ".ipynb")


def find_source_file(rel_path: Path, exts=SOURCE_EXTS) -> Optional[str]:
    for ext in exts:
        candidate = rel_path.with_suffix(ext)
        if candidate.exists():
            return str(candidate)
    return None


In [13]:
# |export
def url_to_file_path(url: str, base_path: str, site_url: str) -> Optional[str]:
    """Map website URL to local file path"""
    rel_path = url_to_relpath(url, base_path, site_url)
    return find_source_file(rel_path) if rel_path else None


In [14]:
# | export
from seo_rat.gsc_storage import normalize_url

def map_all_urls_to_files(
    base_path: str, site_url: str, urls: list[str]
) -> dict[str, str]:
    return {
        normalize_url(url): path
        for url in urls
        if (path := url_to_file_path(url, base_path, site_url))
    }


In [15]:
# | hide
#| eval: false
from dotenv import load_dotenv
import os

load_dotenv()
BASE_PATH = os.environ["SEO_RAT_BASE_PATH"]

urls = fetch_sitemap_urls("https://kareemai.com/sitemap.xml")
map_all_urls_to_files(
    base_path=f"{BASE_PATH}",
    site_url="https://kareemai.com/",
    urls=urls,
)


{'https://kareemai.com/papers.html': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/papers.qmd',
 'https://kareemai.com/oss/opensource.html': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/oss/opensource.qmd',
 'https://kareemai.com/til/tils/2025 12 15.html': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/til/tils/2025-12-15.md',
 'https://kareemai.com/til/tils/2025 12 13.html': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/til/tils/2025-12-13.md',
 'https://kareemai.com/til/tils/2025 06 06 til.html': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/til/tils/2025-06-06-til.qmd',
 'https://kareemai.com/til/tils/2025 05 25 til.html': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/til/tils/2025-05-25-til.qmd',
 'https://kareemai.com/til/tils/2025 05 21 til.html': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/til/tils/2025-05-21-til.qmd',
 'https://kareemai.com/til/tils/2025 05 19 til.html': '/home/kobo/Desktop/obsidian_valuts/logseq/karem-s

In [16]:
BASE_PATH

'/home/kobo/Desktop/obsidian_valuts/logseq/karem-site/'