In [9]:
import json
from pathlib import Path

# -----------------------------
# CONFIG
# -----------------------------
BASE_DIR = Path("/content/drive/MyDrive/Upwork/david-grass/pilot_folder")  # project root
BUNDLE_MANIFEST_PATH = BASE_DIR / "inputs" / "bundle_manifest.json"
SITEMAP_ANALYSIS_PATH = BASE_DIR / "inputs" / "sitemap_analysis.json"

OUTPUT_PATH = BASE_DIR / "processing_index.json"

# Page types we NEVER want to process
SKIP_PAGE_TYPES = {
    "homepage",
    "category_landing",
    "blog",
    "cms",
}

# -----------------------------
# LOAD INPUT FILES
# -----------------------------
with open(BUNDLE_MANIFEST_PATH, "r", encoding="utf-8") as f:
    bundle_manifest = json.load(f)

with open(SITEMAP_ANALYSIS_PATH, "r", encoding="utf-8") as f:
    sitemap_analysis = json.load(f)

selected_urls = set(bundle_manifest.get("selected_urls", []))

# -----------------------------
# BUILD URL â†’ SITEMAP LOOKUP
# -----------------------------
sitemap_by_url = {}
for row in sitemap_analysis:
    url = row.get("url")
    if url:
        sitemap_by_url[url] = row

# -----------------------------
# PROCESS SELECTED URLS
# -----------------------------
processing_index = []
errors = []

for url in sorted(selected_urls):
    sitemap_row = sitemap_by_url.get(url)

    if not sitemap_row:
        errors.append({
            "url": url,
            "reason": "URL not found in sitemap_analysis.json"
        })
        continue

    status = sitemap_row.get("status")
    page_type = sitemap_row.get("page_type")

    # Skip explicitly marked or unsupported pages
    if status == "skip":
        continue

    if page_type in SKIP_PAGE_TYPES:
        continue

    dir_path = BASE_DIR / sitemap_row.get("dir", "")
    page_html_path = dir_path / "page.html"
    tables_json_path = dir_path / "tables.json"

    record = {
        "source_url": url,
        "slug": sitemap_row.get("slug"),
        "page_type": page_type,
        "status": status,
        "dir": str(dir_path),
        "page_html_path": str(page_html_path),
        "tables_json_path": str(tables_json_path),
        "has_page_html": page_html_path.exists(),
        "has_tables_json": tables_json_path.exists(),
    }

    # Track missing critical files
    if not record["has_page_html"]:
        errors.append({
            "url": url,
            "reason": "Missing page.html",
            "expected_path": str(page_html_path)
        })

    processing_index.append(record)

# -----------------------------
# WRITE OUTPUTS
# -----------------------------
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(processing_index, f, indent=2, ensure_ascii=False)

if errors:
    with open("processing_index_errors.json", "w", encoding="utf-8") as f:
        json.dump(errors, f, indent=2, ensure_ascii=False)

print("Step 1 complete.")
print(f"Total selected URLs: {len(selected_urls)}")
print(f"Processing index entries: {len(processing_index)}")
print(f"Errors logged: {len(errors)}")

Step 1 complete.
Total selected URLs: 386
Processing index entries: 144
Errors logged: 0
