In [9]:
import json
from pathlib import Path

# -----------------------------
# CONFIG
# -----------------------------
BASE_DIR = Path("/content/drive/MyDrive/Upwork/david-grass/pilot_folder")  # project root
BUNDLE_MANIFEST_PATH = BASE_DIR / "inputs" / "bundle_manifest.json"
SITEMAP_ANALYSIS_PATH = BASE_DIR / "inputs" / "sitemap_analysis.json"

OUTPUT_PATH = BASE_DIR / "processing_index.json"

# Page types we NEVER want to process
SKIP_PAGE_TYPES = {
    "homepage",
    "category_landing",
    "blog",
    "cms",
}

# -----------------------------
# LOAD INPUT FILES
# -----------------------------
with open(BUNDLE_MANIFEST_PATH, "r", encoding="utf-8") as f:
    bundle_manifest = json.load(f)

with open(SITEMAP_ANALYSIS_PATH, "r", encoding="utf-8") as f:
    sitemap_analysis = json.load(f)

selected_urls = set(bundle_manifest.get("selected_urls", []))

# -----------------------------
# BUILD URL → SITEMAP LOOKUP
# -----------------------------
sitemap_by_url = {}
for row in sitemap_analysis:
    url = row.get("url")
    if url:
        sitemap_by_url[url] = row

# -----------------------------
# PROCESS SELECTED URLS
# -----------------------------
processing_index = []
errors = []

for url in sorted(selected_urls):
    sitemap_row = sitemap_by_url.get(url)

    if not sitemap_row:
        errors.append({
            "url": url,
            "reason": "URL not found in sitemap_analysis.json"
        })
        continue

    status = sitemap_row.get("status")
    page_type = sitemap_row.get("page_type")

    # Skip explicitly marked or unsupported pages
    if status == "skip":
        continue

    if page_type in SKIP_PAGE_TYPES:
        continue

    dir_path = BASE_DIR / sitemap_row.get("dir", "")
    page_html_path = dir_path / "page.html"
    tables_json_path = dir_path / "tables.json"

    record = {
        "source_url": url,
        "slug": sitemap_row.get("slug"),
        "page_type": page_type,
        "status": status,
        "dir": str(dir_path),
        "page_html_path": str(page_html_path),
        "tables_json_path": str(tables_json_path),
        "has_page_html": page_html_path.exists(),
        "has_tables_json": tables_json_path.exists(),
    }

    # Track missing critical files
    if not record["has_page_html"]:
        errors.append({
            "url": url,
            "reason": "Missing page.html",
            "expected_path": str(page_html_path)
        })

    processing_index.append(record)

# -----------------------------
# WRITE OUTPUTS
# -----------------------------
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(processing_index, f, indent=2, ensure_ascii=False)

if errors:
    with open("processing_index_errors.json", "w", encoding="utf-8") as f:
        json.dump(errors, f, indent=2, ensure_ascii=False)

print("Step 1 complete.")
print(f"Total selected URLs: {len(selected_urls)}")
print(f"Processing index entries: {len(processing_index)}")
print(f"Errors logged: {len(errors)}")

Step 1 complete.
Total selected URLs: 386
Processing index entries: 144
Errors logged: 0


In [15]:
import json
import re
from pathlib import Path
from typing import Optional, Tuple

# -----------------------------
# CONFIG
# -----------------------------
# BASE_DIR = Path(".")
PROCESSING_INDEX_PATH = BASE_DIR / "processing_index.json"
OUTPUT_PATH = BASE_DIR / "sku_index.json"
WARNINGS_PATH = BASE_DIR / "sku_index_warnings.json"

# -----------------------------
# BULLETPROOF SKU REGEXES
# -----------------------------

SKU_VALUE_PATTERN = r'([A-Z0-9_-]{5,30})'

RE_KLAVIYO_SKU = re.compile(
    rf'"SKU"\s*:\s*"{SKU_VALUE_PATTERN}"',
    re.IGNORECASE
)

RE_DATA_PRODUCT_SKU = re.compile(
    rf'data-product-sku\s*=\s*"{SKU_VALUE_PATTERN}"',
    re.IGNORECASE
)

RE_VISIBLE_SKU = re.compile(
    rf'<label[^>]*>\s*SKU\s*:\s*</label>\s*'
    rf'<span[^>]*>\s*{SKU_VALUE_PATTERN}\s*</span>',
    re.IGNORECASE | re.DOTALL
)


# -----------------------------
# SKU EXTRACTION
# -----------------------------
def extract_sterlitech_sku(html: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Extract one canonical SKU from a Sterlitech product page.
    Priority:
      1. Klaviyo JS
      2. data-product-sku
      3. Visible attribute block
    """

    match = RE_KLAVIYO_SKU.search(html)
    if match:
        return match.group(1), "klaviyo_js"

    match = RE_DATA_PRODUCT_SKU.search(html)
    if match:
        return match.group(1), "data_product_sku"

    match = RE_VISIBLE_SKU.search(html)
    if match:
        return match.group(1), "visible_attribute"

    return None, None


# -----------------------------
# LOAD INPUT
# -----------------------------
with open(PROCESSING_INDEX_PATH, "r", encoding="utf-8") as f:
    processing_index = json.load(f)

sku_index = []
qa_warnings = []


# -----------------------------
# MAIN LOOP
# -----------------------------
for page in processing_index:
    source_url = page["source_url"]
    slug = page["slug"]
    page_html_path = Path(page["page_html_path"])

    print(f"Processing page: {source_url}")

    if not page_html_path.exists():
        qa_warnings.append({
            "source_url": source_url,
            "reason": "page.html missing"
        })
        print("  ⚠ page.html missing")
        continue

    html = page_html_path.read_text(encoding="utf-8", errors="ignore")

    sku, sku_source = extract_sterlitech_sku(html)

    if sku:
        sku_index.append({
            "source_url": source_url,
            "page_slug": slug,
            "sku": sku,
            "sku_source": sku_source,
        })
        print(f"  → SKU found: {sku} ({sku_source})")

    else:
        qa_warnings.append({
            "source_url": source_url,
            "reason": "No SKU found in HTML"
        })
        print("  ⚠ No SKU found")


# -----------------------------
# WRITE OUTPUTS
# -----------------------------
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(sku_index, f, indent=2, ensure_ascii=False)

if qa_warnings:
    with open(WARNINGS_PATH, "w", encoding="utf-8") as f:
        json.dump(qa_warnings, f, indent=2, ensure_ascii=False)

print("\nStep 2 complete.")
print(f"Total pages processed: {len(processing_index)}")
print(f"SKUs extracted: {len(sku_index)}")
print(f"Warnings: {len(qa_warnings)}")

Processing page: https://www.sterlitech.com/50mm-disc-capsule-glass-fiber-5-um-prefilter-over-0-20um-pes-final-filter-1-8-barb-inlet-outlet.html
  → SKU found: 1470016 (klaviyo_js)
Processing page: https://www.sterlitech.com/50mm-disc-capsule-glass-fiber-5-um-prefilter-over-0-45um-pes-final-filter-1-8-barb-inlet-outlet.html
  → SKU found: 1470017 (klaviyo_js)
Processing page: https://www.sterlitech.com/50mm-disc-capsule-glass-fiber-5-um-prefilter-over-0-80um-pes-final-filter-1-8-barb-inlet-outlet.html
  → SKU found: 1470018 (klaviyo_js)
Processing page: https://www.sterlitech.com/aluminum-oxide-membrane-filters-0-2-micron-13-mm-10-pk.html
  → SKU found: 1360007 (klaviyo_js)
Processing page: https://www.sterlitech.com/aluminum-oxide-membrane-filters-1360001.html
  → SKU found: 1360001 (klaviyo_js)
Processing page: https://www.sterlitech.com/aluminum-oxide-membrane-filters-1360002.html
  → SKU found: 1360002 (klaviyo_js)
Processing page: https://www.sterlitech.com/aluminum-oxide-membrane