In [None]:
# lambda_function.py
# -*- coding: utf-8 -*-

import os
import json
import time
import base64
import mimetypes
import logging
import re
from typing import Optional, Dict, List, Any

import boto3
from botocore.config import Config
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ───────────────────────── config / logging ─────────────────────────
log = logging.getLogger()
log.setLevel(logging.INFO)

# Required (DO NOT hardcode real values in public repo)
S3_BUCKET   = os.getenv("S3_BUCKET", "").strip()
SN_INSTANCE = os.getenv("SN_INSTANCE", "").strip()
SN_USER     = os.getenv("SN_USER", "").strip()

# Credentials (prefer Secrets Manager)
SECRET_ID = os.getenv("SECRET_ID", "").strip()   # preferred
SN_PASS   = os.getenv("SN_PASS", "").strip()     # fallback (discouraged for production)

# KB restriction (no client-specific default in public repo)
KNOWLEDGE_BASE_NAME = os.getenv("KNOWLEDGE_BASE_NAME", "").strip()

# Optional
DEST_PREFIX        = os.getenv("DEST_PREFIX", "").strip()
PAGE_SIZE          = int(os.getenv("PAGE_SIZE", "200"))
ARTICLES_LIMIT     = int(os.getenv("ARTICLES_LIMIT", "0"))  # 0 = no cap
STRICT_PUBLISHED   = os.getenv("STRICT_PUBLISHED", "true").lower() == "true"
STOP_AFTER_SECONDS = int(os.getenv("STOP_AFTER_SECONDS", "840"))  # ~14m

# File/sidecar controls
MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", "52428800"))  # 50 MB
ATTACH_EXT_WHITELIST = [
    e.strip().lower()
    for e in os.getenv(
        "ATTACH_EXT_WHITELIST",
        "pdf,doc,docx,xls,xlsx,ppt,pptx,html,jpg,jpeg,png,gif,webp,tif,tiff,svg,csv,txt,rtf,odt,ods,odp,zip"
    ).split(",")
    if e.strip()
]

TITLE_MAX_CHARS        = int(os.getenv("TITLE_MAX_CHARS", "200"))
LOG_METADATA_BYTES     = os.getenv("LOG_METADATA_BYTES", "false").lower() == "true"
GENERATE_TEXT_FROM_HTML = os.getenv("GENERATE_TEXT_FROM_HTML", "false").lower() == "true"
MAX_METADATA_BYTES     = int(os.getenv("MAX_METADATA_BYTES", "1800"))  # budget for metadataAttributes JSON

# Public-safe URL template (no client paths)
SN_URL_TEMPLATE = os.getenv(
    "SN_URL_TEMPLATE",
    "{base}/kb_view.do?sysparm_article={number}"
)

# ───────────────────────── endpoints/clients ─────────────────────
SN_BASE             = f"https://{SN_INSTANCE}.service-now.com" if SN_INSTANCE else ""
KB_TABLE_API        = f"{SN_BASE}/api/now/table/kb_knowledge" if SN_BASE else ""
KB_BASE_TABLE       = f"{SN_BASE}/api/now/table/kb_knowledge_base" if SN_BASE else ""
ARTICLE_CONTENT_API = f"{SN_BASE}/api/sn_km_api/v1/knowledge/articles/" if SN_BASE else ""
ATTACH_LIST_API     = f"{SN_BASE}/api/now/attachment" if SN_BASE else ""
ATTACH_DOWNLOAD     = f"{SN_BASE}/api/now/attachment/" if SN_BASE else ""

boto_cfg = Config(retries={"max_attempts": 5, "mode": "standard"})
s3       = boto3.client("s3", config=boto_cfg)
secrets  = boto3.client("secretsmanager", config=boto_cfg)

mimetypes.init()

# ───────────────────────── HTTP (retrying session) ──────────────
def _requests_session():
    s = requests.Session()
    retry = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=("GET",),
    )
    s.mount("https://", HTTPAdapter(max_retries=retry))
    return s

HTTP = _requests_session()

# ───────────────────────── safety helpers ─────────────────────────
def validate_runtime_config() -> Optional[dict]:
    """
    Return an error dict if config is invalid, otherwise None.
    (Avoid printing env values in errors/logs to keep public-safe.)
    """
    missing = []
    if not S3_BUCKET:
        missing.append("S3_BUCKET")
    if not SN_INSTANCE:
        missing.append("SN_INSTANCE")
    if not SN_USER:
        missing.append("SN_USER")
    if not (SECRET_ID or SN_PASS):
        missing.append("SECRET_ID (preferred) or SN_PASS")

    if missing:
        return {
            "status": "error",
            "type": "config_error",
            "message": "Missing required configuration.",
            "missing": missing,
        }
    if not KNOWLEDGE_BASE_NAME:
        return {
            "status": "error",
            "type": "config_error",
            "message": "KNOWLEDGE_BASE_NAME must be provided (no default in public repo).",
            "missing": ["KNOWLEDGE_BASE_NAME"],
        }
    return None

def mask(s: str, keep: int = 4) -> str:
    if not s:
        return ""
    return s[:keep] + "..." if len(s) > keep else s

# ───────────────────────── helpers ──────────────────────────────
def _prefix() -> str:
    return DEST_PREFIX.rstrip("/") if DEST_PREFIX else ""

def make_key(filename: str, number: Optional[str] = None) -> str:
    p = _prefix()
    if number:
        base = f"{p}/{number}" if p else number
        return f"{base}/{filename}"
    return f"{p}/{filename}" if p else filename

def s3_put_bytes(key: str, body: bytes, content_type: Optional[str] = None):
    extra = {"ContentType": content_type} if content_type else {}
    s3.put_object(Bucket=S3_BUCKET, Key=key, Body=body, **extra)

def s3_put_json(key: str, obj: dict):
    s3.put_object(
        Bucket=S3_BUCKET,
        Key=key,
        Body=json.dumps(obj, ensure_ascii=False, separators=(",", ":")).encode("utf-8"),
        ContentType="application/json",
    )

def get_password() -> str:
    if SECRET_ID:
        r = secrets.get_secret_value(SecretId=SECRET_ID)
        if "SecretString" in r and r["SecretString"]:
            try:
                js = json.loads(r["SecretString"])
                return js.get("SN_PASS") or js.get("password") or next(
                    (v for v in js.values() if isinstance(v, str)), SN_PASS
                )
            except json.JSONDecodeError:
                return r["SecretString"]
    return SN_PASS

def auth_header() -> Dict[str, str]:
    token = base64.b64encode(f"{SN_USER}:{get_password()}".encode()).decode()
    return {"Authorization": f"Basic {token}", "Accept": "application/json"}

def http_get(url: str, params: Optional[dict] = None, stream: bool = False, timeout=(10, 60)) -> requests.Response:
    resp = HTTP.get(url, headers=auth_header(), params=params, stream=stream, timeout=timeout)
    resp.raise_for_status()
    return resp

# ── KB id handling ───────────────────────────────────────────────
KB_RE = re.compile(r"^KB\d{4,}$", re.IGNORECASE)

def coerce_kb_number(*candidates: Optional[str]) -> Optional[str]:
    for raw in candidates:
        s = (raw or "").strip().upper()
        if not s:
            continue
        if KB_RE.match(s):
            return s
        m = re.search(r"(\d{4,})", s)
        if m:
            return f"KB{m.group(1).zfill(7)}"
    return None

def build_sn_url(number: str) -> str:
    try:
        return SN_URL_TEMPLATE.format(base=SN_BASE, number=(number or "").strip())
    except Exception:
        return ""

def guess_content_type(name: str) -> str:
    return (mimetypes.guess_type(name)[0] or "application/octet-stream").lower()

def to_text(v: Any) -> str:
    if v is None:
        return ""
    if isinstance(v, (str, int, float, bool)):
        return str(v)
    if isinstance(v, dict):
        for k in ("display_value", "display", "name", "label", "text", "value"):
            if k in v and v[k]:
                return str(v[k])
        for x in v.values():
            if isinstance(x, (str, int, float, bool)) and x:
                return str(x)
        return ""
    if isinstance(v, (list, tuple, set)):
        parts = []
        for x in v:
            if len(parts) >= 5:
                break
            t = to_text(x)
            if t:
                parts.append(t)
        return ", ".join(parts)
    try:
        return v.decode("utf-8", errors="ignore") if isinstance(v, (bytes, bytearray)) else str(v)
    except Exception:
        return ""

def clamp(v: Any, n: int) -> str:
    return to_text(v)[:n]

def display_only(v: Any) -> str:
    if isinstance(v, dict):
        for k in ("display_value", "display", "name", "label", "value", "text"):
            if v.get(k):
                return str(v[k]).strip()
        return ""
    s = to_text(v).strip()
    return "" if s.startswith("http") else s

def enforce_metadata_budget(meta_blocks: dict, budget_bytes: int) -> dict:
    def packed_size(m: dict) -> int:
        return len(json.dumps({"metadataAttributes": m}, separators=(",", ":")).encode("utf-8"))

    mb = json.loads(json.dumps(meta_blocks))
    if packed_size(mb) <= budget_bytes:
        return mb

    nf = mb.get("nonFilterable", {})
    for key, maxlen in [("title", 256), ("author", 120), ("created_by", 120)]:
        if key in nf and isinstance(nf[key], str) and len(nf[key]) > maxlen:
            nf[key] = nf[key][:maxlen].rstrip()
    if packed_size(mb) <= budget_bytes:
        return mb

    for k in ["author", "created_by"]:
        if k in nf:
            nf.pop(k, None)
            if packed_size(mb) <= budget_bytes:
                return mb

    f = mb.get("filterable", {})
    if "kb_category" in f:
        f.pop("kb_category", None)
    return mb

def build_metadata_blocks(
    *,
    number: str,
    kb_category: Optional[str],
    content_type: str,
    title: str,
    author: Optional[str],
    sys_id: str,
    created_by: Optional[str],
    sn_url: str,
) -> dict:
    filterable = {
        "number": (number or "").strip()[:80],
        "kb_category": display_only(kb_category)[:80],
        "content_type": (content_type or "").strip()[:40],
    }

    # Filterable safety (keep tiny)
    filterable_size = len(json.dumps(filterable, separators=(",", ":")).encode("utf-8"))
    if filterable_size > 2048:
        if len(filterable.get("kb_category", "")) > 40:
            filterable["kb_category"] = filterable["kb_category"][:40]
        filterable_size = len(json.dumps(filterable, separators=(",", ":")).encode("utf-8"))
        if filterable_size > 2048:
            filterable["number"] = filterable["number"][:40]

    non_filterable = {
        "sn_url": (sn_url or "").strip(),
        "title": (title or "").strip()[:512],
        "author": display_only(author)[:256],
        "sys_id": (sys_id or "").strip(),
        "created_by": display_only(created_by)[:256],
    }
    non_filterable = {k: v for k, v in non_filterable.items() if v}

    meta_blocks = {"filterable": filterable, "nonFilterable": non_filterable}
    return enforce_metadata_budget(meta_blocks, MAX_METADATA_BYTES)

# ───────────────────────── KB restriction ──────────────────────
def get_kb_sys_id_by_name(name: str) -> Optional[str]:
    # NOTE: No debug printing of URL or full SN responses (public-safe)
    q_exact = f"title={name}^ORname={name}"
    url = f"{KB_BASE_TABLE}?sysparm_query={q_exact}&sysparm_fields=sys_id,title,name&sysparm_limit=5"
    rows = http_get(url).json().get("result", []) or []
    if rows:
        rows.sort(key=lambda r: (
            str(r.get("title", "")).lower() != name.lower() and
            str(r.get("name", "")).lower()  != name.lower()
        ))
        return rows[0].get("sys_id")

    q_like = f"titleLIKE{name}^ORnameLIKE{name}"
    url = f"{KB_BASE_TABLE}?sysparm_query={q_like}&sysparm_fields=sys_id,title,name&sysparm_limit=10"
    rows = http_get(url).json().get("result", []) or []
    if not rows:
        return None

    def contains_name(row: dict) -> bool:
        combined = (str(row.get("title", "")) + str(row.get("name", ""))).lower()
        return name.lower() in combined

    rows.sort(key=lambda r: not contains_name(r))
    return rows[0].get("sys_id")

# ─────────────────────── SN fetchers (articles) ─────────────────
def list_kb_articles(kb_sys_id: str, page_size: int, cap: int, strict_published: bool) -> List[Dict[str, str]]:
    results: List[Dict[str, str]] = []
    offset = 0

    def fetch_page(q: str, off: int):
        fields = "sys_id,number,short_description,kb_knowledge_base,workflow_state"
        url = (f"{KB_TABLE_API}?sysparm_query={q}"
               f"&sysparm_fields={fields}&sysparm_limit={page_size}&sysparm_offset={off}")
        return http_get(url).json().get("result", []) or []

    q_state = "workflow_state=published^" if strict_published else ""
    while True:
        q = f"kb_knowledge_base={kb_sys_id}^{q_state}".rstrip("^")
        batch = fetch_page(q, offset)
        if not batch:
            break
        for row in batch:
            results.append({
                "sys_id": row.get("sys_id", ""),
                "number": row.get("number", "") or "",
                "short_description": row.get("short_description", "") or "",
            })
            if cap > 0 and len(results) >= cap:
                return results
        if len(batch) < page_size:
            break
        offset += page_size

    return results

def get_article_content(sys_id: str) -> dict:
    url = f"{ARTICLE_CONTENT_API}{sys_id}?fields=kb_knowledge_base"
    return http_get(url).json().get("result", {}) or {}

def get_article_table_row(sys_id: str) -> dict:
    fields = ",".join([
        "sys_id","number","short_description","workflow_state",
        "sys_created_on","sys_created_by","sys_updated_on","sys_updated_by",
        "kb_category","kb_knowledge_base","valid_from","valid_to","author","sys_view_count"
    ])
    params = {"sysparm_fields": fields, "sysparm_display_value": "true", "sysparm_limit": "1"}
    url = f"{KB_TABLE_API}/{sys_id}"
    res = http_get(url, params=params).json().get("result") or {}
    res.setdefault("sys_view_count", 0)
    return res

def list_attachments_for_article(article_sys_id: str) -> List[dict]:
    all_rows: List[dict] = []
    limit = 200
    offset = 0
    while True:
        q = f"table_name=kb_knowledge^table_sys_id={article_sys_id}"
        url = (f"{ATTACH_LIST_API}?sysparm_query={q}"
               f"&sysparm_fields=sys_id,file_name,content_type,sys_updated_on&sysparm_limit={limit}&sysparm_offset={offset}")
        res = http_get(url).json().get("result", []) or []
        all_rows.extend(res)
        if len(res) < limit:
            break
        offset += limit
    return all_rows

def download_attachment_bytes(attach_sys_id: str) -> bytes:
    url = f"{ATTACH_DOWNLOAD}{attach_sys_id}/file"
    return http_get(url, stream=True).content

# ─────────────────────── writers (HTML + attachments) ───────────────────────
_TAG_RE = re.compile(r"<[^>]+>")

def _html_to_plain_text(html: str) -> str:
    txt = _TAG_RE.sub(" ", html or "")
    return re.sub(r"\s+", " ", txt).strip()

def _simple_meta(file_name: str, number: str, short_desc: str, sys_id: str, link: str) -> dict:
    return {
        "file_name": file_name,
        "metadata": {
            "number": number or "",
            "short_description": short_desc or "",
            "sys_id": sys_id or "",
            "file_name": file_name,
            "link": link or ""
        }
    }

def write_html_and_sidecar(table_row: dict, detail: dict) -> bool:
    content = detail.get("content")
    number  = coerce_kb_number(detail.get("number"), table_row.get("number"))
    sys_id  = detail.get("sys_id", "")
    short   = detail.get("short_description") or table_row.get("short_description") or ""

    if not content:
        return False

    ident = number or sys_id
    if not ident:
        return False

    html_name = f"{ident}.html"
    html_key  = make_key(html_name, ident)
    s3_put_bytes(html_key, content.encode("utf-8"), "text/html; charset=utf-8")

    if GENERATE_TEXT_FROM_HTML:
        text_key = make_key(f"{ident}.txt", ident)
        s3_put_bytes(text_key, _html_to_plain_text(content).encode("utf-8"), "text/plain; charset=utf-8")

    link = build_sn_url(number or "")
    title_val = clamp(short or "Untitled", TITLE_MAX_CHARS)
    meta_blocks = build_metadata_blocks(
        number=number or "",
        kb_category=table_row.get("kb_category"),
        content_type="text/html",
        title=title_val,
        author=table_row.get("author"),
        sys_id=sys_id,
        created_by=table_row.get("sys_created_by"),
        sn_url=link,
    )

    sidecar = {
        "number": number or "",
        "sys_id": sys_id,
        "title": title_val,
        "sn_url": link,
        "sys_updated_on": table_row.get("sys_updated_on"),
        "sys_created_on": table_row.get("sys_created_on"),
        "author": display_only(table_row.get("author")),
        "created_by": display_only(table_row.get("sys_created_by")),
        "workflow_state": table_row.get("workflow_state"),
        "valid_from": table_row.get("valid_from") or "",
        "valid_to": table_row.get("valid_to"),
        "kb_category": display_only(table_row.get("kb_category")),
        "sys_view_count": int(table_row.get("sys_view_count") or 0),
        "html_key": html_name,
        "metadataAttributes": meta_blocks,
        "simple_metadata": _simple_meta(
            file_name=html_name,
            number=number or "",
            short_desc=short or "",
            sys_id=sys_id,
            link=link,
        ),
    }

    if LOG_METADATA_BYTES:
        sz = len(json.dumps({"metadataAttributes": sidecar["metadataAttributes"]}, separators=(",", ":")).encode("utf-8"))
        log.info(f"[sidecar] html metaBytes={sz}")

    s3_put_json(html_key + ".metadata.json", sidecar)
    return True

def write_attachment_and_sidecar(table_row: dict, article_detail: dict, att_row: dict) -> bool:
    number  = coerce_kb_number(article_detail.get("number"), table_row.get("number"))
    sys_id  = article_detail.get("sys_id", "")
    short   = article_detail.get("short_description") or table_row.get("short_description") or ""
    ident   = number or sys_id
    if not ident:
        return False

    att_id = att_row.get("sys_id", "")
    fname  = (att_row.get("file_name") or "").strip()
    if not (att_id and fname):
        return False

    ext = fname.rsplit(".", 1)[-1].lower() if "." in fname else ""
    if ATTACH_EXT_WHITELIST and ext not in ATTACH_EXT_WHITELIST:
        return False

    body = download_attachment_bytes(att_id)
    if MAX_FILE_BYTES > 0 and len(body) > MAX_FILE_BYTES:
        log.info(f"[skip oversize] ({len(body)} > {MAX_FILE_BYTES})")
        return False

    ctype = guess_content_type(fname)
    key   = make_key(fname, ident)
    s3_put_bytes(key, body, ctype)

    link = build_sn_url(number or "")
    title_val = clamp(short or fname, TITLE_MAX_CHARS)
    meta_blocks = build_metadata_blocks(
        number=number or "",
        kb_category=table_row.get("kb_category"),
        content_type=ctype,
        title=title_val,
        author=table_row.get("author"),
        sys_id=sys_id,
        created_by=table_row.get("sys_created_by"),
        sn_url=link,
    )

    sidecar = {
        "number": number or "",
        "sys_id": sys_id,
        "title": title_val,
        "sn_url": link,
        "sys_updated_on": table_row.get("sys_updated_on"),
        "sys_created_on": table_row.get("sys_created_on"),
        "author": display_only(table_row.get("author")),
        "created_by": display_only(table_row.get("sys_created_by")),
        "workflow_state": table_row.get("workflow_state"),
        "valid_from": table_row.get("valid_from") or "",
        "valid_to": table_row.get("valid_to"),
        "kb_category": display_only(table_row.get("kb_category")),
        "sys_view_count": int(table_row.get("sys_view_count") or 0),
        "file_key": fname,
        "metadataAttributes": meta_blocks,
        "simple_metadata": _simple_meta(
            file_name=fname,
            number=number or "",
            short_desc=short or "",
            sys_id=sys_id,
            link=link,
        ),
    }

    if LOG_METADATA_BYTES:
        sz = len(json.dumps(sidecar, separators=(",", ":")).encode("utf-8"))
        log.info(f"[sidecar] att metaBytes={sz}")

    s3_put_json(key + ".metadata.json", sidecar)
    return True

# ─────────────────────── core pipeline ────────────────────────
def process_article(sys_id: str, start_ts: float) -> int:
    total = 0
    table_row = get_article_table_row(sys_id)
    detail    = get_article_content(sys_id)

    if detail.get("content"):
        if write_html_and_sidecar(table_row, detail):
            total += 1

    api_atts = detail.get("attachments") or []
    tbl_atts = list_attachments_for_article(sys_id)
    seen_ids = {a.get("sys_id") for a in api_atts if a.get("sys_id")}
    all_atts = api_atts + [t for t in tbl_atts if t.get("sys_id") not in seen_ids]

    embedded = detail.get("embedded_content") or []
    for emb in embedded:
        emb_id = emb.get("sys_id") or emb.get("id") or emb.get("attachment_sys_id")
        emb_fn = (emb.get("file_name") or emb.get("name") or "").strip()
        if not emb_id or not emb_fn:
            continue
        if emb_id in seen_ids:
            continue
        all_atts.append({"sys_id": emb_id, "file_name": emb_fn})
        seen_ids.add(emb_id)

    for att in all_atts:
        if time.time() - start_ts > STOP_AFTER_SECONDS:
            log.info("Time guard hit during attachments; stopping.")
            break
        try:
            if write_attachment_and_sidecar(table_row, detail, att):
                total += 1
        except requests.HTTPError as e:
            log.warning(f"[att-dl-failed] {e.__class__.__name__}")

    return total

# ─────────────────────── lambda entry ──────────────────────────
def handler(event, context):
    """
    Example event:
      {"limit": 0, "offset": 0, "page_size": 200, "knowledgeBaseName": "Your KB Name"}
    """
    cfg_err = validate_runtime_config()
    if cfg_err:
        return cfg_err

    wanted_name = (event.get("knowledgeBaseName") if isinstance(event, dict) else None) or KNOWLEDGE_BASE_NAME
    page_size   = int((event.get("page_size") if isinstance(event, dict) else None) or PAGE_SIZE)
    cap_env     = int((event.get("limit") if isinstance(event, dict) else None) or ARTICLES_LIMIT or 0)
    offset      = int((event.get("offset") if isinstance(event, dict) else 0) or 0)

    # Public-safe log (no bucket name, no instance URL, no secrets)
    log.info(f"Start sync: KB='{wanted_name}', page_size={page_size}, cap={cap_env or 'ALL'}, offset={offset}")

    kb_sys_id = get_kb_sys_id_by_name(wanted_name)
    if not kb_sys_id:
        return {"status": "error", "type": "kb_not_found", "message": "Knowledge base not found."}

    start_ts  = time.time()
    rows_all  = list_kb_articles(kb_sys_id, page_size=200, cap=(cap_env or 0), strict_published=STRICT_PUBLISHED)
    log.info(f"[stats] articles_total={len(rows_all)} strict_published={STRICT_PUBLISHED}")

    # NOTE: Your processing loop is currently commented out in the original file.
    # Keep as-is for your POC behavior:
    return {"status": "ok", "message": "done"}

def lambda_handler(event, context):
    return handler(event, context)
