In [15]:
import re, time, json, os
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Optional, Iterable, Dict

In [16]:
# ----------------------------
# Paths (EDIT THESE)
# ----------------------------
DOCS_ROOT     = Path("/Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/docs")
EXAMPLES_ROOT = Path("/Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/docs/examples")
PY_PATH       = Path("/Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/python/ndvi.py")

assert DOCS_ROOT.exists(), f"Missing docs root: {DOCS_ROOT}"
assert EXAMPLES_ROOT.exists(), f"Missing examples root: {EXAMPLES_ROOT}"
assert PY_PATH.exists(), f"Missing python script: {PY_PATH}"

py_code = PY_PATH.read_text(encoding="utf-8")


In [17]:
# ----------------------------
# Markdown chunking utilities
# ----------------------------
@dataclass
class Chunk:
    chunk_id: str
    heading_path: str
    level: int
    text: str

def chunk_markdown_by_headings(md: str) -> List[Chunk]:
    lines = md.splitlines()
    starts = []
    for i, line in enumerate(lines):
        m = re.match(r"^(#{2,3})\s+(.*\S)\s*$", line)
        if m:
            starts.append((i, len(m.group(1)), m.group(2).strip()))

    if not starts:
        return [Chunk("c0000", "DOC", 1, md.strip() + "\n")]

    chunks: List[Chunk] = []
    current_h2: Optional[str] = None

    for idx, (start_i, level, title) in enumerate(starts):
        end_i = starts[idx + 1][0] if idx + 1 < len(starts) else len(lines)
        body = "\n".join(lines[start_i:end_i]).strip() + "\n"

        if level == 2:
            current_h2 = title
            heading_path = current_h2
        else:
            heading_path = f"{current_h2} / {title}" if current_h2 else title

        chunks.append(Chunk(f"c{idx:04d}", heading_path, level, body))

    return chunks

def make_doc_pack(selected: List[Chunk]) -> str:
    out = []
    for ch in selected:
        out.append(f"### DOC CHUNK {ch.chunk_id}: {ch.heading_path}\n{ch.text}\n")
    return "\n".join(out).strip() + "\n"

In [18]:
# ----------------------------
# Docs loader: key -> .md file(s)
# ----------------------------
def load_docs_for_keys_with_aliases(
    root: Path,
    keys: Iterable[str],
    *,
    encoding: str = "utf-8",
    aliases: Optional[Dict[str, str]] = None,
) -> List[Tuple[Path, str]]:
    """
    keys: things you care about (API names or concepts).
    aliases: maps key -> filename_stem (e.g., 'mapPixels' -> 'mappixels').
    Returns unique docs in first-seen order.
    """
    if not root.exists():
        raise FileNotFoundError(f"DOCS_ROOT does not exist: {root}")

    aliases = aliases or {}

    def norm(s: str) -> str:
        return s.strip().lower()

    # index: stem -> path
    md_index: Dict[str, Path] = {}
    for p in root.rglob("*.md"):
        md_index[norm(p.stem)] = p

    resolved_paths: List[Path] = []
    seen = set()

    for k in keys:
        stem = norm(aliases.get(k, k))  # apply alias if any

        # direct stem match
        path = md_index.get(stem)

        # fallback: normalize to alnum only (handles camelCase-ish inputs)
        if path is None:
            stem2 = "".join(ch for ch in stem if ch.isalnum())
            path = md_index.get(stem2)

        if path is None:
            available = sorted(md_index.keys())
            raise FileNotFoundError(
                f"No .md doc for key '{k}' (resolved stem '{stem}') under {root}\n"
                f"Available docs: {available}"
            )

        if path not in seen:
            resolved_paths.append(path)
            seen.add(path)

    return [(p, p.read_text(encoding=encoding)) for p in resolved_paths]

In [19]:
# ----------------------------
# Examples loader: operation -> (scala?, python?)
# ----------------------------
def load_examples_for_operations(
    examples_root: Path,
    operations: Iterable[str],
    *,
    encoding: str = "utf-8",
) -> Dict[str, Dict[str, str]]:
    """
    For each operation (e.g. '00_dataloading'):
      - load examples/<op>.scala if it exists and is non-empty
      - load examples/<op>.py if it exists and is non-empty

    Missing/empty files are skipped.
    """
    if not examples_root.exists():
        raise FileNotFoundError(f"EXAMPLES_ROOT does not exist: {examples_root}")

    results: Dict[str, Dict[str, str]] = {}

    for op in operations:
        op_key = op.strip()
        bucket: Dict[str, str] = {}

        scala_path = examples_root / f"{op_key}.scala"
        if scala_path.exists():
            text = scala_path.read_text(encoding=encoding).strip()
            if text:
                bucket["scala"] = text

        py_path = examples_root / f"{op_key}.py"
        if py_path.exists():
            text = py_path.read_text(encoding=encoding).strip()
            if text:
                bucket["python"] = text

        if bucket:
            results[op_key] = bucket

    return results


In [20]:
def make_examples_pack(examples: Dict[str, Dict[str, str]]) -> str:
    if not examples:
        return ""
    parts: List[str] = []
    for op, langs in examples.items():
        parts.append(f"## EXAMPLE OPERATION: {op}")
        if "scala" in langs:
            parts.append("### Scala example\n" + langs["scala"].strip() + "\n")
        if "python" in langs:
            parts.append("### Python example\n" + langs["python"].strip() + "\n")
    return "\n".join(parts).strip() + "\n"


In [21]:
MODEL = "gpt-5"

SYSTEM_PROMPT = """
You are a geospatial data engineer and Spark systems expert.

Task: Convert a given geospatial Python script into Scala code that runs on RDPro (Spark-based raster processing) on Apache Spark.

You must understand Spark execution and produce distributed, RDD-based Scala.

AUTHORITATIVE EXAMPLES RULE:
- For each operation, Scala/Python examples may exist under examples/.
- If an example file is missing or empty, ignore it.
- Use only APIs/signatures shown in DOC CHUNKS OR non-empty examples.
- Never assume Scala and Python examples are symmetric.

Environment & paths:
- Determine whether output paths should be treated as local or distributed based on Spark configuration and the URI scheme.
- You MAY use standard Spark/Scala APIs for this (SparkConf, SparkContext.hadoopConfiguration, java.net.URI, java.nio.file).
- You MUST NOT invent any RDPro path utilities.

FILESYSTEM & PATH NORMALIZATION (MANDATORY):
- Detect Spark local mode using SparkContext:
  - Treat as local if `sc.master` starts with "local" (case-insensitive).
- Before calling any RDPro IO API (e.g., geoTiff read/write), normalize ALL input/output paths:
  1) If the path already has a URI scheme (file:, hdfs:, s3a:, gs:, http:, etc.), use it as-is.
  2) If the path has NO scheme AND Spark is local AND the path looks like a local filesystem path
     (e.g., starts with "/" on Unix/macOS, or has a Windows drive like "C:\\"), convert it to an
     absolute `file:///...` URI using standard Java APIs (java.net.URI + java.nio.file.Paths).
  3) If Spark is NOT local, do NOT prepend file:///; leave scheme-less paths unchanged so they
     resolve against the cluster filesystem config (fs.defaultFS).
- This rule exists to prevent Hadoop from interpreting local absolute paths as HDFS
  (e.g., hdfs://localhost:9000).

Hard rules:
1) Output MUST be valid Scala and compile as an RDPro operation module.
   Required structure:
   - `object <OperationName> { def run(sc: SparkContext): <ReturnType> = ... }`
   - Include all necessary imports
   - Do NOT define `main` and do NOT use `extends App`
   - Do NOT create or stop SparkSession or SparkContext inside `run`
   - Assume SparkContext `sc` is provided by the caller
2) Use ONLY RDPro APIs that appear in the provided DOC CHUNKS OR non-empty examples.
   - If a method signature is not shown, do NOT guess.
3) Do NOT invent RDPro APIs, overloads, implicits, or helper utilities.
4) Preserve semantics of the Python.
5) Distributed correctness: avoid driver-side collect unless required.
6) Raster alignment: if required but no API exists in docs/examples, throw a clear runtime error.
7) Performance guidance: only safe Spark-level optimizations.
8) Lambdas: add explicit parameter/return types.
9) CLI args:
   - If the Python has input/output paths, read them from args with safe defaults and validation.
   - Do not introduce extra parameters not implied by the Python.

Output format (strict):
- First: Scala file content only (NO markdown fences).
- After the Scala: a "NOTES" section listing:
  (a) RDPro APIs used (names only)
  (b) Unsupported operations and why
  (c) Assumptions about IO paths / bands / nodata / CRS / environment detection logic
""".strip()


In [22]:
def build_user_prompt(doc_pack: str, examples_pack: str, py_code: str) -> str:
    return f"""
RDPro documentation (relevant DOC CHUNKS only):
{doc_pack}

Non-empty code examples (authoritative; missing/empty files are omitted):
{examples_pack}

Python script:
{py_code}

Task:
Translate the Python script into Scala targeting RDPro on Spark.
Use ONLY APIs/signatures described in the DOC CHUNKS or shown in examples.
""".strip()

In [23]:
# ----------------------------
# Manual "keys" (what you used before)
# These are NOT necessarily filenames; aliases map them to doc files.
# ----------------------------
NDVI_KEYS = [
    "datamodel",
    "setup",
    "dataloading",
    "rastermetadata",
    "overlay",
    "mapPixels",
    "saveAsGeoTiff",
    "GeoTiffWriter",
    "Compression",
]

ALIASES = {
    # docs tree uses lowercase stems like mappixels.md
    "mapPixels": "mappixels",

    # concepts that live inside these docs (not separate files)
    "saveAsGeoTiff": "rasterwriting",
    "GeoTiffWriter": "rasterwriting",
    "Compression": "rasterwriting",

    # "geoTiff" read API is typically documented in dataloading
    "geoTiff": "dataloading",
}


In [24]:
EXAMPLE_OPERATIONS = [
    "00_dataloading",
    "02_mappixels",
    "03_overlay"
]

In [25]:
# ----------------------------
# Load docs -> chunk -> pack
# ----------------------------
doc_files = load_docs_for_keys_with_aliases(DOCS_ROOT, NDVI_KEYS, aliases=ALIASES)

print("Loaded docs:")
for p, _ in doc_files:
    print(" -", p)

all_chunks: List[Chunk] = []
for path, text in doc_files:
    all_chunks.extend(chunk_markdown_by_headings(text))

doc_pack = make_doc_pack(all_chunks)

# ----------------------------
# Load examples -> pack (skips empty/missing)
# ----------------------------
examples = load_examples_for_operations(EXAMPLES_ROOT, EXAMPLE_OPERATIONS)
examples_pack = make_examples_pack(examples)

print("\nLoaded examples (non-empty only):")
for op, langs in examples.items():
    print(" -", op, "->", ",".join(sorted(langs.keys())))

# ----------------------------
# Build final prompt
# ----------------------------
user_prompt = build_user_prompt(doc_pack, examples_pack, py_code)



Loaded docs:
 - /Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/docs/common/datamodel.md
 - /Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/docs/common/setup.md
 - /Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/docs/data/dataloading.md
 - /Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/docs/common/rastermetadata.md
 - /Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/docs/process/overlay.md
 - /Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/docs/process/mappixels.md
 - /Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/docs/data/rasterwriting.md

Loaded examples (non-empty only):
 - 00_dataloading -> python,scala
 - 02_mappixels -> python,scala
 - 03_overlay -> python,scala


In [26]:
# ----------------------------
# Save prompt + selections
# ----------------------------
OUT_DIR = Path("./runs/workspace")
OUT_DIR.mkdir(parents=True, exist_ok=True)

(OUT_DIR / "prompt_manual_multi.txt").write_text(user_prompt, encoding="utf-8")
(OUT_DIR / "doc_selection_multi.json").write_text(
    json.dumps([str(p) for p, _ in doc_files], indent=2),
    encoding="utf-8",
)
(OUT_DIR / "example_selection_multi.json").write_text(
    json.dumps({op: list(langs.keys()) for op, langs in examples.items()}, indent=2),
    encoding="utf-8",
)

print("\nPrompt chars:", len(user_prompt))
print("Saved:", OUT_DIR / "prompt_manual_multi.txt")



Prompt chars: 19517
Saved: runs/workspace/prompt_manual_multi.txt


In [27]:
import os

if "OPENAI_API_KEY" in os.environ:
    print("OPENAI_API_KEY is set")
else:
    print("OPENAI_API_KEY is NOT set")

OPENAI_API_KEY is set


In [28]:
from openai import OpenAI
client = OpenAI()

def run_llm(prompt: str) -> Tuple[str, float]:
    t0 = time.time()
    resp = client.responses.create(
        model=MODEL,
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt},
        ],
    )
    return resp.output_text.strip(), time.time() - t0

scala_out, dt = run_llm(user_prompt)
(OUT_DIR / "Job.manual.codeExample.scala").write_text(scala_out, encoding="utf-8")

print("Wrote:", OUT_DIR / "ndvi_doc_multi_codeExample_GPT.scala")
print("LLM latency:", round(dt, 2), "s")
print("\n--- Preview ---\n", scala_out[:900])

Wrote: runs/workspace/ndvi_doc_multi_codeExample_GPT.scala
LLM latency: 116.94 s

--- Preview ---
 import edu.ucr.cs.bdlab.beast._
import edu.ucr.cs.bdlab.raptor.RasterOperationsLocal
import org.apache.spark.SparkContext

import java.net.URI
import java.nio.file.{Paths, Path => JPath}

object ndvi {

  private def hasScheme(p: String): Boolean = {
    try {
      val u = new URI(p)
      u.getScheme != null
    } catch {
      case _: Exception => false
    }
  }

  // FILESYSTEM & PATH NORMALIZATION (MANDATORY)
  private def normalizePath(rawPath: String, sc: SparkContext): String = {
    if (rawPath == null || rawPath.trim.isEmpty) {
      throw new IllegalArgumentException("Path must be non-empty")
    }
    val isLocal: Boolean = sc.master != null && sc.master.toLowerCase.startsWith("local")
    if (hasScheme(rawPath)) {
      rawPath
    } else {
      // No scheme
      val jpath: JPath = Paths.get(rawPath)
      val looksLocalAbsolute: Boolean = {
        try {
          jpath.
