In [14]:
import re, time, json
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple

In [15]:
DOC_PATH = Path("/Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/doc/RDPro.md")
PY_PATH  = Path("/Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/python/ndvi.py")
assert DOC_PATH.exists(), f"Missing: {DOC_PATH}"
assert PY_PATH.exists(), f"Missing: {PY_PATH}"

rdpro_text = DOC_PATH.read_text(encoding="utf-8")
py_code = PY_PATH.read_text(encoding="utf-8")

In [16]:
import re
from dataclasses import dataclass
from typing import List, Optional

@dataclass
class Chunk:
    chunk_id: str
    heading_path: str
    level: int
    text: str

def chunk_markdown_by_headings(md: str) -> List[Chunk]:
    lines = md.splitlines()
    starts = []
    for i, line in enumerate(lines):
        m = re.match(r"^(#{2,3})\s+(.*\S)\s*$", line)
        if m:
            starts.append((i, len(m.group(1)), m.group(2).strip()))

    if not starts:
        return [Chunk("c0000", "DOC", 1, md.strip() + "\n")]

    chunks: List[Chunk] = []
    current_h2: Optional[str] = None

    for idx, (start_i, level, title) in enumerate(starts):
        end_i = starts[idx + 1][0] if idx + 1 < len(starts) else len(lines)
        body = "\n".join(lines[start_i:end_i]).strip() + "\n"

        if level == 2:
            current_h2 = title
            heading_path = current_h2
        else:
            heading_path = f"{current_h2} / {title}" if current_h2 else title

        chunks.append(Chunk(f"c{idx:04d}", heading_path, level, body))

    return chunks

def select_chunks_manual(chunks: List[Chunk], include_keywords: List[str]) -> List[Chunk]:
    inc = [k.lower() for k in include_keywords]
    selected: List[Chunk] = []
    for ch in chunks:
        hay = (ch.heading_path + "\n" + ch.text).lower()
        if any(k in hay for k in inc):
            selected.append(ch)

    # de-dupe (preserve order)
    seen = set()
    uniq: List[Chunk] = []
    for ch in selected:
        if ch.chunk_id not in seen:
            uniq.append(ch)
            seen.add(ch.chunk_id)
    return uniq

def select_chunks_manual_api_keys(chunks: List[Chunk], api_keys: List[str]) -> List[Chunk]:
    """
    Strict manual selection: only select chunks that contain at least one explicit API key.
    This prevents pulling unrelated chunks like Flatten/Explode/Reshape unless you include them.
    """
    keys = [k.lower() for k in api_keys]

    selected: List[Chunk] = []
    for ch in chunks:
        hay = (ch.heading_path + "\n" + ch.text).lower()
        if any(key in hay for key in keys):
            selected.append(ch)

    # de-dupe (preserve order)
    seen = set()
    uniq: List[Chunk] = []
    for ch in selected:
        if ch.chunk_id not in seen:
            uniq.append(ch)
            seen.add(ch.chunk_id)
    return uniq

def make_doc_pack(selected: List[Chunk]) -> str:
    out = []
    for ch in selected:
        out.append(f"### DOC CHUNK {ch.chunk_id}: {ch.heading_path}\n{ch.text}\n")
    return "\n".join(out).strip() + "\n"


In [17]:
MODEL = "gpt-5"

SYSTEM_PROMPT = """
You are a geospatial data engineer and Spark systems expert.

Task: Convert a given geospatial Python script into Scala code that runs on RDPro (Spark-based raster processing) on Apache Spark.

You must understand Spark execution and produce distributed, RDD-based Scala.

Environment & paths:
- Determine whether output paths should be treated as local or distributed based on Spark configuration and the URI scheme.
- You MAY use standard Spark/Scala APIs for this (SparkConf, SparkContext.hadoopConfiguration, java.net.URI, java.nio.file).
- You MUST NOT invent any RDPro path utilities.

Hard rules:
1) Output MUST be valid Scala that compiles as a Spark job (a complete file). Include:
   - necessary imports
   - a runnable entrypoint: `object JobName { def main(args:Array[String]):Unit = ... }` (or `extends App`)
   - SparkSession initialization
   - spark.stop() at the end (in finally or equivalent)
2) Use ONLY RDPro APIs that appear in the provided DOC CHUNKS.
   - If a method signature is not shown in DOC CHUNKS, do NOT guess.
3) Do NOT invent RDPro APIs, overloads, implicits, or helper utilities. No hidden "magic" conversions.
4) Preserve semantics of the Python: raster IO, pixel math, focal ops, masking/nodata, reprojection/resample if present.
5) Distributed correctness:
   - Avoid driver-side operations: do NOT call collect/toLocalIterator unless required by the Python semantics.
   - Prefer RDPro RasterRDD end-to-end when available in DOC CHUNKS.
6) Raster alignment robustness:
   - If not in DOC CHUNKS, fail fast: throw a clear runtime error explaining alignment is required but unsupported with available APIs.
7) Performance guidance (Spark-level only):
   - You MAY set Spark SQL / Spark configs and use standard Spark operations (repartition/coalesce/cache/persist) ONLY when:
     (a) it does not change semantics, and
     (b) it is justified by an obvious pipeline boundary (e.g., before a wide op / expensive reuse).
8) Lambdas:
   - When passing lambdas to RDPro functions (e.g., mapPixels), add explicit parameter and return types so Scala compiles.
9) CLI args:
   - If the Python has input/output paths, read them from args with safe defaults and validation.
   - Do not introduce extra parameters not implied by the Python.

Output format (strict):
- First: Scala file content only (NO markdown fences).
- After the Scala: a "NOTES" section listing:
  (a) RDPro APIs used (names only)
  (b) Unsupported operations and why (especially if missing alignment/warp APIs)
  (c) Assumptions about IO paths / bands / nodata / CRS / environment detection logic
""".strip()

In [18]:
def build_user_prompt(doc_pack: str, py_code: str) -> str:
    return f"""
RDPro documentation (relevant DOC CHUNKS only):
{doc_pack}

Python script:
{py_code}

Task:
Translate the Python script into Scala targeting RDPro on Spark.
Use ONLY APIs described in the DOC CHUNKS.
""".strip()


In [19]:
# Cell 7 â€” LLM call stub (you plug in your model call here)
def call_llm(prompt: str) -> str:
    """
    Replace this with your real LLM call.
    It should return Scala code as a string.
    """
    raise NotImplementedError("Plug in your LLM API call here.")

# Example usage:
# scala_code_manual = call_llm(prompt_manual)
# scala_code_auto   = call_llm(prompt_auto)


In [20]:
# ---- MANUAL KEYWORDS (edit this list) ----
NDVI_KEYS = [
    "setup",
    "geoTiff",
    "rastermetadata",
    "overlay",          # stack red + nir
    "mapPixels",        # compute NDVI
    "saveAsGeoTiff",    # write output
    "GeoTiffWriter",    # compression + write options
    "Compression",      # (optional) forces pulling compression option lines
]


In [21]:
chunks = chunk_markdown_by_headings(rdpro_text)
manual_selected = select_chunks_manual_api_keys(chunks, NDVI_KEYS)
doc_pack = make_doc_pack(manual_selected)
user_prompt = build_user_prompt(doc_pack, py_code)

In [22]:
OUT_DIR = Path("./runs/workspace")
OUT_DIR.mkdir(parents=True, exist_ok=True)

(OUT_DIR / "prompt_manual.txt").write_text(user_prompt, encoding="utf-8")
(OUT_DIR / "doc_selection.json").write_text(
    json.dumps([{"id": c.chunk_id, "heading": c.heading_path} for c in manual_selected], indent=2),
    encoding="utf-8"
)

print("Manual chunks:", len(manual_selected))
print("Prompt chars:", len(user_prompt))
print("Saved:", OUT_DIR / "prompt_manual.txt")


Manual chunks: 14
Prompt chars: 19393
Saved: runs/workspace/prompt_manual.txt


In [23]:
import os

if "OPENAI_API_KEY" in os.environ:
    print("OPENAI_API_KEY is set")
else:
    print("OPENAI_API_KEY is NOT set")

OPENAI_API_KEY is set


In [24]:
from openai import OpenAI
client = OpenAI()

def run_llm(prompt: str) -> Tuple[str, float]:
    t0 = time.time()
    resp = client.responses.create(
        model=MODEL,
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt},
        ],
    )
    return resp.output_text.strip(), time.time() - t0

scala_out, dt = run_llm(user_prompt)
(OUT_DIR / "Job.manual.scala").write_text(scala_out, encoding="utf-8")

print("Wrote:", OUT_DIR / "Job.manual.GPT.scala")
print("LLM latency:", round(dt, 2), "s")
print("\n--- Preview ---\n", scala_out[:900])

Wrote: runs/workspace/Job.manual.GPT.scala
LLM latency: 68.52 s

--- Preview ---
 import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import edu.ucr.cs.bdlab.beast._
import edu.ucr.cs.bdlab.raptor.GeoTiffWriter
import edu.ucr.cs.bdlab.beast.io.tiff.TiffConstants

import java.net.URI
import scala.util.Try

object NDVIJob {
  def main(args: Array[String]): Unit = {
    val defaultRed = "/Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/data/landsat8/LA/B4/LC08_L2SP_040037_20250827_20250903_02_T1_SR_B4.TIF"
    val defaultNir = "/Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/data/landsat8/LA/B5/LC08_L2SP_040037_20250827_20250903_02_T1_SR_B5.TIF"
    val defaultOut = "/Users/clockorangezoe/Documents/phd_projects/code/geoAI/RDProLLMagent/python/ndvi.tif"

    val redPath = if (args.length > 0) args(0) else defaultRed
    val nirPath = if (args.length > 1) args(1) else defaultN
