From bb24060f2ac3dc946cbd9aa076e552e081079e06 Mon Sep 17 00:00:00 2001
From: Henry Priest <hdpriest@illinois.edu>
Date: Thu, 9 Oct 2025 13:43:31 -0500
Subject: [PATCH 1/7] Create apptainer-sipnet-carb.yml

---
 .github/workflows/apptainer-sipnet-carb.yml | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .github/workflows/apptainer-sipnet-carb.yml

diff --git a/.github/workflows/apptainer-sipnet-carb.yml b/.github/workflows/apptainer-sipnet-carb.yml
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/.github/workflows/apptainer-sipnet-carb.yml
@@ -0,0 +1 @@
+

From 83a23da5ee0b30126f9710b9a049f81a8ab763c5 Mon Sep 17 00:00:00 2001
From: Henry Priest <hdpriest@illinois.edu>
Date: Thu, 11 Dec 2025 13:19:21 -0600
Subject: [PATCH 2/7] Create run-workflow-examples.yml

---
 .github/workflows/run-workflow-examples.yml | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .github/workflows/run-workflow-examples.yml

diff --git a/.github/workflows/run-workflow-examples.yml b/.github/workflows/run-workflow-examples.yml
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/.github/workflows/run-workflow-examples.yml
@@ -0,0 +1 @@
+

From c0f8255ddff2b56b34815fbb92e74b0ba747b51c Mon Sep 17 00:00:00 2001
From: Henry Priest <hdpriest@illinois.edu>
Date: Fri, 20 Feb 2026 11:52:10 -0600
Subject: [PATCH 3/7] Add first iteration of workflow CLI with config files and
 data prep shell

---
 2a_grass/00_fetch_s3_and_prepare_run_dir.sh | 113 +++++++
 2a_grass/example_user_config.yaml           |  29 ++
 2a_grass/workflow_manifest.yaml             |  87 +++++
 magic-ensemble                              | 338 ++++++++++++++++++++
 4 files changed, 567 insertions(+)
 create mode 100755 2a_grass/00_fetch_s3_and_prepare_run_dir.sh
 create mode 100644 2a_grass/example_user_config.yaml
 create mode 100644 2a_grass/workflow_manifest.yaml
 create mode 100755 magic-ensemble

diff --git a/2a_grass/00_fetch_s3_and_prepare_run_dir.sh b/2a_grass/00_fetch_s3_and_prepare_run_dir.sh
new file mode 100755
index 0000000..16a1172
--- /dev/null
+++ b/2a_grass/00_fetch_s3_and_prepare_run_dir.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+# 00_fetch_s3_and_prepare_run_dir.sh: fetch demo data from S3 and prepare run directory.
+# Invoked by the 'get-demo-data' command (for users who do not have local data).
+# All configuration is read from the workflow manifest or from environment variables set by the CLI.
+#
+# Required env (from CLI):
+#   RUN_DIR    run directory (e.g. 2a_grass/run), relative to REPO_ROOT
+#   REPO_ROOT  repo root (workflows directory)
+#   MANIFEST   path to workflow_manifest.yaml
+#   COMMAND    command name (e.g. get-demo-data)
+#   STEP_INDEX step index in that command (e.g. 0)
+#
+# Requires: yq (mikefarah/yq), aws CLI
+
+set -euo pipefail
+
+RUN_DIR="${RUN_DIR:?RUN_DIR is required}"
+REPO_ROOT="${REPO_ROOT:?REPO_ROOT is required}"
+MANIFEST="${MANIFEST:?MANIFEST is required}"
+COMMAND="${COMMAND:-prepare}"
+STEP_INDEX="${STEP_INDEX:-0}"
+
+if [[ ! -f "$MANIFEST" ]]; then
+  echo "00_fetch_s3_and_prepare_run_dir: Manifest not found: $MANIFEST" >&2
+  exit 1
+fi
+
+if ! command -v yq &>/dev/null; then
+  echo "00_fetch_s3_and_prepare_run_dir: yq is required to read the manifest." >&2
+  exit 1
+fi
+
+cd "$REPO_ROOT"
+
+# Resolve a path relative to run_dir (RUN_DIR may be absolute or relative to REPO_ROOT).
+resolve_run_path() {
+  if [[ "$RUN_DIR" == /* ]]; then
+    echo "${RUN_DIR}/${1}"
+  else
+    echo "${REPO_ROOT}/${RUN_DIR}/${1}"
+  fi
+}
+
+# --- Read from manifest ---
+s3_endpoint=$(yq eval '.s3.endpoint_url' "$MANIFEST")
+
+# Artifact: url + filename from s3.artifact_02
+artifact_url=$(yq eval '.s3.artifact_02.url' "$MANIFEST")
+artifact_filename=$(yq eval '.s3.artifact_02.filename' "$MANIFEST")
+artifact_s3_uri="${artifact_url}/${artifact_filename}"
+
+# LandTrendr TIFs: two S3 resources and two local path segments from paths.landtrendr_raw_files
+median_url=$(yq eval '.s3.median_tif.url' "$MANIFEST")
+median_filename=$(yq eval '.s3.median_tif.filename' "$MANIFEST")
+stdv_url=$(yq eval '.s3.stdv_tif.url' "$MANIFEST")
+stdv_filename=$(yq eval '.s3.stdv_tif.filename' "$MANIFEST")
+median_s3_uri="${median_url}/${median_filename}"
+stdv_s3_uri="${stdv_url}/${stdv_filename}"
+
+landtrendr_paths_raw=$(yq eval '.paths.landtrendr_raw_files' "$MANIFEST")
+# Split comma-separated; first segment = median, second = stdv
+landtrendr_segment_1="${landtrendr_paths_raw%%,*}"
+landtrendr_segment_2="${landtrendr_paths_raw#*,}"
+
+# Output path keys for this step: create these dirs (from manifest step.outputs)
+output_keys=$(yq eval '.steps["'"$COMMAND"'"] | .['"$STEP_INDEX"'].outputs | .[]' "$MANIFEST" 2>/dev/null || true)
+
+# --- Create run directory and output dirs from manifest ---
+echo "00_fetch_s3_and_prepare_run_dir: Creating run directory and output dirs from manifest"
+mkdir -p "$RUN_DIR"
+
+while IFS= read -r path_key; do
+  [[ -z "$path_key" ]] && continue
+  path_value=$(yq eval '.paths["'"$path_key"'"]' "$MANIFEST" 2>/dev/null)
+  [[ -z "$path_value" || "$path_value" == "null" ]] && continue
+  resolved=$(resolve_run_path "$path_value")
+  mkdir -p "$resolved"
+done <<< "$output_keys"
+
+# --- Download and extract artifact ---
+if [[ -f "$artifact_filename" ]]; then
+  echo "00_fetch_s3_and_prepare_run_dir: Artifact tarball already present: $artifact_filename"
+else
+  echo "00_fetch_s3_and_prepare_run_dir: Downloading artifact from S3"
+  aws s3 cp --endpoint-url "$s3_endpoint" "$artifact_s3_uri" "./$artifact_filename"
+fi
+
+RUN_DIR_ABS=$(if [[ "$RUN_DIR" = /* ]]; then echo "$RUN_DIR"; else echo "$REPO_ROOT/$RUN_DIR"; fi)
+echo "00_fetch_s3_and_prepare_run_dir: Extracting artifact into run directory"
+tar -xzf "$artifact_filename" -C "$RUN_DIR_ABS"
+
+# --- Download LandTrendr TIFs if not present (paths from manifest: first=median, second=stdv) ---
+seg1=$(echo "$landtrendr_segment_1" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+seg2=$(echo "$landtrendr_segment_2" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+
+download_tif() {
+  local seg="$1"
+  local s3_uri="$2"
+  local label="$3"
+  [[ -z "$seg" ]] && return 0
+  resolved=$(resolve_run_path "$seg")
+  if [[ -f "$resolved" ]]; then
+    echo "00_fetch_s3_and_prepare_run_dir: Already present: $resolved"
+  else
+    mkdir -p "$(dirname "$resolved")"
+    echo "00_fetch_s3_and_prepare_run_dir: Downloading $label from S3"
+    aws s3 cp --endpoint-url "$s3_endpoint" "$s3_uri" "$resolved"
+  fi
+}
+download_tif "$seg1" "$median_s3_uri" "median TIF"
+download_tif "$seg2" "$stdv_s3_uri" "stdv TIF"
+
+echo "00_fetch_s3_and_prepare_run_dir: Done."
diff --git a/2a_grass/example_user_config.yaml b/2a_grass/example_user_config.yaml
new file mode 100644
index 0000000..f1fcbfb
--- /dev/null
+++ b/2a_grass/example_user_config.yaml
@@ -0,0 +1,29 @@
+# Example user-facing config for 2a_grass workflows.
+# Pass with: ./magic-ensemble <command> --config workflows/2a_grass/example_user_config.yaml
+#
+# This file contains only overridable settings. Fixed paths, S3 resources, and
+# step I/O are defined in workflow_manifest.yaml (do not put those here).
+
+# Run directory: where outputs and run-specific data live.
+# Relative to the CWD where you invoke the CLI, unless you use an absolute path.
+run_dir: "2a_grass/run"
+
+# Dates used by prepare and run-ensembles.
+start_date: "2016-01-01"
+end_date: "2023-12-31"
+run_LAI_date: "2016-07-01"
+
+# Ensemble sizes.
+n_ens: 20
+n_met: 10
+ic_ensemble_size: 100
+
+# Parallelism (e.g. for step 01 --n_cores).
+n_workers: 1
+
+# Optional: distributed compute adapter (for future use with Slurm/Apptainer).
+# distributed_compute_adapter:
+#   name: "localhost"
+#   qsub: "sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif"
+#   qsub_jobid: "Submitted batch job ([0-9]+)"
+#   qstat: 'if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi'
diff --git a/2a_grass/workflow_manifest.yaml b/2a_grass/workflow_manifest.yaml
new file mode 100644
index 0000000..ef48585
--- /dev/null
+++ b/2a_grass/workflow_manifest.yaml
@@ -0,0 +1,87 @@
+# Workflow manifest: fixed paths and step I/O (internal, not user-facing).
+# CLI loads this automatically; do not pass via --config.
+#
+# Paths: All entries under 'paths' are inside the run directory (no paths outside run_dir).
+# Keys are referenced by name in steps (inputs/outputs). At runtime the CLI resolves
+# each path as run_dir + "/" + value.
+#
+# Steps: Each command has a list of step objects. Each step has:
+#   script:       R script path (relative to repo root)
+#   r_libraries:  R packages to check before running this script
+#   inputs:       List of path keys (from 'paths') this script reads (local paths only)
+#   outputs:      List of path keys this script creates or writes
+
+# S3 resources (not in user config). Remote resources are localized before R runs.
+s3:
+  endpoint_url: "https://s3.garage.ccmmf.ncsa.cloud"
+  bucket: "carb"
+  artifact_02:
+    url: "s3://carb/data/workflows/phase_2a"
+    filename: "ccmmf_phase_2a_input_artifacts.tgz"
+  median_tif:
+    url: "s3://carb/data_raw"
+    filename: "ca_biomassfiaald_2016_median.tif"
+  stdv_tif:
+    url: "s3://carb/data_raw"
+    filename: "ca_biomassfiaald_2016_stdv.tif"
+
+# Apptainer (not in user config)
+apptainer:
+  remote:
+    url: "docker://hdpriest0uiuc/"
+  container:
+    name: "sipnet-carb"
+  tag: "develop"
+  sif: "sipnet-carb_develop.sif"
+
+# Path definitions: all contained within the run directory.
+# Values are relative to run_dir; CLI resolves as run_dir + "/" + value.
+paths:
+  site_info_file: "site_info.csv"
+  site_sipnet_met_path: "data/ERA5_SIPNET"
+  site_era5_path: "data_raw/ERA5_nc"
+  field_shape_path: "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb"
+  data_dir: "data/IC_prep"
+  ic_outdir: "IC_files"
+  pft_dir: "pfts"
+  landtrendr_raw_files: "data_raw/ca_biomassfiaald_2016_median.tif,data_raw/ca_biomassfiaald_2016_stdv.tif"
+  site_file: "site_info.csv"
+  template_file: "template.xml"
+  output_file: "settings.xml"
+  met_dir: "data/ERA5_SIPNET"
+  ic_dir: "IC_files"
+  settings_xml: "settings.xml"
+
+# Fixed workflow values (not user overrides)
+params_from_pft: "SLA,leafC"
+additional_params: "varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005"
+
+# Steps per command: script path, R libs to check (empty for shell scripts), input/output path keys
+steps:
+  get-demo-data:
+    - script: "2a_grass/00_fetch_s3_and_prepare_run_dir.sh"
+      r_libraries: []
+      inputs:  []
+      outputs: [data_dir, ic_outdir, site_sipnet_met_path]
+
+  prepare:
+    - script: "2a_grass/01_ERA5_nc_to_clim.R"
+      r_libraries: [future, furrr]
+      inputs:  [site_info_file, site_era5_path]
+      outputs: [site_sipnet_met_path]
+
+    - script: "2a_grass/02_ic_build.R"
+      r_libraries: [tidyverse]
+      inputs:  [site_info_file, field_shape_path, pft_dir, data_dir, landtrendr_raw_files]
+      outputs: [ic_outdir, data_dir]
+
+    - script: "2a_grass/03_xml_build.R"
+      r_libraries: [PEcAn.settings]
+      inputs:  [site_file, template_file, ic_dir, met_dir]
+      outputs: [output_file]
+
+  run-ensembles:
+    - script: "2a_grass/04_run_model.R"
+      r_libraries: [PEcAn.all]
+      inputs:  [settings_xml]
+      outputs: []
diff --git a/magic-ensemble b/magic-ensemble
new file mode 100755
index 0000000..96c05df
--- /dev/null
+++ b/magic-ensemble
@@ -0,0 +1,338 @@
+#!/usr/bin/env bash
+# magic-ensemble: minimal CLI for workflows (2a_grass).
+# Usage: ./magic-ensemble <command> [--dry-run] [--verbose] [--config <path>]
+# Commands: help | get-demo-data | prepare | run-ensembles
+
+set -euo pipefail
+
+# --- Repo root, manifest, and invocation CWD ---
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$SCRIPT_DIR"
+MANIFEST="${REPO_ROOT}/2a_grass/workflow_manifest.yaml"
+INVOCATION_CWD="${INVOCATION_CWD:-$(pwd)}"
+
+usage() {
+  cat <<'EOF'
+Usage: ./magic-ensemble <command> [global options]
+
+Commands:
+  help            Print this usage and help (no scripts run).
+  get-demo-data   Fetch demo data from S3 and create run directory (for users without local data).
+  prepare         Run preparation steps: 01 (ERA5→clim), 02 (IC build), 03 (XML build).
+  run-ensembles   Run step 04 (run model) using existing settings.xml and prepared inputs.
+
+Global options (after command):
+  --dry-run       Do not run R scripts; print what would be run and run pre-execution checks.
+  --verbose       Echo each Rscript command before running.
+  --config <path> Path to user YAML config (overridable keys only; fixed paths are in workflow manifest).
+
+Examples:
+  ./magic-ensemble help
+  ./magic-ensemble get-demo-data --config my_config.yaml
+  ./magic-ensemble prepare --dry-run
+  ./magic-ensemble prepare --config my_config.yaml --verbose
+  ./magic-ensemble run-ensembles --config my_config.yaml
+EOF
+}
+
+# --- Require yq (mikefarah/yq, jq-style) ---
+require_yq() {
+  if ! command -v yq &>/dev/null; then
+    echo "magic-ensemble: yq is required to read YAML. Install mikefarah/yq: https://github.com/mikefarah/yq" >&2
+    exit 1
+  fi
+  if ! yq eval '.' "$MANIFEST" &>/dev/null; then
+    echo "magic-ensemble: Could not parse manifest with yq. This CLI requires mikefarah/yq (jq-style). Your 'yq' may be a different implementation." >&2
+    exit 1
+  fi
+}
+
+# --- Parse arguments: command first, then global options ---
+COMMAND=""
+DRY_RUN=0
+VERBOSE=0
+CONFIG_FILE=""
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    help|get-demo-data|prepare|run-ensembles)
+      if [[ -z "$COMMAND" ]]; then COMMAND="$1"; shift; continue; fi
+      ;;
+    --dry-run)   DRY_RUN=1; shift; continue ;;
+    --verbose)   VERBOSE=1; shift; continue ;;
+    --config)
+      if [[ $# -lt 2 ]]; then echo "magic-ensemble: --config requires <path>." >&2; usage >&2; exit 1; fi
+      CONFIG_FILE="$2"; shift 2; continue
+      ;;
+    -*)
+      echo "magic-ensemble: Unknown option: $1" >&2; usage >&2; exit 1
+      ;;
+    *)
+      if [[ -z "$COMMAND" ]]; then COMMAND="$1"; shift; continue; fi
+      echo "magic-ensemble: Unexpected argument: $1" >&2; usage >&2; exit 1
+      ;;
+  esac
+  shift
+done
+
+# --- Help or no command ---
+if [[ -z "$COMMAND" || "$COMMAND" == "help" ]]; then
+  usage
+  exit 0
+fi
+
+if [[ "$COMMAND" != "get-demo-data" && "$COMMAND" != "prepare" && "$COMMAND" != "run-ensembles" ]]; then
+  echo "magic-ensemble: Unknown command: $COMMAND" >&2
+  usage >&2
+  exit 1
+fi
+
+require_yq
+if [[ ! -f "$MANIFEST" ]]; then
+  echo "magic-ensemble: Workflow manifest not found: $MANIFEST" >&2
+  exit 1
+fi
+
+# --- Load effective config: manifest + optional user overrides ---
+# User config may contain: run_dir, start_date, end_date, run_LAI_date, n_ens, n_met, ic_ensemble_size, n_workers
+get_val() {
+  local key="$1"
+  local from_manifest="$2"
+  if [[ -n "$CONFIG_FILE" && -f "$CONFIG_FILE" ]]; then
+    local u
+    u=$(yq eval ".$key // .paths.$key // .dates.$key // .ensemble.$key // empty" "$CONFIG_FILE" 2>/dev/null)
+    if [[ -n "$u" && "$u" != "null" ]]; then
+      echo "$u"
+      return
+    fi
+  fi
+  echo "$from_manifest"
+}
+
+# Read manifest paths and fixed values
+p_site_info_file=$(yq eval '.paths.site_info_file' "$MANIFEST")
+p_site_sipnet_met_path=$(yq eval '.paths.site_sipnet_met_path' "$MANIFEST")
+p_site_era5_path=$(yq eval '.paths.site_era5_path' "$MANIFEST")
+p_field_shape_path=$(yq eval '.paths.field_shape_path' "$MANIFEST")
+p_data_dir=$(yq eval '.paths.data_dir' "$MANIFEST")
+p_ic_outdir=$(yq eval '.paths.ic_outdir' "$MANIFEST")
+p_pft_dir=$(yq eval '.paths.pft_dir' "$MANIFEST")
+p_landtrendr_raw_files=$(yq eval '.paths.landtrendr_raw_files' "$MANIFEST")
+p_site_file=$(yq eval '.paths.site_file' "$MANIFEST")
+p_template_file=$(yq eval '.paths.template_file' "$MANIFEST")
+p_output_file=$(yq eval '.paths.output_file' "$MANIFEST")
+p_met_dir=$(yq eval '.paths.met_dir' "$MANIFEST")
+p_ic_dir=$(yq eval '.paths.ic_dir' "$MANIFEST")
+p_settings_xml=$(yq eval '.paths.settings_xml' "$MANIFEST")
+params_from_pft=$(yq eval '.params_from_pft' "$MANIFEST")
+additional_params=$(yq eval '.additional_params' "$MANIFEST")
+
+# Overridable defaults (manifest may not have these; use script defaults if not in user config)
+run_dir_default="magic-ensemble-run-directory/"
+start_date_default="2016-01-01"
+end_date_default="2023-12-31"
+run_LAI_date_default="2016-07-01"
+n_ens_default="20"
+n_met_default="10"
+ic_ensemble_size_default="100"
+n_workers_default="1"
+
+run_dir=$(get_val "run_dir" "$run_dir_default")
+# If run_dir is not absolute, resolve relative to CWD where the CLI was invoked
+if [[ "$run_dir" != /* ]]; then
+  run_dir="${INVOCATION_CWD}/${run_dir}"
+fi
+start_date=$(get_val "start_date" "$start_date_default")
+end_date=$(get_val "end_date" "$end_date_default")
+run_LAI_date=$(get_val "run_LAI_date" "$run_LAI_date_default")
+n_ens=$(get_val "n_ens" "$n_ens_default")
+n_met=$(get_val "n_met" "$n_met_default")
+ic_ensemble_size=$(get_val "ic_ensemble_size" "$ic_ensemble_size_default")
+n_workers=$(get_val "n_workers" "$n_workers_default")
+
+# Resolve manifest paths relative to run_dir (then relative to repo root).
+# Effective path = run_dir / manifest_path so R (CWD=REPO_ROOT) sees the correct file.
+resolve_path() { echo "${run_dir}/${1}"; }
+site_info_file=$(resolve_path "$p_site_info_file")
+site_sipnet_met_path=$(resolve_path "$p_site_sipnet_met_path")
+site_era5_path=$(resolve_path "$p_site_era5_path")
+field_shape_path=$(resolve_path "$p_field_shape_path")
+data_dir=$(resolve_path "$p_data_dir")
+ic_outdir=$(resolve_path "$p_ic_outdir")
+pft_dir=$(resolve_path "$p_pft_dir")
+landtrendr_raw_files=$(resolve_path "$p_landtrendr_raw_files")
+site_file=$(resolve_path "$p_site_file")
+template_file=$(resolve_path "$p_template_file")
+output_file=$(resolve_path "$p_output_file")
+met_dir=$(resolve_path "$p_met_dir")
+ic_dir=$(resolve_path "$p_ic_dir")
+settings_xml=$(resolve_path "$p_settings_xml")
+# landtrendr_raw_files is comma-separated; resolve each segment
+landtrendr_raw_files=""
+while IFS= read -r segment; do
+  segment=$(echo "$segment" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+  [[ -z "$segment" ]] && continue
+  [[ -n "$landtrendr_raw_files" ]] && landtrendr_raw_files="${landtrendr_raw_files},"
+  landtrendr_raw_files="${landtrendr_raw_files}${run_dir}/${segment}"
+done < <(yq eval '.paths.landtrendr_raw_files' "$MANIFEST" | tr ',' '\n')
+
+# --- Pre-execution: AWS S3 tools check ---
+check_aws() {
+  if ! command -v aws &>/dev/null; then
+    echo "magic-ensemble: AWS CLI (aws) not found on PATH; required for S3 access." >&2
+    exit 1
+  fi
+}
+
+# --- Get list of script paths for current command (from manifest steps) ---
+get_steps() {
+  yq eval '.steps["'"$COMMAND"'"] | .[].script' "$MANIFEST"
+}
+
+# --- Populate STEPS array for current command (from manifest) ---
+get_steps_array() {
+  STEPS=()
+  while IFS= read -r s; do
+    [[ -n "$s" ]] && STEPS+=("$s")
+  done < <(get_steps)
+}
+
+# --- R library check for step at index i (reads r_libraries from manifest step; skip if empty or .sh) ---
+check_r_libs_for_step() {
+  local i="$1"
+  local script="${STEPS[i]}"
+  [[ "$script" == *.sh ]] && return 0
+  local lib
+  while IFS= read -r lib; do
+    [[ -z "$lib" || "$lib" == "null" ]] && continue
+    if ! (cd "$REPO_ROOT" && Rscript -e "library(\"$lib\")") 2>/dev/null; then
+      echo "magic-ensemble: R library check failed: library(\"$lib\") not available. Install it or activate the correct environment." >&2
+      exit 1
+    fi
+  done < <(yq eval '.steps["'"$COMMAND"'"] | .['"$i"'].r_libraries | .[]?' "$MANIFEST" 2>/dev/null || true)
+}
+
+# --- Dry-run: print scripts and optionally run checks ---
+do_dry_run() {
+  echo "magic-ensemble: dry-run for command: $COMMAND"
+  echo "Would run the following scripts (CWD = $REPO_ROOT):"
+  while IFS= read -r script; do
+    [[ -z "$script" ]] && continue
+    script_path="${REPO_ROOT}/${script}"
+    if [[ -f "$script_path" ]]; then
+      echo "  - $script (exists)"
+    else
+      echo "  - $script (MISSING)"
+    fi
+  done < <(get_steps)
+  echo ""
+  echo "Pre-execution checks (R libraries, AWS CLI) can be run when not in dry-run."
+  exit 0
+}
+
+# --- Run R script with args; CWD = REPO_ROOT ---
+run_script() {
+  local script="$1"
+  shift
+  local script_path="${REPO_ROOT}/${script}"
+  if [[ ! -f "$script_path" ]]; then
+    echo "magic-ensemble: Script not found: $script_path" >&2
+    exit 1
+  fi
+  if [[ $VERBOSE -eq 1 ]]; then
+    echo "Rscript $script_path $*" >&2
+  fi
+  (cd "$REPO_ROOT" && Rscript "$script_path" "$@")
+}
+
+# --- Run shell script; CWD = REPO_ROOT. Pass COMMAND and STEP_INDEX for manifest lookups. ---
+run_shell_script() {
+  local script="$1"
+  local step_index="${2:-0}"
+  local script_path="${REPO_ROOT}/${script}"
+  if [[ ! -f "$script_path" ]]; then
+    echo "magic-ensemble: Script not found: $script_path" >&2
+    exit 1
+  fi
+  if [[ $VERBOSE -eq 1 ]]; then
+    echo "RUN_DIR=$run_dir REPO_ROOT=$REPO_ROOT MANIFEST=$MANIFEST COMMAND=$COMMAND STEP_INDEX=$step_index bash $script_path" >&2
+  fi
+  (cd "$REPO_ROOT" && RUN_DIR="$run_dir" REPO_ROOT="$REPO_ROOT" MANIFEST="$MANIFEST" COMMAND="$COMMAND" STEP_INDEX="$step_index" bash "$script_path")
+}
+
+# --- Get-demo-data: run steps from manifest (shell script only) ---
+run_get_demo_data() {
+  get_steps_array
+  check_aws
+  for i in "${!STEPS[@]}"; do
+    check_r_libs_for_step "$i"
+    run_shell_script "${STEPS[i]}" "$i"
+  done
+}
+
+# --- Prepare: run steps from manifest (01, 02, 03 with R args) ---
+run_prepare() {
+  get_steps_array
+  check_aws
+  for i in "${!STEPS[@]}"; do
+    check_r_libs_for_step "$i"
+  done
+
+  for i in "${!STEPS[@]}"; do
+    case "$i" in
+      0) run_script "${STEPS[i]}" \
+          --site_era5_path "$site_era5_path" \
+          --site_sipnet_met_path "$site_sipnet_met_path" \
+          --site_info_file "$site_info_file" \
+          --start_date "$start_date" \
+          --end_date "$end_date" \
+          --n_cores "$n_workers" \
+          --parallel_strategy "multisession" ;;
+      1) run_script "${STEPS[i]}" \
+          --site_info_path "$site_info_file" \
+          --field_shape_path "$field_shape_path" \
+          --ic_ensemble_size "$ic_ensemble_size" \
+          --run_start_date "$start_date" \
+          --run_LAI_date "$run_LAI_date" \
+          --ic_outdir "$ic_outdir" \
+          --data_dir "$data_dir" \
+          --pft_dir "$pft_dir" \
+          --params_read_from_pft "$params_from_pft" \
+          --landtrendr_raw_files "$landtrendr_raw_files" \
+          --additional_params "$additional_params" ;;
+      2) run_script "${STEPS[i]}" \
+          --n_ens "$n_ens" \
+          --n_met "$n_met" \
+          --start_date "$start_date" \
+          --end_date "$end_date" \
+          --ic_dir "$ic_dir" \
+          --met_dir "$met_dir" \
+          --site_file "$site_file" \
+          --template_file "$template_file" \
+          --output_file "$output_file" ;;
+      *) echo "magic-ensemble: No argument mapping for prepare step index $i" >&2; exit 1 ;;
+    esac
+  done
+}
+
+# --- Run-ensembles: run single step from manifest (04) ---
+run_run_ensembles() {
+  get_steps_array
+  check_aws
+  check_r_libs_for_step 0
+
+  run_script "${STEPS[0]}" \
+    --settings "$settings_xml" \
+    --continue "FALSE"
+}
+
+# --- Main ---
+if [[ $DRY_RUN -eq 1 ]]; then
+  do_dry_run
+fi
+
+case "$COMMAND" in
+  get-demo-data)  run_get_demo_data ;;
+  prepare)        run_prepare ;;
+  run-ensembles)  run_run_ensembles ;;
+  *)              echo "magic-ensemble: Unknown command: $COMMAND" >&2; exit 1 ;;
+esac

From f6f0f7bd5cbb4957ee7c39cbc1ff9d8fc753e463 Mon Sep 17 00:00:00 2001
From: Henry Priest <hdpriest@illinois.edu>
Date: Fri, 20 Feb 2026 16:06:45 -0600
Subject: [PATCH 4/7] Enhance run directory handling in scripts: update
 magic-ensemble to display run directory during dry-run and modify
 00_fetch_s3_and_prepare_run_dir.sh to resolve and use absolute run directory
 for artifact downloads and extractions.

---
 2a_grass/00_fetch_s3_and_prepare_run_dir.sh | 20 +++++++++++---------
 magic-ensemble                              |  1 +
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/2a_grass/00_fetch_s3_and_prepare_run_dir.sh b/2a_grass/00_fetch_s3_and_prepare_run_dir.sh
index 16a1172..d4a4bb9 100755
--- a/2a_grass/00_fetch_s3_and_prepare_run_dir.sh
+++ b/2a_grass/00_fetch_s3_and_prepare_run_dir.sh
@@ -65,9 +65,12 @@ landtrendr_segment_2="${landtrendr_paths_raw#*,}"
 # Output path keys for this step: create these dirs (from manifest step.outputs)
 output_keys=$(yq eval '.steps["'"$COMMAND"'"] | .['"$STEP_INDEX"'].outputs | .[]' "$MANIFEST" 2>/dev/null || true)
 
+# --- Resolve absolute run directory (for downloads and extract) ---
+RUN_DIR_ABS=$(if [[ "$RUN_DIR" = /* ]]; then echo "$RUN_DIR"; else echo "$REPO_ROOT/$RUN_DIR"; fi)
+
 # --- Create run directory and output dirs from manifest ---
 echo "00_fetch_s3_and_prepare_run_dir: Creating run directory and output dirs from manifest"
-mkdir -p "$RUN_DIR"
+mkdir -p "$RUN_DIR_ABS"
 
 while IFS= read -r path_key; do
   [[ -z "$path_key" ]] && continue
@@ -77,17 +80,16 @@ while IFS= read -r path_key; do
   mkdir -p "$resolved"
 done <<< "$output_keys"
 
-# --- Download and extract artifact ---
-if [[ -f "$artifact_filename" ]]; then
-  echo "00_fetch_s3_and_prepare_run_dir: Artifact tarball already present: $artifact_filename"
+# --- Download artifact tarball into run directory and extract ---
+artifact_local="${RUN_DIR_ABS}/${artifact_filename}"
+if [[ -f "$artifact_local" ]]; then
+  echo "00_fetch_s3_and_prepare_run_dir: Artifact tarball already present in run dir: $artifact_local"
 else
-  echo "00_fetch_s3_and_prepare_run_dir: Downloading artifact from S3"
-  aws s3 cp --endpoint-url "$s3_endpoint" "$artifact_s3_uri" "./$artifact_filename"
+  echo "00_fetch_s3_and_prepare_run_dir: Downloading artifact from S3 into run directory"
+  aws s3 cp --endpoint-url "$s3_endpoint" "$artifact_s3_uri" "$artifact_local"
 fi
-
-RUN_DIR_ABS=$(if [[ "$RUN_DIR" = /* ]]; then echo "$RUN_DIR"; else echo "$REPO_ROOT/$RUN_DIR"; fi)
 echo "00_fetch_s3_and_prepare_run_dir: Extracting artifact into run directory"
-tar -xzf "$artifact_filename" -C "$RUN_DIR_ABS"
+tar -xzf "$artifact_local" -C "$RUN_DIR_ABS"
 
 # --- Download LandTrendr TIFs if not present (paths from manifest: first=median, second=stdv) ---
 seg1=$(echo "$landtrendr_segment_1" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
diff --git a/magic-ensemble b/magic-ensemble
index 96c05df..bd534e8 100755
--- a/magic-ensemble
+++ b/magic-ensemble
@@ -214,6 +214,7 @@ check_r_libs_for_step() {
 # --- Dry-run: print scripts and optionally run checks ---
 do_dry_run() {
   echo "magic-ensemble: dry-run for command: $COMMAND"
+  echo "Run directory (for this execution): $run_dir"
   echo "Would run the following scripts (CWD = $REPO_ROOT):"
   while IFS= read -r script; do
     [[ -z "$script" ]] && continue

From 2a4c4a647921e4555cf21fb41d5f2335c14e85fd Mon Sep 17 00:00:00 2001
From: Henry Priest <hdpriest@illinois.edu>
Date: Wed, 18 Mar 2026 19:41:16 +0000
Subject: [PATCH 5/7] Add Apptainer support, dispatch configuration, and
 external input staging to workflow CLI

- magic-ensemble: --config is now required; supports use_apptainer (run
  prepare steps inside
  a container) and pecan_dispatch (select how ensemble members are
  submitted/executed)
- workflow_manifest.yaml: defines available dispatch modes
  (local-gnu-parallel, slurm-dispatch)
  with appropriate host XML for native and apptainer execution; S3
  resources consolidated
- Prep scripts: accept CLI flags instead of env vars; stage
  user-provided external files
  (e.g. template.xml) into the run directory before prepare steps run
- tools/patch_xml.py: utility to patch elements in PEcAn XML config
  files in-place
- 01_ERA5_nc_to_clim.R: ERA5 met inputs now looked up by grid cell
  center rather than site id
- example_user_config.yaml: documents new user-facing options
  (use_apptainer, pecan_dispatch,
  external_paths)

Relates to: https://github.com/orgs/ccmmf/discussions/182
---
 .gitignore                                  |   2 +
 2a_grass/00_fetch_s3_and_prepare_run_dir.sh | 154 +++++++--
 2a_grass/00_stage_external_inputs.sh        | 194 +++++++++++
 2a_grass/01_ERA5_nc_to_clim.R               |  21 +-
 2a_grass/example_user_config.yaml           |  28 +-
 2a_grass/template.xml                       |  14 +-
 2a_grass/workflow_manifest.yaml             |  70 +++-
 magic-ensemble                              | 354 +++++++++++++++-----
 tools/patch_xml.py                          |  97 ++++++
 9 files changed, 798 insertions(+), 136 deletions(-)
 create mode 100644 2a_grass/00_stage_external_inputs.sh
 create mode 100644 tools/patch_xml.py

diff --git a/.gitignore b/.gitignore
index 0558174..540bcd8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@
 # data files not marked test
 **/IC_files/**
 **/data/**
+**/data_raw/**
 **/pfts/**
 
 # R-specific files
@@ -43,3 +44,4 @@ Thumbs.db
 # Temporary files
 *.tmp
 *.log
+**/local_dev_notes/**
diff --git a/2a_grass/00_fetch_s3_and_prepare_run_dir.sh b/2a_grass/00_fetch_s3_and_prepare_run_dir.sh
index d4a4bb9..13afd3a 100755
--- a/2a_grass/00_fetch_s3_and_prepare_run_dir.sh
+++ b/2a_grass/00_fetch_s3_and_prepare_run_dir.sh
@@ -1,24 +1,98 @@
 #!/usr/bin/env bash
 # 00_fetch_s3_and_prepare_run_dir.sh: fetch demo data from S3 and prepare run directory.
 # Invoked by the 'get-demo-data' command (for users who do not have local data).
-# All configuration is read from the workflow manifest or from environment variables set by the CLI.
-#
-# Required env (from CLI):
-#   RUN_DIR    run directory (e.g. 2a_grass/run), relative to REPO_ROOT
-#   REPO_ROOT  repo root (workflows directory)
-#   MANIFEST   path to workflow_manifest.yaml
-#   COMMAND    command name (e.g. get-demo-data)
-#   STEP_INDEX step index in that command (e.g. 0)
+# S3 URLs and path keys come from the workflow manifest; run dir and paths are passed as arguments.
 #
 # Requires: yq (mikefarah/yq), aws CLI
+#
+# Options (see --help): --repo-root (required); --manifest optional, defaults to <repo-root>/2a_grass/workflow_manifest.yaml
 
 set -euo pipefail
 
-RUN_DIR="${RUN_DIR:?RUN_DIR is required}"
-REPO_ROOT="${REPO_ROOT:?REPO_ROOT is required}"
-MANIFEST="${MANIFEST:?MANIFEST is required}"
-COMMAND="${COMMAND:-prepare}"
-STEP_INDEX="${STEP_INDEX:-0}"
+usage() {
+  cat <<'EOF'
+Usage: 00_fetch_s3_and_prepare_run_dir.sh [OPTIONS]
+
+Fetch demo data from S3 and prepare the run directory. S3 URLs and path keys are
+read from the workflow manifest. Run directory is either from --run-dir or from
+run_dir in the file given by --config (relative paths resolved with --invocation-cwd).
+
+Required:
+  --repo-root PATH      Repo root (workflows directory). Script changes to this directory.
+
+Run directory (one of):
+  --run-dir PATH        Run directory (absolute, or relative to --repo-root).
+  --config PATH         User YAML config file; script reads run_dir from it (use with --invocation-cwd).
+
+Optional:
+  --manifest PATH       Path to workflow_manifest.yaml (default: <repo-root>/2a_grass/workflow_manifest.yaml).
+  --invocation-cwd PATH Required when using --config with a relative run_dir. Paths reported relative to this.
+  --command NAME        Command name for manifest step lookup (default: get-demo-data).
+  --step-index N        Step index in that command (default: 0).
+  -h, --help            Print this help and exit.
+EOF
+}
+
+RUN_DIR=""
+CONFIG_FILE=""
+REPO_ROOT=""
+MANIFEST=""
+COMMAND="get-demo-data"
+STEP_INDEX="0"
+INVOCATION_CWD=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --run-dir)      [[ $# -lt 2 ]] && { echo "00_fetch_s3_and_prepare_run_dir: --run-dir requires PATH." >&2; usage >&2; exit 1; }; RUN_DIR="$2"; shift 2 ;;
+    --config)       [[ $# -lt 2 ]] && { echo "00_fetch_s3_and_prepare_run_dir: --config requires PATH." >&2; usage >&2; exit 1; }; CONFIG_FILE="$2"; shift 2 ;;
+    --repo-root)    [[ $# -lt 2 ]] && { echo "00_fetch_s3_and_prepare_run_dir: --repo-root requires PATH." >&2; usage >&2; exit 1; }; REPO_ROOT="$2"; shift 2 ;;
+    --manifest)     [[ $# -lt 2 ]] && { echo "00_fetch_s3_and_prepare_run_dir: --manifest requires PATH." >&2; usage >&2; exit 1; }; MANIFEST="$2"; shift 2 ;;
+    --command)      [[ $# -lt 2 ]] && { echo "00_fetch_s3_and_prepare_run_dir: --command requires NAME." >&2; usage >&2; exit 1; }; COMMAND="$2"; shift 2 ;;
+    --step-index)   [[ $# -lt 2 ]] && { echo "00_fetch_s3_and_prepare_run_dir: --step-index requires N." >&2; usage >&2; exit 1; }; STEP_INDEX="$2"; shift 2 ;;
+    --invocation-cwd) [[ $# -lt 2 ]] && { echo "00_fetch_s3_and_prepare_run_dir: --invocation-cwd requires PATH." >&2; usage >&2; exit 1; }; INVOCATION_CWD="$2"; shift 2 ;;
+    -h|--help)      usage; exit 0 ;;
+    *)              echo "00_fetch_s3_and_prepare_run_dir: Unknown option: $1" >&2; usage >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$REPO_ROOT" ]]; then echo "00_fetch_s3_and_prepare_run_dir: --repo-root is required." >&2; usage >&2; exit 1; fi
+if [[ -z "$MANIFEST" ]]; then
+  MANIFEST="${REPO_ROOT}/2a_grass/workflow_manifest.yaml"
+fi
+
+# Run directory: from --run-dir or from config file
+if [[ -n "$CONFIG_FILE" ]]; then
+  if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "00_fetch_s3_and_prepare_run_dir: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+  fi
+  RUN_DIR=$(yq eval '.run_dir' "$CONFIG_FILE") || { echo "00_fetch_s3_and_prepare_run_dir: yq failed to read .run_dir from config: $CONFIG_FILE" >&2; exit 1; }
+  if [[ -z "$RUN_DIR" || "$RUN_DIR" == "null" ]]; then
+    echo "00_fetch_s3_and_prepare_run_dir: run_dir not found or empty in config (expected .run_dir): $CONFIG_FILE" >&2
+    exit 1
+  fi
+  if [[ "$RUN_DIR" != /* ]]; then
+    if [[ -z "$INVOCATION_CWD" ]]; then
+      echo "00_fetch_s3_and_prepare_run_dir: --invocation-cwd is required when run_dir in config is relative." >&2
+      exit 1
+    fi
+    RUN_DIR="${INVOCATION_CWD}/${RUN_DIR}"
+  fi
+elif [[ -z "$RUN_DIR" ]]; then
+  echo "00_fetch_s3_and_prepare_run_dir: Provide --run-dir or --config (with run_dir in the config file)." >&2
+  usage >&2
+  exit 1
+fi
+
+# Show path for user: relative to INVOCATION_CWD if under it, else absolute
+report_path() {
+  local abs_path="$1"
+  if [[ -n "$INVOCATION_CWD" && "$abs_path" == "$INVOCATION_CWD"/* ]]; then
+    echo "${abs_path#"$INVOCATION_CWD"/}"
+  else
+    echo "$abs_path"
+  fi
+}
 
 if [[ ! -f "$MANIFEST" ]]; then
   echo "00_fetch_s3_and_prepare_run_dir: Manifest not found: $MANIFEST" >&2
@@ -41,21 +115,37 @@ resolve_run_path() {
   fi
 }
 
-# --- Read from manifest ---
+# --- Read from manifest (endpoint, bucket, and per-resource key_prefix + filename) ---
 s3_endpoint=$(yq eval '.s3.endpoint_url' "$MANIFEST")
+s3_bucket=$(yq eval '.s3.bucket' "$MANIFEST")
+
+# Build S3 key from key_prefix + filename (key_prefix may be empty or null from yq)
+s3_key() {
+  local prefix="$1"
+  local name="$2"
+  [[ "$prefix" == "null" || -z "$prefix" ]] && prefix=""
+  if [[ -n "$prefix" ]]; then
+    echo "${prefix}/${name}"
+  else
+    echo "$name"
+  fi
+}
 
-# Artifact: url + filename from s3.artifact_02
-artifact_url=$(yq eval '.s3.artifact_02.url' "$MANIFEST")
+# Artifact: bucket + key from s3.artifact_02
+artifact_key_prefix=$(yq eval '.s3.artifact_02.key_prefix' "$MANIFEST")
 artifact_filename=$(yq eval '.s3.artifact_02.filename' "$MANIFEST")
-artifact_s3_uri="${artifact_url}/${artifact_filename}"
+artifact_s3_key=$(s3_key "$artifact_key_prefix" "$artifact_filename")
+artifact_s3_uri="s3://${s3_bucket}/${artifact_s3_key}"
 
-# LandTrendr TIFs: two S3 resources and two local path segments from paths.landtrendr_raw_files
-median_url=$(yq eval '.s3.median_tif.url' "$MANIFEST")
+# LandTrendr TIFs: bucket + key from s3.median_tif and s3.stdv_tif
+median_key_prefix=$(yq eval '.s3.median_tif.key_prefix' "$MANIFEST")
 median_filename=$(yq eval '.s3.median_tif.filename' "$MANIFEST")
-stdv_url=$(yq eval '.s3.stdv_tif.url' "$MANIFEST")
+stdv_key_prefix=$(yq eval '.s3.stdv_tif.key_prefix' "$MANIFEST")
 stdv_filename=$(yq eval '.s3.stdv_tif.filename' "$MANIFEST")
-median_s3_uri="${median_url}/${median_filename}"
-stdv_s3_uri="${stdv_url}/${stdv_filename}"
+median_s3_key=$(s3_key "$median_key_prefix" "$median_filename")
+stdv_s3_key=$(s3_key "$stdv_key_prefix" "$stdv_filename")
+median_s3_uri="s3://${s3_bucket}/${median_s3_key}"
+stdv_s3_uri="s3://${s3_bucket}/${stdv_s3_key}"
 
 landtrendr_paths_raw=$(yq eval '.paths.landtrendr_raw_files' "$MANIFEST")
 # Split comma-separated; first segment = median, second = stdv
@@ -68,9 +158,11 @@ output_keys=$(yq eval '.steps["'"$COMMAND"'"] | .['"$STEP_INDEX"'].outputs | .[]
 # --- Resolve absolute run directory (for downloads and extract) ---
 RUN_DIR_ABS=$(if [[ "$RUN_DIR" = /* ]]; then echo "$RUN_DIR"; else echo "$REPO_ROOT/$RUN_DIR"; fi)
 
-# --- Create run directory and output dirs from manifest ---
+# --- Create run directory and canonicalize so paths have no ".." (clean aws/tar output) ---
 echo "00_fetch_s3_and_prepare_run_dir: Creating run directory and output dirs from manifest"
 mkdir -p "$RUN_DIR_ABS"
+RUN_DIR_ABS=$(cd "$RUN_DIR_ABS" && pwd)
+RUN_DIR="$RUN_DIR_ABS"
 
 while IFS= read -r path_key; do
   [[ -z "$path_key" ]] && continue
@@ -82,11 +174,13 @@ done <<< "$output_keys"
 
 # --- Download artifact tarball into run directory and extract ---
 artifact_local="${RUN_DIR_ABS}/${artifact_filename}"
+artifact_report=$(report_path "$artifact_local")
 if [[ -f "$artifact_local" ]]; then
-  echo "00_fetch_s3_and_prepare_run_dir: Artifact tarball already present in run dir: $artifact_local"
+  echo "00_fetch_s3_and_prepare_run_dir: Artifact tarball already present in run dir: $artifact_report"
 else
   echo "00_fetch_s3_and_prepare_run_dir: Downloading artifact from S3 into run directory"
-  aws s3 cp --endpoint-url "$s3_endpoint" "$artifact_s3_uri" "$artifact_local"
+  echo "00_fetch_s3_and_prepare_run_dir: Saving to: $artifact_report"
+  (cd "$RUN_DIR_ABS" && aws s3 cp --endpoint-url "$s3_endpoint" "$artifact_s3_uri" "$artifact_filename")
 fi
 echo "00_fetch_s3_and_prepare_run_dir: Extracting artifact into run directory"
 tar -xzf "$artifact_local" -C "$RUN_DIR_ABS"
@@ -102,11 +196,15 @@ download_tif() {
   [[ -z "$seg" ]] && return 0
   resolved=$(resolve_run_path "$seg")
   if [[ -f "$resolved" ]]; then
-    echo "00_fetch_s3_and_prepare_run_dir: Already present: $resolved"
+    echo "00_fetch_s3_and_prepare_run_dir: Already present: $(report_path "$resolved")"
   else
-    mkdir -p "$(dirname "$resolved")"
+    local dest_dir dest_name
+    dest_dir=$(dirname "$resolved")
+    dest_name=$(basename "$resolved")
+    mkdir -p "$dest_dir"
     echo "00_fetch_s3_and_prepare_run_dir: Downloading $label from S3"
-    aws s3 cp --endpoint-url "$s3_endpoint" "$s3_uri" "$resolved"
+    echo "00_fetch_s3_and_prepare_run_dir: Saving to: $(report_path "$resolved")"
+    (cd "$dest_dir" && aws s3 cp --endpoint-url "$s3_endpoint" "$s3_uri" "$dest_name")
   fi
 }
 download_tif "$seg1" "$median_s3_uri" "median TIF"
diff --git a/2a_grass/00_stage_external_inputs.sh b/2a_grass/00_stage_external_inputs.sh
new file mode 100644
index 0000000..51617ef
--- /dev/null
+++ b/2a_grass/00_stage_external_inputs.sh
@@ -0,0 +1,194 @@
+#!/usr/bin/env bash
+# 00_stage_external_inputs.sh: create run directory and stage user external inputs.
+# Invoked as step 00 of the 'prepare' command. It:
+#   - Ensures the run directory exists.
+#   - Copies user-provided external files (from config.external_paths) into
+#     the run directory so they are available to the workflow.
+#
+# Requires: yq (mikefarah/yq)
+#
+# Options (see --help): --repo-root (required); --manifest optional, currently
+# unused for staging; defaults to <repo-root>/2a_grass/workflow_manifest.yaml.
+# Run directory is either from --run-dir or from run_dir in the file given by
+# --config (relative paths resolved with --invocation-cwd). external_paths
+# entries are resolved from --invocation-cwd when relative.
+
+set -euo pipefail
+
+usage() {
+  cat <<'EOF'
+Usage: 00_stage_external_inputs.sh [OPTIONS]
+
+Create the run directory (if needed) and copy user-provided external files
+from the config's external_paths section into the run directory so they are
+available to the workflow.
+
+Required:
+  --repo-root PATH      Repo root (workflows directory). Script changes to this directory.
+
+Run directory (one of):
+  --run-dir PATH        Run directory (absolute, or relative to --repo-root).
+  --config PATH         User YAML config file; script reads run_dir from it (use with --invocation-cwd).
+
+Optional:
+  --manifest PATH       Path to workflow_manifest.yaml (default: <repo-root>/2a_grass/workflow_manifest.yaml). (Currently unused.)
+  --invocation-cwd PATH Required when using --config with a relative run_dir or relative external_paths.
+  -h, --help            Print this help and exit.
+EOF
+}
+
+RUN_DIR=""
+CONFIG_FILE=""
+REPO_ROOT=""
+MANIFEST=""
+INVOCATION_CWD=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --run-dir)
+      [[ $# -lt 2 ]] && { echo "00_stage_external_inputs: --run-dir requires PATH." >&2; usage >&2; exit 1; }
+      RUN_DIR="$2"; shift 2 ;;
+    --config)
+      [[ $# -lt 2 ]] && { echo "00_stage_external_inputs: --config requires PATH." >&2; usage >&2; exit 1; }
+      CONFIG_FILE="$2"; shift 2 ;;
+    --repo-root)
+      [[ $# -lt 2 ]] && { echo "00_stage_external_inputs: --repo-root requires PATH." >&2; usage >&2; exit 1; }
+      REPO_ROOT="$2"; shift 2 ;;
+    --manifest)
+      [[ $# -lt 2 ]] && { echo "00_stage_external_inputs: --manifest requires PATH." >&2; usage >&2; exit 1; }
+      MANIFEST="$2"; shift 2 ;;
+    --invocation-cwd)
+      [[ $# -lt 2 ]] && { echo "00_stage_external_inputs: --invocation-cwd requires PATH." >&2; usage >&2; exit 1; }
+      INVOCATION_CWD="$2"; shift 2 ;;
+    -h|--help)
+      usage; exit 0 ;;
+    *)
+      echo "00_stage_external_inputs: Unknown option: $1" >&2
+      usage >&2
+      exit 1 ;;
+  esac
+done
+
+if [[ -z "$REPO_ROOT" ]]; then
+  echo "00_stage_external_inputs: --repo-root is required." >&2
+  usage >&2
+  exit 1
+fi
+
+if [[ -z "$MANIFEST" ]]; then
+  MANIFEST="${REPO_ROOT}/2a_grass/workflow_manifest.yaml"
+fi
+
+# Run directory: from --run-dir or from config file
+if [[ -n "$CONFIG_FILE" ]]; then
+  if [[ ! -f "$CONFIG_FILE" ]]; then
+    echo "00_stage_external_inputs: Config file not found: $CONFIG_FILE" >&2
+    exit 1
+  fi
+  RUN_DIR=$(yq eval '.run_dir' "$CONFIG_FILE") || {
+    echo "00_stage_external_inputs: yq failed to read .run_dir from config: $CONFIG_FILE" >&2
+    exit 1
+  }
+  if [[ -z "$RUN_DIR" || "$RUN_DIR" == "null" ]]; then
+    echo "00_stage_external_inputs: run_dir not found or empty in config (expected .run_dir): $CONFIG_FILE" >&2
+    exit 1
+  fi
+  if [[ "$RUN_DIR" != /* ]]; then
+    if [[ -z "$INVOCATION_CWD" ]]; then
+      echo "00_stage_external_inputs: --invocation-cwd is required when run_dir in config is relative." >&2
+      exit 1
+    fi
+    RUN_DIR="${INVOCATION_CWD}/${RUN_DIR}"
+  fi
+elif [[ -z "$RUN_DIR" ]]; then
+  echo "00_stage_external_inputs: Provide --run-dir or --config (with run_dir in the config file)." >&2
+  usage >&2
+  exit 1
+fi
+
+if [[ ! -f "$MANIFEST" ]]; then
+  echo "00_stage_external_inputs: Manifest not found: $MANIFEST" >&2
+  exit 1
+fi
+
+if ! command -v yq &>/dev/null; then
+  echo "00_stage_external_inputs: yq is required to read the manifest and config." >&2
+  exit 1
+fi
+
+cd "$REPO_ROOT"
+
+# Show path for user: relative to INVOCATION_CWD if under it, else absolute
+report_path() {
+  local abs_path="$1"
+  if [[ -n "$INVOCATION_CWD" && "$abs_path" == "$INVOCATION_CWD"/* ]]; then
+    echo "${abs_path#"$INVOCATION_CWD"/}"
+  else
+    echo "$abs_path"
+  fi
+}
+
+# Resolve an absolute run directory for staging.
+RUN_DIR_ABS=$(if [[ "$RUN_DIR" = /* ]]; then echo "$RUN_DIR"; else echo "$REPO_ROOT/$RUN_DIR"; fi)
+
+echo "00_stage_external_inputs: Ensuring run directory exists"
+mkdir -p "$RUN_DIR_ABS"
+RUN_DIR_ABS=$(cd "$RUN_DIR_ABS" && pwd)
+RUN_DIR="$RUN_DIR_ABS"
+echo "00_stage_external_inputs: Run directory: $(report_path "$RUN_DIR_ABS")"
+
+# If no config or no external_paths, nothing more to do.
+if [[ -z "$CONFIG_FILE" || ! -f "$CONFIG_FILE" ]]; then
+  echo "00_stage_external_inputs: No config file provided; only run directory was created."
+  echo "00_stage_external_inputs: Done."
+  exit 0
+fi
+
+# external_paths is a mapping from arbitrary keys to source file paths.
+# We do not depend on manifest paths here; we simply copy each source file
+# into the run directory (flattened by basename).
+# Parse the YAML block output of .external_paths line by line (yq v4 outputs plain
+# scalars without quotes). Split on first ": " to get key and value.
+external_block=$(yq eval '.external_paths' "$CONFIG_FILE" 2>/dev/null || echo "null")
+if [[ -z "$external_block" || "$external_block" == "null" || "$external_block" == "{}" ]]; then
+  echo "00_stage_external_inputs: No external_paths configured; nothing to copy."
+  echo "00_stage_external_inputs: Done."
+  exit 0
+fi
+
+echo "00_stage_external_inputs: Staging external inputs into run directory"
+
+while IFS= read -r line; do
+  [[ -z "$line" ]] && continue
+  # Split on first ": " — key is everything before, value everything after.
+  key="${line%%: *}"
+  src="${line#*: }"
+  [[ -z "$key" || "$key" == "$line" ]] && continue  # no ": " found
+  [[ -z "$src" || "$src" == "null" ]] && continue
+  # Strip surrounding quotes that yq may preserve from the YAML source.
+  src="${src#\"}" ; src="${src%\"}"
+
+  # Resolve source: absolute as-is, relative to INVOCATION_CWD otherwise.
+  if [[ "$src" != /* ]]; then
+    if [[ -z "$INVOCATION_CWD" ]]; then
+      echo "00_stage_external_inputs: --invocation-cwd is required when external_paths entries are relative." >&2
+      exit 1
+    fi
+    src="${INVOCATION_CWD}/${src}"
+  fi
+  if [[ ! -f "$src" ]]; then
+    echo "00_stage_external_inputs: external_paths.${key}: source file not found: ${src}" >&2
+    exit 1
+  fi
+
+  # Destination: copy into the run directory using the source basename.
+  dest="${RUN_DIR_ABS}/$(basename "$src")"
+  dest_dir=$(dirname "$dest")
+  mkdir -p "$dest_dir"
+
+  echo "00_stage_external_inputs: Copying $(report_path "$src") -> $(report_path "$dest")"
+  cp -f "$src" "$dest"
+done <<< "$external_block"
+
+echo "00_stage_external_inputs: Done."
+
diff --git a/2a_grass/01_ERA5_nc_to_clim.R b/2a_grass/01_ERA5_nc_to_clim.R
index 0fc5e51..56ebf16 100755
--- a/2a_grass/01_ERA5_nc_to_clim.R
+++ b/2a_grass/01_ERA5_nc_to_clim.R
@@ -73,21 +73,36 @@ file_info <- site_info |>
   dplyr::rename(site_id = id) |>
   dplyr::cross_join(data.frame(ens_id = 1:10))
 
+# stopifnot(
+#   length(unique(file_info$id)) == nrow(file_info),
+#   all(file_info$lat > 0), # just to simplify grid naming below
+#   all(file_info$lon < 0)
+# )
+file_info <- file_info |>
+  dplyr::mutate(
+    # match locations to half-degree ERA5 grid cell centers
+    # CAUTION: Calculation only correct when all lats are N and all lons are W!
+    ERA5_grid_cell = paste0(
+      ((lat + 0.25) %/% 0.5) * 0.5, "N_",
+      ((abs(lon) + 0.25) %/% 0.5) * 0.5, "W"
+    )
+  )
 if (!dir.exists(args$site_sipnet_met_path)) {
   dir.create(args$site_sipnet_met_path, recursive = TRUE)
 }
 furrr::future_pwalk(
   file_info,
-  function(site_id, start_date, end_date, ens_id, ...) {
+  function(site_id, start_date, end_date, ens_id, ERA5_grid_cell, ...) {
     PEcAn.SIPNET::met2model.SIPNET(
       in.path = file.path(
         args$site_era5_path,
-        paste("ERA5", site_id, ens_id, sep = "_")
+        # paste("ERA5", site_id, ens_id, sep = "_")
+        paste("ERA5", ERA5_grid_cell, ens_id, sep = "_")
       ),
       start_date = args$start_date,
       end_date = args$end_date,
       in.prefix = paste0("ERA5.", ens_id),
-      outfolder = file.path(args$site_sipnet_met_path, site_id)
+      outfolder = file.path(args$site_sipnet_met_path, ERA5_grid_cell)
     )
   }
 )
diff --git a/2a_grass/example_user_config.yaml b/2a_grass/example_user_config.yaml
index f1fcbfb..05fc1fe 100644
--- a/2a_grass/example_user_config.yaml
+++ b/2a_grass/example_user_config.yaml
@@ -1,12 +1,13 @@
 # Example user-facing config for 2a_grass workflows.
 # Pass with: ./magic-ensemble <command> --config workflows/2a_grass/example_user_config.yaml
 #
-# This file contains only overridable settings. Fixed paths, S3 resources, and
-# step I/O are defined in workflow_manifest.yaml (do not put those here).
+# This file contains only overridable settings and user-provided external
+# resources. Fixed paths, S3 resources, and step I/O are defined in
+# workflow_manifest.yaml and are not overridden here.
 
 # Run directory: where outputs and run-specific data live.
 # Relative to the CWD where you invoke the CLI, unless you use an absolute path.
-run_dir: "2a_grass/run"
+run_dir: "config-based-rundir/"
 
 # Dates used by prepare and run-ensembles.
 start_date: "2016-01-01"
@@ -18,12 +19,21 @@ n_ens: 20
 n_met: 10
 ic_ensemble_size: 100
 
+# User-provided external resources.
+
+# Absolute paths are used as-is; relative paths are resolved from the
+# directory where you invoke ./magic-ensemble.
+# paths in external_paths will be localized to the run_dir before the workflow is run.
+external_paths:
+    template_file: "2a_grass/template.xml"
+
 # Parallelism (e.g. for step 01 --n_cores).
 n_workers: 1
 
-# Optional: distributed compute adapter (for future use with Slurm/Apptainer).
-# distributed_compute_adapter:
-#   name: "localhost"
-#   qsub: "sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif"
-#   qsub_jobid: "Submitted batch job ([0-9]+)"
-#   qstat: 'if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi'
+# Run prepare steps inside Apptainer (single image from workflow manifest; pull-if-not-present).
+# When true: 'module load apptainer' is attempted, then apptainer must be on PATH; SIF is
+# pulled from manifest remote if not present. No user override of remote; local image only.
+use_apptainer: true
+
+# Switch dispatch method for parallel execution (local-gnu-parallel, slurm-dispatch, etc.).
+pecan_dispatch: local-gnu-parallel
diff --git a/2a_grass/template.xml b/2a_grass/template.xml
index 6d91f23..75c2c24 100644
--- a/2a_grass/template.xml
+++ b/2a_grass/template.xml
@@ -45,7 +45,7 @@
   <type>SIPNET</type>
   <revision>git</revision>
   <delete.raw>TRUE</delete.raw>
-  <binary>sipnet.git</binary>
+  <binary>/usr/local/bin/sipnet.git</binary>
   <prerun>cp data/events.in @RUNDIR@</prerun>
  </model>
  <run>
@@ -67,15 +67,11 @@
  </run>
  <host>
   <name>localhost</name>
+  <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif </qsub>
+  <qsub.jobid>Submitted batch job ([0-9]+)</qsub.jobid>
+  <!-- <qstat>squeue -h -j  @JOBID@ || echo "DONE"</qstat> -->
+  <qstat>if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi</qstat>
   <outdir>output/out</outdir>
   <rundir>output/run</rundir>
-  <!-- <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@</qsub> -->
-  <!-- <qsub.jobid>.*job ([0-9]+).*</qsub.jobid> -->
-  <qstat>squeue -j @JOBID@ || echo DONE</qstat>
-  <modellauncher>
-    <binary>parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' ::::</binary>
-    <!-- <qsub.extra>-a 1-@NJOBS@</qsub.extra> -->
-    <Njobmax>1000</Njobmax>
-  </modellauncher>
  </host>
 </pecan>
diff --git a/2a_grass/workflow_manifest.yaml b/2a_grass/workflow_manifest.yaml
index ef48585..e0b859e 100644
--- a/2a_grass/workflow_manifest.yaml
+++ b/2a_grass/workflow_manifest.yaml
@@ -3,7 +3,9 @@
 #
 # Paths: All entries under 'paths' are inside the run directory (no paths outside run_dir).
 # Keys are referenced by name in steps (inputs/outputs). At runtime the CLI resolves
-# each path as run_dir + "/" + value.
+# each path as run_dir + "/" + value. These are fixed, non-overrideable locations;
+# user configs may only supply external source files that are copied into these
+# paths before 'prepare' runs.
 #
 # Steps: Each command has a list of step objects. Each step has:
 #   script:       R script path (relative to repo root)
@@ -12,19 +14,70 @@
 #   outputs:      List of path keys this script creates or writes
 
 # S3 resources (not in user config). Remote resources are localized before R runs.
+# Stored as endpoint + bucket + per-resource key_prefix and filename (no full URLs).
 s3:
   endpoint_url: "https://s3.garage.ccmmf.ncsa.cloud"
   bucket: "carb"
   artifact_02:
-    url: "s3://carb/data/workflows/phase_2a"
-    filename: "ccmmf_phase_2a_input_artifacts.tgz"
+    key_prefix: "data_raw"
+    filename: "ensembles_data_artifact.tar.gz"
   median_tif:
-    url: "s3://carb/data_raw"
+    key_prefix: "data_raw"
     filename: "ca_biomassfiaald_2016_median.tif"
   stdv_tif:
-    url: "s3://carb/data_raw"
+    key_prefix: "data_raw"
     filename: "ca_biomassfiaald_2016_stdv.tif"
 
+# Dispatch options for run-ensembles. The user config selects one by name via pecan_dispatch.
+# host_xml is the complete <host>...</host> block to inject into the staged template.xml
+# before step 03 (xml_build.R) runs. Valid values for pecan_dispatch in user config are the
+# keys listed here.
+pecan_dispatch:
+  local-gnu-parallel:
+    description: "Run ensemble members locally using GNU parallel (no Slurm required)"
+    host_xml: |
+      <host>
+       <name>localhost</name>
+       <outdir>output/out</outdir>
+       <rundir>output/run</rundir>
+       <qstat>squeue -j @JOBID@ || echo DONE</qstat>
+       <modellauncher>
+        <binary>parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' ::::</binary>
+        <Njobmax>1000</Njobmax>
+       </modellauncher>
+      </host>
+    host_xml_apptainer: |
+      <host>
+       <name>localhost</name>
+       <outdir>output/out</outdir>
+       <rundir>output/run</rundir>
+       <qstat>squeue -j @JOBID@ || echo DONE</qstat>
+       <modellauncher>
+        <binary>parallel -j ${NCPUS:-1} --skip-first-line 'apptainer run @SIF@ {}/job.sh' ::::</binary>
+        <Njobmax>1000</Njobmax>
+       </modellauncher>
+      </host>
+  slurm-dispatch:
+    description: "Submit ensemble members to Slurm via sbatch"
+    host_xml: |
+      <host>
+       <name>localhost</name>
+       <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ </qsub>
+       <qsub.jobid>Submitted batch job ([0-9]+)</qsub.jobid>
+       <qstat>if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi</qstat>
+       <outdir>output/out</outdir>
+       <rundir>output/run</rundir>
+      </host>
+    host_xml_apptainer: |
+      <host>
+       <name>localhost</name>
+       <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run @SIF@ </qsub>
+       <qsub.jobid>Submitted batch job ([0-9]+)</qsub.jobid>
+       <qstat>if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi</qstat>
+       <outdir>output/out</outdir>
+       <rundir>output/run</rundir>
+      </host>
+
 # Apptainer (not in user config)
 apptainer:
   remote:
@@ -35,6 +88,8 @@ apptainer:
   sif: "sipnet-carb_develop.sif"
 
 # Path definitions: all contained within the run directory.
+# note that these paths are the internal-workflow expected I/O paths.
+# Users should not modify these values unless you know what you are doing.
 # Values are relative to run_dir; CLI resolves as run_dir + "/" + value.
 paths:
   site_info_file: "site_info.csv"
@@ -65,6 +120,11 @@ steps:
       outputs: [data_dir, ic_outdir, site_sipnet_met_path]
 
   prepare:
+    - script: "2a_grass/00_stage_external_inputs.sh"
+      r_libraries: []
+      inputs:  []
+      outputs: []
+
     - script: "2a_grass/01_ERA5_nc_to_clim.R"
       r_libraries: [future, furrr]
       inputs:  [site_info_file, site_era5_path]
diff --git a/magic-ensemble b/magic-ensemble
index bd534e8..63894f4 100755
--- a/magic-ensemble
+++ b/magic-ensemble
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 # magic-ensemble: minimal CLI for workflows (2a_grass).
-# Usage: ./magic-ensemble <command> [--dry-run] [--verbose] [--config <path>]
+# Usage: ./magic-ensemble <command> [--verbose] --config <path>
 # Commands: help | get-demo-data | prepare | run-ensembles
 
 set -euo pipefail
@@ -18,18 +18,17 @@ Usage: ./magic-ensemble <command> [global options]
 Commands:
   help            Print this usage and help (no scripts run).
   get-demo-data   Fetch demo data from S3 and create run directory (for users without local data).
-  prepare         Run preparation steps: 01 (ERA5→clim), 02 (IC build), 03 (XML build).
+  prepare         Run preparation steps: 00 (stage external inputs), 01 (ERA5→clim), 02 (IC build), 03 (XML build).
   run-ensembles   Run step 04 (run model) using existing settings.xml and prepared inputs.
 
 Global options (after command):
-  --dry-run       Do not run R scripts; print what would be run and run pre-execution checks.
-  --verbose       Echo each Rscript command before running.
-  --config <path> Path to user YAML config (overridable keys only; fixed paths are in workflow manifest).
+  --verbose       Echo each command before running (including apptainer run when use_apptainer is true).
+  --config <path> REQUIRED. Path to user YAML config (overridable scalar keys only; fixed paths are in workflow manifest).
+                  Config may set use_apptainer: true to run prepare inside Apptainer (pull-if-not-present; apptainer only).
 
 Examples:
   ./magic-ensemble help
   ./magic-ensemble get-demo-data --config my_config.yaml
-  ./magic-ensemble prepare --dry-run
   ./magic-ensemble prepare --config my_config.yaml --verbose
   ./magic-ensemble run-ensembles --config my_config.yaml
 EOF
@@ -49,7 +48,6 @@ require_yq() {
 
 # --- Parse arguments: command first, then global options ---
 COMMAND=""
-DRY_RUN=0
 VERBOSE=0
 CONFIG_FILE=""
 while [[ $# -gt 0 ]]; do
@@ -57,7 +55,6 @@ while [[ $# -gt 0 ]]; do
     help|get-demo-data|prepare|run-ensembles)
       if [[ -z "$COMMAND" ]]; then COMMAND="$1"; shift; continue; fi
       ;;
-    --dry-run)   DRY_RUN=1; shift; continue ;;
     --verbose)   VERBOSE=1; shift; continue ;;
     --config)
       if [[ $# -lt 2 ]]; then echo "magic-ensemble: --config requires <path>." >&2; usage >&2; exit 1; fi
@@ -74,6 +71,11 @@ while [[ $# -gt 0 ]]; do
   shift
 done
 
+# Resolve paths passed on the command line relative to CWD (use actual pwd so config is found)
+if [[ -n "$CONFIG_FILE" && "$CONFIG_FILE" != /* ]]; then
+  CONFIG_FILE="$(pwd)/${CONFIG_FILE}"
+fi
+
 # --- Help or no command ---
 if [[ -z "$COMMAND" || "$COMMAND" == "help" ]]; then
   usage
@@ -92,23 +94,37 @@ if [[ ! -f "$MANIFEST" ]]; then
   exit 1
 fi
 
+if [[ -z "$CONFIG_FILE" ]]; then
+  echo "magic-ensemble: --config <path> is required for command '$COMMAND'." >&2
+  usage >&2
+  exit 1
+fi
+
+if [[ ! -f "$CONFIG_FILE" ]]; then
+  echo "magic-ensemble: Config file not found: $CONFIG_FILE" >&2
+  exit 1
+fi
+
 # --- Load effective config: manifest + optional user overrides ---
-# User config may contain: run_dir, start_date, end_date, run_LAI_date, n_ens, n_met, ic_ensemble_size, n_workers
+# User config may contain: run_dir, start_date, end_date, run_LAI_date, n_ens, n_met, ic_ensemble_size, n_workers, pecan_dispatch
 get_val() {
   local key="$1"
   local from_manifest="$2"
   if [[ -n "$CONFIG_FILE" && -f "$CONFIG_FILE" ]]; then
     local u
-    u=$(yq eval ".$key // .paths.$key // .dates.$key // .ensemble.$key // empty" "$CONFIG_FILE" 2>/dev/null)
-    if [[ -n "$u" && "$u" != "null" ]]; then
-      echo "$u"
-      return
+    u=$(yq eval ".$key" "$CONFIG_FILE" 2>/dev/null)
+    if [[ -z "$u" || "$u" == "null" ]]; then
+      echo "magic-ensemble: Config key '$key' is missing or empty in $CONFIG_FILE" >&2
+      exit 1
     fi
+    echo "$u"
+    return
   fi
   echo "$from_manifest"
 }
 
 # Read manifest paths and fixed values
+# Manifest paths are internal-workflow I/O connections. they should not be altered.
 p_site_info_file=$(yq eval '.paths.site_info_file' "$MANIFEST")
 p_site_sipnet_met_path=$(yq eval '.paths.site_sipnet_met_path' "$MANIFEST")
 p_site_era5_path=$(yq eval '.paths.site_era5_path' "$MANIFEST")
@@ -135,12 +151,22 @@ n_ens_default="20"
 n_met_default="10"
 ic_ensemble_size_default="100"
 n_workers_default="1"
+use_apptainer_default="false"
 
 run_dir=$(get_val "run_dir" "$run_dir_default")
+
+if [[ "$run_dir" == "/" ]]; then
+  echo "magic-ensemble: run_dir cannot be the root directory (/)." >&2
+  exit 1
+fi
+
 # If run_dir is not absolute, resolve relative to CWD where the CLI was invoked
 if [[ "$run_dir" != /* ]]; then
   run_dir="${INVOCATION_CWD}/${run_dir}"
 fi
+# Normalize run_dir to avoid trailing slashes so joined paths do not contain "//"
+run_dir="${run_dir%/}"
+
 start_date=$(get_val "start_date" "$start_date_default")
 end_date=$(get_val "end_date" "$end_date_default")
 run_LAI_date=$(get_val "run_LAI_date" "$run_LAI_date_default")
@@ -148,10 +174,20 @@ n_ens=$(get_val "n_ens" "$n_ens_default")
 n_met=$(get_val "n_met" "$n_met_default")
 ic_ensemble_size=$(get_val "ic_ensemble_size" "$ic_ensemble_size_default")
 n_workers=$(get_val "n_workers" "$n_workers_default")
+use_apptainer_raw=$(get_val "use_apptainer" "$use_apptainer_default")
+# Normalize: true/yes/1 (case-insensitive) => 1; else 0
+use_apptainer=0
+case "$(echo "$use_apptainer_raw" | tr '[:upper:]' '[:lower:]')" in
+  true|yes|1) use_apptainer=1 ;;
+esac
+
+pecan_dispatch=$(get_val "pecan_dispatch" "")
 
-# Resolve manifest paths relative to run_dir (then relative to repo root).
+# Resolve manifest paths relative to run_dir.
 # Effective path = run_dir / manifest_path so R (CWD=REPO_ROOT) sees the correct file.
 resolve_path() { echo "${run_dir}/${1}"; }
+
+# All workflow paths come from the manifest only (no user overrides).
 site_info_file=$(resolve_path "$p_site_info_file")
 site_sipnet_met_path=$(resolve_path "$p_site_sipnet_met_path")
 site_era5_path=$(resolve_path "$p_site_era5_path")
@@ -159,14 +195,13 @@ field_shape_path=$(resolve_path "$p_field_shape_path")
 data_dir=$(resolve_path "$p_data_dir")
 ic_outdir=$(resolve_path "$p_ic_outdir")
 pft_dir=$(resolve_path "$p_pft_dir")
-landtrendr_raw_files=$(resolve_path "$p_landtrendr_raw_files")
 site_file=$(resolve_path "$p_site_file")
 template_file=$(resolve_path "$p_template_file")
 output_file=$(resolve_path "$p_output_file")
 met_dir=$(resolve_path "$p_met_dir")
 ic_dir=$(resolve_path "$p_ic_dir")
 settings_xml=$(resolve_path "$p_settings_xml")
-# landtrendr_raw_files is comma-separated; resolve each segment
+# landtrendr_raw_files is comma-separated; resolve each segment (manifest only for now)
 landtrendr_raw_files=""
 while IFS= read -r segment; do
   segment=$(echo "$segment" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
@@ -183,6 +218,46 @@ check_aws() {
   fi
 }
 
+# --- Apptainer: ensure apptainer is available (module load then PATH); apptainer only, no singularity ---
+ensure_apptainer_available() {
+  if command -v apptainer &>/dev/null; then
+    return 0
+  fi
+  if command -v module &>/dev/null; then
+    if module load apptainer 2>/dev/null; then
+      if command -v apptainer &>/dev/null; then
+        return 0
+      fi
+    fi
+  fi
+  echo "magic-ensemble: use_apptainer is true but apptainer is not available. Run 'module load apptainer' or ensure apptainer is on PATH. (Singularity is not supported.)" >&2
+  exit 1
+}
+
+# --- Apptainer: resolve SIF path from manifest (in run_dir); pull from remote if not present (no user override of remote) ---
+ensure_sif_present() {
+  local sif_name sif_path remote_base container_name tag uri
+  sif_name=$(yq eval '.apptainer.sif' "$MANIFEST")
+  sif_path="${run_dir}/${sif_name}"
+  if [[ -f "$sif_path" ]]; then
+    APPTAINER_SIF="$sif_path"
+    return 0
+  fi
+  ensure_apptainer_available
+  remote_base=$(yq eval '.apptainer.remote.url' "$MANIFEST")
+  remote_base="${remote_base%/}"
+  container_name=$(yq eval '.apptainer.container.name' "$MANIFEST")
+  tag=$(yq eval '.apptainer.tag' "$MANIFEST")
+  uri="${remote_base}/${container_name}:${tag}"
+  echo "magic-ensemble: SIF not found at $sif_path; pulling from $uri" >&2
+  mkdir -p "$run_dir"
+  if ! apptainer pull "$sif_path" "$uri"; then
+    echo "magic-ensemble: Failed to pull container to $sif_path" >&2
+    exit 1
+  fi
+  APPTAINER_SIF="$sif_path"
+}
+
 # --- Get list of script paths for current command (from manifest steps) ---
 get_steps() {
   yq eval '.steps["'"$COMMAND"'"] | .[].script' "$MANIFEST"
@@ -211,27 +286,39 @@ check_r_libs_for_step() {
   done < <(yq eval '.steps["'"$COMMAND"'"] | .['"$i"'].r_libraries | .[]?' "$MANIFEST" 2>/dev/null || true)
 }
 
-# --- Dry-run: print scripts and optionally run checks ---
-do_dry_run() {
-  echo "magic-ensemble: dry-run for command: $COMMAND"
-  echo "Run directory (for this execution): $run_dir"
-  echo "Would run the following scripts (CWD = $REPO_ROOT):"
-  while IFS= read -r script; do
-    [[ -z "$script" ]] && continue
-    script_path="${REPO_ROOT}/${script}"
-    if [[ -f "$script_path" ]]; then
-      echo "  - $script (exists)"
-    else
-      echo "  - $script (MISSING)"
+# --- R library check for step at index i inside Apptainer (APPTAINER_SIF must be set) ---
+check_r_libs_for_step_in_apptainer() {
+  local i="$1"
+  local script="${STEPS[i]}"
+  [[ "$script" == *.sh ]] && return 0
+  local lib
+  while IFS= read -r lib; do
+    [[ -z "$lib" || "$lib" == "null" ]] && continue
+    if ! apptainer run --bind "$REPO_ROOT:$REPO_ROOT" --bind "$run_dir:$run_dir" --pwd "$REPO_ROOT" "$APPTAINER_SIF" Rscript -e "library(\"$lib\")" 2>/dev/null; then
+      echo "magic-ensemble: R library check failed inside container: library(\"$lib\") not available in image $APPTAINER_SIF" >&2
+      exit 1
     fi
-  done < <(get_steps)
-  echo ""
-  echo "Pre-execution checks (R libraries, AWS CLI) can be run when not in dry-run."
-  exit 0
+  done < <(yq eval '.steps["'"$COMMAND"'"] | .['"$i"'].r_libraries | .[]?' "$MANIFEST" 2>/dev/null || true)
 }
 
-# --- Run R script with args; CWD = REPO_ROOT ---
+# --- Run R script with args. Optional leading args: --apptainer, --cwd DIR (default DIR = REPO_ROOT). ---
 run_script() {
+  local use_apptainer=0
+  local script_cwd="$REPO_ROOT"
+  while [[ $# -gt 0 ]]; do
+    case "${1:-}" in
+      --apptainer) use_apptainer=1; shift ;;
+      --cwd)
+        if [[ $# -lt 2 ]]; then
+          echo "magic-ensemble: run_script --cwd requires DIR." >&2
+          exit 1
+        fi
+        script_cwd="$2"
+        shift 2
+        ;;
+      *) break ;;
+    esac
+  done
   local script="$1"
   shift
   local script_path="${REPO_ROOT}/${script}"
@@ -239,13 +326,23 @@ run_script() {
     echo "magic-ensemble: Script not found: $script_path" >&2
     exit 1
   fi
-  if [[ $VERBOSE -eq 1 ]]; then
-    echo "Rscript $script_path $*" >&2
+  if [[ $use_apptainer -eq 1 ]]; then
+    # APPTAINER_SIF and run_dir must be set (ensure_sif_present and run_dir resolved earlier).
+    echo "magic-ensemble: Rscript (inside apptainer: $APPTAINER_SIF)"
+    if [[ $VERBOSE -eq 1 ]]; then
+      echo "apptainer run --bind \"$REPO_ROOT:$REPO_ROOT\" --bind \"$run_dir:$run_dir\" --cwd \"$script_cwd\" \"$APPTAINER_SIF\" Rscript \"$script_path\" $*" >&2
+    fi
+    apptainer run --bind "$REPO_ROOT:$REPO_ROOT" --bind "$run_dir:$run_dir" --cwd "$script_cwd" "$APPTAINER_SIF" Rscript "$script_path" "$@"
+  else
+    echo "magic-ensemble: Rscript: $(command -v Rscript)"
+    if [[ $VERBOSE -eq 1 ]]; then
+      echo "(cd \"$script_cwd\" && Rscript \"$script_path\" $*)" >&2
+    fi
+    (cd "$script_cwd" && Rscript "$script_path" "$@")
   fi
-  (cd "$REPO_ROOT" && Rscript "$script_path" "$@")
 }
 
-# --- Run shell script; CWD = REPO_ROOT. Pass COMMAND and STEP_INDEX for manifest lookups. ---
+# --- Run shell script; CWD = REPO_ROOT. Step scripts receive documented CLI arguments. ---
 run_shell_script() {
   local script="$1"
   local step_index="${2:-0}"
@@ -254,10 +351,74 @@ run_shell_script() {
     echo "magic-ensemble: Script not found: $script_path" >&2
     exit 1
   fi
+  local script_basename="${script##*/}"
+  if [[ "$script_basename" == "00_fetch_s3_and_prepare_run_dir.sh" || "$script_basename" == "00_stage_external_inputs.sh" ]]; then
+    # Step 00 helpers: documented arguments (see each script's --help).
+    local args=(--repo-root "$REPO_ROOT")
+    if [[ "$script_basename" == "00_fetch_s3_and_prepare_run_dir.sh" ]]; then
+      args+=(--command "$COMMAND" --step-index "$step_index")
+    fi
+    if [[ -n "$CONFIG_FILE" && -f "$CONFIG_FILE" ]]; then
+      args+=(--config "$CONFIG_FILE" --invocation-cwd "$INVOCATION_CWD")
+    else
+      args+=(--run-dir "$run_dir" --invocation-cwd "$INVOCATION_CWD")
+    fi
+    if [[ $VERBOSE -eq 1 ]]; then
+      echo "bash $script_path ${args[*]}" >&2
+    fi
+    (cd "$REPO_ROOT" && bash "$script_path" "${args[@]}")
+  else
+    # Other shell steps: pass args if/when they are added; no env vars
+    if [[ $VERBOSE -eq 1 ]]; then
+      echo "bash $script_path" >&2
+    fi
+    (cd "$REPO_ROOT" && bash "$script_path")
+  fi
+}
+
+# --- Validate pecan_dispatch value against manifest options ---
+validate_pecan_dispatch() {
+  if ! yq eval ".pecan_dispatch | has(\"$pecan_dispatch\")" "$MANIFEST" | grep -q '^true$'; then
+    echo "magic-ensemble: Unknown pecan_dispatch value '$pecan_dispatch'. Valid options:" >&2
+    yq eval '.pecan_dispatch | keys | .[]' "$MANIFEST" >&2
+    exit 1
+  fi
+}
+
+# --- Patch <host>...</host> block in staged template.xml with chosen dispatch XML ---
+# Selects host_xml_apptainer when use_apptainer=1 (with @SIF@ substituted); falls back to host_xml.
+patch_dispatch() {
+  if ! command -v python3 &>/dev/null; then
+    echo "magic-ensemble: python3 is required to patch dispatch in template.xml." >&2
+    exit 1
+  fi
+  local template_path="${run_dir}/$(yq eval '.paths.template_file' "$MANIFEST")"
+  if [[ ! -f "$template_path" ]]; then
+    echo "magic-ensemble: staged template.xml not found at $template_path" >&2
+    exit 1
+  fi
+
+  # Select apptainer variant when available and requested; otherwise plain host_xml.
+  local host_xml_key="host_xml"
+  if [[ $use_apptainer -eq 1 ]]; then
+    local has_apptainer_variant
+    has_apptainer_variant=$(yq eval ".pecan_dispatch[\"$pecan_dispatch\"] | has(\"host_xml_apptainer\")" "$MANIFEST")
+    if [[ "$has_apptainer_variant" == "true" ]]; then
+      host_xml_key="host_xml_apptainer"
+    fi
+  fi
+
   if [[ $VERBOSE -eq 1 ]]; then
-    echo "RUN_DIR=$run_dir REPO_ROOT=$REPO_ROOT MANIFEST=$MANIFEST COMMAND=$COMMAND STEP_INDEX=$step_index bash $script_path" >&2
+    echo "magic-ensemble: patching <host> block in $template_path (pecan_dispatch=$pecan_dispatch, xml_key=$host_xml_key)" >&2
   fi
-  (cd "$REPO_ROOT" && RUN_DIR="$run_dir" REPO_ROOT="$REPO_ROOT" MANIFEST="$MANIFEST" COMMAND="$COMMAND" STEP_INDEX="$step_index" bash "$script_path")
+
+  local sif_name host_xml
+  sif_name=$(yq eval '.apptainer.sif' "$MANIFEST")
+  # Substitute @SIF@ with the SIF filename (relative to run_dir, as jobs execute there).
+  host_xml=$(yq eval ".pecan_dispatch[\"$pecan_dispatch\"].$host_xml_key" "$MANIFEST" \
+    | sed "s|@SIF@|./${sif_name}|g")
+
+  python3 "${REPO_ROOT}/tools/patch_xml.py" "$template_path" "host" "$host_xml" --block
 }
 
 # --- Get-demo-data: run steps from manifest (shell script only) ---
@@ -270,67 +431,96 @@ run_get_demo_data() {
   done
 }
 
-# --- Prepare: run steps from manifest (01, 02, 03 with R args) ---
+# --- Prepare: run steps from manifest (hard-coded sequence for this workflow); optionally inside Apptainer ---
 run_prepare() {
   get_steps_array
   check_aws
-  for i in "${!STEPS[@]}"; do
-    check_r_libs_for_step "$i"
-  done
+  validate_pecan_dispatch
+
+  local apptainer_arg=""
+  if [[ $use_apptainer -eq 1 ]]; then
+    ensure_apptainer_available
+    ensure_sif_present
+    apptainer_arg="--apptainer"
+    for i in "${!STEPS[@]}"; do
+      check_r_libs_for_step_in_apptainer "$i"
+    done
+  else
+    for i in "${!STEPS[@]}"; do
+      check_r_libs_for_step "$i"
+    done
+  fi
 
   for i in "${!STEPS[@]}"; do
-    case "$i" in
-      0) run_script "${STEPS[i]}" \
-          --site_era5_path "$site_era5_path" \
-          --site_sipnet_met_path "$site_sipnet_met_path" \
-          --site_info_file "$site_info_file" \
-          --start_date "$start_date" \
-          --end_date "$end_date" \
-          --n_cores "$n_workers" \
-          --parallel_strategy "multisession" ;;
-      1) run_script "${STEPS[i]}" \
-          --site_info_path "$site_info_file" \
-          --field_shape_path "$field_shape_path" \
-          --ic_ensemble_size "$ic_ensemble_size" \
-          --run_start_date "$start_date" \
-          --run_LAI_date "$run_LAI_date" \
-          --ic_outdir "$ic_outdir" \
-          --data_dir "$data_dir" \
-          --pft_dir "$pft_dir" \
-          --params_read_from_pft "$params_from_pft" \
-          --landtrendr_raw_files "$landtrendr_raw_files" \
-          --additional_params "$additional_params" ;;
-      2) run_script "${STEPS[i]}" \
-          --n_ens "$n_ens" \
-          --n_met "$n_met" \
-          --start_date "$start_date" \
-          --end_date "$end_date" \
-          --ic_dir "$ic_dir" \
-          --met_dir "$met_dir" \
-          --site_file "$site_file" \
-          --template_file "$template_file" \
-          --output_file "$output_file" ;;
-      *) echo "magic-ensemble: No argument mapping for prepare step index $i" >&2; exit 1 ;;
-    esac
+    step_num=$((i + 1))
+    script="${STEPS[i]}"
+    echo "magic-ensemble: prepare step $step_num of ${#STEPS[@]}: $script"
+    if [[ "$script" == *.sh ]]; then
+      run_shell_script "$script" "$i"
+      if [[ "$i" -eq 0 ]]; then
+        echo "magic-ensemble: patching template.xml with dispatch: $pecan_dispatch"
+        patch_dispatch
+      fi
+    else
+      case "$i" in
+        1) run_script $apptainer_arg "$script" \
+             --site_era5_path "$site_era5_path" \
+             --site_sipnet_met_path "$site_sipnet_met_path" \
+             --site_info_file "$site_info_file" \
+             --start_date "$start_date" \
+             --end_date "$end_date" \
+             --n_cores "$n_workers" \
+             --parallel_strategy "multisession" ;;
+        2) run_script $apptainer_arg "$script" \
+             --site_info_path "$site_info_file" \
+             --field_shape_path "$field_shape_path" \
+             --ic_ensemble_size "$ic_ensemble_size" \
+             --run_start_date "$start_date" \
+             --run_LAI_date "$run_LAI_date" \
+             --ic_outdir "$ic_outdir" \
+             --data_dir "$data_dir" \
+             --pft_dir "$pft_dir" \
+             --params_read_from_pft "$params_from_pft" \
+             --landtrendr_raw_files "$landtrendr_raw_files" \
+             --additional_params "$additional_params" ;;
+        3) run_script $apptainer_arg "$script" \
+             --n_ens "$n_ens" \
+             --n_met "$n_met" \
+             --start_date "$start_date" \
+             --end_date "$end_date" \
+             --ic_dir "$ic_dir" \
+             --met_dir "$met_dir" \
+             --site_file "$site_file" \
+             --template_file "$template_file" \
+             --output_file "$output_file" ;;
+        *) echo "magic-ensemble: No argument mapping for prepare step index $i (script $script)" >&2; exit 1 ;;
+      esac
+    fi
+    echo "magic-ensemble: prepare step $step_num completed"
   done
+  echo "magic-ensemble: prepare finished (all ${#STEPS[@]} steps)"
 }
 
-# --- Run-ensembles: run single step from manifest (04) ---
+# --- Run-ensembles: run single step from manifest (04); never inside Apptainer.
+# When use_apptainer=1, the SIF must be present for dispatched jobs (already patched
+# into template.xml via patch_dispatch during prepare); 04_run_model.R itself always
+# runs on the host so it can submit further jobs to Slurm.
 run_run_ensembles() {
   get_steps_array
   check_aws
+
+  if [[ $use_apptainer -eq 1 ]]; then
+    ensure_apptainer_available
+    ensure_sif_present
+  fi
   check_r_libs_for_step 0
 
-  run_script "${STEPS[0]}" \
+  run_script --cwd "$run_dir" "${STEPS[0]}" \
     --settings "$settings_xml" \
     --continue "FALSE"
 }
 
 # --- Main ---
-if [[ $DRY_RUN -eq 1 ]]; then
-  do_dry_run
-fi
-
 case "$COMMAND" in
   get-demo-data)  run_get_demo_data ;;
   prepare)        run_prepare ;;
diff --git a/tools/patch_xml.py b/tools/patch_xml.py
new file mode 100644
index 0000000..9a346ce
--- /dev/null
+++ b/tools/patch_xml.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""
+patch_xml.py — in-place XML element patcher.
+
+Usage:
+    patch_xml.py <template_path> <xml_path> <new_content> [--block]
+
+Arguments:
+    template_path   Path to the XML file to patch (modified in-place).
+    xml_path        Slash-separated element path, e.g. "host" or "model/binary".
+                    The last segment is the target tag; an optional leading segment
+                    constrains the match to within that parent element.
+    new_content     Replacement value. Without --block, replaces the text node only.
+                    With --block, replaces the entire element including its tags.
+    --block         Replace the full <tag>...</tag> element rather than just its text.
+
+Exit codes:
+    0   Success.
+    1   Usage error, file I/O error, or element not found.
+"""
+
+import sys
+import re
+
+
+def usage(msg=None):
+    if msg:
+        print(f"patch_xml: {msg}", file=sys.stderr)
+    print(__doc__, file=sys.stderr)
+    sys.exit(1)
+
+
+def patch_element(text, tag, new_content, replace_block):
+    # Regex-based: assumes tags have no attributes (e.g. <host>, not <host name="x">).
+    # This holds for PEcAn template.xml but would need revision for attributed tags.
+    if replace_block:
+        patched, n = re.subn(
+            r'<' + tag + r'>.*?</' + tag + r'>',
+            new_content, text, count=1, flags=re.DOTALL,
+        )
+    else:
+        patched, n = re.subn(
+            r'(<' + tag + r'>)[^<]*(</'+tag+r'>)',
+            r'\g<1>' + new_content + r'\g<2>',
+            text, count=1,
+        )
+    return patched, n
+
+
+def main():
+    args = sys.argv[1:]
+    replace_block = '--block' in args
+    args = [a for a in args if a != '--block']
+
+    if len(args) != 3:
+        usage(f"expected 3 positional arguments, got {len(args)}")
+
+    template_path, xml_path, new_content = args
+    parts = xml_path.split('/')
+    tag = parts[-1]
+    parent = parts[0] if len(parts) > 1 else None
+
+    try:
+        content = open(template_path).read()
+    except OSError as e:
+        print(f"patch_xml: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    if parent:
+        total_replaced = 0
+
+        def replacer(m):
+            nonlocal total_replaced
+            patched, n = patch_element(m.group(0), tag, new_content, replace_block)
+            total_replaced += n
+            return patched
+
+        result = re.sub(
+            r'<' + parent + r'>.*?</' + parent + r'>',
+            replacer, content, count=1, flags=re.DOTALL,
+        )
+    else:
+        result, total_replaced = patch_element(content, tag, new_content, replace_block)
+
+    if total_replaced == 0:
+        print(f"patch_xml: no element matched path '{xml_path}' in {template_path}", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        open(template_path, 'w').write(result)
+    except OSError as e:
+        print(f"patch_xml: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()

From caf7eab5ccf1be3e33058cfde01068622ed00bdc Mon Sep 17 00:00:00 2001
From: Henry Priest <hdpriest@illinois.edu>
Date: Fri, 3 Apr 2026 21:24:54 +0000
Subject: [PATCH 6/7] Fix external_paths staging contract and add documentation

- Stage external inputs to manifest-defined destinations rather than
  source
  basename; enforce that each external_paths key has a matching
  manifest.paths
  entry and error if not
- Make get_val() fall through to defaults for missing config keys
  instead of
  erroring; add explicit post-resolution required check for run_dir only
- Remove spurious check_aws calls from prepare and run-ensembles
  commands
- Reorganize workflow_manifest.yaml: move steps block to top, normalize
  to
  4-space indentation throughout
- Add magic-ensemble-DEVELOPERS.md (architecture, internals, dispatch)
  and
  magic-ensemble-README.md
---
 2a_grass/00_stage_external_inputs.sh |  22 ++-
 2a_grass/README.md                   |   2 +-
 2a_grass/workflow_manifest.yaml      | 241 +++++++++++++------------
 magic-ensemble                       |  15 +-
 magic-ensemble-DEVELOPERS.md         | 257 +++++++++++++++++++++++++++
 magic-ensemble-README.md             | 173 ++++++++++++++++++
 6 files changed, 577 insertions(+), 133 deletions(-)
 create mode 100644 magic-ensemble-DEVELOPERS.md
 create mode 100644 magic-ensemble-README.md

diff --git a/2a_grass/00_stage_external_inputs.sh b/2a_grass/00_stage_external_inputs.sh
index 51617ef..6413565 100644
--- a/2a_grass/00_stage_external_inputs.sh
+++ b/2a_grass/00_stage_external_inputs.sh
@@ -7,8 +7,8 @@
 #
 # Requires: yq (mikefarah/yq)
 #
-# Options (see --help): --repo-root (required); --manifest optional, currently
-# unused for staging; defaults to <repo-root>/2a_grass/workflow_manifest.yaml.
+# Options (see --help): --repo-root (required); --manifest optional,
+# defaults to <repo-root>/2a_grass/workflow_manifest.yaml.
 # Run directory is either from --run-dir or from run_dir in the file given by
 # --config (relative paths resolved with --invocation-cwd). external_paths
 # entries are resolved from --invocation-cwd when relative.
@@ -31,7 +31,7 @@ Run directory (one of):
   --config PATH         User YAML config file; script reads run_dir from it (use with --invocation-cwd).
 
 Optional:
-  --manifest PATH       Path to workflow_manifest.yaml (default: <repo-root>/2a_grass/workflow_manifest.yaml). (Currently unused.)
+  --manifest PATH       Path to workflow_manifest.yaml (default: <repo-root>/2a_grass/workflow_manifest.yaml).
   --invocation-cwd PATH Required when using --config with a relative run_dir or relative external_paths.
   -h, --help            Print this help and exit.
 EOF
@@ -144,9 +144,9 @@ if [[ -z "$CONFIG_FILE" || ! -f "$CONFIG_FILE" ]]; then
   exit 0
 fi
 
-# external_paths is a mapping from arbitrary keys to source file paths.
-# We do not depend on manifest paths here; we simply copy each source file
-# into the run directory (flattened by basename).
+# external_paths is a mapping from manifest path keys to source file paths.
+# Each key must match an entry in manifest.paths; the destination filename is
+# derived from that manifest path (basename), not from the source filename.
 # Parse the YAML block output of .external_paths line by line (yq v4 outputs plain
 # scalars without quotes). Split on first ": " to get key and value.
 external_block=$(yq eval '.external_paths' "$CONFIG_FILE" 2>/dev/null || echo "null")
@@ -181,8 +181,14 @@ while IFS= read -r line; do
     exit 1
   fi
 
-  # Destination: copy into the run directory using the source basename.
-  dest="${RUN_DIR_ABS}/$(basename "$src")"
+  # Destination: derived from the manifest path for the same key, not the source basename.
+  # This enforces the manifest contract so downstream scripts always find files where expected.
+  manifest_path=$(yq eval ".paths.${key}" "$MANIFEST" 2>/dev/null)
+  if [[ -z "$manifest_path" || "$manifest_path" == "null" ]]; then
+    echo "00_stage_external_inputs: external_paths key '${key}' has no corresponding entry in manifest.paths" >&2
+    exit 1
+  fi
+  dest="${RUN_DIR_ABS}/$(basename "$manifest_path")"
   dest_dir=$(dirname "$dest")
   mkdir -p "$dest_dir"
 
diff --git a/2a_grass/README.md b/2a_grass/README.md
index 41c2f74..f95273a 100644
--- a/2a_grass/README.md
+++ b/2a_grass/README.md
@@ -45,7 +45,7 @@ ln -sf ../sipnet ../sipnet.git
 ### Install or update PEcAn
 
 If this is a brand-new installation, expect this step to take a few hours to download and compile more than 300 R packages. If you've installed PEcAn on this machine before, expect it to be just a few minutes of updating only the PEcAn packages and any dependencies whose version requirement has changed.
-Defaults to using 4 CPUs to compile packages in parallel. If you have more cores, adjust `sbatch`'s `--cpus-per-task` parameter.
+Defaults to using 4 CPUs to compile packages in parallel. If you have more cores, adjust `sbatch`'s `--cpus-per-task` parameter.
 
 ```
 sbatch -o install_pecan.out ../tools/install_pecan.sh
diff --git a/2a_grass/workflow_manifest.yaml b/2a_grass/workflow_manifest.yaml
index e0b859e..d703f96 100644
--- a/2a_grass/workflow_manifest.yaml
+++ b/2a_grass/workflow_manifest.yaml
@@ -13,135 +13,142 @@
 #   inputs:       List of path keys (from 'paths') this script reads (local paths only)
 #   outputs:      List of path keys this script creates or writes
 
-# S3 resources (not in user config). Remote resources are localized before R runs.
-# Stored as endpoint + bucket + per-resource key_prefix and filename (no full URLs).
-s3:
-  endpoint_url: "https://s3.garage.ccmmf.ncsa.cloud"
-  bucket: "carb"
-  artifact_02:
-    key_prefix: "data_raw"
-    filename: "ensembles_data_artifact.tar.gz"
-  median_tif:
-    key_prefix: "data_raw"
-    filename: "ca_biomassfiaald_2016_median.tif"
-  stdv_tif:
-    key_prefix: "data_raw"
-    filename: "ca_biomassfiaald_2016_stdv.tif"
+# Steps per command: script path, R libs to check (empty for shell scripts), input/output path keys
+steps:
+    get-demo-data:
+        - script: "2a_grass/00_fetch_s3_and_prepare_run_dir.sh"
+          r_libraries: []
+          inputs: []
+          outputs: [data_dir, ic_outdir, site_sipnet_met_path]
 
-# Dispatch options for run-ensembles. The user config selects one by name via pecan_dispatch.
-# host_xml is the complete <host>...</host> block to inject into the staged template.xml
-# before step 03 (xml_build.R) runs. Valid values for pecan_dispatch in user config are the
-# keys listed here.
-pecan_dispatch:
-  local-gnu-parallel:
-    description: "Run ensemble members locally using GNU parallel (no Slurm required)"
-    host_xml: |
-      <host>
-       <name>localhost</name>
-       <outdir>output/out</outdir>
-       <rundir>output/run</rundir>
-       <qstat>squeue -j @JOBID@ || echo DONE</qstat>
-       <modellauncher>
-        <binary>parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' ::::</binary>
-        <Njobmax>1000</Njobmax>
-       </modellauncher>
-      </host>
-    host_xml_apptainer: |
-      <host>
-       <name>localhost</name>
-       <outdir>output/out</outdir>
-       <rundir>output/run</rundir>
-       <qstat>squeue -j @JOBID@ || echo DONE</qstat>
-       <modellauncher>
-        <binary>parallel -j ${NCPUS:-1} --skip-first-line 'apptainer run @SIF@ {}/job.sh' ::::</binary>
-        <Njobmax>1000</Njobmax>
-       </modellauncher>
-      </host>
-  slurm-dispatch:
-    description: "Submit ensemble members to Slurm via sbatch"
-    host_xml: |
-      <host>
-       <name>localhost</name>
-       <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ </qsub>
-       <qsub.jobid>Submitted batch job ([0-9]+)</qsub.jobid>
-       <qstat>if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi</qstat>
-       <outdir>output/out</outdir>
-       <rundir>output/run</rundir>
-      </host>
-    host_xml_apptainer: |
-      <host>
-       <name>localhost</name>
-       <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run @SIF@ </qsub>
-       <qsub.jobid>Submitted batch job ([0-9]+)</qsub.jobid>
-       <qstat>if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi</qstat>
-       <outdir>output/out</outdir>
-       <rundir>output/run</rundir>
-      </host>
+    prepare:
+        - script: "2a_grass/00_stage_external_inputs.sh"
+          r_libraries: []
+          inputs: []
+          outputs: []
 
-# Apptainer (not in user config)
-apptainer:
-  remote:
-    url: "docker://hdpriest0uiuc/"
-  container:
-    name: "sipnet-carb"
-  tag: "develop"
-  sif: "sipnet-carb_develop.sif"
+        - script: "2a_grass/01_ERA5_nc_to_clim.R"
+          r_libraries: [future, furrr]
+          inputs: [site_info_file, site_era5_path]
+          outputs: [site_sipnet_met_path]
+
+        - script: "2a_grass/02_ic_build.R"
+          r_libraries: [tidyverse]
+          inputs:
+              [
+                  site_info_file,
+                  field_shape_path,
+                  pft_dir,
+                  data_dir,
+                  landtrendr_raw_files,
+              ]
+          outputs: [ic_outdir, data_dir]
+
+        - script: "2a_grass/03_xml_build.R"
+          r_libraries: [PEcAn.settings]
+          inputs: [site_file, template_file, ic_dir, met_dir]
+          outputs: [output_file]
+
+    run-ensembles:
+        - script: "2a_grass/04_run_model.R"
+          r_libraries: [PEcAn.all]
+          inputs: [settings_xml]
+          outputs: []
 
 # Path definitions: all contained within the run directory.
 # note that these paths are the internal-workflow expected I/O paths.
 # Users should not modify these values unless you know what you are doing.
 # Values are relative to run_dir; CLI resolves as run_dir + "/" + value.
 paths:
-  site_info_file: "site_info.csv"
-  site_sipnet_met_path: "data/ERA5_SIPNET"
-  site_era5_path: "data_raw/ERA5_nc"
-  field_shape_path: "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb"
-  data_dir: "data/IC_prep"
-  ic_outdir: "IC_files"
-  pft_dir: "pfts"
-  landtrendr_raw_files: "data_raw/ca_biomassfiaald_2016_median.tif,data_raw/ca_biomassfiaald_2016_stdv.tif"
-  site_file: "site_info.csv"
-  template_file: "template.xml"
-  output_file: "settings.xml"
-  met_dir: "data/ERA5_SIPNET"
-  ic_dir: "IC_files"
-  settings_xml: "settings.xml"
+    site_info_file: "site_info.csv"
+    site_sipnet_met_path: "data/ERA5_SIPNET"
+    site_era5_path: "data_raw/ERA5_nc"
+    field_shape_path: "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb"
+    data_dir: "data/IC_prep"
+    ic_outdir: "IC_files"
+    pft_dir: "pfts"
+    landtrendr_raw_files: "data_raw/ca_biomassfiaald_2016_median.tif,data_raw/ca_biomassfiaald_2016_stdv.tif"
+    site_file: "site_info.csv"
+    template_file: "template.xml"
+    output_file: "settings.xml"
+    met_dir: "data/ERA5_SIPNET"
+    ic_dir: "IC_files"
+    settings_xml: "settings.xml"
 
 # Fixed workflow values (not user overrides)
 params_from_pft: "SLA,leafC"
 additional_params: "varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005"
 
-# Steps per command: script path, R libs to check (empty for shell scripts), input/output path keys
-steps:
-  get-demo-data:
-    - script: "2a_grass/00_fetch_s3_and_prepare_run_dir.sh"
-      r_libraries: []
-      inputs:  []
-      outputs: [data_dir, ic_outdir, site_sipnet_met_path]
-
-  prepare:
-    - script: "2a_grass/00_stage_external_inputs.sh"
-      r_libraries: []
-      inputs:  []
-      outputs: []
-
-    - script: "2a_grass/01_ERA5_nc_to_clim.R"
-      r_libraries: [future, furrr]
-      inputs:  [site_info_file, site_era5_path]
-      outputs: [site_sipnet_met_path]
-
-    - script: "2a_grass/02_ic_build.R"
-      r_libraries: [tidyverse]
-      inputs:  [site_info_file, field_shape_path, pft_dir, data_dir, landtrendr_raw_files]
-      outputs: [ic_outdir, data_dir]
+# S3 resources (not in user config). Remote resources are localized before R runs.
+# Stored as endpoint + bucket + per-resource key_prefix and filename (no full URLs).
+s3:
+    endpoint_url: "https://s3.garage.ccmmf.ncsa.cloud"
+    bucket: "carb"
+    artifact_02:
+        key_prefix: "data_raw"
+        filename: "ensembles_data_artifact.tar.gz"
+    median_tif:
+        key_prefix: "data_raw"
+        filename: "ca_biomassfiaald_2016_median.tif"
+    stdv_tif:
+        key_prefix: "data_raw"
+        filename: "ca_biomassfiaald_2016_stdv.tif"
 
-    - script: "2a_grass/03_xml_build.R"
-      r_libraries: [PEcAn.settings]
-      inputs:  [site_file, template_file, ic_dir, met_dir]
-      outputs: [output_file]
+# Dispatch options for run-ensembles. The user config selects one by name via pecan_dispatch.
+# host_xml is the complete <host>...</host> block to inject into the staged template.xml
+# before step 03 (xml_build.R) runs. Valid values for pecan_dispatch in user config are the
+# keys listed here.
+pecan_dispatch:
+    local-gnu-parallel:
+        description: "Run ensemble members locally using GNU parallel (no Slurm required)"
+        host_xml: |
+            <host>
+             <name>localhost</name>
+             <outdir>output/out</outdir>
+             <rundir>output/run</rundir>
+             <qstat>squeue -j @JOBID@ || echo DONE</qstat>
+             <modellauncher>
+              <binary>parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' ::::</binary>
+              <Njobmax>1000</Njobmax>
+             </modellauncher>
+            </host>
+        host_xml_apptainer: |
+            <host>
+             <name>localhost</name>
+             <outdir>output/out</outdir>
+             <rundir>output/run</rundir>
+             <qstat>squeue -j @JOBID@ || echo DONE</qstat>
+             <modellauncher>
+              <binary>parallel -j ${NCPUS:-1} --skip-first-line 'apptainer run @SIF@ {}/job.sh' ::::</binary>
+              <Njobmax>1000</Njobmax>
+             </modellauncher>
+            </host>
+    slurm-dispatch:
+        description: "Submit ensemble members to Slurm via sbatch"
+        host_xml: |
+            <host>
+             <name>localhost</name>
+             <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ </qsub>
+             <qsub.jobid>Submitted batch job ([0-9]+)</qsub.jobid>
+             <qstat>if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi</qstat>
+             <outdir>output/out</outdir>
+             <rundir>output/run</rundir>
+            </host>
+        host_xml_apptainer: |
+            <host>
+             <name>localhost</name>
+             <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run @SIF@ </qsub>
+             <qsub.jobid>Submitted batch job ([0-9]+)</qsub.jobid>
+             <qstat>if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi</qstat>
+             <outdir>output/out</outdir>
+             <rundir>output/run</rundir>
+            </host>
 
-  run-ensembles:
-    - script: "2a_grass/04_run_model.R"
-      r_libraries: [PEcAn.all]
-      inputs:  [settings_xml]
-      outputs: []
+# Apptainer (not in user config)
+apptainer:
+    remote:
+        url: "docker://hdpriest0uiuc/"
+    container:
+        name: "sipnet-carb"
+    tag: "develop"
+    sif: "sipnet-carb_develop.sif"
diff --git a/magic-ensemble b/magic-ensemble
index 63894f4..5a20008 100755
--- a/magic-ensemble
+++ b/magic-ensemble
@@ -113,12 +113,10 @@ get_val() {
   if [[ -n "$CONFIG_FILE" && -f "$CONFIG_FILE" ]]; then
     local u
     u=$(yq eval ".$key" "$CONFIG_FILE" 2>/dev/null)
-    if [[ -z "$u" || "$u" == "null" ]]; then
-      echo "magic-ensemble: Config key '$key' is missing or empty in $CONFIG_FILE" >&2
-      exit 1
+    if [[ -n "$u" && "$u" != "null" ]]; then
+      echo "$u"
+      return
     fi
-    echo "$u"
-    return
   fi
   echo "$from_manifest"
 }
@@ -155,6 +153,11 @@ use_apptainer_default="false"
 
 run_dir=$(get_val "run_dir" "$run_dir_default")
 
+if [[ -z "$run_dir" || "$run_dir" == "null" ]]; then
+  echo "magic-ensemble: run_dir is required in config $CONFIG_FILE" >&2
+  exit 1
+fi
+
 if [[ "$run_dir" == "/" ]]; then
   echo "magic-ensemble: run_dir cannot be the root directory (/)." >&2
   exit 1
@@ -434,7 +437,6 @@ run_get_demo_data() {
 # --- Prepare: run steps from manifest (hard-coded sequence for this workflow); optionally inside Apptainer ---
 run_prepare() {
   get_steps_array
-  check_aws
   validate_pecan_dispatch
 
   local apptainer_arg=""
@@ -507,7 +509,6 @@ run_prepare() {
 # runs on the host so it can submit further jobs to Slurm.
 run_run_ensembles() {
   get_steps_array
-  check_aws
 
   if [[ $use_apptainer -eq 1 ]]; then
     ensure_apptainer_available
diff --git a/magic-ensemble-DEVELOPERS.md b/magic-ensemble-DEVELOPERS.md
new file mode 100644
index 0000000..f388150
--- /dev/null
+++ b/magic-ensemble-DEVELOPERS.md
@@ -0,0 +1,257 @@
+# magic-ensemble Developer Guide
+
+This document covers the internal design of `magic-ensemble` and the
+`2a_grass/` workflow: how the pieces fit together, where the boundaries are,
+and what to change when adapting this CLI to a different workflow.
+
+---
+
+## Architecture Overview
+
+The CLI is built on a three-layer configuration model:
+
+```
+workflow_manifest.yaml   — fixed contract: internal paths, step definitions,
+                           S3 coords, dispatch XML, Apptainer image
+        +
+user_config.yaml         — runtime overrides: run_dir, dates, ensemble sizes,
+                           dispatch mode, use_apptainer, external_paths
+        +
+external_paths (staged)  — user-provided files copied into run_dir before
+                           prepare runs, mapped to manifest-defined destinations
+```
+
+The manifest is the source of truth for everything that is fixed per workflow.
+The user config contains only the values a user legitimately needs to vary
+between runs. External paths are the mechanism for injecting user-owned files
+(e.g. a custom `template.xml`) without making manifest paths user-overridable.
+As written, a user can only inject files that are expected by the pipeline.
+
+---
+
+## Execution Graph
+
+### `get-demo-data`
+
+```
+00_fetch_s3_and_prepare_run_dir.sh
+  → creates run_dir
+  → downloads and extracts S3 artifact into run_dir
+```
+
+### `prepare`
+
+```
+00_stage_external_inputs.sh
+  → creates run_dir
+  → copies external_paths files into run_dir (manifest-defined destinations)
+  → [patch_dispatch() runs after this step]
+      → reads pecan_dispatch host_xml from manifest
+      → substitutes @SIF@ if use_apptainer is set
+      → patches <host> block in run_dir/template.xml via tools/patch_xml.py
+
+01_ERA5_nc_to_clim.R
+  reads:  run_dir/data_raw/ERA5_nc, run_dir/site_info.csv
+  writes: run_dir/data/ERA5_SIPNET/
+
+02_ic_build.R
+  reads:  run_dir/site_info.csv, run_dir/data_raw/dwr_map/...,
+          run_dir/data/IC_prep/, run_dir/pfts/,
+          run_dir/data_raw/ca_biomassfiaald_*.tif
+  writes: run_dir/IC_files/, run_dir/data/IC_prep/
+
+03_xml_build.R
+  reads:  run_dir/site_info.csv, run_dir/template.xml,
+          run_dir/IC_files/, run_dir/data/ERA5_SIPNET/
+  writes: run_dir/settings.xml
+```
+
+### `run-ensembles`
+
+```
+04_run_model.R  (CWD = run_dir)
+  reads:  run_dir/settings.xml
+  writes: run_dir/output/  (via PEcAn dispatch)
+```
+
+---
+
+## Configuration Contract
+
+### What belongs in the manifest
+
+- `steps`: ordered list of scripts per command, with declared inputs/outputs and R library checks
+- `paths`: all internal file/directory locations relative to `run_dir`
+- `s3`: S3 endpoint, bucket, and per-resource key prefix and filename
+- `pecan_dispatch`: named dispatch modes, each with a `host_xml` (and optionally `host_xml_apptainer`) block
+- `apptainer`: remote registry URL, container name, tag, and SIF filename
+
+None of these are user-overridable. Adding a new workflow means replacing or
+extending the manifest, not the user config. As the underlying R-scripts evolve,
+the manifest must be kept in-sync with any i/o changes made in R-scripts.
+
+### What belongs in the user config
+
+Scalar values that vary between runs: `run_dir`, dates, ensemble sizes,
+`n_workers`, `use_apptainer`, `pecan_dispatch`. These all have fallback
+defaults in `magic-ensemble`; only `run_dir` is required.
+
+### What belongs in `external_paths`
+
+File paths for user-owned inputs that must be injected into `run_dir` before
+`prepare` runs. Keys must match entries under `manifest.paths`. The destination
+is `run_dir/$(basename manifest.paths.<key>)` — derived from the manifest, not
+from the source filename, so downstream scripts always find files where they
+expect them.
+
+---
+
+## CLI Internals
+
+### Argument parsing (`magic-ensemble` lines 50–77)
+
+Command is the first positional argument. `--config` and `--verbose` are global
+options that may appear in any order after the command. The config path is
+resolved relative to the actual `pwd` at invocation time and stored as an
+absolute path immediately after parsing.
+
+### `get_val()` resolution order
+
+```
+get_val "key" "default"
+  1. If CONFIG_FILE is set and the key is present and non-null → use config value
+  2. Otherwise → use the default passed as the second argument
+```
+
+Only `run_dir` has an explicit post-resolution check for empty/null; all other
+keys silently fall back to their defaults if absent from the config. This makes
+the config contract forward-compatible: adding new keys to the CLI does not
+break existing user configs.
+
+### Path normalization
+
+`run_dir` is resolved in two steps:
+1. If relative, it is prepended with `INVOCATION_CWD` (the directory where the
+   CLI was invoked, not `REPO_ROOT`).
+2. The trailing slash is stripped so that `run_dir + "/" + manifest_path` never
+   produces double slashes.
+
+All manifest paths are then resolved as `run_dir/manifest_path` and passed as
+absolute paths to R scripts.
+
+---
+
+## Dispatch and XML Patching
+
+### How dispatch modes work
+
+Each named mode under `manifest.pecan_dispatch` carries a `host_xml` block —
+the complete `<host>...</host>` XML to inject into `template.xml`. 
+
+When `use_apptainer` is set to `true` and the mode also defines `host_xml_apptainer`, that
+variant is used instead. The `@SIF@` string substituted with the SIF filename
+relative to `run_dir` (since dispatched jobs execute there).
+
+### `patch_dispatch()` (`magic-ensemble` lines 390–422)
+
+Called immediately after step 00 in `prepare`. Steps:
+1. Resolve `template_path` as `run_dir + manifest.paths.template_file`.
+2. Select `host_xml` or `host_xml_apptainer` based on `use_apptainer` and
+   manifest availability.
+3. Substitute `@SIF@` via `sed`.
+4. Call `tools/patch_xml.py` with `--block` to replace the entire `<host>` element.
+
+### `tools/patch_xml.py`
+
+Regex-based in-place XML patcher. In `--block` mode it replaces the entire
+`<tag>...</tag>` element (tags included). Limitations: assumes tags have no
+attributes; single substitution only (first match). The tool is intentionally
+minimal and workflow-agnostic.
+
+---
+
+## Apptainer Integration
+
+When `use_apptainer: true`:
+
+1. `ensure_apptainer_available()` — tries `module load apptainer` if not on PATH.
+2. `ensure_sif_present()` — looks for the SIF at `run_dir/<sif_name>`. If absent,
+   pulls from `manifest.apptainer.remote.url/container.name:tag`. The SIF always
+   lives in `run_dir` so it is co-located with the run for reproducibility.
+3. R library pre-checks run inside the container (`check_r_libs_for_step_in_apptainer`).
+4. Each R step is wrapped: `apptainer run --bind REPO_ROOT --bind run_dir`.
+
+`run-ensembles` always executes `04_run_model.R` on the host (it submits jobs;
+it does not run model code itself). When `use_apptainer: true`, the SIF must
+be present because the patched `host_xml_apptainer` references it in the
+`<binary>` or `<qsub>` command that PEcAn generates for each ensemble member.
+
+---
+
+## External Inputs Staging (`00_stage_external_inputs.sh`)
+
+The script accepts `--repo-root`, `--config`, `--invocation-cwd`, and
+optionally `--manifest`. Manifest defaults to
+`<repo-root>/2a_grass/workflow_manifest.yaml`.
+
+For each entry in `config.external_paths`:
+1. Key must exist under `manifest.paths`; if not, the script exits with an error.
+2. Source path is resolved: absolute as-is, relative paths prepended with
+   `INVOCATION_CWD`.
+3. Destination is `run_dir/$(basename manifest.paths.<key>)` — manifest-derived,
+   not source-derived.
+4. Parent directories are created if needed; file is copied with `cp -f`.
+
+This staging runs before `patch_dispatch()`, so `template.xml` is guaranteed
+to be present when the XML patching step fires.
+
+---
+
+## Adapting to a New Workflow
+
+The CLI skeleton (`magic-ensemble`) and the staging/dispatch infrastructure are
+designed to be reused. When adapting:
+
+### Replace in the manifest
+
+- `steps`: update script paths and input/output path keys for the new workflow
+- `paths`: replace with the new workflow's internal file layout
+- `params_from_pft`, `additional_params`: workflow-specific fixed values
+- `s3`: update bucket, key prefixes, and filenames
+- `pecan_dispatch`: keep as-is if PEcAn dispatch is reused; otherwise replace
+- `apptainer`: update container name and tag
+
+### Replace the step scripts
+
+Each script under `steps` should accept its inputs as named CLI arguments (R
+scripts via `optparse`; shell scripts via `--flag value`). The CLI passes all
+paths as absolute values so scripts do not need to be CWD-aware.
+
+### Keep in `magic-ensemble`
+
+- Argument parsing, `get_val()`, path normalization
+- `check_aws` (for any command that fetches from S3)
+- `ensure_apptainer_available`, `ensure_sif_present`, `check_r_libs_for_step*`
+- `run_script`, `run_shell_script`, `patch_dispatch`
+
+### Update in `magic-ensemble`
+
+- The argument mappings in `run_prepare()` (the `case "$i"` block) — these are
+  the per-step CLI arguments passed to each R script and are workflow-specific.
+- `usage()` — update command descriptions and examples.
+- The manifest path constant (`MANIFEST=`) if the new workflow lives in a
+  different subdirectory.
+
+---
+
+## Testing
+
+_(Placeholder — expand on this.)_
+
+Proposed tiers:
+- **Unit (bats/shunit2):** `get_val()` fallback behavior, `patch_dispatch()` XML
+  output, path normalization and `external_paths` destination derivation using
+  fixture configs and manifests.
+- **Integration:** End-to-end `prepare` against a minimal fixture that exercises
+  the full step sequence without live R script execution (mock scripts that
+  assert their arguments and touch their expected outputs).
diff --git a/magic-ensemble-README.md b/magic-ensemble-README.md
new file mode 100644
index 0000000..7d9cb19
--- /dev/null
+++ b/magic-ensemble-README.md
@@ -0,0 +1,173 @@
+# magic-ensemble CLI
+
+`magic-ensemble` is a command-line interface for running the 2a_grass statewide
+grassland carbon flux workflow. It fetches or stages input data, builds initial
+conditions and model settings, and dispatches ensemble runs locally or via Slurm.
+
+---
+
+## Prerequisites
+
+| Tool | Notes |
+|---|---|
+| `yq` | mikefarah/yq v4 (jq-style). Other `yq` implementations are not supported. |
+| `aws` | AWS CLI v2; required only for `get-demo-data`. |
+| `Rscript` | With packages listed per step (see *Commands* below). |
+| `python3` | Required for `prepare` (patches template.xml). |
+| `apptainer` | Required only when `use_apptainer: true` in your config. |
+
+---
+
+## Quick Start
+
+```bash
+# 1. Copy and edit the example config
+cp 2a_grass/example_user_config.yaml my_config.yaml
+$EDITOR my_config.yaml   # at minimum, set run_dir
+
+# 2. Fetch demo data (skip if you have your own inputs — see "Supplying Your Own Data")
+./magic-ensemble get-demo-data --config my_config.yaml
+
+# 3. Prepare: stage inputs, build climate files, ICs, and settings XML
+./magic-ensemble prepare --config my_config.yaml
+
+# 4. Run the ensemble
+./magic-ensemble run-ensembles --config my_config.yaml
+```
+
+Add `--verbose` to any command to echo the exact shell and Rscript calls as
+they execute.
+
+---
+
+## Configuration
+
+Copy `2a_grass/example_user_config.yaml` as a starting point. All keys except
+`run_dir` are optional and fall back to the defaults shown below.
+
+| Key | Default | Description |
+|---|---|---|
+| `run_dir` | **required** | Directory for all run outputs. Relative paths are resolved from the directory where you invoke `./magic-ensemble`. |
+| `start_date` | `2016-01-01` | Run start date (YYYY-MM-DD). |
+| `end_date` | `2023-12-31` | Run end date (YYYY-MM-DD). |
+| `run_LAI_date` | `2016-07-01` | Date used for LAI lookup during IC build. |
+| `n_ens` | `20` | Number of parameter ensemble members. |
+| `n_met` | `10` | Number of meteorology ensemble members. |
+| `ic_ensemble_size` | `100` | IC ensemble draw size. |
+| `n_workers` | `1` | Parallel workers for the ERA5 conversion step. |
+| `use_apptainer` | `false` | Run R steps inside the workflow Apptainer container. |
+| `pecan_dispatch` | _(none)_ | Dispatch mode for `run-ensembles`. Required for `prepare` and `run-ensembles`. |
+| `external_paths` | _(none)_ | User-provided input files to stage into `run_dir` before `prepare` runs (see below). |
+
+Fixed internal paths, S3 coordinates, dispatch XML, and Apptainer image
+details are defined in `2a_grass/workflow_manifest.yaml` and are not set in
+user configs.
+
+---
+
+## Commands
+
+### `get-demo-data`
+
+Downloads demo input data from S3 and creates the run directory. Use this if
+you do not have your own ERA5, IC, or site data.
+
+**Requires:** `aws` CLI; S3 credentials for the CCMMF bucket.
+
+**Produces:** ERA5 NetCDF files, IC files, and site info CSV inside `run_dir`.
+
+### `prepare`
+
+Runs four steps in sequence:
+
+| Step | Script | R packages |
+|---|---|---|
+| 00 | Stage external inputs; create run directory | — |
+| 01 | Convert ERA5 NetCDF to SIPNET climate format | `future`, `furrr` |
+| 02 | Build initial condition ensemble | `tidyverse` |
+| 03 | Build PEcAn settings XML | `PEcAn.settings` |
+
+After step 00, `template.xml` is patched with the `<host>` dispatch block
+selected by `pecan_dispatch` (and the Apptainer SIF path when applicable).
+
+**Requires:** `pecan_dispatch` set in config; `python3` on PATH.
+
+**Produces:** `settings.xml` in `run_dir`, ready for `run-ensembles`.
+
+### `run-ensembles`
+
+Runs `04_run_model.R` using the `settings.xml` produced by `prepare`. The R
+script runs on the host and dispatches ensemble members to workers (local or
+Slurm) as configured in the patched `settings.xml`.
+
+**Requires:** `PEcAn.all` R package; `settings.xml` present in `run_dir`.
+
+---
+
+## Dispatch Options
+
+Set `pecan_dispatch` in your config to one of the following:
+
+| Value | Description |
+|---|---|
+| `local-gnu-parallel` | Runs ensemble members locally using GNU parallel. No cluster required. |
+| `slurm-dispatch` | Submits ensemble members as Slurm batch jobs via `sbatch`. |
+
+The corresponding `<host>` XML block is injected into `template.xml` during
+`prepare` step 00.
+
+---
+
+## Using Apptainer
+
+Set `use_apptainer: true` in your config to run the R steps inside the
+workflow container. The CLI will:
+
+1. Attempt `module load apptainer` if `apptainer` is not already on PATH.
+2. Look for the SIF file in `run_dir`. If absent, pull it from the registry
+   defined in `workflow_manifest.yaml`.
+3. Bind `run_dir` and the repo root into the container for each R step.
+
+`run-ensembles` always runs `04_run_model.R` on the host, but when
+`use_apptainer: true` the SIF must be present in `run_dir` because dispatched
+job scripts reference it directly.
+
+---
+
+## Supplying Your Own Data
+
+If you have your own ERA5, site, or template files, skip `get-demo-data` and
+use `external_paths` in your config to inject them:
+
+```yaml
+external_paths:
+  template_file: /path/to/my-template.xml
+```
+
+Each key must match a key under `paths` in `workflow_manifest.yaml`. The file
+is copied into `run_dir` at the location the workflow expects, before `prepare`
+runs. Paths may be absolute or relative to the directory where you invoke
+`./magic-ensemble`.
+
+---
+
+## Troubleshooting
+
+**`yq` not found or manifest parse fails**
+Install mikefarah/yq v4. The `yq` distributed with some Linux package managers
+is a different tool and is not compatible.
+
+**`run_dir is required`**
+Your config file is missing `run_dir`. This is the only key with no default.
+
+**`Unknown pecan_dispatch value`**
+The value of `pecan_dispatch` in your config does not match any key in
+`workflow_manifest.yaml`. Valid options are printed when this error occurs.
+
+**`staged template.xml not found`**
+`prepare` could not find `template.xml` in `run_dir`. Either run
+`get-demo-data` first, or supply `external_paths.template_file` in your config.
+
+**`apptainer` not available**
+Run `module load apptainer` before invoking the CLI, or ensure `apptainer` is
+on your PATH. Singularity is not supported.

From 2af7a276a20b4294bd31deb140c37be7ca08eca4 Mon Sep 17 00:00:00 2001
From: Henry Priest <hdpriest@illinois.edu>
Date: Tue, 7 Apr 2026 17:04:42 +0000
Subject: [PATCH 7/7] Generalize patch_dispatch into patch_xml_block and add
 SIPNET model XML

Replaces the host-only `patch_dispatch()` function with a generic
`patch_xml_block()` that accepts an XML tag name and yq paths for both
plain and Apptainer variants. Uses this to patch both the `<host>` block
(dispatch config) and the new `<model>` block (SIPNET binary path) in a
single prepare pass.

Adds `sipnet_model.model_xml` and `sipnet_model.model_xml_apptainer` to
the workflow manifest, selecting the Apptainer variant (absolute binary
path inside the container) when `use_apptainer=true`. Updates developer
docs to reflect the new calling convention and extensibility pattern.
---
 2a_grass/workflow_manifest.yaml | 23 +++++++++++++++++
 magic-ensemble                  | 45 ++++++++++++++++++++-------------
 magic-ensemble-DEVELOPERS.md    | 38 ++++++++++++++++++----------
 3 files changed, 75 insertions(+), 31 deletions(-)

diff --git a/2a_grass/workflow_manifest.yaml b/2a_grass/workflow_manifest.yaml
index d703f96..4b46bff 100644
--- a/2a_grass/workflow_manifest.yaml
+++ b/2a_grass/workflow_manifest.yaml
@@ -144,6 +144,29 @@ pecan_dispatch:
              <rundir>output/run</rundir>
             </host>
 
+# SIPNET model XML block injected into template.xml during prepare.
+# model_xml is used for host-native runs (sipnet.git must be on PATH).
+# model_xml_apptainer is used when use_apptainer=true (binary path is inside container).
+sipnet_model:
+    model_xml: |
+        <model>
+         <id>99000000003</id>
+         <type>SIPNET</type>
+         <revision>git</revision>
+         <delete.raw>TRUE</delete.raw>
+         <binary>sipnet.git</binary>
+         <prerun>cp data/events.in @RUNDIR@</prerun>
+        </model>
+    model_xml_apptainer: |
+        <model>
+         <id>99000000003</id>
+         <type>SIPNET</type>
+         <revision>git</revision>
+         <delete.raw>TRUE</delete.raw>
+         <binary>/usr/local/bin/sipnet.git</binary>
+         <prerun>cp data/events.in @RUNDIR@</prerun>
+        </model>
+
 # Apptainer (not in user config)
 apptainer:
     remote:
diff --git a/magic-ensemble b/magic-ensemble
index 5a20008..b516e7b 100755
--- a/magic-ensemble
+++ b/magic-ensemble
@@ -388,40 +388,44 @@ validate_pecan_dispatch() {
   fi
 }
 
-# --- Patch <host>...</host> block in staged template.xml with chosen dispatch XML ---
-# Selects host_xml_apptainer when use_apptainer=1 (with @SIF@ substituted); falls back to host_xml.
-patch_dispatch() {
+# --- Patch a named XML block in the staged template.xml from manifest values ---
+# Usage: patch_xml_block <xml_tag> <plain_yq_path> <apptainer_yq_path>
+#
+# Reads the XML block from the manifest at <plain_yq_path>. If use_apptainer=1
+# and <apptainer_yq_path> resolves to a non-null value in the manifest, uses
+# that instead. Always runs @SIF@ substitution (no-op when @SIF@ is absent).
+patch_xml_block() {
+  local xml_tag="$1" plain_yq_path="$2" apptainer_yq_path="$3"
+
   if ! command -v python3 &>/dev/null; then
-    echo "magic-ensemble: python3 is required to patch dispatch in template.xml." >&2
+    echo "magic-ensemble: python3 is required to patch <${xml_tag}> in template.xml." >&2
     exit 1
   fi
+
   local template_path="${run_dir}/$(yq eval '.paths.template_file' "$MANIFEST")"
   if [[ ! -f "$template_path" ]]; then
     echo "magic-ensemble: staged template.xml not found at $template_path" >&2
     exit 1
   fi
 
-  # Select apptainer variant when available and requested; otherwise plain host_xml.
-  local host_xml_key="host_xml"
+  local yq_path="$plain_yq_path"
   if [[ $use_apptainer -eq 1 ]]; then
-    local has_apptainer_variant
-    has_apptainer_variant=$(yq eval ".pecan_dispatch[\"$pecan_dispatch\"] | has(\"host_xml_apptainer\")" "$MANIFEST")
-    if [[ "$has_apptainer_variant" == "true" ]]; then
-      host_xml_key="host_xml_apptainer"
+    local apptainer_val
+    apptainer_val=$(yq eval "$apptainer_yq_path" "$MANIFEST" 2>/dev/null)
+    if [[ -n "$apptainer_val" && "$apptainer_val" != "null" ]]; then
+      yq_path="$apptainer_yq_path"
     fi
   fi
 
   if [[ $VERBOSE -eq 1 ]]; then
-    echo "magic-ensemble: patching <host> block in $template_path (pecan_dispatch=$pecan_dispatch, xml_key=$host_xml_key)" >&2
+    echo "magic-ensemble: patching <${xml_tag}> block in $template_path (yq_path=${yq_path})" >&2
   fi
 
-  local sif_name host_xml
+  local sif_name xml_block
   sif_name=$(yq eval '.apptainer.sif' "$MANIFEST")
-  # Substitute @SIF@ with the SIF filename (relative to run_dir, as jobs execute there).
-  host_xml=$(yq eval ".pecan_dispatch[\"$pecan_dispatch\"].$host_xml_key" "$MANIFEST" \
-    | sed "s|@SIF@|./${sif_name}|g")
+  xml_block=$(yq eval "$yq_path" "$MANIFEST" | sed "s|@SIF@|./${sif_name}|g")
 
-  python3 "${REPO_ROOT}/tools/patch_xml.py" "$template_path" "host" "$host_xml" --block
+  python3 "${REPO_ROOT}/tools/patch_xml.py" "$template_path" "$xml_tag" "$xml_block" --block
 }
 
 # --- Get-demo-data: run steps from manifest (shell script only) ---
@@ -461,7 +465,12 @@ run_prepare() {
       run_shell_script "$script" "$i"
       if [[ "$i" -eq 0 ]]; then
         echo "magic-ensemble: patching template.xml with dispatch: $pecan_dispatch"
-        patch_dispatch
+        patch_xml_block "host" \
+          ".pecan_dispatch[\"$pecan_dispatch\"].host_xml" \
+          ".pecan_dispatch[\"$pecan_dispatch\"].host_xml_apptainer"
+        patch_xml_block "model" \
+          ".sipnet_model.model_xml" \
+          ".sipnet_model.model_xml_apptainer"
       fi
     else
       case "$i" in
@@ -505,7 +514,7 @@ run_prepare() {
 
 # --- Run-ensembles: run single step from manifest (04); never inside Apptainer.
 # When use_apptainer=1, the SIF must be present for dispatched jobs (already patched
-# into template.xml via patch_dispatch during prepare); 04_run_model.R itself always
+# into template.xml via patch_xml_block during prepare); 04_run_model.R itself always
 # runs on the host so it can submit further jobs to Slurm.
 run_run_ensembles() {
   get_steps_array
diff --git a/magic-ensemble-DEVELOPERS.md b/magic-ensemble-DEVELOPERS.md
index f388150..ab7493c 100644
--- a/magic-ensemble-DEVELOPERS.md
+++ b/magic-ensemble-DEVELOPERS.md
@@ -45,10 +45,12 @@ As written, a user can only inject files that are expected by the pipeline.
 00_stage_external_inputs.sh
   → creates run_dir
   → copies external_paths files into run_dir (manifest-defined destinations)
-  → [patch_dispatch() runs after this step]
-      → reads pecan_dispatch host_xml from manifest
-      → substitutes @SIF@ if use_apptainer is set
-      → patches <host> block in run_dir/template.xml via tools/patch_xml.py
+  → [patch_xml_block() runs twice after this step]
+      → patches <host> block: reads pecan_dispatch host_xml from manifest,
+        substitutes @SIF@ if use_apptainer is set
+      → patches <model> block: reads sipnet_model model_xml from manifest,
+        selects model_xml_apptainer variant if use_apptainer is set
+      → both use tools/patch_xml.py --block
 
 01_ERA5_nc_to_clim.R
   reads:  run_dir/data_raw/ERA5_nc, run_dir/site_info.csv
@@ -152,14 +154,24 @@ When `use_apptainer` is set to `true` and the mode also defines `host_xml_apptai
 variant is used instead. The `@SIF@` string substituted with the SIF filename
 relative to `run_dir` (since dispatched jobs execute there).
 
-### `patch_dispatch()` (`magic-ensemble` lines 390–422)
+### `patch_xml_block()` (`magic-ensemble`)
 
-Called immediately after step 00 in `prepare`. Steps:
+Generic XML block patcher called immediately after step 00 in `prepare`.
+
+```
+patch_xml_block <xml_tag> <plain_yq_path> <apptainer_yq_path>
+```
+
+Steps:
 1. Resolve `template_path` as `run_dir + manifest.paths.template_file`.
-2. Select `host_xml` or `host_xml_apptainer` based on `use_apptainer` and
-   manifest availability.
-3. Substitute `@SIF@` via `sed`.
-4. Call `tools/patch_xml.py` with `--block` to replace the entire `<host>` element.
+2. If `use_apptainer=1` and `<apptainer_yq_path>` resolves to a non-null value
+   in the manifest, use it; otherwise use `<plain_yq_path>`.
+3. Substitute `@SIF@` via `sed` (no-op when `@SIF@` is absent from the block).
+4. Call `tools/patch_xml.py` with `--block` to replace the entire element.
+
+Called twice in `run_prepare()`: once for `<host>` (dispatch XML) and once for
+`<model>` (SIPNET binary path). Adding a new patched block requires only one
+more `patch_xml_block` call with the appropriate manifest yq paths.
 
 ### `tools/patch_xml.py`
 
@@ -202,7 +214,7 @@ For each entry in `config.external_paths`:
    not source-derived.
 4. Parent directories are created if needed; file is copied with `cp -f`.
 
-This staging runs before `patch_dispatch()`, so `template.xml` is guaranteed
+This staging runs before `patch_xml_block()`, so `template.xml` is guaranteed
 to be present when the XML patching step fires.
 
 ---
@@ -232,7 +244,7 @@ paths as absolute values so scripts do not need to be CWD-aware.
 - Argument parsing, `get_val()`, path normalization
 - `check_aws` (for any command that fetches from S3)
 - `ensure_apptainer_available`, `ensure_sif_present`, `check_r_libs_for_step*`
-- `run_script`, `run_shell_script`, `patch_dispatch`
+- `run_script`, `run_shell_script`, `patch_xml_block`
 
 ### Update in `magic-ensemble`
 
@@ -249,7 +261,7 @@ paths as absolute values so scripts do not need to be CWD-aware.
 _(Placeholder — expand on this.)_
 
 Proposed tiers:
-- **Unit (bats/shunit2):** `get_val()` fallback behavior, `patch_dispatch()` XML
+- **Unit (bats/shunit2):** `get_val()` fallback behavior, `patch_xml_block()` XML
   output, path normalization and `external_paths` destination derivation using
   fixture configs and manifests.
 - **Integration:** End-to-end `prepare` against a minimal fixture that exercises