alez007 · alez007 · Apr 7, 2026 · Apr 7, 2026
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,7 +1,8 @@
 {
   "name": "yasha",
   "build": {
-    "dockerfile": "../Dockerfile.dev",
+    "dockerfile": "../Dockerfile",
+    "target": "dev",
     "context": ".."
   },
   "features": {

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -40,7 +40,8 @@ jobs:
         uses: docker/build-push-action@v6
         with:
           context: .
-          file: Dockerfile.prod
+          file: Dockerfile
+          target: prod
           push: true
           build-args: |
             CUDA_VERSION=13.0.2

diff --git a/Dockerfile.prod → Dockerfile b/Dockerfile.prod → Dockerfile
@@ -41,9 +41,6 @@ WORKDIR /yasha
 ADD ./pyproject.toml pyproject.toml
 ADD ./README.md README.md
 ADD ./uv.lock uv.lock
-ADD ./start.py start.py
-ADD ./yasha yasha
-ADD ./config config
 ADD ./plugins plugins
 
 ENV UV_PROJECT_ENVIRONMENT=/.venv
@@ -63,36 +60,32 @@ RUN uv python install ${PYTHON_VERSION}
 
 ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
 
+# ---------------------------------------------------------------------------
+# Development target
+# ---------------------------------------------------------------------------
+FROM base AS dev
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --locked --no-install-project --extra dev
+
+ADD ./scripts/start_ray.sh /yasha/scripts/start_ray.sh
+RUN chmod +x /yasha/scripts/start_ray.sh
+
+CMD ["/bin/bash"]
+
+# ---------------------------------------------------------------------------
+# Production target
+# ---------------------------------------------------------------------------
+FROM base AS prod
+
+ADD ./start.py start.py
+ADD ./yasha yasha
+ADD ./config config
+ADD ./scripts scripts
+
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv sync --locked --no-install-project
 
-WORKDIR /
-COPY <<EOF start.sh
-#!/bin/bash
-
-EXTRAS=""
-if [ -n "\${YASHA_PLUGINS}" ]; then
-    for plugin in \$(echo "\${YASHA_PLUGINS}" | tr ',' ' '); do
-        EXTRAS="\$EXTRAS --extra \$plugin"
-    done
-fi
-uv sync --project /yasha --locked \$EXTRAS
-
-if [ "\${YASHA_USE_EXISTING_RAY_CLUSTER}" != "true" ]; then
-    METRICS_FLAG=""
-    if [ "\${YASHA_METRICS}" = "true" ]; then
-        METRICS_FLAG="--metrics-export-port=\${RAY_METRICS_EXPORT_PORT}"
-    fi
-    cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats \${METRICS_FLAG}
-    if ! ray status --address=\${RAY_CLUSTER_ADDRESS}:\${RAY_REDIS_PORT}; then
-        echo "ray cluster failed to start"
-        exit 1
-    fi
-fi
-cd /yasha && uv run start.py
-
-EOF
-RUN chmod +x start.sh
-
-WORKDIR /
-CMD ["uv", "run", "--active", "bash", "start.sh"]
+RUN chmod +x /yasha/scripts/start_ray.sh /yasha/scripts/start.sh
+
+CMD ["uv", "run", "--active", "bash", "/yasha/scripts/start.sh"]
diff --git a/Dockerfile.dev b/Dockerfile.dev
diff --git a/README.md b/README.md
@@ -44,6 +44,7 @@ Each model runs as an isolated Ray Serve deployment with its own lifecycle, heal
 - **Plugin system** — opt-in TTS backends installed as isolated uv workspace packages
 - **Multi-GPU support** — assign models to specific GPUs by index or named Ray resource, with full tensor parallelism support
 - **Client disconnect detection** — cancels in-flight inference when the client disconnects, freeing GPU resources immediately
+- **Prometheus metrics & Grafana dashboard** — built-in observability with custom `yasha:*` metrics, vLLM engine stats, and Ray cluster metrics on a single scrape endpoint; pre-built Grafana dashboard included
 - **Ray dashboard** — monitor deployments, resources, and request logs
 
 ## Supported OpenAI Endpoints
@@ -80,7 +81,7 @@ docker run --rm --shm-size=8g --gpus all \
   -e YASHA_PLUGINS=kokoro \
   -v ./models.yaml:/yasha/config/models.yaml \
   -v ./models-cache:/yasha/.cache/models \
-  -p 8265:8265 -p 8000:8000 ghcr.io/alez007/yasha:latest
+  -p 8265:8265 -p 8000:8000 -p 8079:8079 ghcr.io/alez007/yasha:latest
 ```
 
 Try it out:
@@ -95,6 +96,7 @@ curl http://localhost:8000/v1/chat/completions \
 ```
 
 - API: `http://localhost:8000`
+- Prometheus metrics: `http://localhost:8079`
 - Ray dashboard: `http://localhost:8265`
 
 Example configs are included for 8 GB, 16 GB, 24 GB, and 2×16 GB GPU setups.
@@ -129,7 +131,7 @@ For a full guide on writing your own plugin, see [Plugin Development](docs/plugi
 
 ## Monitoring
 
-Yasha exposes Prometheus metrics (Ray cluster, Ray Serve, vLLM, and custom `yasha:*` metrics) through a single port. Enable with `YASHA_METRICS=true` and scrape port 8079. A pre-built Grafana dashboard is included. See [Monitoring](docs/monitoring.md) for setup details.
+Yasha exposes Prometheus metrics (Ray cluster, Ray Serve, vLLM, and custom `yasha:*` metrics) through a single scrape endpoint on port 8079. Metrics are **enabled by default** — set `YASHA_METRICS=false` to disable. A pre-built Grafana dashboard is included. See [Monitoring](docs/monitoring.md) for setup details.
 
 ## Future Work
 

diff --git a/docs/development.md b/docs/development.md
@@ -42,7 +42,7 @@ The recommended way to develop Yasha is with VS Code Dev Containers. The configu
 > **Why the extra steps?** The Dev Container overrides the image's default `CMD` (which normally runs `start.sh` to sync deps and start Ray). Inside a Dev Container you need to run these steps manually.
 
 The Dev Container automatically:
-- Builds the dev image from `Dockerfile.dev`
+- Builds the dev image from `Dockerfile` (target: `dev`)
 - Bind-mounts the repo to `/yasha` for live editing
 - Forwards ports `8000` (API) and `8265` (Ray Dashboard)
 - Installs extensions: Ruff, Python, Pyright, and Claude Code
@@ -76,7 +76,7 @@ If you prefer not to use Dev Containers, you can build and run the dev image dir
 ### Building the dev image
 
 ```bash
-docker build -t yasha_dev -f Dockerfile.dev .
+docker build -t yasha_dev --target dev .
 ```
 
 ### Running with live source mounting

diff --git a/scripts/start.sh b/scripts/start.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+EXTRAS=""
+if [ -n "${YASHA_PLUGINS}" ]; then
+    for plugin in $(echo "${YASHA_PLUGINS}" | tr ',' ' '); do
+        EXTRAS="$EXTRAS --extra $plugin"
+    done
+fi
+uv sync --project /yasha --locked $EXTRAS
+
+if [ "${YASHA_USE_EXISTING_RAY_CLUSTER}" != "true" ]; then
+    /yasha/scripts/start_ray.sh --num-cpus "${RAY_HEAD_CPU_NUM}" --num-gpus "${RAY_HEAD_GPU_NUM}"
+fi
+
+cd /yasha && uv run start.py
diff --git a/scripts/start_ray.sh b/scripts/start_ray.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+set -e
+
+usage() {
+    echo "Usage: start_ray.sh --num-cpus <n> --num-gpus <n> [--enable-metrics <true|false>]"
+    exit 1
+}
+
+ENABLE_METRICS="true"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --num-cpus) NUM_CPUS="$2"; shift 2 ;;
+        --num-gpus) NUM_GPUS="$2"; shift 2 ;;
+        --enable-metrics) ENABLE_METRICS="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+
+[ -z "${NUM_CPUS}" ] && usage
+[ -z "${NUM_GPUS}" ] && usage
+
+METRICS_FLAG=""
+if [ "${ENABLE_METRICS}" = "true" ]; then
+    METRICS_FLAG="--metrics-export-port=${RAY_METRICS_EXPORT_PORT:-8079}"
+fi
+
+ray start --head \
+    --dashboard-host=0.0.0.0 \
+    --num-cpus="${NUM_CPUS}" \
+    --num-gpus="${NUM_GPUS}" \
+    --disable-usage-stats \
+    ${METRICS_FLAG}
+
+if ! ray status; then
+    echo "ray cluster failed to start"
+    exit 1
+fi