From ef4170e7b7838f237be6b643616f79e595820f06 Mon Sep 17 00:00:00 2001
From: Alex M <alex@macmini.com>
Date: Tue, 7 Apr 2026 07:35:48 +0000
Subject: [PATCH] refactor: unify Dockerfiles into multi-stage build and update
 README

Merge Dockerfile.dev and Dockerfile.prod into a single multi-stage
Dockerfile with dev/prod targets. Extract entrypoint scripts to
scripts/. Update README to reflect metrics enabled by default and
add observability to the features list.
---
 .devcontainer/devcontainer.json |  3 +-
 .github/workflows/release.yml   |  3 +-
 Dockerfile.prod => Dockerfile   | 59 +++++++++-----------
 Dockerfile.dev                  | 99 ---------------------------------
 README.md                       |  6 +-
 docs/development.md             |  4 +-
 scripts/start.sh                | 16 ++++++
 scripts/start_ray.sh            | 38 +++++++++++++
 8 files changed, 90 insertions(+), 138 deletions(-)
 rename Dockerfile.prod => Dockerfile (69%)
 delete mode 100644 Dockerfile.dev
 create mode 100755 scripts/start.sh
 create mode 100755 scripts/start_ray.sh

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 121e13a..5764a43 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,7 +1,8 @@
 {
   "name": "yasha",
   "build": {
-    "dockerfile": "../Dockerfile.dev",
+    "dockerfile": "../Dockerfile",
+    "target": "dev",
     "context": ".."
   },
   "features": {
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3339712..77e82a7 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -40,7 +40,8 @@ jobs:
         uses: docker/build-push-action@v6
         with:
           context: .
-          file: Dockerfile.prod
+          file: Dockerfile
+          target: prod
           push: true
           build-args: |
             CUDA_VERSION=13.0.2
diff --git a/Dockerfile.prod b/Dockerfile
similarity index 69%
rename from Dockerfile.prod
rename to Dockerfile
index 9a9e2fc..74e0296 100644
--- a/Dockerfile.prod
+++ b/Dockerfile
@@ -41,9 +41,6 @@ WORKDIR /yasha
 ADD ./pyproject.toml pyproject.toml
 ADD ./README.md README.md
 ADD ./uv.lock uv.lock
-ADD ./start.py start.py
-ADD ./yasha yasha
-ADD ./config config
 ADD ./plugins plugins
 
 ENV UV_PROJECT_ENVIRONMENT=/.venv
@@ -63,36 +60,32 @@ RUN uv python install ${PYTHON_VERSION}
 
 ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
 
+# ---------------------------------------------------------------------------
+# Development target
+# ---------------------------------------------------------------------------
+FROM base AS dev
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --locked --no-install-project --extra dev
+
+ADD ./scripts/start_ray.sh /yasha/scripts/start_ray.sh
+RUN chmod +x /yasha/scripts/start_ray.sh
+
+CMD ["/bin/bash"]
+
+# ---------------------------------------------------------------------------
+# Production target
+# ---------------------------------------------------------------------------
+FROM base AS prod
+
+ADD ./start.py start.py
+ADD ./yasha yasha
+ADD ./config config
+ADD ./scripts scripts
+
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv sync --locked --no-install-project
 
-WORKDIR /
-COPY <<EOF start.sh
-#!/bin/bash
-
-EXTRAS=""
-if [ -n "\${YASHA_PLUGINS}" ]; then
-    for plugin in \$(echo "\${YASHA_PLUGINS}" | tr ',' ' '); do
-        EXTRAS="\$EXTRAS --extra \$plugin"
-    done
-fi
-uv sync --project /yasha --locked \$EXTRAS
-
-if [ "\${YASHA_USE_EXISTING_RAY_CLUSTER}" != "true" ]; then
-    METRICS_FLAG=""
-    if [ "\${YASHA_METRICS}" = "true" ]; then
-        METRICS_FLAG="--metrics-export-port=\${RAY_METRICS_EXPORT_PORT}"
-    fi
-    cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats \${METRICS_FLAG}
-    if ! ray status --address=\${RAY_CLUSTER_ADDRESS}:\${RAY_REDIS_PORT}; then
-        echo "ray cluster failed to start"
-        exit 1
-    fi
-fi
-cd /yasha && uv run start.py
-
-EOF
-RUN chmod +x start.sh
-
-WORKDIR /
-CMD ["uv", "run", "--active", "bash", "start.sh"]
\ No newline at end of file
+RUN chmod +x /yasha/scripts/start_ray.sh /yasha/scripts/start.sh
+
+CMD ["uv", "run", "--active", "bash", "/yasha/scripts/start.sh"]
diff --git a/Dockerfile.dev b/Dockerfile.dev
deleted file mode 100644
index 346a210..0000000
--- a/Dockerfile.dev
+++ /dev/null
@@ -1,99 +0,0 @@
-ARG CUDA_VERSION=13.0.2
-ARG PYTHON_VERSION=3.12.10
-
-FROM ubuntu:24.04 AS base
-
-ARG CUDA_VERSION
-ARG PYTHON_VERSION
-
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends ca-certificates curl gnupg && \
-    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/3bf863cc.pub \
-        | gpg --dearmor -o /usr/share/keyrings/cuda-keyring.gpg && \
-    echo "deb [signed-by=/usr/share/keyrings/cuda-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/ /" \
-        > /etc/apt/sources.list.d/cuda.list
-
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends \
-    build-essential \
-    curl \
-    espeak-ng \
-    git
-
-RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) && \
-    apt update -y && \
-    apt install -y --no-install-recommends \
-        build-essential \
-        cuda-nvcc-${CUDA_VERSION_DASH} \
-        cuda-cudart-${CUDA_VERSION_DASH} \
-        cuda-nvrtc-${CUDA_VERSION_DASH} \
-        cuda-cuobjdump-${CUDA_VERSION_DASH} \
-        libcurand-dev-${CUDA_VERSION_DASH} \
-        libcublas-${CUDA_VERSION_DASH} \
-        cudnn9-cuda-${CUDA_MAJOR_VERSION}
-
-# Install uv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
-ENV UV_LINK_MODE=copy
-
-WORKDIR /yasha
-
-# Copy dependency manifests and workspace members so uv can resolve and
-# pre-install packages during the image build.  At runtime the dev container
-# bind-mounts the full host repo over /yasha, shadowing everything below.
-ADD ./pyproject.toml pyproject.toml
-ADD ./README.md README.md
-ADD ./uv.lock uv.lock
-ADD ./plugins plugins
-
-ENV UV_PROJECT_ENVIRONMENT=/.venv
-ENV VIRTUAL_ENV=/.venv
-ENV YASHA_CACHE_DIR=/yasha/.cache/models
-ENV RAY_REDIS_PORT=6379
-ENV RAY_CLUSTER_ADDRESS=0.0.0.0
-ENV RAY_HEAD_CPU_NUM=2
-ENV RAY_HEAD_GPU_NUM=1
-ENV YASHA_USE_EXISTING_RAY_CLUSTER=false
-ENV YASHA_METRICS=true
-ENV RAY_METRICS_EXPORT_PORT=8079
-RUN uv venv
-
-ARG PYTHON_VERSION
-RUN uv python install ${PYTHON_VERSION}
-
-ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv sync --locked --no-install-project --extra dev
-
-WORKDIR /
-COPY <<EOF start.sh
-#!/bin/bash
-
-EXTRAS=""
-if [ -n "\${YASHA_PLUGINS}" ]; then
-    for plugin in \$(echo "\${YASHA_PLUGINS}" | tr ',' ' '); do
-        EXTRAS="\$EXTRAS --extra \$plugin"
-    done
-fi
-uv sync --project /yasha --locked \$EXTRAS
-cd /yasha && uv run pre-commit install
-
-if [ "\${YASHA_USE_EXISTING_RAY_CLUSTER}" != "true" ]; then
-    METRICS_FLAG=""
-    if [ "\${YASHA_METRICS}" = "true" ]; then
-        METRICS_FLAG="--metrics-export-port=\${RAY_METRICS_EXPORT_PORT}"
-    fi
-    cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats \${METRICS_FLAG}
-    if ! ray status --address=\${RAY_CLUSTER_ADDRESS}:\${RAY_REDIS_PORT}; then
-        echo "ray cluster failed to start"
-        exit 1
-    fi
-fi
-/bin/bash
-
-EOF
-RUN chmod +x start.sh
-
-WORKDIR /
-CMD ["uv", "run", "--active", "bash", "start.sh"]
diff --git a/README.md b/README.md
index ebaf649..0b01508 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,7 @@ Each model runs as an isolated Ray Serve deployment with its own lifecycle, heal
 - **Plugin system** — opt-in TTS backends installed as isolated uv workspace packages
 - **Multi-GPU support** — assign models to specific GPUs by index or named Ray resource, with full tensor parallelism support
 - **Client disconnect detection** — cancels in-flight inference when the client disconnects, freeing GPU resources immediately
+- **Prometheus metrics & Grafana dashboard** — built-in observability with custom `yasha:*` metrics, vLLM engine stats, and Ray cluster metrics on a single scrape endpoint; pre-built Grafana dashboard included
 - **Ray dashboard** — monitor deployments, resources, and request logs
 
 ## Supported OpenAI Endpoints
@@ -80,7 +81,7 @@ docker run --rm --shm-size=8g --gpus all \
   -e YASHA_PLUGINS=kokoro \
   -v ./models.yaml:/yasha/config/models.yaml \
   -v ./models-cache:/yasha/.cache/models \
-  -p 8265:8265 -p 8000:8000 ghcr.io/alez007/yasha:latest
+  -p 8265:8265 -p 8000:8000 -p 8079:8079 ghcr.io/alez007/yasha:latest
 ```
 
 Try it out:
@@ -95,6 +96,7 @@ curl http://localhost:8000/v1/chat/completions \
 ```
 
 - API: `http://localhost:8000`
+- Prometheus metrics: `http://localhost:8079`
 - Ray dashboard: `http://localhost:8265`
 
 Example configs are included for 8 GB, 16 GB, 24 GB, and 2×16 GB GPU setups.
@@ -129,7 +131,7 @@ For a full guide on writing your own plugin, see [Plugin Development](docs/plugi
 
 ## Monitoring
 
-Yasha exposes Prometheus metrics (Ray cluster, Ray Serve, vLLM, and custom `yasha:*` metrics) through a single port. Enable with `YASHA_METRICS=true` and scrape port 8079. A pre-built Grafana dashboard is included. See [Monitoring](docs/monitoring.md) for setup details.
+Yasha exposes Prometheus metrics (Ray cluster, Ray Serve, vLLM, and custom `yasha:*` metrics) through a single scrape endpoint on port 8079. Metrics are **enabled by default** — set `YASHA_METRICS=false` to disable. A pre-built Grafana dashboard is included. See [Monitoring](docs/monitoring.md) for setup details.
 
 ## Future Work
 
diff --git a/docs/development.md b/docs/development.md
index d03623b..0f5b29b 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -42,7 +42,7 @@ The recommended way to develop Yasha is with VS Code Dev Containers. The configu
 > **Why the extra steps?** The Dev Container overrides the image's default `CMD` (which normally runs `start.sh` to sync deps and start Ray). Inside a Dev Container you need to run these steps manually.
 
 The Dev Container automatically:
-- Builds the dev image from `Dockerfile.dev`
+- Builds the dev image from `Dockerfile` (target: `dev`)
 - Bind-mounts the repo to `/yasha` for live editing
 - Forwards ports `8000` (API) and `8265` (Ray Dashboard)
 - Installs extensions: Ruff, Python, Pyright, and Claude Code
@@ -76,7 +76,7 @@ If you prefer not to use Dev Containers, you can build and run the dev image dir
 ### Building the dev image
 
 ```bash
-docker build -t yasha_dev -f Dockerfile.dev .
+docker build -t yasha_dev --target dev .
 ```
 
 ### Running with live source mounting
diff --git a/scripts/start.sh b/scripts/start.sh
new file mode 100755
index 0000000..d9a24b8
--- /dev/null
+++ b/scripts/start.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+EXTRAS=""
+if [ -n "${YASHA_PLUGINS}" ]; then
+    for plugin in $(echo "${YASHA_PLUGINS}" | tr ',' ' '); do
+        EXTRAS="$EXTRAS --extra $plugin"
+    done
+fi
+uv sync --project /yasha --locked $EXTRAS
+
+if [ "${YASHA_USE_EXISTING_RAY_CLUSTER}" != "true" ]; then
+    /yasha/scripts/start_ray.sh --num-cpus "${RAY_HEAD_CPU_NUM}" --num-gpus "${RAY_HEAD_GPU_NUM}"
+fi
+
+cd /yasha && uv run start.py
diff --git a/scripts/start_ray.sh b/scripts/start_ray.sh
new file mode 100755
index 0000000..fdba72b
--- /dev/null
+++ b/scripts/start_ray.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+set -e
+
+usage() {
+    echo "Usage: start_ray.sh --num-cpus <n> --num-gpus <n> [--enable-metrics <true|false>]"
+    exit 1
+}
+
+ENABLE_METRICS="true"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --num-cpus) NUM_CPUS="$2"; shift 2 ;;
+        --num-gpus) NUM_GPUS="$2"; shift 2 ;;
+        --enable-metrics) ENABLE_METRICS="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+
+[ -z "${NUM_CPUS}" ] && usage
+[ -z "${NUM_GPUS}" ] && usage
+
+METRICS_FLAG=""
+if [ "${ENABLE_METRICS}" = "true" ]; then
+    METRICS_FLAG="--metrics-export-port=${RAY_METRICS_EXPORT_PORT:-8079}"
+fi
+
+ray start --head \
+    --dashboard-host=0.0.0.0 \
+    --num-cpus="${NUM_CPUS}" \
+    --num-gpus="${NUM_GPUS}" \
+    --disable-usage-stats \
+    ${METRICS_FLAG}
+
+if ! ray status; then
+    echo "ray cluster failed to start"
+    exit 1
+fi