alez007 · alez007 · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,5 +1,5 @@
 {
-  "name": "yasha",
+  "name": "modelship",
   "build": {
     "dockerfile": "../Dockerfile",
     "target": "dev",
@@ -10,8 +10,8 @@
       "version": "20"
     }
   },
-  "workspaceFolder": "/yasha",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/yasha,type=bind",
+  "workspaceFolder": "/modelship",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/modelship,type=bind",
   "runArgs": [
     "--shm-size=8g",
     "--gpus=all",
@@ -33,8 +33,8 @@
   },
   "remoteEnv": {
     "HF_TOKEN": "${localEnv:HF_TOKEN}",
-    "YASHA_PLUGINS": "${localEnv:YASHA_PLUGINS}",
-    "YASHA_METRICS": "true"
+    "MSHIP_PLUGINS": "${localEnv:MSHIP_PLUGINS}",
+    "MSHIP_METRICS": "true"
   },
   "customizations": {
     "vscode": {

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -28,7 +28,7 @@ What actually happened. Include error messages or logs if available.
 - **GPU**: (e.g. RTX 3090 24GB)
 - **Docker version**: 
 - **NVIDIA driver version**: 
-- **Yasha version/tag**: 
+- **Modelship version/tag**: 
 - **OS**: 
 
 ## Configuration

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -74,4 +74,4 @@ jobs:
 
             ## Configuration
 
-            Create a `models.yaml` file and mount it at `/yasha/config/models.yaml`. Example configs for various GPU sizes are included in the image under `/yasha/config/` — use them as a reference for structure and available options, then tailor the models and GPU fractions to your hardware.
+            Create a `models.yaml` file and mount it at `/modelship/config/models.yaml`. Example configs for various GPU sizes are included in the image under `/modelship/config/` — use them as a reference for structure and available options, then tailor the models and GPU fractions to your hardware.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
-# Contributing to Yasha
+# Contributing to Modelship
 
-Thanks for your interest in contributing to Yasha! This document covers the basics for getting started.
+Thanks for your interest in contributing to Modelship! This document covers the basics for getting started.
 
 ## Development Setup
 
@@ -9,7 +9,7 @@ The recommended way to develop is via the VS Code Dev Container. See [docs/devel
 **Quick version:**
 
 1. Install Docker + NVIDIA Container Toolkit
-2. Set `HF_TOKEN` and `YASHA_PLUGINS` environment variables
+2. Set `HF_TOKEN` and `MSHIP_PLUGINS` environment variables
 3. Open the project in VS Code and select **Dev Containers: Reopen in Container**
 4. Inside the container:
 
@@ -44,7 +44,7 @@ If you're contributing a TTS backend, see [docs/plugins.md](docs/plugins.md) for
 
 ## Reporting Issues
 
-Use [GitHub Issues](https://github.com/alez007/yasha/issues). For bugs, include:
+Use [GitHub Issues](https://github.com/alez007/modelship/issues). For bugs, include:
 
 - GPU model and VRAM
 - Your `models.yaml` configuration

diff --git a/Dockerfile b/Dockerfile
@@ -36,7 +36,7 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && CUDA
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 ENV UV_LINK_MODE=copy
 
-WORKDIR /yasha
+WORKDIR /modelship
 
 ADD ./pyproject.toml pyproject.toml
 ADD ./README.md README.md
@@ -46,16 +46,16 @@ ADD ./plugins plugins
 ENV UV_PROJECT_ENVIRONMENT=/.venv
 ENV VIRTUAL_ENV=/.venv
 ENV CUDA_DEVICE_ORDER=PCI_BUS_ID
-ENV YASHA_CACHE_DIR=/yasha/.cache/models
+ENV MSHIP_CACHE_DIR=/modelship/.cache/models
 ENV RAY_REDIS_PORT=6379
 ENV RAY_CLUSTER_ADDRESS=0.0.0.0
 ENV RAY_HEAD_CPU_NUM=2
 ENV RAY_HEAD_GPU_NUM=1
-ENV YASHA_USE_EXISTING_RAY_CLUSTER=false
-ENV YASHA_METRICS=true
+ENV MSHIP_USE_EXISTING_RAY_CLUSTER=false
+ENV MSHIP_METRICS=true
 ENV RAY_METRICS_EXPORT_PORT=8079
-ENV YASHA_LOG_LEVEL=INFO
-ENV YASHA_LOG_FORMAT=text
+ENV MSHIP_LOG_LEVEL=INFO
+ENV MSHIP_LOG_FORMAT=text
 RUN uv venv
 
 ARG PYTHON_VERSION
@@ -71,8 +71,8 @@ FROM base AS dev
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv sync --locked --no-install-project --extra dev
 
-ADD ./scripts/start_ray.sh /yasha/scripts/start_ray.sh
-RUN chmod +x /yasha/scripts/start_ray.sh
+ADD ./scripts/start_ray.sh /modelship/scripts/start_ray.sh
+RUN chmod +x /modelship/scripts/start_ray.sh
 
 CMD ["/bin/bash"]
 
@@ -82,13 +82,13 @@ CMD ["/bin/bash"]
 FROM base AS prod
 
 ADD ./start.py start.py
-ADD ./yasha yasha
+ADD ./modelship modelship
 ADD ./config config
 ADD ./scripts scripts
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv sync --locked --no-install-project
 
-RUN chmod +x /yasha/scripts/start_ray.sh /yasha/scripts/start.sh
+RUN chmod +x /modelship/scripts/start_ray.sh /modelship/scripts/start.sh
 
-CMD ["uv", "run", "--active", "bash", "/yasha/scripts/start.sh"]
+CMD ["uv", "run", "--active", "bash", "/modelship/scripts/start.sh"]
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Yasha
+# Modelship
 
 Self-hosted, multi-model AI inference server. Runs LLMs alongside specialized models (TTS, speech-to-text, embeddings, image generation) on one or more GPUs, exposing an OpenAI-compatible API. Built on [vLLM](https://github.com/vllm-project/vllm) and [Ray](https://github.com/ray-project/ray).
 
@@ -8,25 +8,30 @@ Self-hosted, multi-model AI inference server. Runs LLMs alongside specialized mo
 graph TD
     Client["Client (OpenAI SDK / curl)"]
     API["FastAPI Gateway<br/>OpenAI-compatible API<br/>:8000"]
-    Ray["Ray Serve"]
 
     Client -->|HTTP| API
-    API --> Ray
-
-    Ray --> LLM["LLM Deployment<br/>e.g. Llama 3.1 8B<br/>70% GPU"]
-    Ray --> TTS["TTS Deployment<br/>e.g. Kokoro 82M<br/>5% GPU"]
-    Ray --> STT["STT Deployment<br/>e.g. Whisper<br/>10% GPU"]
-    Ray --> EMB["Embedding Deployment<br/>e.g. Nomic Embed<br/>5% GPU"]
-
-    subgraph GPU["Single GPU"]
-        LLM
-        TTS
-        STT
-        EMB
+    API -->|round-robin| LLM_GPU
+    API -->|round-robin| LLM_CPU
+    API -->|round-robin| TTS
+    API -->|round-robin| STT
+    API -->|round-robin| EMB
+
+    subgraph GPU0["GPU 0"]
+        LLM_GPU["LLM Deployment<br/>e.g. Llama 3.1 8B<br/>70% GPU"]
+        TTS["TTS Deployment<br/>e.g. Kokoro 82M<br/>5% GPU"]
+    end
+
+    subgraph GPU1["GPU 1"]
+        STT["STT Deployment<br/>e.g. Whisper<br/>50% GPU"]
+        EMB["Embedding Deployment<br/>e.g. Nomic Embed<br/>50% GPU"]
+    end
+
+    subgraph CPU["CPU-only"]
+        LLM_CPU["LLM Deployment<br/>e.g. Llama 3.1 8B<br/>CPU-only replica"]
     end
 ```
 
-Each model runs as an isolated Ray Serve deployment with its own lifecycle, health checks, and GPU memory budget.
+Each model runs as an isolated [Ray Serve](https://docs.ray.io/en/latest/serve/index.html) deployment with its own lifecycle, health checks, and resource budget. Models can be deployed across multiple GPUs, run on CPU-only, or both — multiple deployments of the same model (e.g. one on GPU, one on CPU) are load-balanced with round-robin routing. Each deployment can also scale horizontally with `num_replicas`.
 
 ## Requirements
 
@@ -36,15 +41,15 @@ Each model runs as an isolated Ray Serve deployment with its own lifecycle, heal
 
 ## Features
 
-- **Multi-model on a single GPU** — run chat, embedding, STT, TTS, and image generation models simultaneously with tunable per-model GPU memory allocation
-- **Per-model isolated deployments** — each model runs in its own Ray Serve deployment with independent lifecycle, health checks, and failure isolation
+- **Multi-model, multi-GPU** — run chat, embedding, STT, TTS, and image generation models simultaneously across one or more GPUs with tunable per-model GPU memory allocation; models can also run on CPU-only
+- **Per-model isolated deployments** — each model runs in its own Ray Serve deployment with independent lifecycle, health checks, failure isolation, and configurable replica count
 - **OpenAI-compatible API** — drop-in replacement for any OpenAI SDK client
 - **Streaming** — SSE streaming for chat completions and TTS audio
 - **Tool/function calling** — auto tool choice with configurable parsers
 - **Plugin system** — opt-in TTS backends installed as isolated uv workspace packages
-- **Multi-GPU support** — assign models to specific GPUs by index or named Ray resource, with full tensor parallelism support
+- **Multi-GPU & hybrid routing** — assign models to specific GPUs or run them on CPU-only; deploy the same model on both GPU and CPU and requests are load-balanced via round-robin; full tensor parallelism support for large models spanning multiple GPUs
 - **Client disconnect detection** — cancels in-flight inference when the client disconnects, freeing GPU resources immediately
-- **Prometheus metrics & Grafana dashboard** — built-in observability with custom `yasha:*` metrics, vLLM engine stats, and Ray cluster metrics on a single scrape endpoint; pre-built Grafana dashboard included
+- **Prometheus metrics & Grafana dashboard** — built-in observability with custom `modelship:*` metrics, vLLM engine stats, and Ray cluster metrics on a single scrape endpoint; pre-built Grafana dashboard included
 - **Ray dashboard** — monitor deployments, resources, and request logs
 
 ## Supported OpenAI Endpoints
@@ -64,24 +69,24 @@ Each model runs as an isolated Ray Serve deployment with its own lifecycle, heal
 Pull the latest image from GHCR:
 
 ```bash
-docker pull ghcr.io/alez007/yasha:latest
+docker pull ghcr.io/alez007/modelship:latest
 ```
 
 Grab an example config for your GPU and edit it to your liking:
 
 ```bash
-docker run --rm ghcr.io/alez007/yasha:latest cat /yasha/config/models.example.16GB.yaml > models.yaml
+docker run --rm ghcr.io/alez007/modelship:latest cat /modelship/config/models.example.16GB.yaml > models.yaml
 ```
 
 Start the server:
 
 ```bash
 docker run --rm --shm-size=8g --gpus all \
   -e HF_TOKEN=your_token_here \
-  -e YASHA_PLUGINS=kokoro \
-  -v ./models.yaml:/yasha/config/models.yaml \
-  -v ./models-cache:/yasha/.cache/models \
-  -p 8265:8265 -p 8000:8000 -p 8079:8079 ghcr.io/alez007/yasha:latest
+  -e MSHIP_PLUGINS=kokoro \
+  -v ./models.yaml:/modelship/config/models.yaml \
+  -v ./models-cache:/modelship/.cache/models \
+  -p 8265:8265 -p 8000:8000 -p 8079:8079 ghcr.io/alez007/modelship:latest
 ```
 
 Try it out:
@@ -103,7 +108,7 @@ Example configs are included for 8 GB, 16 GB, 24 GB, and 2×16 GB GPU setups.
 
 ## Plugin Support
 
-Yasha's TTS system is built around a plugin architecture — each TTS backend is an opt-in package with its own isolated dependencies. Plugins ship inside this repo (`plugins/`) or can be installed from PyPI.
+Modelship's TTS system is built around a plugin architecture — each TTS backend is an opt-in package with its own isolated dependencies. Plugins ship inside this repo (`plugins/`) or can be installed from PyPI.
 
 To enable plugins, pass them as extras at sync time:
 
@@ -112,10 +117,10 @@ uv sync --extra kokoro
 uv sync --extra kokoro --extra orpheus  # multiple plugins
 ```
 
-When using Docker, set the `YASHA_PLUGINS` environment variable:
+When using Docker, set the `MSHIP_PLUGINS` environment variable:
 
 ```
-YASHA_PLUGINS=kokoro,orpheus
+MSHIP_PLUGINS=kokoro,orpheus
 ```
 
 For a full guide on writing your own plugin, see [Plugin Development](docs/plugins.md).
@@ -131,9 +136,9 @@ For a full guide on writing your own plugin, see [Plugin Development](docs/plugi
 
 ## Monitoring
 
-Yasha exposes Prometheus metrics (Ray cluster, Ray Serve, vLLM, and custom `yasha:*` metrics) through a single scrape endpoint on port 8079. Metrics are **enabled by default** — set `YASHA_METRICS=false` to disable. A pre-built Grafana dashboard is included.
+Modelship exposes Prometheus metrics (Ray cluster, Ray Serve, vLLM, and custom `modelship:*` metrics) through a single scrape endpoint on port 8079. Metrics are **enabled by default** — set `MSHIP_METRICS=false` to disable. A pre-built Grafana dashboard is included.
 
-Logging supports structured JSON output (`YASHA_LOG_FORMAT=json`) and request ID correlation across Ray actor boundaries. Set `YASHA_LOG_LEVEL` to `DEBUG` for request bodies or `TRACE` to include library internals.
+Logging supports structured JSON output (`MSHIP_LOG_FORMAT=json`) and request ID correlation across Ray actor boundaries. Set `MSHIP_LOG_LEVEL` to `DEBUG` for request bodies or `TRACE` to include library internals.
 
 See [Monitoring & Logging](docs/monitoring.md) for full details.
 

diff --git a/SECURITY.md b/SECURITY.md
@@ -15,7 +15,7 @@ Only the latest release receives security fixes.
 
 Use GitHub's private advisory system to report vulnerabilities confidentially:
 
-1. Go to the [Security Advisories](https://github.com/alez007/yasha/security/advisories) page
+1. Go to the [Security Advisories](https://github.com/alez007/modelship/security/advisories) page
 2. Click **Report a vulnerability**
 3. Fill in the details — affected component, reproduction steps, potential impact
 

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-Yasha is built on two core technologies:
+Modelship is built on two core technologies:
 - **[Ray Serve](https://docs.ray.io/en/latest/serve/)** — manages model deployments as isolated actors with independent scaling and failure handling
 - **[vLLM](https://github.com/vllm-project/vllm)** — high-throughput LLM inference engine with continuous batching and PagedAttention
 
@@ -48,7 +48,7 @@ TTS backends are isolated `uv` workspace packages under `plugins/`. Each plugin:
 
 - Implements `BasePlugin` with `start()` and `generate()` methods
 - Has its own dependencies, isolated from the main project
-- Is opt-in via `uv sync --extra <plugin>` or the `YASHA_PLUGINS` env var
+- Is opt-in via `uv sync --extra <plugin>` or the `MSHIP_PLUGINS` env var
 - Returns audio as a single response or as an SSE async generator
 
 See [Plugin Development](plugins.md) for details.
@@ -58,10 +58,10 @@ See [Plugin Development](plugins.md) for details.
 | File | Purpose |
 |------|---------|
 | `start.py` | Entry point — initializes Ray, deploys models |
-| `yasha/openai/api.py` | FastAPI gateway with OpenAI endpoints |
-| `yasha/infer/model_deployment.py` | Ray Serve deployment actor |
-| `yasha/infer/infer_config.py` | Pydantic config models and protocols |
-| `yasha/infer/vllm/vllm_infer.py` | vLLM engine wrapper |
-| `yasha/infer/diffusers/diffusers_infer.py` | Diffusers pipeline wrapper |
-| `yasha/plugins/base_plugin.py` | Plugin base classes |
+| `modelship/openai/api.py` | FastAPI gateway with OpenAI endpoints |
+| `modelship/infer/model_deployment.py` | Ray Serve deployment actor |
+| `modelship/infer/infer_config.py` | Pydantic config models and protocols |
+| `modelship/infer/vllm/vllm_infer.py` | vLLM engine wrapper |
+| `modelship/infer/diffusers/diffusers_infer.py` | Diffusers pipeline wrapper |
+| `modelship/plugins/base_plugin.py` | Plugin base classes |
 | `config/models.yaml` | Model configuration |
diff --git a/docs/development.md b/docs/development.md
@@ -7,13 +7,13 @@
 
 ## Quick start (Dev Container)
 
-The recommended way to develop Yasha is with VS Code Dev Containers. The configuration in `.devcontainer/` builds the dev image, mounts the repo, forwards ports, and installs all required extensions automatically.
+The recommended way to develop Modelship is with VS Code Dev Containers. The configuration in `.devcontainer/` builds the dev image, mounts the repo, forwards ports, and installs all required extensions automatically.
 
 1. Set required environment variables on your host:
 
    ```bash
    export HF_TOKEN=your_token_here
-   export YASHA_PLUGINS=kokoro  # optional — comma-separated list of plugins to install
+   export MSHIP_PLUGINS=kokoro  # optional — comma-separated list of plugins to install
    ```
 
 2. Open the repo in VS Code and run **Dev Containers: Reopen in Container** from the command palette (`Ctrl+Shift+P` / `Cmd+Shift+P`).
@@ -43,7 +43,7 @@ The recommended way to develop Yasha is with VS Code Dev Containers. The configu
 
 The Dev Container automatically:
 - Builds the dev image from `Dockerfile` (target: `dev`)
-- Bind-mounts the repo to `/yasha` for live editing
+- Bind-mounts the repo to `/modelship` for live editing
 - Forwards ports `8000` (API) and `8265` (Ray Dashboard)
 - Installs extensions: Ruff, Python, Pyright, and Claude Code
 - Configures the Python interpreter and linting to use the container's venv at `/.venv`
@@ -58,8 +58,8 @@ The following environment variables are set in the dev image with sensible defau
 | `RAY_CLUSTER_ADDRESS` | `ray://0.0.0.0` | Ray cluster address |
 | `RAY_HEAD_CPU_NUM` | `2` | CPUs allocated to Ray head |
 | `RAY_HEAD_GPU_NUM` | `1` | GPUs allocated to Ray head |
-| `YASHA_CACHE_DIR` | `/yasha/.cache/models` | Model cache directory |
-| `YASHA_USE_EXISTING_RAY_CLUSTER` | `false` | Set to `true` to skip starting a Ray head node |
+| `MSHIP_CACHE_DIR` | `/modelship/.cache/models` | Model cache directory |
+| `MSHIP_USE_EXISTING_RAY_CLUSTER` | `false` | Set to `true` to skip starting a Ray head node |
 
 ### Installing plugin dependencies for IntelliSense
 
@@ -76,7 +76,7 @@ If you prefer not to use Dev Containers, you can build and run the dev image dir
 ### Building the dev image
 
 ```bash
-docker build -t yasha_dev --target dev .
+docker build -t modelship_dev --target dev .
 ```
 
 ### Running with live source mounting
@@ -86,12 +86,12 @@ The dev image does not bake in source files. Mount the repo root so changes take
 ```bash
 docker run -it --rm --shm-size=8g --gpus all \
   -e HF_TOKEN=your_token_here \
-  -e YASHA_PLUGINS=kokoro \
-  --mount type=bind,src=./,dst=/yasha \
-  -p 8265:8265 -p 8000:8000 yasha_dev
+  -e MSHIP_PLUGINS=kokoro \
+  --mount type=bind,src=./,dst=/modelship \
+  -p 8265:8265 -p 8000:8000 modelship_dev
 ```
 
-The container's entrypoint (`start.sh`) automatically syncs dependencies (including any plugins listed in `YASHA_PLUGINS`), starts the Ray head node, and drops into a shell. Then start the server:
+The container's entrypoint (`start.sh`) automatically syncs dependencies (including any plugins listed in `MSHIP_PLUGINS`), starts the Ray head node, and drops into a shell. Then start the server:
 
 ```bash
 uv run start.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -74,4 +74,4 @@ jobs:

		## Configuration

		Create a `models.yaml` file and mount it at `/yasha/config/models.yaml`. Example configs for various GPU sizes are included in the image under `/yasha/config/` — use them as a reference for structure and available options, then tailor the models and GPU fractions to your hardware.
		Create a `models.yaml` file and mount it at `/modelship/config/models.yaml`. Example configs for various GPU sizes are included in the image under `/modelship/config/` — use them as a reference for structure and available options, then tailor the models and GPU fractions to your hardware.