From 1e96fa1a9b7b4c0c228e39131ba588edc464b2f2 Mon Sep 17 00:00:00 2001
From: Yuan Fang <yuanfang@alauda.io>
Date: Wed, 28 Jan 2026 17:34:53 +0800
Subject: [PATCH 1/3] add triton runtime

---
 .../how_to/custom_inference_runtime.mdx       | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx
index 3451a9ce..91aedf20 100644
--- a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx
+++ b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx
@@ -282,6 +282,62 @@ spec:
 
  ```
 
+### Triton Inference Server
+
+The Triton Inference Server runtime is designed for NVIDIA GPUs and supports multiple model formats. Similar to MLServer, you need to create the `ClusterServingRuntime` resource first, then create your inference service.
+
+```yaml
+apiVersion: serving.kserve.io/v1alpha1
+kind: ClusterServingRuntime
+metadata:
+  annotations:
+    cpaas.io/display-name: triton-cuda12-x86
+  labels:
+    cpaas.io/accelerator-type: nvidia
+    cpaas.io/cuda-version: "12.1"
+    cpaas.io/runtime-class: triton
+  name: aml-triton-cuda-12
+spec:
+  containers:
+    - command:
+        - /bin/bash
+        - -c
+        - >
+          tritonserver --log-verbose=1 --http-port=8080
+          --model-repository=/mnt/models
+      env:
+        - name: OMP_NUM_THREADS
+          value: "1"
+        - name: MODEL_REPO
+          value: '{{ index .Annotations "aml-model-repo" }}'
+      image: 152-231-registry.alauda.cn:60070/mlops/tritonserver:25.02-py3
+      name: kserve-container
+      resources:
+        limits:
+          cpu: 2
+          memory: 6Gi
+        requests:
+          cpu: 2
+          memory: 6Gi
+      securityContext:
+        allowPrivilegeEscalation: false
+        capabilities:
+          drop:
+            - ALL
+        privileged: false
+        runAsNonRoot: true
+        runAsUser: 1000
+  supportedModelFormats:
+    - name: triton
+      version: "1"
+```
+
+**Usage Instructions:**
+
+1. **Create the ClusterServingRuntime**: Apply the YAML configuration above using `kubectl apply -f triton-runtime.yaml`
+2. **Prepare Your Model**: Ensure your model is in a format supported by Triton (e.g., TensorFlow, PyTorch, ONNX)
+3. **Set Model Framework**: In the model repository, set the framework metadata to `triton` to match the `supportedModelFormats` field
+4. **Create Inference Service**: When publishing your inference service, select the Triton runtime from the runtime dropdown menu
 
 ### MindIE (Ascend NPU 310P)
 
@@ -620,4 +676,5 @@ Before proceeding, refer to this table to understand the specific requirements f
 | :--- | :--- | :--- | :--- |
 | **Xinference** | CPU / NVIDIA GPU | transformers, pytorch | **Must** set `MODEL_FAMILY` environment variable |
 | **MLServer** | CPU / NVIDIA GPU | sklearn, xgboost, mlflow | Standard configuration |
+| **Triton** | NVIDIA GPU | triton (TensorFlow, PyTorch, ONNX, etc.) | Standard configuration |
 | **MindIE** | Huawei Ascend NPU | mindspore, transformers | **Must** add NPU required Annotations to InferenceService |

From 56eebf9de54370d6471215809ba6afb29ab28cb0 Mon Sep 17 00:00:00 2001
From: Yuan Fang <yuanfang@alauda.io>
Date: Wed, 28 Jan 2026 18:07:05 +0800
Subject: [PATCH 2/3] update image tag

---
 .../inference_service/how_to/custom_inference_runtime.mdx       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx
index 91aedf20..bac3566e 100644
--- a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx
+++ b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx
@@ -310,7 +310,7 @@ spec:
           value: "1"
         - name: MODEL_REPO
           value: '{{ index .Annotations "aml-model-repo" }}'
-      image: 152-231-registry.alauda.cn:60070/mlops/tritonserver:25.02-py3
+      image: alaudadockerhub/tritonserver:25.02-py3
       name: kserve-container
       resources:
         limits:

From 217ba4089de25fdf3ca331f86ad4bef2dbfaa037 Mon Sep 17 00:00:00 2001
From: Yuan Fang <yuanfang@alauda.io>
Date: Wed, 28 Jan 2026 18:18:16 +0800
Subject: [PATCH 3/3] add startup probe

---
 .../inference_service/how_to/custom_inference_runtime.mdx | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx
index bac3566e..e928ee59 100644
--- a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx
+++ b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx
@@ -327,6 +327,14 @@ spec:
         privileged: false
         runAsNonRoot: true
         runAsUser: 1000
+      startupProbe:
+        failureThreshold: 60
+        httpGet:
+          path: /v2/models/{{ index .Annotations "aml-model-repo" }}/ready
+          port: 8080
+          scheme: HTTP
+        periodSeconds: 10
+        timeoutSeconds: 10
   supportedModelFormats:
     - name: triton
       version: "1"