From 1e96fa1a9b7b4c0c228e39131ba588edc464b2f2 Mon Sep 17 00:00:00 2001 From: Yuan Fang Date: Wed, 28 Jan 2026 17:34:53 +0800 Subject: [PATCH 1/3] add triton runtime --- .../how_to/custom_inference_runtime.mdx | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx index 3451a9ce..91aedf20 100644 --- a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx +++ b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx @@ -282,6 +282,62 @@ spec: ``` +### Triton Inference Server + +The Triton Inference Server runtime is designed for NVIDIA GPUs and supports multiple model formats. Similar to MLServer, you need to create the `ClusterServingRuntime` resource first, then create your inference service. + +```yaml +apiVersion: serving.kserve.io/v1alpha1 +kind: ClusterServingRuntime +metadata: + annotations: + cpaas.io/display-name: triton-cuda12-x86 + labels: + cpaas.io/accelerator-type: nvidia + cpaas.io/cuda-version: "12.1" + cpaas.io/runtime-class: triton + name: aml-triton-cuda-12 +spec: + containers: + - command: + - /bin/bash + - -c + - > + tritonserver --log-verbose=1 --http-port=8080 + --model-repository=/mnt/models + env: + - name: OMP_NUM_THREADS + value: "1" + - name: MODEL_REPO + value: '{{ index .Annotations "aml-model-repo" }}' + image: 152-231-registry.alauda.cn:60070/mlops/tritonserver:25.02-py3 + name: kserve-container + resources: + limits: + cpu: 2 + memory: 6Gi + requests: + cpu: 2 + memory: 6Gi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + runAsNonRoot: true + runAsUser: 1000 + supportedModelFormats: + - name: triton + version: "1" +``` + +**Usage Instructions:** + +1. **Create the ClusterServingRuntime**: Apply the YAML configuration above using `kubectl apply -f triton-runtime.yaml` +2. **Prepare Your Model**: Ensure your model is in a format supported by Triton (e.g., TensorFlow, PyTorch, ONNX) +3. **Set Model Framework**: In the model repository, set the framework metadata to `triton` to match the `supportedModelFormats` field +4. **Create Inference Service**: When publishing your inference service, select the Triton runtime from the runtime dropdown menu ### MindIE (Ascend NPU 310P) @@ -620,4 +676,5 @@ Before proceeding, refer to this table to understand the specific requirements f | :--- | :--- | :--- | :--- | | **Xinference** | CPU / NVIDIA GPU | transformers, pytorch | **Must** set `MODEL_FAMILY` environment variable | | **MLServer** | CPU / NVIDIA GPU | sklearn, xgboost, mlflow | Standard configuration | +| **Triton** | NVIDIA GPU | triton (TensorFlow, PyTorch, ONNX, etc.) | Standard configuration | | **MindIE** | Huawei Ascend NPU | mindspore, transformers | **Must** add NPU required Annotations to InferenceService | From 56eebf9de54370d6471215809ba6afb29ab28cb0 Mon Sep 17 00:00:00 2001 From: Yuan Fang Date: Wed, 28 Jan 2026 18:07:05 +0800 Subject: [PATCH 2/3] update image tag --- .../inference_service/how_to/custom_inference_runtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx index 91aedf20..bac3566e 100644 --- a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx +++ b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx @@ -310,7 +310,7 @@ spec: value: "1" - name: MODEL_REPO value: '{{ index .Annotations "aml-model-repo" }}' - image: 152-231-registry.alauda.cn:60070/mlops/tritonserver:25.02-py3 + image: alaudadockerhub/tritonserver:25.02-py3 name: kserve-container resources: limits: From 217ba4089de25fdf3ca331f86ad4bef2dbfaa037 Mon Sep 17 00:00:00 2001 From: Yuan Fang Date: Wed, 28 Jan 2026 18:18:16 +0800 Subject: [PATCH 3/3] add startup probe --- .../inference_service/how_to/custom_inference_runtime.mdx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx index bac3566e..e928ee59 100644 --- a/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx +++ b/docs/en/model_inference/inference_service/how_to/custom_inference_runtime.mdx @@ -327,6 +327,14 @@ spec: privileged: false runAsNonRoot: true runAsUser: 1000 + startupProbe: + failureThreshold: 60 + httpGet: + path: /v2/models/{{ index .Annotations "aml-model-repo" }}/ready + port: 8080 + scheme: HTTP + periodSeconds: 10 + timeoutSeconds: 10 supportedModelFormats: - name: triton version: "1"