diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml new file mode 100644 index 0000000000..6916b80cc0 --- /dev/null +++ b/.github/actions/docker-build/action.yml @@ -0,0 +1,77 @@ +name: 'Docker Build' +description: 'Build Dynamo container images' +inputs: + framework: + description: 'Framework to build' + required: true + default: 'vllm' + target: + description: 'Target to build' + required: false + default: 'runtime' + image_tag: + description: 'Custom image tag (optional, defaults to framework:latest)' + required: false + ngc_ci_access_token: + description: 'NGC CI Access Token' + required: false + ci_token: + description: 'CI Token' + required: false + aws_default_region: + description: 'AWS Default Region' + required: false + sccache_s3_bucket: + description: 'SCCache S3 Bucket' + required: false + aws_access_key_id: + description: 'AWS Access Key ID' + required: false + aws_secret_access_key: + description: 'AWS Secret Access Key' + required: false + +outputs: + image_tag: + description: 'Image Tag' + value: ${{ steps.build.outputs.image_tag }} + +runs: + using: "composite" + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to NGC + if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' + shell: bash + run: | + echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin + - name: Cleanup + if: always() + shell: bash + run: | + docker system prune -af + - name: Build image + id: build + shell: bash + env: + GITHUB_TOKEN: ${{ inputs.ci_token }} + AWS_DEFAULT_REGION: ${{ inputs.aws_default_region }} + SCCACHE_S3_BUCKET: ${{ inputs.sccache_s3_bucket }} + AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + run: | + # Determine image tag + if [ -n "${{ inputs.image_tag }}" ]; then + IMAGE_TAG="${{ inputs.image_tag }}" + else + IMAGE_TAG="${{ inputs.framework }}:latest" + fi + echo "image_tag=$IMAGE_TAG" >> $GITHUB_OUTPUT + + ./container/build.sh --tag "$IMAGE_TAG" \ + --target ${{ inputs.target }} \ + --framework ${{ inputs.framework }} \ + --use-sccache \ + --sccache-bucket "$SCCACHE_S3_BUCKET" \ + --sccache-region "$AWS_DEFAULT_REGION" diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml new file mode 100644 index 0000000000..9ffe797d30 --- /dev/null +++ b/.github/actions/pytest/action.yml @@ -0,0 +1,27 @@ +name: 'Pytest' +description: 'Run pytest on pre-built container images' +inputs: + pytest_marks: + description: 'Pytest marks' + required: true + default: 'e2e and vllm and gpu_1 and not slow' + image_tag: + description: 'Image Tag to run tests on' + required: true + + +runs: + using: "composite" + steps: + - name: Run tests + shell: bash + env: + CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_${{ github.job }} + PYTEST_XML_FILE: pytest_test_report.xml + HF_HOME: /runner/_work/_temp + run: | + docker run --runtime=nvidia --rm --gpus all -w /workspace \ + --network host \ + --name ${{ env.CONTAINER_ID }}_pytest \ + ${{ inputs.image_tag }} \ + bash -c "pytest -xsv --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ inputs.pytest_marks }}\"" diff --git a/.github/filters.yaml b/.github/filters.yaml new file mode 100644 index 0000000000..191d71f63b --- /dev/null +++ b/.github/filters.yaml @@ -0,0 +1,50 @@ +docs: &docs + - 'docs/**' + - '**/*.md' + - '**/*.rst' + +ci: &ci + - '.github/workflows/**' + - '.github/filters.yaml' + - '.github/actions/**' + +has_code_changes: + - *ci + - 'benchmarks/**' + - 'components/**' + - 'container/**' + - 'deploy/**' + - 'examples/**' + - 'launch/**' + - 'lib/**' + - 'recipes/**' + - 'tests/**' + - '*.toml' + - '*.lock' + - '*.py' + - '*.rs' + +vllm: &vllm + - 'container/Dockerfile.vllm' + - 'container/deps/requirements.vllm.txt' + - 'container/deps/vllm/**' + - 'components/backends/vllm/**' + - 'tests/serve/test_vllm.py' + +sglang: &sglang + - 'container/Dockerfile.sglang' + - 'container/Dockerfile.sglang-wideep' + - 'components/backends/sglang/**' + - 'container/build.sh' + - 'tests/serve/test_sglang.py' + +trtllm: &trtllm + - 'container/Dockerfile.trtllm' + - 'components/backends/trtllm/**' + - 'container/build.sh' + - 'container/build_trtllm_wheel.sh' + - 'container/deps/**' + - 'tests/serve/test_trtllm.py' + +sdk: + - 'deploy/**' diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index f0a0a73baf..10b9d220e7 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -name: NVIDIA Dynamo Backends Github Validation +name: Docker Build and Test on: push: @@ -9,70 +9,104 @@ on: - main - "pull-request/[0-9]+" +concurrency: + group: ${{ github.workflow }}-build-test-${{ github.ref_name || github.run_id }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + jobs: - build-test: - runs-on: gpu-l40-amd64 - strategy: - fail-fast: false - matrix: - framework: [vllm, sglang, trtllm] - include: - - framework: vllm - target: runtime - pytest_marks: "e2e and vllm and gpu_1 and not slow" - - framework: sglang - target: runtime - pytest_marks: "e2e and sglang and gpu_1 and not slow" - - framework: trtllm - target: runtime - pytest_marks: "e2e and trtllm_marker and gpu_1 and not slow" + changed-files: + runs-on: ubuntu-latest + outputs: + has_code_changes: ${{ steps.filter.outputs.has_code_changes }} + steps: + - name: Checkout code + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + - name: Check for changes + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 + id: filter + with: + filters: .github/filters.yaml - # Do not cancel main branch runs - concurrency: - group: ${{ github.workflow }}-${{ matrix.framework }}-build-test-${{ github.ref_name || github.run_id }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + backend-status-check: + runs-on: ubuntu-latest + needs: [vllm, sglang, trtllm] + if: always() + steps: + - name: "Check all dependent jobs" + run: | + echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))' - name: Build and Test - ${{ matrix.framework }} - env: - CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_${{ github.job }}_${{ matrix.framework }} - PYTEST_XML_FILE: pytest_test_report.xml - FRAMEWORK: ${{ matrix.framework }} - TARGET: ${{ matrix.target }} - PYTEST_MARKS: ${{ matrix.pytest_marks }} + vllm: + runs-on: gpu-l40-amd64 + needs: changed-files + if: needs.changed-files.outputs.has_code_changes == 'true' + steps: + - name: Checkout code + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + - name: Build Container + id: build-image + uses: ./.github/actions/docker-build + with: + framework: vllm + target: runtime + ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} + ci_token: ${{ secrets.CI_TOKEN }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + - name: Run tests + uses: ./.github/actions/pytest + with: + image_tag: ${{ steps.build-image.outputs.image_tag }} + pytest_marks: "e2e and vllm and gpu_1 and not slow" + sglang: + runs-on: gpu-l40-amd64 + needs: changed-files + if: needs.changed-files.outputs.has_code_changes == 'true' steps: - name: Checkout repository - uses: actions/checkout@v4 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Login to NGC - if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' - run: | - echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin - - name: Cleanup - if: always() - run: | - docker system prune -af - - name: Build image - env: - GITHUB_TOKEN: ${{ secrets.CI_TOKEN }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} - SCCACHE_S3_BUCKET: ${{ secrets.SCCACHE_S3_BUCKET }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - run: | - ./container/build.sh --tag ${{ matrix.framework }}:latest \ - --target ${{ matrix.target }} \ - --framework ${{ matrix.framework }} \ - --use-sccache \ - --sccache-bucket "$SCCACHE_S3_BUCKET" \ - --sccache-region "$AWS_DEFAULT_REGION" - - name: Run pytest - env: - HF_HOME: /runner/_work/_temp - run: | - docker run --runtime=nvidia --rm --gpus all -w /workspace \ - --network host \ - --name ${{ env.CONTAINER_ID }}_pytest \ - ${{ matrix.framework }}:latest \ - bash -c "pytest -xsv --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\"" + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + - name: Build Container + id: build-image + uses: ./.github/actions/docker-build + with: + framework: sglang + target: runtime + ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} + ci_token: ${{ secrets.CI_TOKEN }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + - name: Run tests + uses: ./.github/actions/pytest + with: + image_tag: ${{ steps.build-image.outputs.image_tag }} + pytest_marks: "e2e and sglang and gpu_1" + + trtllm: + runs-on: gpu-l40-amd64 + needs: changed-files + if: needs.changed-files.outputs.has_code_changes == 'true' + steps: + - name: Checkout code + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + - name: Build Container + id: build-image + uses: ./.github/actions/docker-build + with: + framework: trtllm + target: runtime + ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} + ci_token: ${{ secrets.CI_TOKEN }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + - name: Run tests + uses: ./.github/actions/pytest + with: + image_tag: ${{ steps.build-image.outputs.image_tag }} + pytest_marks: "e2e and trtllm_marker and gpu_1 and not slow" diff --git a/.github/workflows/trigger_ci.yml b/.github/workflows/trigger_ci.yml index d4b1145a00..8df9298263 100644 --- a/.github/workflows/trigger_ci.yml +++ b/.github/workflows/trigger_ci.yml @@ -51,28 +51,7 @@ jobs: id: src_changes uses: dorny/paths-filter@v3 with: - filters: | - vllm: - - 'container/Dockerfile.vllm' - - 'container/deps/requirements.vllm.txt' - - 'container/deps/vllm/**' - - 'components/backends/vllm/**' - - 'tests/serve/test_vllm.py' - trtllm: - - 'container/Dockerfile.trtllm' - - 'components/backends/trtllm/**' - - 'container/build.sh' - - 'container/build_trtllm_wheel.sh' - - 'container/deps/**' - - 'tests/serve/test_trtllm.py' - sdk: - - 'deploy/**' - sglang: - - 'container/Dockerfile.sglang' - - 'container/Dockerfile.sglang-deepep' - - 'components/backends/sglang/**' - - 'container/build.sh' - - 'tests/serve/test_sglang.py' + filters: .github/filters.yaml - name: Check if Validation Workflow has run id: check_workflow uses: actions/github-script@v6