awslabs · xyang16 · Aug 2, 2023 · Aug 2, 2023
@@ -1,16 +1,19 @@
 name: Build and push HuggingFace TGI docker image
 
 on:
+  push:
+    branches:
+      - 'main'
   workflow_dispatch:
     inputs:
       tgi-version:
         description: 'tgi version'
         required: true
-        default: '0.8.2'
+        default: '0.9.3'
       pytorch-version:
         description: 'pytorch version'
         required: true
-        default: '2.0.0'
+        default: '2.0.1'
       cuda-version:
         description: 'cuda version'
         required: true
@@ -24,20 +27,20 @@ jobs:
   create-runner:
     runs-on: [ self-hosted, scheduler ]
     steps:
-      - name: Create new CPU instance
-        id: create_cpu
+      - name: Create new G5 instance
+        id: create_gpu
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
           token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
           https://api.github.com/repos/awslabs/llm-hosting-container/actions/runners/registration-token \
           --fail \
           | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_cpu $token awslabs/llm-hosting-container
+          ./start_instance.sh action_g5 $token awslabs/llm-hosting-container
     outputs:
-      cpu_instance_id: ${{ steps.create_cpu.outputs.action_cpu_instance_id }}
+      gpu_instance_id: ${{ steps.create_gpu.outputs.action_g5_instance_id }}
 
   build-and-push-image:
-    runs-on: [ self-hosted, cpu ]
+    runs-on: [ self-hosted, g5 ]
     timeout-minutes: 150
     needs: create-runner
     env:
@@ -87,13 +90,99 @@ jobs:
           cache-from: type=gha
           cache-to: type=gha,mode=max
 
+  run-tests:
+    runs-on: [ self-hosted, g5 ]
+    timeout-minutes: 30
+    needs: [build-and-push-image, create-runner]
+    env:
+      TGI_VERSION: ${{github.event.inputs.tgi-version}}
+      REPOSITORY: djl-serving
+      TAG: ${{github.event.inputs.pytorch-version}}-tgi${{github.event.inputs.tgi-version}}-gpu-py39-cu${{github.event.inputs.cuda-version}}-ubuntu${{github.event.inputs.ubuntu-version}}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Clean env
+        run: |
+          yes | docker system prune -a --volumes
+          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
+          echo "wait dpkg lock..."
+          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v2
+        with:
+          aws-region: us-east-1
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v1
+        with:
+          registries: "125045733377"
+      - name: Pull docker
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+        run: |
+          docker pull ${REGISTRY}/${REPOSITORY}:${TAG}
+      - name: Test bloom-560m
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+        run: |
+          set -ex
+          HF_MODEL_ID=bigscience/bloom-560m && \
+          SM_NUM_GPUS=4 && \
+          TGI_VERSION=$TGI_VERSION && \
+          docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
+              -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
+              ${REGISTRY}/${REPOSITORY}:${TAG}
+          sleep 30
+          ret=$(curl http://localhost:8080/invocations -X POST \
+              -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
+              -H 'Content-Type: application/json')
+          [[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1
+          docker rm -f $(docker ps -aq)
+      - name: Test gpt-neox-20b
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+        run: |
+          set -ex
+          HF_MODEL_ID=EleutherAI/gpt-neox-20b && \
+          SM_NUM_GPUS=4 && \
+          TGI_VERSION=$TGI_VERSION && \
+          docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
+              -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
+              ${REGISTRY}/${REPOSITORY}:${TAG}
+          sleep 150
+          ret=$(curl http://localhost:8080/invocations -X POST \
+              -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
+              -H 'Content-Type: application/json')
+          [[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1
+          docker rm -f $(docker ps -aq)
+      - name: Test flan-t5-xxl
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+        run: |
+          set -ex
+          HF_MODEL_ID=google/flan-t5-xxl && \
+          SM_NUM_GPUS=4 && \
+          TGI_VERSION=$TGI_VERSION && \
+          docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
+              -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
+              ${REGISTRY}/${REPOSITORY}:${TAG}
+          sleep 300
+          ret=$(curl http://localhost:8080/invocations -X POST \
+              -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
+              -H 'Content-Type: application/json')
+          [[ $ret != "[{\"generated_text\""* ]] && exit 1
+          docker rm -f $(docker ps -aq)
+      - name: On fail step
+        if: ${{ failure() }}
+        run: |
+          docker rm -f $(docker ps -aq) || true
+
   stop-runner:
     if: always()
     runs-on: [ self-hosted, scheduler ]
-    needs: [build-and-push-image, create-runner]
+    needs: [run-tests, build-and-push-image, create-runner]
     steps:
       - name: Stop all instances
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
-          instance_id=${{ needs.create-runner.outputs.cpu_instance_id }}
+          instance_id=${{ needs.create-runner.outputs.gpu_instance_id }}
           ./stop_instance.sh $instance_id
@@ -38,10 +38,10 @@ RUN cargo build --release
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
 FROM debian:bullseye-slim as pytorch-install
 
-ARG PYTORCH_VERSION=2.0.0
+ARG PYTORCH_VERSION=2.0.1
 ARG PYTHON_VERSION=3.9
 ARG CUDA_VERSION=11.8
-ARG MAMBA_VERSION=23.1.0-1
+ARG MAMBA_VERSION=23.1.0-4
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch
 # Automatically set by buildx
@@ -168,7 +168,7 @@ COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/co
 
 # Install flash-attention dependencies
 RUN pip install einops --no-cache-dir
-RUN rm -r transformers/examples
+
 # Install server
 COPY proto proto
 COPY server server
@@ -178,6 +178,7 @@ RUN cd server && \
     pip install -r requirements.txt && \
     pip install ".[bnb, accelerate]" --no-cache-dir
 RUN rm -r proto server
+
 # Install benchmarker
 COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router
@@ -204,8 +205,7 @@ RUN HOME_DIR=/root && \
     chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
     ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
     rm -rf ${HOME_DIR}/oss_compliance*
-RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-0.8.2/THIRD-PARTY-LICENSES
-
+RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-0.9.3/THIRD-PARTY-LICENSES
 
 ENTRYPOINT ["./entrypoint.sh"]
 CMD ["--json-output"]