Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[huggingface] Update build workflow to 0.9.3 #21

Merged
merged 1 commit into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 98 additions & 9 deletions .github/workflows/build-huggingface.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
name: Build and push HuggingFace TGI docker image

on:
push:
branches:
- 'main'
workflow_dispatch:
inputs:
tgi-version:
description: 'tgi version'
required: true
default: '0.8.2'
default: '0.9.3'
pytorch-version:
description: 'pytorch version'
required: true
default: '2.0.0'
default: '2.0.1'
cuda-version:
description: 'cuda version'
required: true
Expand All @@ -24,20 +27,20 @@ jobs:
create-runner:
runs-on: [ self-hosted, scheduler ]
steps:
- name: Create new CPU instance
id: create_cpu
- name: Create new G5 instance
id: create_gpu
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/awslabs/llm-hosting-container/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token awslabs/llm-hosting-container
./start_instance.sh action_g5 $token awslabs/llm-hosting-container
outputs:
cpu_instance_id: ${{ steps.create_cpu.outputs.action_cpu_instance_id }}
gpu_instance_id: ${{ steps.create_gpu.outputs.action_g5_instance_id }}

build-and-push-image:
runs-on: [ self-hosted, cpu ]
runs-on: [ self-hosted, g5 ]
timeout-minutes: 150
needs: create-runner
env:
Expand Down Expand Up @@ -87,13 +90,99 @@ jobs:
cache-from: type=gha
cache-to: type=gha,mode=max

run-tests:
runs-on: [ self-hosted, g5 ]
timeout-minutes: 30
needs: [build-and-push-image, create-runner]
env:
TGI_VERSION: ${{github.event.inputs.tgi-version}}
REPOSITORY: djl-serving
TAG: ${{github.event.inputs.pytorch-version}}-tgi${{github.event.inputs.tgi-version}}-gpu-py39-cu${{github.event.inputs.cuda-version}}-ubuntu${{github.event.inputs.ubuntu-version}}
steps:
- uses: actions/checkout@v3
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
with:
registries: "125045733377"
- name: Pull docker
env:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
run: |
docker pull ${REGISTRY}/${REPOSITORY}:${TAG}
- name: Test bloom-560m
env:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
run: |
set -ex
HF_MODEL_ID=bigscience/bloom-560m && \
SM_NUM_GPUS=4 && \
TGI_VERSION=$TGI_VERSION && \
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
${REGISTRY}/${REPOSITORY}:${TAG}
sleep 30
ret=$(curl http://localhost:8080/invocations -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
-H 'Content-Type: application/json')
[[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1
docker rm -f $(docker ps -aq)
- name: Test gpt-neox-20b
env:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
run: |
set -ex
HF_MODEL_ID=EleutherAI/gpt-neox-20b && \
SM_NUM_GPUS=4 && \
TGI_VERSION=$TGI_VERSION && \
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
${REGISTRY}/${REPOSITORY}:${TAG}
sleep 150
ret=$(curl http://localhost:8080/invocations -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
-H 'Content-Type: application/json')
[[ $ret != "[{\"generated_text\":\"What is Deep Learning?"* ]] && exit 1
docker rm -f $(docker ps -aq)
- name: Test flan-t5-xxl
env:
REGISTRY: ${{ steps.login-ecr.outputs.registry }}
run: |
set -ex
HF_MODEL_ID=google/flan-t5-xxl && \
SM_NUM_GPUS=4 && \
TGI_VERSION=$TGI_VERSION && \
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
${REGISTRY}/${REPOSITORY}:${TAG}
sleep 300
ret=$(curl http://localhost:8080/invocations -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
-H 'Content-Type: application/json')
[[ $ret != "[{\"generated_text\""* ]] && exit 1
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
run: |
docker rm -f $(docker ps -aq) || true

stop-runner:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [build-and-push-image, create-runner]
needs: [run-tests, build-and-push-image, create-runner]
steps:
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runner.outputs.cpu_instance_id }}
instance_id=${{ needs.create-runner.outputs.gpu_instance_id }}
./stop_instance.sh $instance_id
134 changes: 0 additions & 134 deletions .github/workflows/test-huggingface.yml

This file was deleted.

10 changes: 5 additions & 5 deletions huggingface/pytorch/tgi/docker/0.9.3/py3/cu118/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ RUN cargo build --release
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
FROM debian:bullseye-slim as pytorch-install

ARG PYTORCH_VERSION=2.0.0
ARG PYTORCH_VERSION=2.0.1
ARG PYTHON_VERSION=3.9
ARG CUDA_VERSION=11.8
ARG MAMBA_VERSION=23.1.0-1
ARG MAMBA_VERSION=23.1.0-4
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
Expand Down Expand Up @@ -168,7 +168,7 @@ COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/co

# Install flash-attention dependencies
RUN pip install einops --no-cache-dir
RUN rm -r transformers/examples

# Install server
COPY proto proto
COPY server server
Expand All @@ -178,6 +178,7 @@ RUN cd server && \
pip install -r requirements.txt && \
pip install ".[bnb, accelerate]" --no-cache-dir
RUN rm -r proto server

# Install benchmarker
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
Expand All @@ -204,8 +205,7 @@ RUN HOME_DIR=/root && \
chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
rm -rf ${HOME_DIR}/oss_compliance*
RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-0.8.2/THIRD-PARTY-LICENSES

RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-0.9.3/THIRD-PARTY-LICENSES

ENTRYPOINT ["./entrypoint.sh"]
CMD ["--json-output"]
Expand Down