From ae1db51d4c9663f1dca1ee79e6822632428937de Mon Sep 17 00:00:00 2001 From: philschmid Date: Mon, 14 Aug 2023 11:27:01 +0000 Subject: [PATCH 1/8] init sagemaker 0.9.3 --- .../tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu | 215 ++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu diff --git a/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu new file mode 100644 index 0000000..4ec2312 --- /dev/null +++ b/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu @@ -0,0 +1,215 @@ +FROM lukemathwalker/cargo-chef:latest-rust-1.70 AS chef +WORKDIR /usr/src + +ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse + +FROM chef as planner +COPY Cargo.toml Cargo.toml +COPY rust-toolchain.toml rust-toolchain.toml +COPY proto proto +COPY benchmark benchmark +COPY router router +COPY launcher launcher +RUN cargo chef prepare --recipe-path recipe.json + +FROM chef AS builder + +ARG GIT_SHA +ARG DOCKER_LABEL + +RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ + curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ + unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ + unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ + rm -f $PROTOC_ZIP + +COPY --from=planner /usr/src/recipe.json recipe.json +RUN cargo chef cook --release --recipe-path recipe.json + +COPY Cargo.toml Cargo.toml +COPY rust-toolchain.toml rust-toolchain.toml +COPY proto proto +COPY benchmark benchmark +COPY router router +COPY launcher launcher +RUN cargo build --release + +# Python builder +# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile +FROM debian:bullseye-slim as pytorch-install + +ARG PYTORCH_VERSION=2.0.1 +ARG PYTHON_VERSION=3.9 +ARG CUDA_VERSION=11.8 +ARG MAMBA_VERSION=23.1.0-4 +ARG CUDA_CHANNEL=nvidia +ARG INSTALL_CHANNEL=pytorch +# Automatically set by buildx +ARG TARGETPLATFORM + +ENV PATH /opt/conda/bin:$PATH + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + ccache \ + curl \ + git && \ + rm -rf /var/lib/apt/lists/* + +# Install conda +# translating Docker's TARGETPLATFORM into mamba arches +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") MAMBA_ARCH=aarch64 ;; \ + *) MAMBA_ARCH=x86_64 ;; \ + esac && \ + curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" +RUN chmod +x ~/mambaforge.sh && \ + bash ~/mambaforge.sh -b -p /opt/conda && \ + rm ~/mambaforge.sh + +# Install pytorch +# On arm64 we exit with an error code +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") exit 1 ;; \ + *) /opt/conda/bin/conda update -y conda && \ + /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \ + esac && \ + /opt/conda/bin/conda clean -ya + +# CUDA kernels builder image +FROM pytorch-install as kernel-builder + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ninja-build \ + && rm -rf /var/lib/apt/lists/* + +RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" -y cuda==11.8 && \ + /opt/conda/bin/conda clean -ya + +# Build Flash Attention CUDA kernels +FROM kernel-builder as flash-att-builder + +WORKDIR /usr/src + +COPY server/Makefile-flash-att Makefile + +# Build specific version of flash attention +RUN make build-flash-attention + +# Build Flash Attention v2 CUDA kernels +FROM kernel-builder as flash-att-v2-builder + +WORKDIR /usr/src + +COPY server/Makefile-flash-att-v2 Makefile + +# Build specific version of flash attention v2 +RUN make build-flash-attention-v2 + +# Build Transformers CUDA kernels +FROM kernel-builder as custom-kernels-builder + +WORKDIR /usr/src + +COPY server/custom_kernels/ . + +# Build specific version of transformers +RUN python setup.py build + +# Build vllm CUDA kernels +FROM kernel-builder as vllm-builder + +WORKDIR /usr/src + +COPY server/Makefile-vllm Makefile + +# Build specific version of vllm +RUN make build-vllm + +# Text Generation Inference base image +FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base + +# Conda env +ENV PATH=/opt/conda/bin:$PATH \ + CONDA_PREFIX=/opt/conda + +# Text Generation Inference base env +ENV HUGGINGFACE_HUB_CACHE=/tmp \ + HF_HUB_ENABLE_HF_TRANSFER=1 \ + PORT=80 + +WORKDIR /usr/src + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + libssl-dev \ + ca-certificates \ + make \ + unzip \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy conda with PyTorch installed +COPY --from=pytorch-install /opt/conda /opt/conda + +# Copy build artifacts from flash attention builder +COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages + +# Copy build artifacts from flash attention v2 builder +COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages + +# Copy build artifacts from custom kernels builder +COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages + +# Copy builds artifacts from vllm builder +COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages + +# Install flash-attention dependencies +RUN pip install einops --no-cache-dir + +# Install server +COPY proto proto +COPY server server +COPY server/Makefile server/Makefile +RUN cd server && \ + make gen-server && \ + pip install -r requirements.txt && \ + pip install ".[bnb, accelerate]" --no-cache-dir +RUN rm -r proto server + +# Install benchmarker +COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark +# Install router +COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router +# Install launcher +COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# AWS Sagemaker compatbile image +FROM base as sagemaker + +COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh + +RUN HOME_DIR=/root && \ + pip install requests && \ + curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \ + unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \ + cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \ + chmod +x /usr/local/bin/testOSSCompliance && \ + chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ + ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ + rm -rf ${HOME_DIR}/oss_compliance* +RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-0.9.3/THIRD-PARTY-LICENSES + +ENTRYPOINT ["./entrypoint.sh"] +CMD ["--json-output"] + +LABEL dlc_major_version="1" +LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true" From 2604f3c248213037165249f479e8ce250fe5304b Mon Sep 17 00:00:00 2001 From: philschmid Date: Mon, 14 Aug 2023 11:27:55 +0000 Subject: [PATCH 2/8] v1.0.1 changes --- .../tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu index 4ec2312..5b2198f 100644 --- a/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu +++ b/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu @@ -1,4 +1,5 @@ -FROM lukemathwalker/cargo-chef:latest-rust-1.70 AS chef +# Rust builder +FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse @@ -40,6 +41,7 @@ FROM debian:bullseye-slim as pytorch-install ARG PYTORCH_VERSION=2.0.1 ARG PYTHON_VERSION=3.9 +# Keep in sync with `server/pyproject.toml ARG CUDA_VERSION=11.8 ARG MAMBA_VERSION=23.1.0-4 ARG CUDA_CHANNEL=nvidia @@ -107,6 +109,17 @@ COPY server/Makefile-flash-att-v2 Makefile # Build specific version of flash attention v2 RUN make build-flash-attention-v2 +# Build Transformers exllama kernels +FROM kernel-builder as exllama-kernels-builder + +WORKDIR /usr/src + +COPY server/exllama_kernels/ . + + +# Build specific version of transformers +RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build + # Build Transformers CUDA kernels FROM kernel-builder as custom-kernels-builder @@ -162,6 +175,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86 # Copy build artifacts from custom kernels builder COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +# Copy build artifacts from exllama kernels builder +COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages # Copy builds artifacts from vllm builder COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages @@ -176,7 +191,7 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements.txt && \ - pip install ".[bnb, accelerate]" --no-cache-dir + pip install ".[bnb, accelerate, quantize]" --no-cache-dir RUN rm -r proto server # Install benchmarker @@ -205,7 +220,7 @@ RUN HOME_DIR=/root && \ chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ rm -rf ${HOME_DIR}/oss_compliance* -RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-0.9.3/THIRD-PARTY-LICENSES +RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-1.0.1/THIRD-PARTY-LICENSES ENTRYPOINT ["./entrypoint.sh"] CMD ["--json-output"] From bac11323f4779e82d4bb53a44c5e02956f98d1f1 Mon Sep 17 00:00:00 2001 From: philschmid Date: Mon, 14 Aug 2023 11:28:23 +0000 Subject: [PATCH 3/8] change actions --- .github/workflows/build-huggingface.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-huggingface.yml b/.github/workflows/build-huggingface.yml index 8c4aaae..3690aa4 100644 --- a/.github/workflows/build-huggingface.yml +++ b/.github/workflows/build-huggingface.yml @@ -9,7 +9,7 @@ on: tgi-version: description: 'tgi version' required: true - default: '0.9.3' + default: '1.0.1' pytorch-version: description: 'pytorch version' required: true From 80a95b4c87b5ba628762ee704b84f6a2de7ac767 Mon Sep 17 00:00:00 2001 From: philschmid Date: Wed, 23 Aug 2023 12:10:04 +0000 Subject: [PATCH 4/8] update to 1.0.2 --- .../pytorch/tgi/docker/{1.0.1 => 1.0.2}/py3/cu118/Dockerfile.gpu | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename huggingface/pytorch/tgi/docker/{1.0.1 => 1.0.2}/py3/cu118/Dockerfile.gpu (100%) diff --git a/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu similarity index 100% rename from huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu rename to huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu From 9c6f1c175115a3b8acb562ed3bc5e8241c4fb3a1 Mon Sep 17 00:00:00 2001 From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com> Date: Wed, 23 Aug 2023 10:19:33 -0400 Subject: [PATCH 5/8] Update build-huggingface.yml --- .github/workflows/build-huggingface.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-huggingface.yml b/.github/workflows/build-huggingface.yml index 3690aa4..e0e8353 100644 --- a/.github/workflows/build-huggingface.yml +++ b/.github/workflows/build-huggingface.yml @@ -9,7 +9,7 @@ on: tgi-version: description: 'tgi version' required: true - default: '1.0.1' + default: '1.0.2' pytorch-version: description: 'pytorch version' required: true @@ -185,4 +185,4 @@ jobs: run: | cd /home/ubuntu/djl_benchmark_script/scripts instance_id=${{ needs.create-runner.outputs.gpu_instance_id }} - ./stop_instance.sh $instance_id \ No newline at end of file + ./stop_instance.sh $instance_id From bbef65e412e3101bd459501ee003dd433b415073 Mon Sep 17 00:00:00 2001 From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com> Date: Wed, 23 Aug 2023 10:20:00 -0400 Subject: [PATCH 6/8] Update Dockerfile.gpu --- huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu index 5b2198f..a1631d4 100644 --- a/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu +++ b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu @@ -220,7 +220,7 @@ RUN HOME_DIR=/root && \ chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \ ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ rm -rf ${HOME_DIR}/oss_compliance* -RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-1.0.1/THIRD-PARTY-LICENSES +RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-1.0.2/THIRD-PARTY-LICENSES ENTRYPOINT ["./entrypoint.sh"] CMD ["--json-output"] From c28e9253a83b1aa5cfe6c66adfe862d4c607848d Mon Sep 17 00:00:00 2001 From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com> Date: Mon, 28 Aug 2023 09:05:18 -0400 Subject: [PATCH 7/8] Update Dockerfile.gpu --- huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu | 1 + 1 file changed, 1 insertion(+) diff --git a/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu index a1631d4..687e492 100644 --- a/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu +++ b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu @@ -221,6 +221,7 @@ RUN HOME_DIR=/root && \ ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \ rm -rf ${HOME_DIR}/oss_compliance* RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-1.0.2/THIRD-PARTY-LICENSES +RUN /opt/conda/bin/conda clean -py ENTRYPOINT ["./entrypoint.sh"] CMD ["--json-output"] From 8470a07a20c678532e1bed6a72e223c1a19a9f3a Mon Sep 17 00:00:00 2001 From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com> Date: Mon, 28 Aug 2023 09:05:53 -0400 Subject: [PATCH 8/8] Update build-huggingface.yml --- .github/workflows/build-huggingface.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-huggingface.yml b/.github/workflows/build-huggingface.yml index e0e8353..a61d60e 100644 --- a/.github/workflows/build-huggingface.yml +++ b/.github/workflows/build-huggingface.yml @@ -148,7 +148,7 @@ jobs: docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \ -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \ ${REGISTRY}/${REPOSITORY}:${TAG} - sleep 150 + sleep 200 ret=$(curl http://localhost:8080/invocations -X POST \ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \ -H 'Content-Type: application/json')