V1.0.3 (#30)

* init sagemaker 0.9.3 * v1.0.1 changes * change actions * update to 1.0.2 * Update build-huggingface.yml * Update Dockerfile.gpu * Update Dockerfile.gpu * Update build-huggingface.yml * update to 1.0.3 * update action * add license & increase sleep time in tests * Update Dockerfile.gpu --------- Co-authored-by: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com> Co-authored-by: haixiw <haixiw@amazon.com>
awslabs · Sep 7, 2023 · 8bc04e0 · 8bc04e0
1 parent 40df5f9
commit 8bc04e0
Show file tree

Hide file tree

Showing 3 changed files with 972 additions and 3 deletions.
diff --git a/.github/workflows/build-huggingface.yml b/.github/workflows/build-huggingface.yml
@@ -6,7 +6,7 @@ on:
       tgi-version:
         description: 'tgi version'
         required: true
-        default: '1.0.2'
+        default: '1.0.3'
       pytorch-version:
         description: 'pytorch version'
         required: true
@@ -145,7 +145,7 @@ jobs:
           docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
               -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
               ${REGISTRY}/${REPOSITORY}:${TAG}
-          sleep 200
+          sleep 400
           ret=$(curl http://localhost:8080/invocations -X POST \
               -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
               -H 'Content-Type: application/json')
@@ -162,7 +162,7 @@ jobs:
           docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
               -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
               ${REGISTRY}/${REPOSITORY}:${TAG}
-          sleep 300
+          sleep 400
           ret=$(curl http://localhost:8080/invocations -X POST \
               -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
               -H 'Content-Type: application/json')

diff --git a/huggingface/pytorch/tgi/docker/1.0.3/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.3/py3/cu118/Dockerfile.gpu
@@ -0,0 +1,233 @@
+# Rust builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef as planner
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --release --recipe-path recipe.json
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo build --release
+
+# Python builder
+# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
+FROM debian:bullseye-slim as pytorch-install
+
+ARG PYTORCH_VERSION=2.0.1
+ARG PYTHON_VERSION=3.9
+# Keep in sync with `server/pyproject.toml
+ARG CUDA_VERSION=11.8
+ARG MAMBA_VERSION=23.1.0-4
+ARG CUDA_CHANNEL=nvidia
+ARG INSTALL_CHANNEL=pytorch
+# Automatically set by buildx
+ARG TARGETPLATFORM
+
+ENV PATH /opt/conda/bin:$PATH
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        git && \
+        rm -rf /var/lib/apt/lists/*
+
+# Install conda
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+# Install pytorch
+# On arm64 we exit with an error code
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  exit 1 ;; \
+         *)              /opt/conda/bin/conda update -y conda &&  \
+                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+# CUDA kernels builder image
+FROM pytorch-install as kernel-builder
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        ninja-build \
+        && rm -rf /var/lib/apt/lists/*
+
+RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" -y cuda==11.8 && \
+    /opt/conda/bin/conda clean -ya
+
+# Build Flash Attention CUDA kernels
+FROM kernel-builder as flash-att-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-flash-att Makefile
+
+# Build specific version of flash attention
+RUN make build-flash-attention
+
+# Build Flash Attention v2 CUDA kernels
+FROM kernel-builder as flash-att-v2-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-flash-att-v2 Makefile
+
+# Build specific version of flash attention v2
+RUN make build-flash-attention-v2
+
+# Build Transformers exllama kernels
+FROM kernel-builder as exllama-kernels-builder
+
+WORKDIR /usr/src
+
+COPY server/exllama_kernels/ .
+
+
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
+# Build Transformers CUDA kernels
+FROM kernel-builder as custom-kernels-builder
+
+WORKDIR /usr/src
+
+COPY server/custom_kernels/ .
+
+# Build specific version of transformers
+RUN python setup.py build
+
+# Build vllm CUDA kernels
+FROM kernel-builder as vllm-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-vllm Makefile
+
+# Build specific version of vllm
+RUN make build-vllm
+
+# Text Generation Inference base image
+FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
+
+# Conda env
+ENV PATH=/opt/conda/bin:$PATH \
+    CONDA_PREFIX=/opt/conda
+
+# Text Generation Inference base env
+ENV HUGGINGFACE_HUB_CACHE=/tmp \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+WORKDIR /usr/src
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        libssl-dev \
+        ca-certificates \
+        make \
+        unzip \
+        curl \
+        && rm -rf /var/lib/apt/lists/*
+
+# Copy conda with PyTorch installed
+COPY --from=pytorch-install /opt/conda /opt/conda
+
+# Copy build artifacts from flash attention builder
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Copy build artifacts from flash attention v2 builder
+COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Copy build artifacts from custom kernels builder
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Copy builds artifacts from vllm builder
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Install flash-attention dependencies
+RUN pip install einops --no-cache-dir
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements.txt && \
+    pip install ".[bnb, accelerate, quantize]" --no-cache-dir
+RUN rm -r proto server
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        g++ \
+        && rm -rf /var/lib/apt/lists/*
+
+# AWS Sagemaker compatbile image
+FROM base as sagemaker
+
+COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh
+
+RUN HOME_DIR=/root && \
+    pip install requests && \
+    curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
+    unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
+    cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
+    chmod +x /usr/local/bin/testOSSCompliance && \
+    chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
+    ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
+    rm -rf ${HOME_DIR}/oss_compliance*
+
+COPY llm-hosting-container/huggingface/pytorch/tgi/docker/1.0.3/py3/cu118/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES
+
+RUN /opt/conda/bin/conda clean -py
+
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["--json-output"]
+
+LABEL dlc_major_version="1"
+LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true"
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"