Skip to content

Commit

Permalink
V1.0.3 (#30)
Browse files Browse the repository at this point in the history
* init sagemaker 0.9.3

* v1.0.1 changes

* change actions

* update to 1.0.2

* Update build-huggingface.yml

* Update Dockerfile.gpu

* Update Dockerfile.gpu

* Update build-huggingface.yml

* update to 1.0.3

* update action

* add license & increase sleep time in tests

* Update Dockerfile.gpu

---------

Co-authored-by: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com>
Co-authored-by: haixiw <haixiw@amazon.com>
  • Loading branch information
3 people committed Sep 7, 2023
1 parent 40df5f9 commit 8bc04e0
Show file tree
Hide file tree
Showing 3 changed files with 972 additions and 3 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/build-huggingface.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
tgi-version:
description: 'tgi version'
required: true
default: '1.0.2'
default: '1.0.3'
pytorch-version:
description: 'pytorch version'
required: true
Expand Down Expand Up @@ -145,7 +145,7 @@ jobs:
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
${REGISTRY}/${REPOSITORY}:${TAG}
sleep 200
sleep 400
ret=$(curl http://localhost:8080/invocations -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
-H 'Content-Type: application/json')
Expand All @@ -162,7 +162,7 @@ jobs:
docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
-e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
${REGISTRY}/${REPOSITORY}:${TAG}
sleep 300
sleep 400
ret=$(curl http://localhost:8080/invocations -X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
-H 'Content-Type: application/json')
Expand Down
233 changes: 233 additions & 0 deletions huggingface/pytorch/tgi/docker/1.0.3/py3/cu118/Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
# Rust builder
FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef as planner
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

ARG GIT_SHA
ARG DOCKER_LABEL

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --release --recipe-path recipe.json

COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY launcher launcher
RUN cargo build --release

# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
FROM debian:bullseye-slim as pytorch-install

ARG PYTORCH_VERSION=2.0.1
ARG PYTHON_VERSION=3.9
# Keep in sync with `server/pyproject.toml
ARG CUDA_VERSION=11.8
ARG MAMBA_VERSION=23.1.0-4
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM

ENV PATH /opt/conda/bin:$PATH

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
ccache \
curl \
git && \
rm -rf /var/lib/apt/lists/*

# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
"linux/arm64") MAMBA_ARCH=aarch64 ;; \
*) MAMBA_ARCH=x86_64 ;; \
esac && \
curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
bash ~/mambaforge.sh -b -p /opt/conda && \
rm ~/mambaforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
"linux/arm64") exit 1 ;; \
*) /opt/conda/bin/conda update -y conda && \
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
esac && \
/opt/conda/bin/conda clean -ya

# CUDA kernels builder image
FROM pytorch-install as kernel-builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ninja-build \
&& rm -rf /var/lib/apt/lists/*

RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" -y cuda==11.8 && \
/opt/conda/bin/conda clean -ya

# Build Flash Attention CUDA kernels
FROM kernel-builder as flash-att-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN make build-flash-attention

# Build Flash Attention v2 CUDA kernels
FROM kernel-builder as flash-att-v2-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
RUN make build-flash-attention-v2

# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder

WORKDIR /usr/src

COPY server/exllama_kernels/ .


# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

# Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder

WORKDIR /usr/src

COPY server/custom_kernels/ .

# Build specific version of transformers
RUN python setup.py build

# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder

WORKDIR /usr/src

COPY server/Makefile-vllm Makefile

# Build specific version of vllm
RUN make build-vllm

# Text Generation Inference base image
FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
CONDA_PREFIX=/opt/conda

# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/tmp \
HF_HUB_ENABLE_HF_TRANSFER=1 \
PORT=80

WORKDIR /usr/src

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
libssl-dev \
ca-certificates \
make \
unzip \
curl \
&& rm -rf /var/lib/apt/lists/*

# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda

# Copy build artifacts from flash attention builder
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

# Install flash-attention dependencies
RUN pip install einops --no-cache-dir

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
RUN cd server && \
make gen-server && \
pip install -r requirements.txt && \
pip install ".[bnb, accelerate, quantize]" --no-cache-dir
RUN rm -r proto server

# Install benchmarker
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
g++ \
&& rm -rf /var/lib/apt/lists/*

# AWS Sagemaker compatbile image
FROM base as sagemaker

COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh

RUN HOME_DIR=/root && \
pip install requests && \
curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
chmod +x /usr/local/bin/testOSSCompliance && \
chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
rm -rf ${HOME_DIR}/oss_compliance*

COPY llm-hosting-container/huggingface/pytorch/tgi/docker/1.0.3/py3/cu118/THIRD-PARTY-LICENSES /root/THIRD-PARTY-LICENSES

RUN /opt/conda/bin/conda clean -py

ENTRYPOINT ["./entrypoint.sh"]
CMD ["--json-output"]

LABEL dlc_major_version="1"
LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true"
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"
Loading

0 comments on commit 8bc04e0

Please sign in to comment.