From ae1db51d4c9663f1dca1ee79e6822632428937de Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 14 Aug 2023 11:27:01 +0000
Subject: [PATCH 1/8] init sagemaker 0.9.3

---
 .../tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu | 215 ++++++++++++++++++
 1 file changed, 215 insertions(+)
 create mode 100644 huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu

diff --git a/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu
new file mode 100644
index 0000000..4ec2312
--- /dev/null
+++ b/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu
@@ -0,0 +1,215 @@
+FROM lukemathwalker/cargo-chef:latest-rust-1.70 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef as planner
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --release --recipe-path recipe.json
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo build --release
+
+# Python builder
+# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
+FROM debian:bullseye-slim as pytorch-install
+
+ARG PYTORCH_VERSION=2.0.1
+ARG PYTHON_VERSION=3.9
+ARG CUDA_VERSION=11.8
+ARG MAMBA_VERSION=23.1.0-4
+ARG CUDA_CHANNEL=nvidia
+ARG INSTALL_CHANNEL=pytorch
+# Automatically set by buildx
+ARG TARGETPLATFORM
+
+ENV PATH /opt/conda/bin:$PATH
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        git && \
+        rm -rf /var/lib/apt/lists/*
+
+# Install conda
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+# Install pytorch
+# On arm64 we exit with an error code
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  exit 1 ;; \
+         *)              /opt/conda/bin/conda update -y conda &&  \
+                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+# CUDA kernels builder image
+FROM pytorch-install as kernel-builder
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        ninja-build \
+        && rm -rf /var/lib/apt/lists/*
+
+RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" -y cuda==11.8 && \
+    /opt/conda/bin/conda clean -ya
+
+# Build Flash Attention CUDA kernels
+FROM kernel-builder as flash-att-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-flash-att Makefile
+
+# Build specific version of flash attention
+RUN make build-flash-attention
+
+# Build Flash Attention v2 CUDA kernels
+FROM kernel-builder as flash-att-v2-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-flash-att-v2 Makefile
+
+# Build specific version of flash attention v2
+RUN make build-flash-attention-v2
+
+# Build Transformers CUDA kernels
+FROM kernel-builder as custom-kernels-builder
+
+WORKDIR /usr/src
+
+COPY server/custom_kernels/ .
+
+# Build specific version of transformers
+RUN python setup.py build
+
+# Build vllm CUDA kernels
+FROM kernel-builder as vllm-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-vllm Makefile
+
+# Build specific version of vllm
+RUN make build-vllm
+
+# Text Generation Inference base image
+FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
+
+# Conda env
+ENV PATH=/opt/conda/bin:$PATH \
+    CONDA_PREFIX=/opt/conda
+
+# Text Generation Inference base env
+ENV HUGGINGFACE_HUB_CACHE=/tmp \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+WORKDIR /usr/src
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        libssl-dev \
+        ca-certificates \
+        make \
+        unzip \
+        curl \
+        && rm -rf /var/lib/apt/lists/*
+
+# Copy conda with PyTorch installed
+COPY --from=pytorch-install /opt/conda /opt/conda
+
+# Copy build artifacts from flash attention builder
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Copy build artifacts from flash attention v2 builder
+COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Copy build artifacts from custom kernels builder
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Copy builds artifacts from vllm builder
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+
+# Install flash-attention dependencies
+RUN pip install einops --no-cache-dir
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements.txt && \
+    pip install ".[bnb, accelerate]" --no-cache-dir
+RUN rm -r proto server
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        g++ \
+        && rm -rf /var/lib/apt/lists/*
+
+# AWS Sagemaker compatbile image
+FROM base as sagemaker
+
+COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh
+
+RUN HOME_DIR=/root && \
+    pip install requests && \
+    curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip && \
+    unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ && \
+    cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance && \
+    chmod +x /usr/local/bin/testOSSCompliance && \
+    chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
+    ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
+    rm -rf ${HOME_DIR}/oss_compliance*
+RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-0.9.3/THIRD-PARTY-LICENSES
+
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["--json-output"]
+
+LABEL dlc_major_version="1"
+LABEL com.amazonaws.ml.engines.sagemaker.dlc.framework.huggingface.tgi="true"
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port="true"

From 2604f3c248213037165249f479e8ce250fe5304b Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 14 Aug 2023 11:27:55 +0000
Subject: [PATCH 2/8] v1.0.1 changes

---
 .../tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu
index 4ec2312..5b2198f 100644
--- a/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu
+++ b/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu
@@ -1,4 +1,5 @@
-FROM lukemathwalker/cargo-chef:latest-rust-1.70 AS chef
+# Rust builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -40,6 +41,7 @@ FROM debian:bullseye-slim as pytorch-install
 
 ARG PYTORCH_VERSION=2.0.1
 ARG PYTHON_VERSION=3.9
+# Keep in sync with `server/pyproject.toml
 ARG CUDA_VERSION=11.8
 ARG MAMBA_VERSION=23.1.0-4
 ARG CUDA_CHANNEL=nvidia
@@ -107,6 +109,17 @@ COPY server/Makefile-flash-att-v2 Makefile
 # Build specific version of flash attention v2
 RUN make build-flash-attention-v2
 
+# Build Transformers exllama kernels
+FROM kernel-builder as exllama-kernels-builder
+
+WORKDIR /usr/src
+
+COPY server/exllama_kernels/ .
+
+
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
 # Build Transformers CUDA kernels
 FROM kernel-builder as custom-kernels-builder
 
@@ -162,6 +175,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
 
 # Copy build artifacts from custom kernels builder
 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 
 # Copy builds artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
@@ -176,7 +191,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
     make gen-server && \
     pip install -r requirements.txt && \
-    pip install ".[bnb, accelerate]" --no-cache-dir
+    pip install ".[bnb, accelerate, quantize]" --no-cache-dir
 RUN rm -r proto server
 
 # Install benchmarker
@@ -205,7 +220,7 @@ RUN HOME_DIR=/root && \
     chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
     ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
     rm -rf ${HOME_DIR}/oss_compliance*
-RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-0.9.3/THIRD-PARTY-LICENSES
+RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-1.0.1/THIRD-PARTY-LICENSES
 
 ENTRYPOINT ["./entrypoint.sh"]
 CMD ["--json-output"]

From bac11323f4779e82d4bb53a44c5e02956f98d1f1 Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Mon, 14 Aug 2023 11:28:23 +0000
Subject: [PATCH 3/8] change actions

---
 .github/workflows/build-huggingface.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-huggingface.yml b/.github/workflows/build-huggingface.yml
index 8c4aaae..3690aa4 100644
--- a/.github/workflows/build-huggingface.yml
+++ b/.github/workflows/build-huggingface.yml
@@ -9,7 +9,7 @@ on:
       tgi-version:
         description: 'tgi version'
         required: true
-        default: '0.9.3'
+        default: '1.0.1'
       pytorch-version:
         description: 'pytorch version'
         required: true

From 80a95b4c87b5ba628762ee704b84f6a2de7ac767 Mon Sep 17 00:00:00 2001
From: philschmid <schmidphilipp1995@gmail.com>
Date: Wed, 23 Aug 2023 12:10:04 +0000
Subject: [PATCH 4/8] update to 1.0.2

---
 .../pytorch/tgi/docker/{1.0.1 => 1.0.2}/py3/cu118/Dockerfile.gpu  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename huggingface/pytorch/tgi/docker/{1.0.1 => 1.0.2}/py3/cu118/Dockerfile.gpu (100%)

diff --git a/huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu
similarity index 100%
rename from huggingface/pytorch/tgi/docker/1.0.1/py3/cu118/Dockerfile.gpu
rename to huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu

From 9c6f1c175115a3b8acb562ed3bc5e8241c4fb3a1 Mon Sep 17 00:00:00 2001
From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com>
Date: Wed, 23 Aug 2023 10:19:33 -0400
Subject: [PATCH 5/8] Update build-huggingface.yml

---
 .github/workflows/build-huggingface.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-huggingface.yml b/.github/workflows/build-huggingface.yml
index 3690aa4..e0e8353 100644
--- a/.github/workflows/build-huggingface.yml
+++ b/.github/workflows/build-huggingface.yml
@@ -9,7 +9,7 @@ on:
       tgi-version:
         description: 'tgi version'
         required: true
-        default: '1.0.1'
+        default: '1.0.2'
       pytorch-version:
         description: 'pytorch version'
         required: true
@@ -185,4 +185,4 @@ jobs:
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
           instance_id=${{ needs.create-runner.outputs.gpu_instance_id }}
-          ./stop_instance.sh $instance_id
\ No newline at end of file
+          ./stop_instance.sh $instance_id

From bbef65e412e3101bd459501ee003dd433b415073 Mon Sep 17 00:00:00 2001
From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com>
Date: Wed, 23 Aug 2023 10:20:00 -0400
Subject: [PATCH 6/8] Update Dockerfile.gpu

---
 huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu
index 5b2198f..a1631d4 100644
--- a/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu
+++ b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu
@@ -220,7 +220,7 @@ RUN HOME_DIR=/root && \
     chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh && \
     ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
     rm -rf ${HOME_DIR}/oss_compliance*
-RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-1.0.1/THIRD-PARTY-LICENSES
+RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-1.0.2/THIRD-PARTY-LICENSES
 
 ENTRYPOINT ["./entrypoint.sh"]
 CMD ["--json-output"]

From c28e9253a83b1aa5cfe6c66adfe862d4c607848d Mon Sep 17 00:00:00 2001
From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com>
Date: Mon, 28 Aug 2023 09:05:18 -0400
Subject: [PATCH 7/8] Update Dockerfile.gpu

---
 huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu
index a1631d4..687e492 100644
--- a/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu
+++ b/huggingface/pytorch/tgi/docker/1.0.2/py3/cu118/Dockerfile.gpu
@@ -221,6 +221,7 @@ RUN HOME_DIR=/root && \
     ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} python && \
     rm -rf ${HOME_DIR}/oss_compliance*
 RUN curl -o /root/THIRD-PARTY-LICENSES https://publish.djl.ai/dlc-licenses/huggingface/tgi-1.0.2/THIRD-PARTY-LICENSES
+RUN /opt/conda/bin/conda clean -py
 
 ENTRYPOINT ["./entrypoint.sh"]
 CMD ["--json-output"]

From 8470a07a20c678532e1bed6a72e223c1a19a9f3a Mon Sep 17 00:00:00 2001
From: amzn-choeric <105388439+amzn-choeric@users.noreply.github.com>
Date: Mon, 28 Aug 2023 09:05:53 -0400
Subject: [PATCH 8/8] Update build-huggingface.yml

---
 .github/workflows/build-huggingface.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-huggingface.yml b/.github/workflows/build-huggingface.yml
index e0e8353..a61d60e 100644
--- a/.github/workflows/build-huggingface.yml
+++ b/.github/workflows/build-huggingface.yml
@@ -148,7 +148,7 @@ jobs:
           docker run --gpus all --shm-size 2g -itd --rm -p 8080:8080 \
               -e SM_NUM_GPUS=$SM_NUM_GPUS -e HF_MODEL_ID=$HF_MODEL_ID \
               ${REGISTRY}/${REPOSITORY}:${TAG}
-          sleep 150
+          sleep 200
           ret=$(curl http://localhost:8080/invocations -X POST \
               -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
               -H 'Content-Type: application/json')