From 1dbcf1c4e85c0da778c710f9a1bdf488454139db Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Thu, 3 Oct 2024 20:34:27 -0500 Subject: [PATCH] chore: Update GPU Dockerfile versions --- e2e2/test/images/nvidia/Dockerfile | 147 +++++++++++++++-------------- 1 file changed, 74 insertions(+), 73 deletions(-) diff --git a/e2e2/test/images/nvidia/Dockerfile b/e2e2/test/images/nvidia/Dockerfile index 7753c9f6b..b947df5ed 100644 --- a/e2e2/test/images/nvidia/Dockerfile +++ b/e2e2/test/images/nvidia/Dockerfile @@ -1,14 +1,20 @@ +ARG UBUNTU_MAJOR_VERSION=22 + +ARG CUDA_MAJOR_VERSION=12 +ARG CUDA_MINOR_VERSION=5 + # Start with the NVIDIA CUDA base image -FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 +FROM nvidia/cuda:${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}.1-devel-ubuntu${UBUNTU_MAJOR_VERSION}.04 -ARG EFA_INSTALLER_VERSION=latest -# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 -ARG AWS_OFI_NCCL_VERSION=1.9.1 -ARG NCCL_TESTS_VERSION=master +ARG UBUNTU_MAJOR_VERSION +ARG CUDA_MAJOR_VERSION +ARG CUDA_MINOR_VERSION + +ENV DEBIAN_FRONTEND=noninteractive # Install necessary dependencies -RUN apt-get update -y -RUN apt-get remove -y --allow-change-held-packages \ +RUN apt update -y \ + && apt remove -y --allow-change-held-packages \ libmlx5-1 \ ibverbs-utils \ libibverbs-dev \ @@ -17,84 +23,79 @@ RUN apt-get remove -y --allow-change-held-packages \ libnccl-dev RUN rm -rf /opt/hpcx \ - && rm -rf /usr/local/mpi \ - && rm -rf /usr/local/ucx \ - && rm -f /etc/ld.so.conf.d/hpcx.conf \ - && ldconfig + && rm -rf /usr/local/mpi \ + && rm -rf /usr/local/ucx \ + && rm -f /etc/ld.so.conf.d/hpcx.conf \ + && ldconfig -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ - sudo \ - git \ - gcc \ - vim \ - kmod \ - openssh-client \ - openssh-server \ - build-essential \ - wget curl \ - autoconf \ - libtool \ - gdb \ - automake \ - python3-distutils \ - cmake \ - apt-utils \ - devscripts \ - debhelper \ - libsubunit-dev \ - check \ - pkg-config \ - libhwloc-dev \ - datacenter-gpu-manager \ - cloud-utils \ - cuda-demo-suite-12-5 +RUN apt install -y \ + git \ + gcc \ + openssh-client \ + openssh-server \ + build-essential \ + curl \ + autoconf \ + libtool \ + automake \ + cmake \ + apt-utils \ + libhwloc-dev \ + cuda-demo-suite-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} -RUN mkdir -p /var/run/sshd -RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ - echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ - sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config -ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH +RUN mkdir -p /var/run/sshd \ + && sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config \ + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ + && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH # Install EFA -RUN cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ - && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ - && rm -rf $HOME/aws-efa-installer +ARG EFA_INSTALLER_VERSION=latest +RUN cd /tmp \ + && curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xvz \ + && cd aws-efa-installer \ + && ./efa_installer.sh --yes --enable-gdr --skip-kmod --skip-limit-conf --no-verify --mpi openmpi5 \ + && rm -rf /tmp/* \ + /var/lib/apt/lists/* # Install NCCL -RUN apt-key del 7fa2af80 \ - && curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \ - && dpkg -i cuda-keyring_1.0-1_all.deb \ - && sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2 +ARG NCCL_VERSION=2.22.3-1+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION} +RUN apt update \ + && apt install -y \ + libnccl2=${NCCL_VERSION} \ + libnccl-dev=${NCCL_VERSION} -## Install AWS-OFI-NCCL plugin -RUN export OPAL_PREFIX="" \ - && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ - && cd /opt/aws-ofi-nccl \ - && git checkout v${AWS_OFI_NCCL_VERSION}-aws \ - && ./autogen.sh \ - && ./configure --prefix=/opt/aws-ofi-nccl/install \ - --with-libfabric=/opt/amazon/efa/ \ - --with-cuda=/usr/local/cuda \ - --with-mpi=/opt/amazon/openmpi/ \ - && make && make install - -# Install NCCL Tests -RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ - && cd /opt/nccl-tests \ - && git checkout ${NCCL_TESTS_VERSION} \ - && make MPI=1 \ - MPI_HOME=/opt/amazon/openmpi/ \ - CUDA_HOME=/usr/local/cuda +# Install AWS-OFI-NCCL plugin +ARG AWS_OFI_NCCL_VERSION=1.11.0-aws +RUN cd tmp \ + && curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz | tar xvz \ + && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} \ + && ./configure --prefix=/opt/aws-ofi-nccl/install \ + --with-mpi=/opt/amazon/openmpi \ + --with-libfabric=/opt/amazon/efa \ + --with-cuda=/usr/local/cuda \ + --enable-platform-aws \ + --disable-tests \ + && make -j $(nproc) \ + && make install +# Install NCCL Tests +ARG NCCL_TESTS_VERSION=2.13.10 +RUN cd /tmp \ + && curl -sL https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz | tar xvz \ + && cd nccl-tests-${NCCL_TESTS_VERSION} \ + && make MPI=1 \ + MPI_HOME=/opt/amazon/openmpi5/ \ + CUDA_HOME=/usr/local/cuda \ + && mkdir -p /opt/nccl-tests \ + && cp -r build /opt/nccl-tests/build \ + && rm -rf /tmp/* # Set a default command for debugging or modify as per requirements ENV NCCL_PROTO simple -RUN rm -rf /var/lib/apt/lists/* ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD COPY e2e2/test/images/nvidia/gpu_unit_tests ./gpu_unit_tests -RUN chmod +x ./gpu_unit_tests/unit_test \ No newline at end of file +RUN chmod +x ./gpu_unit_tests/unit_test