-
Notifications
You must be signed in to change notification settings - Fork 445
/
Dockerfile.trcomp.gpu
155 lines (119 loc) · 5.83 KB
/
Dockerfile.trcomp.gpu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# https://github.com/aws/deep-learning-containers/blob/master/available_images.md
# Refer to the above page to pull latest Pytorch image
# docker image region us-west-2
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker
########
### :::Update NCCL:::
### This is a temporary workaround to
### upgrade NCCL
########
# Remove Older NCCL
RUN rm -rvf `find /usr/local | grep nccl` /usr/local/obj
ENV NCCL_VERSION=2.12.12
# Install Newer NCCL
RUN cd /tmp \
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
&& cd nccl \
&& make -j `nproc` src.build BUILDDIR=/usr/local \
&& rm -rf /tmp/nccl
# Reinstall AWS OFI NCCL
ENV BRANCH_OFI=1.4.0-aws
RUN mkdir /tmp/efa-ofi-nccl && cd /tmp/efa-ofi-nccl && git clone https://github.com/aws/aws-ofi-nccl.git -b v${BRANCH_OFI} && cd aws-ofi-nccl && ./autogen.sh && ./configure --with-libfabric=/opt/amazon/efa --with-mpi=/opt/amazon/openmpi --with-cuda=/usr/local/cuda --with-nccl=/usr/local --prefix=/usr/local && make && make install && rm -rf /tmp/efa-ofi-nccl && rm -rf /var/lib/apt/lists/* && apt-get clean
########
### :::Update NCCL:::
########
LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"
# Version args - overwritten by args specified in buildspec
ARG TRANSFORMERS_VERSION=4.21.1
ARG DATASETS_VERSION=1.18.4
ARG PYTHON=python3
ARG PT_BUCKET=https://sm-training-comp-pytorch-binaries.s3.us-west-2.amazonaws.com
ARG PT_BINARY_PATH=${PT_BUCKET}/bdd09cad-152b-42bd-993c-16e49d7af027/20220819-180620/54da28ad4c33c2f259969f74364149ecc58672e0
ARG PT_URL=${PT_BINARY_PATH}/torch-1.11.0%2Bcu113-cp38-cp38-linux_x86_64.whl
ARG PT_XLA_URL=${PT_BINARY_PATH}/torch_xla-1.11.0-cp38-cp38-linux_x86_64.whl
ARG TORCHVISION_URL=${PT_BINARY_PATH}/torchvision-0.12.0a0%2B9b5a3fe-cp38-cp38-linux_x86_64.whl
ARG TORCHAUDIO_URL=${PT_BINARY_PATH}/torchaudio-0.11.0%2B6297e97-cp38-cp38-linux_x86_64.whl
ARG HF_BINARY_PATH=${PT_BUCKET}/3464575b-920a-48e6-b964-4d4c5fe2acae/20220816-021125/7780d990d11d9552e913cae06c7315fa8545648b
ARG HF_TRANSFORMERS_URL=${HF_BINARY_PATH}/transformers-${TRANSFORMERS_VERSION}-py3-none-any.whl
RUN pip install --no-cache-dir -U \
sagemaker
# install Hugging Face libraries and its dependencies
RUN pip install -U --no-cache-dir \
transformers[sklearn,sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
datasets==${DATASETS_VERSION}
RUN apt-get update \
# TODO: Remove upgrade statements once packages are updated in base image
&& apt-get -y install --only-upgrade systemd openssl cryptsetup \
&& apt install -y git-lfs \
&& apt install -y libomp5 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install PyTorch
RUN pip uninstall -y torch \
&& pip install --no-deps --no-cache-dir -U --force-reinstall ${PT_URL} \
&& rm -rf ${PT_URL}
# Install PyTorch XLA
RUN pip uninstall -y torch_xla \
&& pip install --no-deps --no-cache-dir -U --force-reinstall ${PT_XLA_URL} \
&& rm -rf ${PT_XLA_URL}
# Install TorchVision
RUN pip uninstall -y torchvision \
&& pip install --no-deps --no-cache-dir -U --force-reinstall ${TORCHVISION_URL} \
&& rm -rf ${TORCHVISION_URL}
# Install TorchAudio
RUN pip uninstall -y torchaudio \
&& pip install --no-deps --no-cache-dir -U --force-reinstall ${TORCHAUDIO_URL} \
&& rm -rf ${TORCHAUDIO_URL}
# Install Transformers
RUN pip uninstall -y transformers \
&& pip install --no-deps --no-cache-dir --force-reinstall -U ${HF_TRANSFORMERS_URL} \
&& rm -rf ${HF_TRANSFORMERS_URL} \
&& rm -rf $(${PYTHON} -c "import transformers; print(transformers.__file__.replace('__init__.py', 'examples'))")
# Install SageMaker Training Toolkit
RUN pip uninstall -y sagemaker-training \
&& pip install --no-deps --no-cache-dir -U --force-reinstall "sagemaker-training>=4.2.6"
# Install SageMaker PyTorch Training Toolkit
RUN pip uninstall -y sagemaker-pytorch-training \
&& pip install --no-deps --no-cache-dir -U --force-reinstall "sagemaker-pytorch-training>=2.6.1,<3"
ARG CONDA_PREFIX=/opt/conda
# remove micromamba folder at default installation path
RUN rm -rf /root/micromamba/
# Install common training dependencies
RUN ${CONDA_PREFIX}/bin/conda install -y -c conda-forge librosa
RUN pip install --no-cache-dir \
sacrebleu \
fugashi \
ipadic \
nltk \
rouge-score \
soundfile \
sentence-transformers
# Numpy version gets downgraded and becomes incompatible with PyTorch wheel in the above conda command.
RUN pip install -U numpy==1.22.2
# Fixing identified CVEs
RUN pip install -U \
"protobuf<3.20.0" \
"importlib-metadata<5.0,>=1.4.0" \
"wheel>=0.38.0"
# Fix library links
RUN ln -s ${CONDA_PREFIX}/lib/libmkl_intel_lp64.so ${CONDA_PREFIX}/lib/libmkl_intel_lp64.so.1 \
&& ln -s ${CONDA_PREFIX}/lib/libmkl_intel_thread.so ${CONDA_PREFIX}/lib/libmkl_intel_thread.so.1 \
&& ln -s ${CONDA_PREFIX}/lib/libmkl_core.so ${CONDA_PREFIX}/lib/libmkl_core.so.1 \
&& ln -s ${CONDA_PREFIX}/lib/libmkl_gnu_thread.so ${CONDA_PREFIX}/lib/libmkl_gnu_thread.so.1
# Install Horovod
ENV HOROVOD_VERSION=0.24.3
RUN pip uninstall -y horovod \
&& ldconfig /usr/local/cuda-11.3/targets/x86_64-linux/lib/stubs \
&& HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11.3 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} \
&& ldconfig
# Removing the cache as it is needed for security verification
RUN rm -rf /root/.cache | true
RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance*