Skip to content
This repository has been archived by the owner on Dec 1, 2021. It is now read-only.

Use CUDA stab instead of specifying LD_LIBRARY_PATH #425

Merged
merged 2 commits into from
Sep 18, 2019
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 4 additions & 3 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,11 @@ ENV HOROVOD_GPU_ALLREDUCE NCCL
ENV HOROVOD_WITH_TENSORFLOW 1

ENV CUDA_HOME /usr/local/cuda-10.0
ENV LD_LIBRARY_PATH /usr/local/cuda-10.0/compat/:/usr/local/cuda-10.0/targets/x86_64-linux/lib/:$LD_LIBRARY_PATH

# Install requirements for distributed training
RUN pip install -r /tmp/requirements/dist.requirements.txt
# Install requirements for distributed training temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \
pip install -r /tmp/requirements/dist.requirements.txt && \
ldconfig
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hadusam This second ldconfig is for reset ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iizukak Yes, exactly!

root@c6dc85e0cb06:/home/blueoil# ldconfig -p > ldconfig.bef
# Add cuda stubs temporary
root@c6dc85e0cb06:/home/blueoil# ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs
root@c6dc85e0cb06:/home/blueoil# ldconfig -p > ldconfig.cuda
root@c6dc85e0cb06:/home/blueoil# diff ldconfig.bef ldconfig.cuda
1c1
< 420 libs found in cache `/etc/ld.so.cache'
---
> 443 libs found in cache `/etc/ld.so.cache'
104a105
> 	libnvrtc.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnvrtc.so.10.0
108a110
> 	libnvjpeg.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnvjpeg.so.10.0
112a115
> 	libnvidia-ml.so.1 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnvidia-ml.so.1
116a120
> 	libnvgraph.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnvgraph.so.10.0
136a141
> 	libnpps.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnpps.so.10.0
138a144
> 	libnppitc.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppitc.so.10.0
140a147
> 	libnppisu.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppisu.so.10.0
142a150
> 	libnppist.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppist.so.10.0
144a153
> 	libnppim.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppim.so.10.0
146a156
> 	libnppig.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppig.so.10.0
148a159
> 	libnppif.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppif.so.10.0
150a162
> 	libnppidei.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppidei.so.10.0
152a165
> 	libnppicom.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppicom.so.10.0
154a168
> 	libnppicc.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppicc.so.10.0
156a171
> 	libnppial.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppial.so.10.0
158a174
> 	libnppc.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libnppc.so.10.0
299a316
> 	libcusparse.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcusparse.so.10.0
301a319
> 	libcusolver.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcusolver.so.10.0
305a324
> 	libcurand.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcurand.so.10.0
312a332
> 	libcufftw.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcufftw.so.10.0
314a335
> 	libcufft.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcufft.so.10.0
320a342
> 	libcuda.so.1 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcuda.so.1
322a345
> 	libcublas.so.10.0 (libc6,x86-64) => /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcublas.so.10.0

# Reset ldconfig
root@c6dc85e0cb06:/home/blueoil# ldconfig
root@c6dc85e0cb06:/home/blueoil# ldconfig -p > ldconfig.aft
# There is no diff between before and after
root@c6dc85e0cb06:/home/blueoil# diff ldconfig.bef ldconfig.aft
root@c6dc85e0cb06:/home/blueoil#


# Configure OpenMPI to run good defaults:
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 --mca btl_vader_single_copy_mechanism none
Expand Down