Skip to content

Commit

Permalink
move forward kernels in elastic.c into separate functions so that the…
Browse files Browse the repository at this point in the history
…y are vectorized even when using OpenMP
  • Loading branch information
ar4 committed Jul 21, 2023
1 parent 3e6de3c commit 22d838a
Show file tree
Hide file tree
Showing 9 changed files with 332 additions and 257 deletions.
185 changes: 100 additions & 85 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -1,54 +1,65 @@
name: Build and test
on: push
jobs:
Linux-build:
runs-on: ubuntu-latest
container: quay.io/pypa/manylinux2014_x86_64
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Install NVCC
run: |
yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
yum install -y cuda-nvcc-11-1-11.1.105-1 cuda-cudart-devel-11-1-11.1.74-1
- name: Compile
run: |
PATH=$PATH:/usr/local/cuda-11.1/bin
CUDA_HOME=/usr/local/cuda-11.1
CUDA_ROOT=/usr/local/cuda-11.1
CUDA_PATH=/usr/local/cuda-11.1
CUDADIR=/usr/local/cuda-11.1
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/lib64
cd src/deepwave
cp /lib64/libgomp.so.1 .
./build_linux.sh
- name: Archive built libraries
uses: actions/upload-artifact@v3
with:
name: linux_libraries
path: src/deepwave/*.so*
# Linux-build:
# runs-on: ubuntu-latest
# container: quay.io/pypa/manylinux2014_x86_64
# steps:
# - name: Checkout
# uses: actions/checkout@v3
# - name: Install NVCC
# run: |
# yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
# yum install -y cuda-nvcc-11-1-11.1.105-1 cuda-cudart-devel-11-1-11.1.74-1
# - name: Compile
# run: |
# PATH=$PATH:/usr/local/cuda-11.1/bin
# CUDA_HOME=/usr/local/cuda-11.1
# CUDA_ROOT=/usr/local/cuda-11.1
# CUDA_PATH=/usr/local/cuda-11.1
# CUDADIR=/usr/local/cuda-11.1
# LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/lib64
# cd src/deepwave
# cp /lib64/libgomp.so.1 .
# ./build_linux.sh
# - name: Archive built libraries
# uses: actions/upload-artifact@v3
# with:
# name: linux_libraries
# path: src/deepwave/*.so*
MacOS-build:
runs-on: macos-11
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install torch
# - name: Install dependencies
# run: |
# #ls -R
# #python -m pip install torch
- name: Compile
run: |
cd src/deepwave
cp `python -c "import torch; print(torch.__path__[0])"`/lib/libiomp5.dylib .
#cp `python -c "import torch; print(torch.__path__[0])"`/lib/libiomp5.dylib .
nuget install intelopenmp.devel.osx -DirectDownload -NonInteractive
cp intelopenmp.devel.osx*/lib/native/osx-x64/libiomp5.dylib .
brew install libomp
./build_macos.sh
- name: Archive built libraries
uses: actions/upload-artifact@v3
with:
name: macos_libraries
path: src/deepwave/*.dylib
cd ../../
python -m pip install --upgrade pip
python -m pip install pytest scipy
python -m pip install .
- name: Test with pytest
run: |
#pytest -s
cd tests
PYTHONVERBOSE=3 python -c "import test_elastic; test_elastic.test_wavefield_decays()"
# - name: Archive built libraries
# uses: actions/upload-artifact@v3
# with:
# name: macos_libraries
# path: src/deepwave/*.dylib
Windows-build:
runs-on: windows-2019
defaults:
Expand All @@ -57,58 +68,62 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
- name: Install NVCC
run: |
curl https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe -o cuda_11.1.1_win10_network.exe
chmod +x ./cuda_11.1.1_win10_network.exe
./cuda_11.1.1_win10_network.exe -s nvcc_11.1 cudart_11.1
echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1" >> $GITHUB_ENV
echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\bin" >> $GITHUB_PATH
# - name: Set up Python
# uses: actions/setup-python@v3
# - name: Install NVCC
# run: |
# curl https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe -o cuda_11.1.1_win10_network.exe
# chmod +x ./cuda_11.1.1_win10_network.exe
# ./cuda_11.1.1_win10_network.exe -s nvcc_11.1 cudart_11.1
# echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1" >> $GITHUB_ENV
# echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\bin" >> $GITHUB_PATH
- name: Setup MSVC
uses: ilammy/msvc-dev-cmd@v1
- name: Compile
run: |
cd src/deepwave
nuget install intelopenmp.devel.win -DirectDownload -NonInteractive
nuget install intelopenmp.redist.win -DirectDownload -NonInteractive
cp intelopenmp.devel.win*/lib/native/win-x64/libiomp5md.lib .
cp intelopenmp.redist.win*/runtimes/win-x86/native/libiomp5md.dll .
./build_windows.sh
- name: Archive built libraries
uses: actions/upload-artifact@v3
with:
name: windows_libraries
path: src/deepwave/*.dll
Test:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
fail-fast: false
runs-on: ${{ matrix.os }}
needs: [Linux-build, MacOS-build, Windows-build]
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Download built Linux libraries
uses: actions/download-artifact@v3
with:
name: linux_libraries
path: src/deepwave/
- name: Download built MacOS libraries
uses: actions/download-artifact@v3
with:
name: macos_libraries
path: src/deepwave/
- name: Download built Windows libraries
uses: actions/download-artifact@v3
with:
name: windows_libraries
path: src/deepwave/
- name: Set up Python
uses: actions/setup-python@v3
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install pytest scipy
python -m pip install .
- name: Test with pytest
run: |
pytest
# - name: Archive built libraries
# uses: actions/upload-artifact@v3
# with:
# name: windows_libraries
# path: src/deepwave/*.dll
# Test:
# strategy:
# matrix:
# os: [ubuntu-latest, macos-latest, windows-latest]
# fail-fast: false
# runs-on: ${{ matrix.os }}
# needs: [Linux-build, MacOS-build, Windows-build]
# steps:
# - name: Checkout
# uses: actions/checkout@v3
# - name: Download built Linux libraries
# uses: actions/download-artifact@v3
# with:
# name: linux_libraries
# path: src/deepwave/
# - name: Download built MacOS libraries
# uses: actions/download-artifact@v3
# with:
# name: macos_libraries
# path: src/deepwave/
# - name: Download built Windows libraries
# uses: actions/download-artifact@v3
# with:
# name: windows_libraries
# path: src/deepwave/
# - name: Set up Python
# uses: actions/setup-python@v3
# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# python -m pip install pytest scipy
# python -m pip install .
# - name: Test with pytest
# run: |
# PYTHONVERBOSE=3 pytest
3 changes: 3 additions & 0 deletions src/deepwave/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,11 @@
try:
dll_cpu.omp_get_num_threads
use_openmp = True
import torch
print('USING OPENMP', torch.get_num_threads())
except AttributeError:
use_openmp = False
print('NOT USING OPENMP')
dll_cpu.scalar_iso_2_float_forward.restype = None
dll_cpu.scalar_iso_4_float_forward.restype = None
dll_cpu.scalar_iso_6_float_forward.restype = None
Expand Down
2 changes: 1 addition & 1 deletion src/deepwave/build_linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -e

DW_OMP_NAME=libgomp.so.1
CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -fopenmp -Ofast -mavx2"
CFLAGS="-Wall -Wextra -pedantic -fPIC -fopenmp -Ofast -mavx2"
CUDAFLAGS="--restrict --use_fast_math -O3 -gencode=arch=compute_52,code=sm_52, -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 --compiler-options -fPIC"
gcc $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
gcc $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
Expand Down
48 changes: 24 additions & 24 deletions src/deepwave/build_macos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -e

DW_OMP_NAME=iomp5
CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include"
CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include"
clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o
Expand All @@ -26,26 +26,26 @@ clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_
clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_4_double.o
clang $CFLAGS -dynamiclib scalar_born_cpu_iso_2_float.o scalar_born_cpu_iso_4_float.o scalar_born_cpu_iso_6_float.o scalar_born_cpu_iso_8_float.o scalar_born_cpu_iso_2_double.o scalar_born_cpu_iso_4_double.o scalar_born_cpu_iso_6_double.o scalar_born_cpu_iso_8_double.o scalar_cpu_iso_2_float.o scalar_cpu_iso_4_float.o scalar_cpu_iso_6_float.o scalar_cpu_iso_8_float.o scalar_cpu_iso_2_double.o scalar_cpu_iso_4_double.o scalar_cpu_iso_6_double.o scalar_cpu_iso_8_double.o elastic_cpu_iso_2_float.o elastic_cpu_iso_4_float.o elastic_cpu_iso_2_double.o elastic_cpu_iso_4_double.o -L. -l$DW_OMP_NAME -rpath @loader_path/ -o libdeepwave_cpu_macos_x86_64.dylib
rm *.o
CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -arch arm64"
clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o
clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_8_float.o
clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_2_double.o
clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_4_double.o
clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_6_double.o
clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_8_double.o
clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_2_float.o
clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_4_float.o
clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_6_float.o
clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_8_float.o
clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_2_double.o
clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_4_double.o
clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_6_double.o
clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_8_double.o
clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_2_float.o
clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_4_float.o
clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_2_double.o
clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_4_double.o
clang $CFLAGS -shared scalar_born_cpu_iso_2_float.o scalar_born_cpu_iso_4_float.o scalar_born_cpu_iso_6_float.o scalar_born_cpu_iso_8_float.o scalar_born_cpu_iso_2_double.o scalar_born_cpu_iso_4_double.o scalar_born_cpu_iso_6_double.o scalar_born_cpu_iso_8_double.o scalar_cpu_iso_2_float.o scalar_cpu_iso_4_float.o scalar_cpu_iso_6_float.o scalar_cpu_iso_8_float.o scalar_cpu_iso_2_double.o scalar_cpu_iso_4_double.o scalar_cpu_iso_6_double.o scalar_cpu_iso_8_double.o elastic_cpu_iso_2_float.o elastic_cpu_iso_4_float.o elastic_cpu_iso_2_double.o elastic_cpu_iso_4_double.o -o libdeepwave_cpu_macos_arm64.dylib
rm *.o
#CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -arch arm64"
#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o
#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_8_float.o
#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_2_double.o
#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_4_double.o
#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_6_double.o
#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_8_double.o
#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_2_float.o
#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_4_float.o
#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_6_float.o
#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_8_float.o
#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_2_double.o
#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_4_double.o
#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_6_double.o
#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_8_double.o
#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_2_float.o
#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_4_float.o
#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_2_double.o
#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_4_double.o
#clang $CFLAGS -shared scalar_born_cpu_iso_2_float.o scalar_born_cpu_iso_4_float.o scalar_born_cpu_iso_6_float.o scalar_born_cpu_iso_8_float.o scalar_born_cpu_iso_2_double.o scalar_born_cpu_iso_4_double.o scalar_born_cpu_iso_6_double.o scalar_born_cpu_iso_8_double.o scalar_cpu_iso_2_float.o scalar_cpu_iso_4_float.o scalar_cpu_iso_6_float.o scalar_cpu_iso_8_float.o scalar_cpu_iso_2_double.o scalar_cpu_iso_4_double.o scalar_cpu_iso_6_double.o scalar_cpu_iso_8_double.o elastic_cpu_iso_2_float.o elastic_cpu_iso_4_float.o elastic_cpu_iso_2_double.o elastic_cpu_iso_4_double.o -o libdeepwave_cpu_macos_arm64.dylib
#rm *.o
Loading

0 comments on commit 22d838a

Please sign in to comment.