diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7b59686..a472b3b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,32 +1,32 @@ name: Build and test on: push jobs: - Linux-build: - runs-on: ubuntu-latest - container: quay.io/pypa/manylinux2014_x86_64 - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Install NVCC - run: | - yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo - yum install -y cuda-nvcc-11-1-11.1.105-1 cuda-cudart-devel-11-1-11.1.74-1 - - name: Compile - run: | - PATH=$PATH:/usr/local/cuda-11.1/bin - CUDA_HOME=/usr/local/cuda-11.1 - CUDA_ROOT=/usr/local/cuda-11.1 - CUDA_PATH=/usr/local/cuda-11.1 - CUDADIR=/usr/local/cuda-11.1 - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/lib64 - cd src/deepwave - cp /lib64/libgomp.so.1 . - ./build_linux.sh - - name: Archive built libraries - uses: actions/upload-artifact@v3 - with: - name: linux_libraries - path: src/deepwave/*.so* +# Linux-build: +# runs-on: ubuntu-latest +# container: quay.io/pypa/manylinux2014_x86_64 +# steps: +# - name: Checkout +# uses: actions/checkout@v3 +# - name: Install NVCC +# run: | +# yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo +# yum install -y cuda-nvcc-11-1-11.1.105-1 cuda-cudart-devel-11-1-11.1.74-1 +# - name: Compile +# run: | +# PATH=$PATH:/usr/local/cuda-11.1/bin +# CUDA_HOME=/usr/local/cuda-11.1 +# CUDA_ROOT=/usr/local/cuda-11.1 +# CUDA_PATH=/usr/local/cuda-11.1 +# CUDADIR=/usr/local/cuda-11.1 +# LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/lib64 +# cd src/deepwave +# cp /lib64/libgomp.so.1 . +# ./build_linux.sh +# - name: Archive built libraries +# uses: actions/upload-artifact@v3 +# with: +# name: linux_libraries +# path: src/deepwave/*.so* MacOS-build: runs-on: macos-11 steps: @@ -36,79 +36,94 @@ jobs: uses: actions/setup-python@v3 - name: Install dependencies run: | - python -m pip install --upgrade pip - python -m pip install torch + nuget install intelopenmp.devel.osx -DirectDownload -NonInteractive + #ls -R + #python -m pip install torch - name: Compile run: | cd src/deepwave - cp `python -c "import torch; print(torch.__path__[0])"`/lib/libiomp5.dylib . + #cp `python -c "import torch; print(torch.__path__[0])"`/lib/libiomp5.dylib . + mv intelopenmp.devel.osx*/lib/native/osx-x64/libiomp5.dylib . brew install libomp ./build_macos.sh - - name: Archive built libraries - uses: actions/upload-artifact@v3 - with: - name: macos_libraries - path: src/deepwave/*.dylib - Windows-build: - runs-on: windows-2019 - defaults: - run: - shell: bash - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v3 - - name: Install NVCC - run: | - curl https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe -o cuda_11.1.1_win10_network.exe - chmod +x ./cuda_11.1.1_win10_network.exe - ./cuda_11.1.1_win10_network.exe -s nvcc_11.1 cudart_11.1 - echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1" >> $GITHUB_ENV - echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\bin" >> $GITHUB_PATH - - name: Setup MSVC - uses: ilammy/msvc-dev-cmd@v1 - - name: Compile - run: | - cd src/deepwave - ./build_windows.sh - - name: Archive built libraries - uses: actions/upload-artifact@v3 - with: - name: windows_libraries - path: src/deepwave/*.dll - Test: - strategy: - matrix: - os: [ubuntu-latest, macos-latest, windows-latest] - fail-fast: false - runs-on: ${{ matrix.os }} - needs: [Linux-build, MacOS-build, Windows-build] - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Download built Linux libraries - uses: actions/download-artifact@v3 - with: - name: linux_libraries - path: src/deepwave/ - - name: Download built MacOS libraries - uses: actions/download-artifact@v3 - with: - name: macos_libraries - path: src/deepwave/ - - name: Download built Windows libraries - uses: actions/download-artifact@v3 - with: - name: windows_libraries - path: src/deepwave/ - - name: Set up Python - uses: actions/setup-python@v3 - - name: Install dependencies - run: | + cd ../../ python -m pip install --upgrade pip python -m pip install pytest scipy python -m pip install . - name: Test with pytest run: | - pytest + #pytest -s + cd tests + PYTHONVERBOSE=3 python -c "import test_elastic; test_wavefield_decays()" +# - name: Archive built libraries +# uses: actions/upload-artifact@v3 +# with: +# name: macos_libraries +# path: src/deepwave/*.dylib +# Windows-build: +# runs-on: windows-2019 +# defaults: +# run: +# shell: bash +# steps: +# - name: Checkout +# uses: actions/checkout@v3 +# - name: Set up Python +# uses: actions/setup-python@v3 +# - name: Install NVCC +# run: | +# curl https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe -o cuda_11.1.1_win10_network.exe +# chmod +x ./cuda_11.1.1_win10_network.exe +# ./cuda_11.1.1_win10_network.exe -s nvcc_11.1 cudart_11.1 +# echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1" >> $GITHUB_ENV +# echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\bin" >> $GITHUB_PATH +# - name: Setup MSVC +# uses: ilammy/msvc-dev-cmd@v1 +# - name: Compile +# run: | +# cd src/deepwave +# nuget install intelopenmp.devel.win -DirectDownload -NonInteractive +# nuget install intelopenmp.redist.win -DirectDownload -NonInteractive +# mv intelopenmp.devel.win*/lib/native/win-x64/libiomp5md.lib . +# mv intelopenmp.redist.win*/runtimes/win-x86/native/libiomp5md.dll . +# ./build_windows.sh +# - name: Archive built libraries +# uses: actions/upload-artifact@v3 +# with: +# name: windows_libraries +# path: src/deepwave/*.dll +# Test: +# strategy: +# matrix: +# os: [ubuntu-latest, macos-latest, windows-latest] +# fail-fast: false +# runs-on: ${{ matrix.os }} +# needs: [Linux-build, MacOS-build, Windows-build] +# steps: +# - name: Checkout +# uses: actions/checkout@v3 +# - name: Download built Linux libraries +# uses: actions/download-artifact@v3 +# with: +# name: linux_libraries +# path: src/deepwave/ +# - name: Download built MacOS libraries +# uses: actions/download-artifact@v3 +# with: +# name: macos_libraries +# path: src/deepwave/ +# - name: Download built Windows libraries +# uses: actions/download-artifact@v3 +# with: +# name: windows_libraries +# path: src/deepwave/ +# - name: Set up Python +# uses: actions/setup-python@v3 +# - name: Install dependencies +# run: | +# python -m pip install --upgrade pip +# python -m pip install pytest scipy +# python -m pip install . +# - name: Test with pytest +# run: | +# PYTHONVERBOSE=3 pytest diff --git a/src/deepwave/__init__.py b/src/deepwave/__init__.py index 87373db..ceb02eb 100644 --- a/src/deepwave/__init__.py +++ b/src/deepwave/__init__.py @@ -56,8 +56,11 @@ try: dll_cpu.omp_get_num_threads use_openmp = True + import torch + print('USING OPENMP', torch.get_num_threads()) except AttributeError: use_openmp = False + print('NOT USING OPENMP') dll_cpu.scalar_iso_2_float_forward.restype = None dll_cpu.scalar_iso_4_float_forward.restype = None dll_cpu.scalar_iso_6_float_forward.restype = None diff --git a/src/deepwave/build_linux.sh b/src/deepwave/build_linux.sh index 6e5efcf..c0f35c2 100755 --- a/src/deepwave/build_linux.sh +++ b/src/deepwave/build_linux.sh @@ -3,7 +3,7 @@ set -e DW_OMP_NAME=libgomp.so.1 -CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -fopenmp -Ofast -mavx2" +CFLAGS="-Wall -Wextra -pedantic -fPIC -fopenmp -Ofast -mavx2" CUDAFLAGS="--restrict --use_fast_math -O3 -gencode=arch=compute_52,code=sm_52, -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 --compiler-options -fPIC" gcc $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o gcc $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o diff --git a/src/deepwave/build_macos.sh b/src/deepwave/build_macos.sh index db3f959..5a86f16 100755 --- a/src/deepwave/build_macos.sh +++ b/src/deepwave/build_macos.sh @@ -3,7 +3,7 @@ set -e DW_OMP_NAME=iomp5 -CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include" +CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include" clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o @@ -26,26 +26,26 @@ clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_ clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_4_double.o clang $CFLAGS -dynamiclib scalar_born_cpu_iso_2_float.o scalar_born_cpu_iso_4_float.o scalar_born_cpu_iso_6_float.o scalar_born_cpu_iso_8_float.o scalar_born_cpu_iso_2_double.o scalar_born_cpu_iso_4_double.o scalar_born_cpu_iso_6_double.o scalar_born_cpu_iso_8_double.o scalar_cpu_iso_2_float.o scalar_cpu_iso_4_float.o scalar_cpu_iso_6_float.o scalar_cpu_iso_8_float.o scalar_cpu_iso_2_double.o scalar_cpu_iso_4_double.o scalar_cpu_iso_6_double.o scalar_cpu_iso_8_double.o elastic_cpu_iso_2_float.o elastic_cpu_iso_4_float.o elastic_cpu_iso_2_double.o elastic_cpu_iso_4_double.o -L. -l$DW_OMP_NAME -rpath @loader_path/ -o libdeepwave_cpu_macos_x86_64.dylib rm *.o -CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -arch arm64" -clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o -clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o -clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o -clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_8_float.o -clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_2_double.o -clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_4_double.o -clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_6_double.o -clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_8_double.o -clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_2_float.o -clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_4_float.o -clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_6_float.o -clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_8_float.o -clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_2_double.o -clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_4_double.o -clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_6_double.o -clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_8_double.o -clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_2_float.o -clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_4_float.o -clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_2_double.o -clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_4_double.o -clang $CFLAGS -shared scalar_born_cpu_iso_2_float.o scalar_born_cpu_iso_4_float.o scalar_born_cpu_iso_6_float.o scalar_born_cpu_iso_8_float.o scalar_born_cpu_iso_2_double.o scalar_born_cpu_iso_4_double.o scalar_born_cpu_iso_6_double.o scalar_born_cpu_iso_8_double.o scalar_cpu_iso_2_float.o scalar_cpu_iso_4_float.o scalar_cpu_iso_6_float.o scalar_cpu_iso_8_float.o scalar_cpu_iso_2_double.o scalar_cpu_iso_4_double.o scalar_cpu_iso_6_double.o scalar_cpu_iso_8_double.o elastic_cpu_iso_2_float.o elastic_cpu_iso_4_float.o elastic_cpu_iso_2_double.o elastic_cpu_iso_4_double.o -o libdeepwave_cpu_macos_arm64.dylib -rm *.o +#CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -arch arm64" +#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o +#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o +#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o +#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_8_float.o +#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_2_double.o +#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_4_double.o +#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_6_double.o +#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_8_double.o +#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_2_float.o +#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_4_float.o +#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_6_float.o +#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_8_float.o +#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_2_double.o +#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_4_double.o +#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_6_double.o +#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_8_double.o +#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_2_float.o +#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_4_float.o +#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_2_double.o +#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_4_double.o +#clang $CFLAGS -shared scalar_born_cpu_iso_2_float.o scalar_born_cpu_iso_4_float.o scalar_born_cpu_iso_6_float.o scalar_born_cpu_iso_8_float.o scalar_born_cpu_iso_2_double.o scalar_born_cpu_iso_4_double.o scalar_born_cpu_iso_6_double.o scalar_born_cpu_iso_8_double.o scalar_cpu_iso_2_float.o scalar_cpu_iso_4_float.o scalar_cpu_iso_6_float.o scalar_cpu_iso_8_float.o scalar_cpu_iso_2_double.o scalar_cpu_iso_4_double.o scalar_cpu_iso_6_double.o scalar_cpu_iso_8_double.o elastic_cpu_iso_2_float.o elastic_cpu_iso_4_float.o elastic_cpu_iso_2_double.o elastic_cpu_iso_4_double.o -o libdeepwave_cpu_macos_arm64.dylib +#rm *.o diff --git a/src/deepwave/build_windows.sh b/src/deepwave/build_windows.sh index fa64b3e..6028256 100755 --- a/src/deepwave/build_windows.sh +++ b/src/deepwave/build_windows.sh @@ -2,7 +2,7 @@ set -e -CFLAGS="-Wall -O2 -fp:fast -arch:AVX2" +CFLAGS="-Wall -O2 -fp:fast -arch:AVX2 -openmp" CUDAFLAGS="--restrict --use_fast_math -O3 -gencode=arch=compute_52,code=sm_52, -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80" cl $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -Foscalar_cpu_iso_2_float.obj cl $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -Foscalar_cpu_iso_4_float.obj @@ -24,7 +24,7 @@ cl $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c elastic.c -Foelastic_cpu_iso_2_fl cl $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c elastic.c -Foelastic_cpu_iso_4_float.obj cl $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -Foelastic_cpu_iso_2_double.obj cl $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -Foelastic_cpu_iso_4_double.obj -cl $CFLAGS -LD scalar_born_cpu_iso_2_float.obj scalar_born_cpu_iso_4_float.obj scalar_born_cpu_iso_6_float.obj scalar_born_cpu_iso_8_float.obj scalar_born_cpu_iso_2_double.obj scalar_born_cpu_iso_4_double.obj scalar_born_cpu_iso_6_double.obj scalar_born_cpu_iso_8_double.obj scalar_cpu_iso_2_float.obj scalar_cpu_iso_4_float.obj scalar_cpu_iso_6_float.obj scalar_cpu_iso_8_float.obj scalar_cpu_iso_2_double.obj scalar_cpu_iso_4_double.obj scalar_cpu_iso_6_double.obj scalar_cpu_iso_8_double.obj elastic_cpu_iso_2_float.obj elastic_cpu_iso_4_float.obj elastic_cpu_iso_2_double.obj elastic_cpu_iso_4_double.obj -Felibdeepwave_cpu_windows_x86_64.dll +cl $CFLAGS -LD scalar_born_cpu_iso_2_float.obj scalar_born_cpu_iso_4_float.obj scalar_born_cpu_iso_6_float.obj scalar_born_cpu_iso_8_float.obj scalar_born_cpu_iso_2_double.obj scalar_born_cpu_iso_4_double.obj scalar_born_cpu_iso_6_double.obj scalar_born_cpu_iso_8_double.obj scalar_cpu_iso_2_float.obj scalar_cpu_iso_4_float.obj scalar_cpu_iso_6_float.obj scalar_cpu_iso_8_float.obj scalar_cpu_iso_2_double.obj scalar_cpu_iso_4_double.obj scalar_cpu_iso_6_double.obj scalar_cpu_iso_8_double.obj elastic_cpu_iso_2_float.obj elastic_cpu_iso_4_float.obj elastic_cpu_iso_2_double.obj elastic_cpu_iso_4_double.obj -nodefaultlib:vcomp libiomp5md.lib -Felibdeepwave_cpu_windows_x86_64.dll nvcc $CUDAFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.cu -o scalar_cuda_iso_2_float.obj nvcc $CUDAFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.cu -o scalar_cuda_iso_4_float.obj nvcc $CUDAFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.cu -o scalar_cuda_iso_6_float.obj diff --git a/src/deepwave/common_cpu.h b/src/deepwave/common_cpu.h index 07f794b..30478c9 100644 --- a/src/deepwave/common_cpu.h +++ b/src/deepwave/common_cpu.h @@ -1,12 +1,6 @@ #ifndef DW_COMMON_CPU_H #define DW_COMMON_CPU_H -//#ifdef DW_USE_OPENMP -//int dw_use_openmp = 1; -//#else -//int dw_use_openmp = 0; -//#endif /* DW_USE_OPENMP */ - static void add_sources(DW_DTYPE *__restrict const wf, DW_DTYPE const *__restrict const f, int64_t const *__restrict const sources_i, diff --git a/src/deepwave/elastic.c b/src/deepwave/elastic.c index bb75b77..43dde83 100644 --- a/src/deepwave/elastic.c +++ b/src/deepwave/elastic.c @@ -9,9 +9,9 @@ VX SII | VX SII | VX sii SXY-VY--|-SXY-VY--|-SXY vy */ -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #include -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ #include #include #include "common.h" @@ -66,7 +66,7 @@ SXY-VY--|-SXY-VY--|-SXY vy for (y = y_begin_y; y < y_end_y; ++y) {\ int64_t yi = y * nx;\ for (x = x_begin_y; x < x_end_y; ++x) {\ - int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\ + int64_t i = yi + x, j, k;\ DW_DTYPE dsigmayydy = 0;\ DW_DTYPE dsigmaxydx = 0;\ \ @@ -118,15 +118,15 @@ SXY-VY--|-SXY-VY--|-SXY vy {\ DW_DTYPE buoyancyyhxh;\ if (pml_y == 2 && y == ny - 1) {\ - buoyancyyhxh = (buoyancy[i_noshot] + buoyancy[i_noshot + 1]) / 2;\ + buoyancyyhxh = (buoyancy[i] + buoyancy[i + 1]) / 2;\ } else {\ - buoyancyyhxh = (buoyancy[i_noshot] + buoyancy[i_noshot + 1] + buoyancy[i_noshot + nx] +\ - buoyancy[i_noshot + nx + 1]) /\ + buoyancyyhxh = (buoyancy[i] + buoyancy[i + 1] + buoyancy[i + nx] +\ + buoyancy[i + nx + 1]) /\ 4;\ }\ vy[i] += buoyancyyhxh * dt * (dsigmayydy + dsigmaxydx);\ if (buoyancy_requires_grad) {\ - dvydbuoyancy[store_i + i_noshot] = dt * (dsigmayydy + dsigmaxydx);\ + dvydbuoyancy[i] = dt * (dsigmayydy + dsigmaxydx);\ }\ }\ }\ @@ -135,7 +135,7 @@ SXY-VY--|-SXY-VY--|-SXY vy for (y = y_begin_x; y < y_end_x; ++y) {\ int64_t yi = y * nx;\ for (x = x_begin_x; x < x_end_x; ++x) {\ - int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\ + int64_t i = yi + x, j, k;\ DW_DTYPE dsigmaxydy = 0;\ DW_DTYPE dsigmaxxdx = 0;\ \ @@ -184,9 +184,9 @@ SXY-VY--|-SXY-VY--|-SXY vy m_sigmaxxx[i] = ax[x] * m_sigmaxxx[i] + bx[x] * dsigmaxxdx;\ dsigmaxxdx += m_sigmaxxx[i];\ }\ - vx[i] += buoyancy[i_noshot] * dt * (dsigmaxxdx + dsigmaxydy);\ + vx[i] += buoyancy[i] * dt * (dsigmaxxdx + dsigmaxydy);\ if (buoyancy_requires_grad) {\ - dvxdbuoyancy[store_i + i_noshot] = dt * (dsigmaxxdx + dsigmaxydy);\ + dvxdbuoyancy[i] = dt * (dsigmaxxdx + dsigmaxydy);\ }\ }\ \ @@ -231,7 +231,7 @@ SXY-VY--|-SXY-VY--|-SXY vy for (y = y_begin_ii; y < y_end_ii; ++y) {\ int64_t yi = y * nx;\ for (x = x_begin_ii; x < x_end_ii; ++x) {\ - int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\ + int64_t i = yi + x, j, k;\ DW_DTYPE dvydy = 0;\ DW_DTYPE dvxdx = 0;\ \ @@ -280,13 +280,13 @@ SXY-VY--|-SXY-VY--|-SXY vy dvxdx += m_vxx[i];\ }\ {\ - DW_DTYPE lambyxh = (lamb[i_noshot] + lamb[i_noshot + 1]) / 2;\ - DW_DTYPE muyxh = (mu[i_noshot] + mu[i_noshot + 1]) / 2;\ + DW_DTYPE lambyxh = (lamb[i] + lamb[i + 1]) / 2;\ + DW_DTYPE muyxh = (mu[i] + mu[i + 1]) / 2;\ sigmayy[i] += dt * ((lambyxh + 2 * muyxh) * dvydy + lambyxh * dvxdx);\ sigmaxx[i] += dt * ((lambyxh + 2 * muyxh) * dvxdx + lambyxh * dvydy);\ if (lamb_requires_grad || mu_requires_grad) {\ - dvydy_store[store_i + i_noshot] = dt * dvydy;\ - dvxdx_store[store_i + i_noshot] = dt * dvxdx;\ + dvydy_store[i] = dt * dvydy;\ + dvxdx_store[i] = dt * dvxdx;\ }\ }\ }\ @@ -295,7 +295,7 @@ SXY-VY--|-SXY-VY--|-SXY vy for (y = y_begin_xy; y < y_end_xy; ++y) {\ int64_t yi = y * nx;\ for (x = x_begin_xy; x < x_end_xy; ++x) {\ - int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, jp, k;\ + int64_t i = yi + x, j, jp, k;\ DW_DTYPE dvydx = 0;\ DW_DTYPE dvxdy = 0;\ \ @@ -421,10 +421,10 @@ SXY-VY--|-SXY-VY--|-SXY vy dvydx += m_vyx[i];\ }\ {\ - DW_DTYPE muyhx = (mu[i_noshot] + mu[i_noshot + nx]) / 2;\ + DW_DTYPE muyhx = (mu[i] + mu[i + nx]) / 2;\ sigmaxy[i] += dt * muyhx * (dvydx + dvxdy);\ if (mu_requires_grad) {\ - dvydxdvxdy_store[store_i + i_noshot] = dt * (dvydx + dvxdy);\ + dvydxdvxdy_store[i] = dt * (dvydx + dvxdy);\ }\ }\ }\ @@ -1385,6 +1385,109 @@ static void combine_grad_elastic(DW_DTYPE *__restrict const grad, } } +#ifdef _WIN32 +__declspec(noinline) +#else +__attribute__ ((noinline)) +#endif +static void forward_shot_v(DW_DTYPE const *__restrict const buoyancy, DW_DTYPE *__restrict const vy, DW_DTYPE *__restrict const vx, DW_DTYPE const *__restrict const sigmayy, + DW_DTYPE const *__restrict const sigmaxy, DW_DTYPE const *__restrict const sigmaxx, + DW_DTYPE *__restrict const m_sigmayyy, DW_DTYPE *__restrict const m_sigmaxyy, + DW_DTYPE *__restrict const m_sigmaxyx, DW_DTYPE *__restrict const m_sigmaxxx, + DW_DTYPE *__restrict const dvydbuoyancy, DW_DTYPE *__restrict const dvxdbuoyancy, + DW_DTYPE const *__restrict const ay, + DW_DTYPE const *__restrict const ayh, DW_DTYPE const *__restrict const ax, DW_DTYPE const *__restrict const axh, + DW_DTYPE const *__restrict const by, DW_DTYPE const *__restrict const byh, DW_DTYPE const *__restrict const bx, + DW_DTYPE const *__restrict const bxh, + DW_DTYPE const dt, int64_t const ny, int64_t const nx, + bool const buoyancy_requires_grad, int64_t const pml_y0, + int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1) { + int64_t y, x, y_begin_y, y_end_y, x_begin_y, x_end_y, y_begin_x, y_end_x, x_begin_x, x_end_x; + if (buoyancy_requires_grad) { + FORWARD_KERNEL_V(0, 0, 1) + FORWARD_KERNEL_V(0, 1, 1) + FORWARD_KERNEL_V(0, 2, 1) + FORWARD_KERNEL_V(1, 0, 1) + FORWARD_KERNEL_V(1, 1, 1) + FORWARD_KERNEL_V(1, 2, 1) + FORWARD_KERNEL_V(2, 0, 1) + FORWARD_KERNEL_V(2, 1, 1) + FORWARD_KERNEL_V(2, 2, 1) + } else { + FORWARD_KERNEL_V(0, 0, 0) + FORWARD_KERNEL_V(0, 1, 0) + FORWARD_KERNEL_V(0, 2, 0) + FORWARD_KERNEL_V(1, 0, 0) + FORWARD_KERNEL_V(1, 1, 0) + FORWARD_KERNEL_V(1, 2, 0) + FORWARD_KERNEL_V(2, 0, 0) + FORWARD_KERNEL_V(2, 1, 0) + FORWARD_KERNEL_V(2, 2, 0) + } +} + +#ifdef _WIN32 +__declspec(noinline) +#else +__attribute__ ((noinline)) +#endif +static void forward_shot_sigma( + DW_DTYPE const *__restrict const lamb, + DW_DTYPE const *__restrict const mu, DW_DTYPE const *__restrict const vy, DW_DTYPE const *__restrict const vx, DW_DTYPE *__restrict const sigmayy, + DW_DTYPE *__restrict const sigmaxy, DW_DTYPE *__restrict const sigmaxx, DW_DTYPE *__restrict const m_vyy, + DW_DTYPE *__restrict const m_vyx, DW_DTYPE *__restrict const m_vxy, DW_DTYPE *__restrict const m_vxx, + DW_DTYPE *__restrict const dvydy_store, DW_DTYPE *__restrict const dvxdx_store, + DW_DTYPE *__restrict const dvydxdvxdy_store, DW_DTYPE const *__restrict const ay, DW_DTYPE const *__restrict const ayh, + DW_DTYPE const *__restrict const ax, DW_DTYPE const *__restrict const axh, DW_DTYPE const *__restrict const by, + DW_DTYPE const *__restrict const byh, DW_DTYPE const *__restrict const bx, DW_DTYPE const *__restrict const bxh, + DW_DTYPE const dt, int64_t const ny, int64_t const nx, + bool const lamb_requires_grad, bool const mu_requires_grad, int64_t const pml_y0, + int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1) { + int64_t y, x, y_begin_ii, y_end_ii, x_begin_ii, x_end_ii, y_begin_xy, y_end_xy, x_begin_xy, x_end_xy; + if (lamb_requires_grad && mu_requires_grad) { + FORWARD_KERNEL_SIGMA(0, 0, 1, 1) + FORWARD_KERNEL_SIGMA(0, 1, 1, 1) + FORWARD_KERNEL_SIGMA(0, 2, 1, 1) + FORWARD_KERNEL_SIGMA(1, 0, 1, 1) + FORWARD_KERNEL_SIGMA(1, 1, 1, 1) + FORWARD_KERNEL_SIGMA(1, 2, 1, 1) + FORWARD_KERNEL_SIGMA(2, 0, 1, 1) + FORWARD_KERNEL_SIGMA(2, 1, 1, 1) + FORWARD_KERNEL_SIGMA(2, 2, 1, 1) + } else if (lamb_requires_grad) { + FORWARD_KERNEL_SIGMA(0, 0, 1, 0) + FORWARD_KERNEL_SIGMA(0, 1, 1, 0) + FORWARD_KERNEL_SIGMA(0, 2, 1, 0) + FORWARD_KERNEL_SIGMA(1, 0, 1, 0) + FORWARD_KERNEL_SIGMA(1, 1, 1, 0) + FORWARD_KERNEL_SIGMA(1, 2, 1, 0) + FORWARD_KERNEL_SIGMA(2, 0, 1, 0) + FORWARD_KERNEL_SIGMA(2, 1, 1, 0) + FORWARD_KERNEL_SIGMA(2, 2, 1, 0) + } else if (mu_requires_grad) { + FORWARD_KERNEL_SIGMA(0, 0, 0, 1) + FORWARD_KERNEL_SIGMA(0, 1, 0, 1) + FORWARD_KERNEL_SIGMA(0, 2, 0, 1) + FORWARD_KERNEL_SIGMA(1, 0, 0, 1) + FORWARD_KERNEL_SIGMA(1, 1, 0, 1) + FORWARD_KERNEL_SIGMA(1, 2, 0, 1) + FORWARD_KERNEL_SIGMA(2, 0, 0, 1) + FORWARD_KERNEL_SIGMA(2, 1, 0, 1) + FORWARD_KERNEL_SIGMA(2, 2, 0, 1) + } else { + FORWARD_KERNEL_SIGMA(0, 0, 0, 0) + FORWARD_KERNEL_SIGMA(0, 1, 0, 0) + FORWARD_KERNEL_SIGMA(0, 2, 0, 0) + FORWARD_KERNEL_SIGMA(1, 0, 0, 0) + FORWARD_KERNEL_SIGMA(1, 1, 0, 0) + FORWARD_KERNEL_SIGMA(1, 2, 0, 0) + FORWARD_KERNEL_SIGMA(2, 0, 0, 0) + FORWARD_KERNEL_SIGMA(2, 1, 0, 0) + FORWARD_KERNEL_SIGMA(2, 2, 0, 0) + } +} + + static void backward_shot(DW_DTYPE const *__restrict const lamb, DW_DTYPE const *__restrict const mu, DW_DTYPE const *__restrict const buoyancy, DW_DTYPE const *__restrict const grad_r_y, DW_DTYPE const *__restrict const grad_r_x, DW_DTYPE const *__restrict const grad_r_p, @@ -1562,8 +1665,6 @@ static void set_fd_coeffs_x(DW_DTYPE const dx) { } } - - #ifdef _WIN32 __declspec(dllexport) #endif @@ -1597,9 +1698,9 @@ __declspec(dllexport) int64_t shot; set_fd_coeffs_y(dy); set_fd_coeffs_x(dx); -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const si = shot * ny * nx; int64_t const siy = shot * n_sources_y_per_shot; @@ -1607,7 +1708,7 @@ __declspec(dllexport) int64_t const riy = shot * n_receivers_y_per_shot; int64_t const rix = shot * n_receivers_x_per_shot; int64_t const rip = shot * n_receivers_p_per_shot; - int64_t t, y, x, y_begin_y, y_end_y, x_begin_y, x_end_y, y_begin_x, y_end_x, x_begin_x, x_end_x, y_begin_ii, y_end_ii, x_begin_ii, x_end_ii, y_begin_xy, y_end_xy, x_begin_xy, x_end_xy; + int64_t t; for (t = 0; t < nt; ++t) { int64_t store_i = shot * (nt / step_ratio) * ny * nx + (t / step_ratio) * ny * nx; @@ -1623,27 +1724,18 @@ __declspec(dllexport) record_pressure_receivers(r_p + rip * nt + t * n_receivers_p_per_shot, sigmayy + si, sigmaxx + si, receivers_p_i + rip, n_receivers_p_per_shot); } - if (buoyancy_requires_grad && ((t % step_ratio) == 0)) { - FORWARD_KERNEL_V(0, 0, 1) - FORWARD_KERNEL_V(0, 1, 1) - FORWARD_KERNEL_V(0, 2, 1) - FORWARD_KERNEL_V(1, 0, 1) - FORWARD_KERNEL_V(1, 1, 1) - FORWARD_KERNEL_V(1, 2, 1) - FORWARD_KERNEL_V(2, 0, 1) - FORWARD_KERNEL_V(2, 1, 1) - FORWARD_KERNEL_V(2, 2, 1) - } else { - FORWARD_KERNEL_V(0, 0, 0) - FORWARD_KERNEL_V(0, 1, 0) - FORWARD_KERNEL_V(0, 2, 0) - FORWARD_KERNEL_V(1, 0, 0) - FORWARD_KERNEL_V(1, 1, 0) - FORWARD_KERNEL_V(1, 2, 0) - FORWARD_KERNEL_V(2, 0, 0) - FORWARD_KERNEL_V(2, 1, 0) - FORWARD_KERNEL_V(2, 2, 0) - } + forward_shot_v(buoyancy, vy + si, vx + si, sigmayy + si, + sigmaxy + si, sigmaxx + si, + m_sigmayyy + si, m_sigmaxyy + si, + m_sigmaxyx + si, m_sigmaxxx + si, + dvydbuoyancy + store_i, dvxdbuoyancy + store_i, + ay, + ayh, ax, axh, + by, byh, bx, + bxh, + dt, ny, nx, + buoyancy_requires_grad && ((t % step_ratio) == 0), pml_y0, + pml_y1, pml_x0, pml_x1); if (n_sources_y_per_shot > 0) { add_sources(vy + si, f_y + siy * nt + t * n_sources_y_per_shot, sources_y_i + siy, n_sources_y_per_shot); @@ -1652,47 +1744,18 @@ __declspec(dllexport) add_sources(vx + si, f_x + six * nt + t * n_sources_x_per_shot, sources_x_i + six, n_sources_x_per_shot); } - if (lamb_requires_grad && mu_requires_grad && ((t % step_ratio) == 0)) { - FORWARD_KERNEL_SIGMA(0, 0, 1, 1) - FORWARD_KERNEL_SIGMA(0, 1, 1, 1) - FORWARD_KERNEL_SIGMA(0, 2, 1, 1) - FORWARD_KERNEL_SIGMA(1, 0, 1, 1) - FORWARD_KERNEL_SIGMA(1, 1, 1, 1) - FORWARD_KERNEL_SIGMA(1, 2, 1, 1) - FORWARD_KERNEL_SIGMA(2, 0, 1, 1) - FORWARD_KERNEL_SIGMA(2, 1, 1, 1) - FORWARD_KERNEL_SIGMA(2, 2, 1, 1) - } else if (lamb_requires_grad && ((t % step_ratio) == 0)) { - FORWARD_KERNEL_SIGMA(0, 0, 1, 0) - FORWARD_KERNEL_SIGMA(0, 1, 1, 0) - FORWARD_KERNEL_SIGMA(0, 2, 1, 0) - FORWARD_KERNEL_SIGMA(1, 0, 1, 0) - FORWARD_KERNEL_SIGMA(1, 1, 1, 0) - FORWARD_KERNEL_SIGMA(1, 2, 1, 0) - FORWARD_KERNEL_SIGMA(2, 0, 1, 0) - FORWARD_KERNEL_SIGMA(2, 1, 1, 0) - FORWARD_KERNEL_SIGMA(2, 2, 1, 0) - } else if (mu_requires_grad && ((t % step_ratio) == 0)) { - FORWARD_KERNEL_SIGMA(0, 0, 0, 1) - FORWARD_KERNEL_SIGMA(0, 1, 0, 1) - FORWARD_KERNEL_SIGMA(0, 2, 0, 1) - FORWARD_KERNEL_SIGMA(1, 0, 0, 1) - FORWARD_KERNEL_SIGMA(1, 1, 0, 1) - FORWARD_KERNEL_SIGMA(1, 2, 0, 1) - FORWARD_KERNEL_SIGMA(2, 0, 0, 1) - FORWARD_KERNEL_SIGMA(2, 1, 0, 1) - FORWARD_KERNEL_SIGMA(2, 2, 0, 1) - } else { - FORWARD_KERNEL_SIGMA(0, 0, 0, 0) - FORWARD_KERNEL_SIGMA(0, 1, 0, 0) - FORWARD_KERNEL_SIGMA(0, 2, 0, 0) - FORWARD_KERNEL_SIGMA(1, 0, 0, 0) - FORWARD_KERNEL_SIGMA(1, 1, 0, 0) - FORWARD_KERNEL_SIGMA(1, 2, 0, 0) - FORWARD_KERNEL_SIGMA(2, 0, 0, 0) - FORWARD_KERNEL_SIGMA(2, 1, 0, 0) - FORWARD_KERNEL_SIGMA(2, 2, 0, 0) - } +forward_shot_sigma( + lamb, + mu, vy + si, vx + si, sigmayy + si, + sigmaxy + si, sigmaxx + si, m_vyy + si, + m_vyx + si, m_vxy + si, m_vxx + si, + dvydy_store + store_i, dvxdx_store + store_i, + dvydxdvxdy_store + store_i, ay, ayh, + ax, axh, by, + byh, bx, bxh, + dt, ny, nx, + lamb_requires_grad && ((t % step_ratio) == 0), mu_requires_grad && ((t % step_ratio) == 0), pml_y0, + pml_y1, pml_x0, pml_x1); } if (n_receivers_y_per_shot > 0) { record_receivers(r_y + riy * (nt + 1) + t * n_receivers_y_per_shot, vy + si, receivers_y_i + riy, @@ -1739,9 +1802,9 @@ __declspec(dllexport) int64_t shot; set_fd_coeffs_y(dy); set_fd_coeffs_x(dx); -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const si = shot * ny * nx; int64_t const siy = shot * n_sources_y_per_shot; @@ -1749,11 +1812,11 @@ __declspec(dllexport) int64_t const riy = shot * n_receivers_y_per_shot; int64_t const rix = shot * n_receivers_x_per_shot; int64_t const rip = shot * n_receivers_p_per_shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP int64_t const threadi = omp_get_thread_num() * ny * nx; #else int64_t const threadi = 0; -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ int64_t t; if (n_receivers_y_per_shot > 0 && nt > 0) { add_sources(vy + si, grad_r_y + riy * (nt+1) + nt * n_receivers_y_per_shot, receivers_y_i + riy, @@ -1826,7 +1889,7 @@ __declspec(dllexport) } } } -#ifdef DW_USE_OPENMP +#ifdef _OPENMP if (lamb_requires_grad && n_threads > 1) { combine_grad_elastic(grad_lamb, grad_lamb_thread, n_threads, ny, nx); } @@ -1836,5 +1899,5 @@ __declspec(dllexport) if (buoyancy_requires_grad && n_threads > 1) { combine_grad_elastic(grad_buoyancy, grad_buoyancy_thread, n_threads, ny, nx); } -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ } diff --git a/src/deepwave/scalar.c b/src/deepwave/scalar.c index 887e4f6..f975de0 100644 --- a/src/deepwave/scalar.c +++ b/src/deepwave/scalar.c @@ -17,9 +17,9 @@ * backward. */ -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #include -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ #include #include #include "common.h" @@ -241,9 +241,9 @@ __declspec(dllexport) bool const v_requires_grad, int64_t const pml_y0, int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const si = shot * n_sources_per_shot; @@ -308,18 +308,18 @@ __declspec(dllexport) int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const si = shot * n_sources_per_shot; int64_t const ri = shot * n_receivers_per_shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP int64_t const threadi = omp_get_thread_num() * ny * nx; #else int64_t const threadi = 0; -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ int64_t t; for (t = nt - 1; t >= 0; --t) { if ((nt - 1 - t) & 1) { @@ -351,9 +351,9 @@ __declspec(dllexport) } } } -#ifdef DW_USE_OPENMP +#ifdef _OPENMP if (v_requires_grad && n_threads > 1) { combine_grad(grad_v, grad_v_thread, n_threads, ny, nx); } -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ } diff --git a/src/deepwave/scalar_born.c b/src/deepwave/scalar_born.c index 16320f0..eefd0da 100644 --- a/src/deepwave/scalar_born.c +++ b/src/deepwave/scalar_born.c @@ -17,9 +17,9 @@ * backward. */ -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #include -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ #include #include #include "common.h" @@ -477,9 +477,9 @@ __declspec(dllexport) bool const v_requires_grad, bool const scatter_requires_grad, int64_t const pml_y0, int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const si = shot * n_sources_per_shot; @@ -569,20 +569,20 @@ __declspec(dllexport) int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const si = shot * n_sources_per_shot; int64_t const sisc = shot * n_sourcessc_per_shot; int64_t const ri = shot * n_receivers_per_shot; int64_t const risc = shot * n_receiverssc_per_shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP int64_t const threadi = omp_get_thread_num() * ny * nx; #else int64_t const threadi = 0; -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ int64_t t; for (t = nt - 1; t >= 0; --t) { if ((nt - 1 - t) & 1) { @@ -631,14 +631,14 @@ __declspec(dllexport) } } } -#ifdef DW_USE_OPENMP +#ifdef _OPENMP if (v_requires_grad && n_threads > 1) { combine_grad(grad_v, grad_v_thread, n_threads, ny, nx); } if (scatter_requires_grad && n_threads > 1) { combine_grad(grad_scatter, grad_scatter_thread, n_threads, ny, nx); } -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ } #ifdef _WIN32 @@ -670,18 +670,18 @@ __declspec(dllexport) int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const sisc = shot * n_sourcessc_per_shot; int64_t const risc = shot * n_receiverssc_per_shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP int64_t const threadi = omp_get_thread_num() * ny * nx; #else int64_t const threadi = 0; -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ int64_t t; for (t = nt - 1; t >= 0; --t) { if ((nt - 1 - t) & 1) { @@ -715,9 +715,9 @@ __declspec(dllexport) } } } -#ifdef DW_USE_OPENMP +#ifdef _OPENMP if (scatter_requires_grad && n_threads > 1) { combine_grad(grad_scatter, grad_scatter_thread, n_threads, ny, nx); } -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ }