From e5c54c765a548662cb5a3c06caa08d496b4fbb51 Mon Sep 17 00:00:00 2001 From: Alan Richardson Date: Fri, 21 Jul 2023 17:39:48 +0100 Subject: [PATCH] move forward kernels in elastic.c into separate functions so that they are vectorized even when using OpenMP --- src/deepwave/build_linux.sh | 2 +- src/deepwave/build_macos.sh | 2 +- src/deepwave/common_cpu.h | 6 - src/deepwave/elastic.c | 245 ++++++++++++++++++++++-------------- src/deepwave/scalar.c | 20 +-- src/deepwave/scalar_born.c | 32 ++--- 6 files changed, 182 insertions(+), 125 deletions(-) diff --git a/src/deepwave/build_linux.sh b/src/deepwave/build_linux.sh index 6e5efcf..c0f35c2 100755 --- a/src/deepwave/build_linux.sh +++ b/src/deepwave/build_linux.sh @@ -3,7 +3,7 @@ set -e DW_OMP_NAME=libgomp.so.1 -CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -fopenmp -Ofast -mavx2" +CFLAGS="-Wall -Wextra -pedantic -fPIC -fopenmp -Ofast -mavx2" CUDAFLAGS="--restrict --use_fast_math -O3 -gencode=arch=compute_52,code=sm_52, -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 --compiler-options -fPIC" gcc $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o gcc $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o diff --git a/src/deepwave/build_macos.sh b/src/deepwave/build_macos.sh index db3f959..083a391 100755 --- a/src/deepwave/build_macos.sh +++ b/src/deepwave/build_macos.sh @@ -3,7 +3,7 @@ set -e DW_OMP_NAME=iomp5 -CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include" +CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include" clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o diff --git a/src/deepwave/common_cpu.h b/src/deepwave/common_cpu.h index 07f794b..30478c9 100644 --- a/src/deepwave/common_cpu.h +++ b/src/deepwave/common_cpu.h @@ -1,12 +1,6 @@ #ifndef DW_COMMON_CPU_H #define DW_COMMON_CPU_H -//#ifdef DW_USE_OPENMP -//int dw_use_openmp = 1; -//#else -//int dw_use_openmp = 0; -//#endif /* DW_USE_OPENMP */ - static void add_sources(DW_DTYPE *__restrict const wf, DW_DTYPE const *__restrict const f, int64_t const *__restrict const sources_i, diff --git a/src/deepwave/elastic.c b/src/deepwave/elastic.c index bb75b77..43dde83 100644 --- a/src/deepwave/elastic.c +++ b/src/deepwave/elastic.c @@ -9,9 +9,9 @@ VX SII | VX SII | VX sii SXY-VY--|-SXY-VY--|-SXY vy */ -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #include -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ #include #include #include "common.h" @@ -66,7 +66,7 @@ SXY-VY--|-SXY-VY--|-SXY vy for (y = y_begin_y; y < y_end_y; ++y) {\ int64_t yi = y * nx;\ for (x = x_begin_y; x < x_end_y; ++x) {\ - int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\ + int64_t i = yi + x, j, k;\ DW_DTYPE dsigmayydy = 0;\ DW_DTYPE dsigmaxydx = 0;\ \ @@ -118,15 +118,15 @@ SXY-VY--|-SXY-VY--|-SXY vy {\ DW_DTYPE buoyancyyhxh;\ if (pml_y == 2 && y == ny - 1) {\ - buoyancyyhxh = (buoyancy[i_noshot] + buoyancy[i_noshot + 1]) / 2;\ + buoyancyyhxh = (buoyancy[i] + buoyancy[i + 1]) / 2;\ } else {\ - buoyancyyhxh = (buoyancy[i_noshot] + buoyancy[i_noshot + 1] + buoyancy[i_noshot + nx] +\ - buoyancy[i_noshot + nx + 1]) /\ + buoyancyyhxh = (buoyancy[i] + buoyancy[i + 1] + buoyancy[i + nx] +\ + buoyancy[i + nx + 1]) /\ 4;\ }\ vy[i] += buoyancyyhxh * dt * (dsigmayydy + dsigmaxydx);\ if (buoyancy_requires_grad) {\ - dvydbuoyancy[store_i + i_noshot] = dt * (dsigmayydy + dsigmaxydx);\ + dvydbuoyancy[i] = dt * (dsigmayydy + dsigmaxydx);\ }\ }\ }\ @@ -135,7 +135,7 @@ SXY-VY--|-SXY-VY--|-SXY vy for (y = y_begin_x; y < y_end_x; ++y) {\ int64_t yi = y * nx;\ for (x = x_begin_x; x < x_end_x; ++x) {\ - int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\ + int64_t i = yi + x, j, k;\ DW_DTYPE dsigmaxydy = 0;\ DW_DTYPE dsigmaxxdx = 0;\ \ @@ -184,9 +184,9 @@ SXY-VY--|-SXY-VY--|-SXY vy m_sigmaxxx[i] = ax[x] * m_sigmaxxx[i] + bx[x] * dsigmaxxdx;\ dsigmaxxdx += m_sigmaxxx[i];\ }\ - vx[i] += buoyancy[i_noshot] * dt * (dsigmaxxdx + dsigmaxydy);\ + vx[i] += buoyancy[i] * dt * (dsigmaxxdx + dsigmaxydy);\ if (buoyancy_requires_grad) {\ - dvxdbuoyancy[store_i + i_noshot] = dt * (dsigmaxxdx + dsigmaxydy);\ + dvxdbuoyancy[i] = dt * (dsigmaxxdx + dsigmaxydy);\ }\ }\ \ @@ -231,7 +231,7 @@ SXY-VY--|-SXY-VY--|-SXY vy for (y = y_begin_ii; y < y_end_ii; ++y) {\ int64_t yi = y * nx;\ for (x = x_begin_ii; x < x_end_ii; ++x) {\ - int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\ + int64_t i = yi + x, j, k;\ DW_DTYPE dvydy = 0;\ DW_DTYPE dvxdx = 0;\ \ @@ -280,13 +280,13 @@ SXY-VY--|-SXY-VY--|-SXY vy dvxdx += m_vxx[i];\ }\ {\ - DW_DTYPE lambyxh = (lamb[i_noshot] + lamb[i_noshot + 1]) / 2;\ - DW_DTYPE muyxh = (mu[i_noshot] + mu[i_noshot + 1]) / 2;\ + DW_DTYPE lambyxh = (lamb[i] + lamb[i + 1]) / 2;\ + DW_DTYPE muyxh = (mu[i] + mu[i + 1]) / 2;\ sigmayy[i] += dt * ((lambyxh + 2 * muyxh) * dvydy + lambyxh * dvxdx);\ sigmaxx[i] += dt * ((lambyxh + 2 * muyxh) * dvxdx + lambyxh * dvydy);\ if (lamb_requires_grad || mu_requires_grad) {\ - dvydy_store[store_i + i_noshot] = dt * dvydy;\ - dvxdx_store[store_i + i_noshot] = dt * dvxdx;\ + dvydy_store[i] = dt * dvydy;\ + dvxdx_store[i] = dt * dvxdx;\ }\ }\ }\ @@ -295,7 +295,7 @@ SXY-VY--|-SXY-VY--|-SXY vy for (y = y_begin_xy; y < y_end_xy; ++y) {\ int64_t yi = y * nx;\ for (x = x_begin_xy; x < x_end_xy; ++x) {\ - int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, jp, k;\ + int64_t i = yi + x, j, jp, k;\ DW_DTYPE dvydx = 0;\ DW_DTYPE dvxdy = 0;\ \ @@ -421,10 +421,10 @@ SXY-VY--|-SXY-VY--|-SXY vy dvydx += m_vyx[i];\ }\ {\ - DW_DTYPE muyhx = (mu[i_noshot] + mu[i_noshot + nx]) / 2;\ + DW_DTYPE muyhx = (mu[i] + mu[i + nx]) / 2;\ sigmaxy[i] += dt * muyhx * (dvydx + dvxdy);\ if (mu_requires_grad) {\ - dvydxdvxdy_store[store_i + i_noshot] = dt * (dvydx + dvxdy);\ + dvydxdvxdy_store[i] = dt * (dvydx + dvxdy);\ }\ }\ }\ @@ -1385,6 +1385,109 @@ static void combine_grad_elastic(DW_DTYPE *__restrict const grad, } } +#ifdef _WIN32 +__declspec(noinline) +#else +__attribute__ ((noinline)) +#endif +static void forward_shot_v(DW_DTYPE const *__restrict const buoyancy, DW_DTYPE *__restrict const vy, DW_DTYPE *__restrict const vx, DW_DTYPE const *__restrict const sigmayy, + DW_DTYPE const *__restrict const sigmaxy, DW_DTYPE const *__restrict const sigmaxx, + DW_DTYPE *__restrict const m_sigmayyy, DW_DTYPE *__restrict const m_sigmaxyy, + DW_DTYPE *__restrict const m_sigmaxyx, DW_DTYPE *__restrict const m_sigmaxxx, + DW_DTYPE *__restrict const dvydbuoyancy, DW_DTYPE *__restrict const dvxdbuoyancy, + DW_DTYPE const *__restrict const ay, + DW_DTYPE const *__restrict const ayh, DW_DTYPE const *__restrict const ax, DW_DTYPE const *__restrict const axh, + DW_DTYPE const *__restrict const by, DW_DTYPE const *__restrict const byh, DW_DTYPE const *__restrict const bx, + DW_DTYPE const *__restrict const bxh, + DW_DTYPE const dt, int64_t const ny, int64_t const nx, + bool const buoyancy_requires_grad, int64_t const pml_y0, + int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1) { + int64_t y, x, y_begin_y, y_end_y, x_begin_y, x_end_y, y_begin_x, y_end_x, x_begin_x, x_end_x; + if (buoyancy_requires_grad) { + FORWARD_KERNEL_V(0, 0, 1) + FORWARD_KERNEL_V(0, 1, 1) + FORWARD_KERNEL_V(0, 2, 1) + FORWARD_KERNEL_V(1, 0, 1) + FORWARD_KERNEL_V(1, 1, 1) + FORWARD_KERNEL_V(1, 2, 1) + FORWARD_KERNEL_V(2, 0, 1) + FORWARD_KERNEL_V(2, 1, 1) + FORWARD_KERNEL_V(2, 2, 1) + } else { + FORWARD_KERNEL_V(0, 0, 0) + FORWARD_KERNEL_V(0, 1, 0) + FORWARD_KERNEL_V(0, 2, 0) + FORWARD_KERNEL_V(1, 0, 0) + FORWARD_KERNEL_V(1, 1, 0) + FORWARD_KERNEL_V(1, 2, 0) + FORWARD_KERNEL_V(2, 0, 0) + FORWARD_KERNEL_V(2, 1, 0) + FORWARD_KERNEL_V(2, 2, 0) + } +} + +#ifdef _WIN32 +__declspec(noinline) +#else +__attribute__ ((noinline)) +#endif +static void forward_shot_sigma( + DW_DTYPE const *__restrict const lamb, + DW_DTYPE const *__restrict const mu, DW_DTYPE const *__restrict const vy, DW_DTYPE const *__restrict const vx, DW_DTYPE *__restrict const sigmayy, + DW_DTYPE *__restrict const sigmaxy, DW_DTYPE *__restrict const sigmaxx, DW_DTYPE *__restrict const m_vyy, + DW_DTYPE *__restrict const m_vyx, DW_DTYPE *__restrict const m_vxy, DW_DTYPE *__restrict const m_vxx, + DW_DTYPE *__restrict const dvydy_store, DW_DTYPE *__restrict const dvxdx_store, + DW_DTYPE *__restrict const dvydxdvxdy_store, DW_DTYPE const *__restrict const ay, DW_DTYPE const *__restrict const ayh, + DW_DTYPE const *__restrict const ax, DW_DTYPE const *__restrict const axh, DW_DTYPE const *__restrict const by, + DW_DTYPE const *__restrict const byh, DW_DTYPE const *__restrict const bx, DW_DTYPE const *__restrict const bxh, + DW_DTYPE const dt, int64_t const ny, int64_t const nx, + bool const lamb_requires_grad, bool const mu_requires_grad, int64_t const pml_y0, + int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1) { + int64_t y, x, y_begin_ii, y_end_ii, x_begin_ii, x_end_ii, y_begin_xy, y_end_xy, x_begin_xy, x_end_xy; + if (lamb_requires_grad && mu_requires_grad) { + FORWARD_KERNEL_SIGMA(0, 0, 1, 1) + FORWARD_KERNEL_SIGMA(0, 1, 1, 1) + FORWARD_KERNEL_SIGMA(0, 2, 1, 1) + FORWARD_KERNEL_SIGMA(1, 0, 1, 1) + FORWARD_KERNEL_SIGMA(1, 1, 1, 1) + FORWARD_KERNEL_SIGMA(1, 2, 1, 1) + FORWARD_KERNEL_SIGMA(2, 0, 1, 1) + FORWARD_KERNEL_SIGMA(2, 1, 1, 1) + FORWARD_KERNEL_SIGMA(2, 2, 1, 1) + } else if (lamb_requires_grad) { + FORWARD_KERNEL_SIGMA(0, 0, 1, 0) + FORWARD_KERNEL_SIGMA(0, 1, 1, 0) + FORWARD_KERNEL_SIGMA(0, 2, 1, 0) + FORWARD_KERNEL_SIGMA(1, 0, 1, 0) + FORWARD_KERNEL_SIGMA(1, 1, 1, 0) + FORWARD_KERNEL_SIGMA(1, 2, 1, 0) + FORWARD_KERNEL_SIGMA(2, 0, 1, 0) + FORWARD_KERNEL_SIGMA(2, 1, 1, 0) + FORWARD_KERNEL_SIGMA(2, 2, 1, 0) + } else if (mu_requires_grad) { + FORWARD_KERNEL_SIGMA(0, 0, 0, 1) + FORWARD_KERNEL_SIGMA(0, 1, 0, 1) + FORWARD_KERNEL_SIGMA(0, 2, 0, 1) + FORWARD_KERNEL_SIGMA(1, 0, 0, 1) + FORWARD_KERNEL_SIGMA(1, 1, 0, 1) + FORWARD_KERNEL_SIGMA(1, 2, 0, 1) + FORWARD_KERNEL_SIGMA(2, 0, 0, 1) + FORWARD_KERNEL_SIGMA(2, 1, 0, 1) + FORWARD_KERNEL_SIGMA(2, 2, 0, 1) + } else { + FORWARD_KERNEL_SIGMA(0, 0, 0, 0) + FORWARD_KERNEL_SIGMA(0, 1, 0, 0) + FORWARD_KERNEL_SIGMA(0, 2, 0, 0) + FORWARD_KERNEL_SIGMA(1, 0, 0, 0) + FORWARD_KERNEL_SIGMA(1, 1, 0, 0) + FORWARD_KERNEL_SIGMA(1, 2, 0, 0) + FORWARD_KERNEL_SIGMA(2, 0, 0, 0) + FORWARD_KERNEL_SIGMA(2, 1, 0, 0) + FORWARD_KERNEL_SIGMA(2, 2, 0, 0) + } +} + + static void backward_shot(DW_DTYPE const *__restrict const lamb, DW_DTYPE const *__restrict const mu, DW_DTYPE const *__restrict const buoyancy, DW_DTYPE const *__restrict const grad_r_y, DW_DTYPE const *__restrict const grad_r_x, DW_DTYPE const *__restrict const grad_r_p, @@ -1562,8 +1665,6 @@ static void set_fd_coeffs_x(DW_DTYPE const dx) { } } - - #ifdef _WIN32 __declspec(dllexport) #endif @@ -1597,9 +1698,9 @@ __declspec(dllexport) int64_t shot; set_fd_coeffs_y(dy); set_fd_coeffs_x(dx); -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const si = shot * ny * nx; int64_t const siy = shot * n_sources_y_per_shot; @@ -1607,7 +1708,7 @@ __declspec(dllexport) int64_t const riy = shot * n_receivers_y_per_shot; int64_t const rix = shot * n_receivers_x_per_shot; int64_t const rip = shot * n_receivers_p_per_shot; - int64_t t, y, x, y_begin_y, y_end_y, x_begin_y, x_end_y, y_begin_x, y_end_x, x_begin_x, x_end_x, y_begin_ii, y_end_ii, x_begin_ii, x_end_ii, y_begin_xy, y_end_xy, x_begin_xy, x_end_xy; + int64_t t; for (t = 0; t < nt; ++t) { int64_t store_i = shot * (nt / step_ratio) * ny * nx + (t / step_ratio) * ny * nx; @@ -1623,27 +1724,18 @@ __declspec(dllexport) record_pressure_receivers(r_p + rip * nt + t * n_receivers_p_per_shot, sigmayy + si, sigmaxx + si, receivers_p_i + rip, n_receivers_p_per_shot); } - if (buoyancy_requires_grad && ((t % step_ratio) == 0)) { - FORWARD_KERNEL_V(0, 0, 1) - FORWARD_KERNEL_V(0, 1, 1) - FORWARD_KERNEL_V(0, 2, 1) - FORWARD_KERNEL_V(1, 0, 1) - FORWARD_KERNEL_V(1, 1, 1) - FORWARD_KERNEL_V(1, 2, 1) - FORWARD_KERNEL_V(2, 0, 1) - FORWARD_KERNEL_V(2, 1, 1) - FORWARD_KERNEL_V(2, 2, 1) - } else { - FORWARD_KERNEL_V(0, 0, 0) - FORWARD_KERNEL_V(0, 1, 0) - FORWARD_KERNEL_V(0, 2, 0) - FORWARD_KERNEL_V(1, 0, 0) - FORWARD_KERNEL_V(1, 1, 0) - FORWARD_KERNEL_V(1, 2, 0) - FORWARD_KERNEL_V(2, 0, 0) - FORWARD_KERNEL_V(2, 1, 0) - FORWARD_KERNEL_V(2, 2, 0) - } + forward_shot_v(buoyancy, vy + si, vx + si, sigmayy + si, + sigmaxy + si, sigmaxx + si, + m_sigmayyy + si, m_sigmaxyy + si, + m_sigmaxyx + si, m_sigmaxxx + si, + dvydbuoyancy + store_i, dvxdbuoyancy + store_i, + ay, + ayh, ax, axh, + by, byh, bx, + bxh, + dt, ny, nx, + buoyancy_requires_grad && ((t % step_ratio) == 0), pml_y0, + pml_y1, pml_x0, pml_x1); if (n_sources_y_per_shot > 0) { add_sources(vy + si, f_y + siy * nt + t * n_sources_y_per_shot, sources_y_i + siy, n_sources_y_per_shot); @@ -1652,47 +1744,18 @@ __declspec(dllexport) add_sources(vx + si, f_x + six * nt + t * n_sources_x_per_shot, sources_x_i + six, n_sources_x_per_shot); } - if (lamb_requires_grad && mu_requires_grad && ((t % step_ratio) == 0)) { - FORWARD_KERNEL_SIGMA(0, 0, 1, 1) - FORWARD_KERNEL_SIGMA(0, 1, 1, 1) - FORWARD_KERNEL_SIGMA(0, 2, 1, 1) - FORWARD_KERNEL_SIGMA(1, 0, 1, 1) - FORWARD_KERNEL_SIGMA(1, 1, 1, 1) - FORWARD_KERNEL_SIGMA(1, 2, 1, 1) - FORWARD_KERNEL_SIGMA(2, 0, 1, 1) - FORWARD_KERNEL_SIGMA(2, 1, 1, 1) - FORWARD_KERNEL_SIGMA(2, 2, 1, 1) - } else if (lamb_requires_grad && ((t % step_ratio) == 0)) { - FORWARD_KERNEL_SIGMA(0, 0, 1, 0) - FORWARD_KERNEL_SIGMA(0, 1, 1, 0) - FORWARD_KERNEL_SIGMA(0, 2, 1, 0) - FORWARD_KERNEL_SIGMA(1, 0, 1, 0) - FORWARD_KERNEL_SIGMA(1, 1, 1, 0) - FORWARD_KERNEL_SIGMA(1, 2, 1, 0) - FORWARD_KERNEL_SIGMA(2, 0, 1, 0) - FORWARD_KERNEL_SIGMA(2, 1, 1, 0) - FORWARD_KERNEL_SIGMA(2, 2, 1, 0) - } else if (mu_requires_grad && ((t % step_ratio) == 0)) { - FORWARD_KERNEL_SIGMA(0, 0, 0, 1) - FORWARD_KERNEL_SIGMA(0, 1, 0, 1) - FORWARD_KERNEL_SIGMA(0, 2, 0, 1) - FORWARD_KERNEL_SIGMA(1, 0, 0, 1) - FORWARD_KERNEL_SIGMA(1, 1, 0, 1) - FORWARD_KERNEL_SIGMA(1, 2, 0, 1) - FORWARD_KERNEL_SIGMA(2, 0, 0, 1) - FORWARD_KERNEL_SIGMA(2, 1, 0, 1) - FORWARD_KERNEL_SIGMA(2, 2, 0, 1) - } else { - FORWARD_KERNEL_SIGMA(0, 0, 0, 0) - FORWARD_KERNEL_SIGMA(0, 1, 0, 0) - FORWARD_KERNEL_SIGMA(0, 2, 0, 0) - FORWARD_KERNEL_SIGMA(1, 0, 0, 0) - FORWARD_KERNEL_SIGMA(1, 1, 0, 0) - FORWARD_KERNEL_SIGMA(1, 2, 0, 0) - FORWARD_KERNEL_SIGMA(2, 0, 0, 0) - FORWARD_KERNEL_SIGMA(2, 1, 0, 0) - FORWARD_KERNEL_SIGMA(2, 2, 0, 0) - } +forward_shot_sigma( + lamb, + mu, vy + si, vx + si, sigmayy + si, + sigmaxy + si, sigmaxx + si, m_vyy + si, + m_vyx + si, m_vxy + si, m_vxx + si, + dvydy_store + store_i, dvxdx_store + store_i, + dvydxdvxdy_store + store_i, ay, ayh, + ax, axh, by, + byh, bx, bxh, + dt, ny, nx, + lamb_requires_grad && ((t % step_ratio) == 0), mu_requires_grad && ((t % step_ratio) == 0), pml_y0, + pml_y1, pml_x0, pml_x1); } if (n_receivers_y_per_shot > 0) { record_receivers(r_y + riy * (nt + 1) + t * n_receivers_y_per_shot, vy + si, receivers_y_i + riy, @@ -1739,9 +1802,9 @@ __declspec(dllexport) int64_t shot; set_fd_coeffs_y(dy); set_fd_coeffs_x(dx); -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const si = shot * ny * nx; int64_t const siy = shot * n_sources_y_per_shot; @@ -1749,11 +1812,11 @@ __declspec(dllexport) int64_t const riy = shot * n_receivers_y_per_shot; int64_t const rix = shot * n_receivers_x_per_shot; int64_t const rip = shot * n_receivers_p_per_shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP int64_t const threadi = omp_get_thread_num() * ny * nx; #else int64_t const threadi = 0; -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ int64_t t; if (n_receivers_y_per_shot > 0 && nt > 0) { add_sources(vy + si, grad_r_y + riy * (nt+1) + nt * n_receivers_y_per_shot, receivers_y_i + riy, @@ -1826,7 +1889,7 @@ __declspec(dllexport) } } } -#ifdef DW_USE_OPENMP +#ifdef _OPENMP if (lamb_requires_grad && n_threads > 1) { combine_grad_elastic(grad_lamb, grad_lamb_thread, n_threads, ny, nx); } @@ -1836,5 +1899,5 @@ __declspec(dllexport) if (buoyancy_requires_grad && n_threads > 1) { combine_grad_elastic(grad_buoyancy, grad_buoyancy_thread, n_threads, ny, nx); } -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ } diff --git a/src/deepwave/scalar.c b/src/deepwave/scalar.c index 887e4f6..f975de0 100644 --- a/src/deepwave/scalar.c +++ b/src/deepwave/scalar.c @@ -17,9 +17,9 @@ * backward. */ -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #include -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ #include #include #include "common.h" @@ -241,9 +241,9 @@ __declspec(dllexport) bool const v_requires_grad, int64_t const pml_y0, int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const si = shot * n_sources_per_shot; @@ -308,18 +308,18 @@ __declspec(dllexport) int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const si = shot * n_sources_per_shot; int64_t const ri = shot * n_receivers_per_shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP int64_t const threadi = omp_get_thread_num() * ny * nx; #else int64_t const threadi = 0; -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ int64_t t; for (t = nt - 1; t >= 0; --t) { if ((nt - 1 - t) & 1) { @@ -351,9 +351,9 @@ __declspec(dllexport) } } } -#ifdef DW_USE_OPENMP +#ifdef _OPENMP if (v_requires_grad && n_threads > 1) { combine_grad(grad_v, grad_v_thread, n_threads, ny, nx); } -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ } diff --git a/src/deepwave/scalar_born.c b/src/deepwave/scalar_born.c index 16320f0..eefd0da 100644 --- a/src/deepwave/scalar_born.c +++ b/src/deepwave/scalar_born.c @@ -17,9 +17,9 @@ * backward. */ -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #include -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ #include #include #include "common.h" @@ -477,9 +477,9 @@ __declspec(dllexport) bool const v_requires_grad, bool const scatter_requires_grad, int64_t const pml_y0, int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const si = shot * n_sources_per_shot; @@ -569,20 +569,20 @@ __declspec(dllexport) int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const si = shot * n_sources_per_shot; int64_t const sisc = shot * n_sourcessc_per_shot; int64_t const ri = shot * n_receivers_per_shot; int64_t const risc = shot * n_receiverssc_per_shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP int64_t const threadi = omp_get_thread_num() * ny * nx; #else int64_t const threadi = 0; -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ int64_t t; for (t = nt - 1; t >= 0; --t) { if ((nt - 1 - t) & 1) { @@ -631,14 +631,14 @@ __declspec(dllexport) } } } -#ifdef DW_USE_OPENMP +#ifdef _OPENMP if (v_requires_grad && n_threads > 1) { combine_grad(grad_v, grad_v_thread, n_threads, ny, nx); } if (scatter_requires_grad && n_threads > 1) { combine_grad(grad_scatter, grad_scatter_thread, n_threads, ny, nx); } -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ } #ifdef _WIN32 @@ -670,18 +670,18 @@ __declspec(dllexport) int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) { int64_t shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP #pragma omp parallel for num_threads(n_threads) -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ for (shot = 0; shot < n_shots; ++shot) { int64_t const i = shot * ny * nx; int64_t const sisc = shot * n_sourcessc_per_shot; int64_t const risc = shot * n_receiverssc_per_shot; -#ifdef DW_USE_OPENMP +#ifdef _OPENMP int64_t const threadi = omp_get_thread_num() * ny * nx; #else int64_t const threadi = 0; -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ int64_t t; for (t = nt - 1; t >= 0; --t) { if ((nt - 1 - t) & 1) { @@ -715,9 +715,9 @@ __declspec(dllexport) } } } -#ifdef DW_USE_OPENMP +#ifdef _OPENMP if (scatter_requires_grad && n_threads > 1) { combine_grad(grad_scatter, grad_scatter_thread, n_threads, ny, nx); } -#endif /* DW_USE_OPENMP */ +#endif /* _OPENMP */ }