From e5c54c765a548662cb5a3c06caa08d496b4fbb51 Mon Sep 17 00:00:00 2001
From: Alan Richardson <arichar@tcd.ie>
Date: Fri, 21 Jul 2023 17:39:48 +0100
Subject: [PATCH] move forward kernels in elastic.c into separate functions so
 that they are vectorized even when using OpenMP

---
 src/deepwave/build_linux.sh |   2 +-
 src/deepwave/build_macos.sh |   2 +-
 src/deepwave/common_cpu.h   |   6 -
 src/deepwave/elastic.c      | 245 ++++++++++++++++++++++--------------
 src/deepwave/scalar.c       |  20 +--
 src/deepwave/scalar_born.c  |  32 ++---
 6 files changed, 182 insertions(+), 125 deletions(-)

diff --git a/src/deepwave/build_linux.sh b/src/deepwave/build_linux.sh
index 6e5efcf..c0f35c2 100755
--- a/src/deepwave/build_linux.sh
+++ b/src/deepwave/build_linux.sh
@@ -3,7 +3,7 @@
 set -e
 
 DW_OMP_NAME=libgomp.so.1
-CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -fopenmp -Ofast -mavx2"
+CFLAGS="-Wall -Wextra -pedantic -fPIC -fopenmp -Ofast -mavx2"
 CUDAFLAGS="--restrict --use_fast_math -O3 -gencode=arch=compute_52,code=sm_52, -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80  --compiler-options -fPIC"
 gcc $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
 gcc $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
diff --git a/src/deepwave/build_macos.sh b/src/deepwave/build_macos.sh
index db3f959..083a391 100755
--- a/src/deepwave/build_macos.sh
+++ b/src/deepwave/build_macos.sh
@@ -3,7 +3,7 @@
 set -e
 
 DW_OMP_NAME=iomp5
-CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include"
+CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include"
 clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
 clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
 clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o
diff --git a/src/deepwave/common_cpu.h b/src/deepwave/common_cpu.h
index 07f794b..30478c9 100644
--- a/src/deepwave/common_cpu.h
+++ b/src/deepwave/common_cpu.h
@@ -1,12 +1,6 @@
 #ifndef DW_COMMON_CPU_H
 #define DW_COMMON_CPU_H
 
-//#ifdef DW_USE_OPENMP
-//int dw_use_openmp = 1;
-//#else
-//int dw_use_openmp = 0;
-//#endif /* DW_USE_OPENMP */
-
 static void add_sources(DW_DTYPE *__restrict const wf,
                         DW_DTYPE const *__restrict const f,
                         int64_t const *__restrict const sources_i,
diff --git a/src/deepwave/elastic.c b/src/deepwave/elastic.c
index bb75b77..43dde83 100644
--- a/src/deepwave/elastic.c
+++ b/src/deepwave/elastic.c
@@ -9,9 +9,9 @@ VX  SII | VX  SII | VX  sii
 SXY-VY--|-SXY-VY--|-SXY vy
 */
 
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
 #include <omp.h>
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 #include <stdbool.h>
 #include <stdint.h>
 #include "common.h"
@@ -66,7 +66,7 @@ SXY-VY--|-SXY-VY--|-SXY vy
   for (y = y_begin_y; y < y_end_y; ++y) {\
     int64_t yi = y * nx;\
     for (x = x_begin_y; x < x_end_y; ++x) {\
-      int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\
+      int64_t i = yi + x, j, k;\
       DW_DTYPE dsigmayydy = 0;\
       DW_DTYPE dsigmaxydx = 0;\
 \
@@ -118,15 +118,15 @@ SXY-VY--|-SXY-VY--|-SXY vy
       {\
       DW_DTYPE buoyancyyhxh;\
       if (pml_y == 2 && y == ny - 1) {\
-        buoyancyyhxh = (buoyancy[i_noshot] + buoyancy[i_noshot + 1]) / 2;\
+        buoyancyyhxh = (buoyancy[i] + buoyancy[i + 1]) / 2;\
       } else {\
-        buoyancyyhxh = (buoyancy[i_noshot] + buoyancy[i_noshot + 1] + buoyancy[i_noshot + nx] +\
-                        buoyancy[i_noshot + nx + 1]) /\
+        buoyancyyhxh = (buoyancy[i] + buoyancy[i + 1] + buoyancy[i + nx] +\
+                        buoyancy[i + nx + 1]) /\
                        4;\
       }\
       vy[i] += buoyancyyhxh * dt * (dsigmayydy + dsigmaxydx);\
       if (buoyancy_requires_grad) {\
-        dvydbuoyancy[store_i + i_noshot] = dt * (dsigmayydy + dsigmaxydx);\
+        dvydbuoyancy[i] = dt * (dsigmayydy + dsigmaxydx);\
       }\
       }\
     }\
@@ -135,7 +135,7 @@ SXY-VY--|-SXY-VY--|-SXY vy
   for (y = y_begin_x; y < y_end_x; ++y) {\
     int64_t yi = y * nx;\
     for (x = x_begin_x; x < x_end_x; ++x) {\
-      int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\
+      int64_t i = yi + x, j, k;\
       DW_DTYPE dsigmaxydy = 0;\
       DW_DTYPE dsigmaxxdx = 0;\
 \
@@ -184,9 +184,9 @@ SXY-VY--|-SXY-VY--|-SXY vy
         m_sigmaxxx[i] = ax[x] * m_sigmaxxx[i] + bx[x] * dsigmaxxdx;\
         dsigmaxxdx += m_sigmaxxx[i];\
       }\
-      vx[i] += buoyancy[i_noshot] * dt * (dsigmaxxdx + dsigmaxydy);\
+      vx[i] += buoyancy[i] * dt * (dsigmaxxdx + dsigmaxydy);\
       if (buoyancy_requires_grad) {\
-        dvxdbuoyancy[store_i + i_noshot] = dt * (dsigmaxxdx + dsigmaxydy);\
+        dvxdbuoyancy[i] = dt * (dsigmaxxdx + dsigmaxydy);\
       }\
     }\
 \
@@ -231,7 +231,7 @@ SXY-VY--|-SXY-VY--|-SXY vy
   for (y = y_begin_ii; y < y_end_ii; ++y) {\
     int64_t yi = y * nx;\
     for (x = x_begin_ii; x < x_end_ii; ++x) {\
-      int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\
+      int64_t i = yi + x, j, k;\
       DW_DTYPE dvydy = 0;\
       DW_DTYPE dvxdx = 0;\
 \
@@ -280,13 +280,13 @@ SXY-VY--|-SXY-VY--|-SXY vy
         dvxdx += m_vxx[i];\
       }\
       {\
-      DW_DTYPE lambyxh = (lamb[i_noshot] + lamb[i_noshot + 1]) / 2;\
-      DW_DTYPE muyxh = (mu[i_noshot] + mu[i_noshot + 1]) / 2;\
+      DW_DTYPE lambyxh = (lamb[i] + lamb[i + 1]) / 2;\
+      DW_DTYPE muyxh = (mu[i] + mu[i + 1]) / 2;\
       sigmayy[i] += dt * ((lambyxh + 2 * muyxh) * dvydy + lambyxh * dvxdx);\
       sigmaxx[i] += dt * ((lambyxh + 2 * muyxh) * dvxdx + lambyxh * dvydy);\
       if (lamb_requires_grad || mu_requires_grad) {\
-        dvydy_store[store_i + i_noshot] = dt * dvydy;\
-        dvxdx_store[store_i + i_noshot] = dt * dvxdx;\
+        dvydy_store[i] = dt * dvydy;\
+        dvxdx_store[i] = dt * dvxdx;\
       }\
       }\
     }\
@@ -295,7 +295,7 @@ SXY-VY--|-SXY-VY--|-SXY vy
   for (y = y_begin_xy; y < y_end_xy; ++y) {\
     int64_t yi = y * nx;\
     for (x = x_begin_xy; x < x_end_xy; ++x) {\
-      int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, jp, k;\
+      int64_t i = yi + x, j, jp, k;\
       DW_DTYPE dvydx = 0;\
       DW_DTYPE dvxdy = 0;\
 \
@@ -421,10 +421,10 @@ SXY-VY--|-SXY-VY--|-SXY vy
         dvydx += m_vyx[i];\
       }\
       {\
-      DW_DTYPE muyhx = (mu[i_noshot] + mu[i_noshot + nx]) / 2;\
+      DW_DTYPE muyhx = (mu[i] + mu[i + nx]) / 2;\
       sigmaxy[i] += dt * muyhx * (dvydx + dvxdy);\
       if (mu_requires_grad) {\
-        dvydxdvxdy_store[store_i + i_noshot] = dt * (dvydx + dvxdy);\
+        dvydxdvxdy_store[i] = dt * (dvydx + dvxdy);\
       }\
       }\
     }\
@@ -1385,6 +1385,109 @@ static void combine_grad_elastic(DW_DTYPE *__restrict const grad,
   }
 }
 
+#ifdef _WIN32
+__declspec(noinline)
+#else
+__attribute__ ((noinline)) 
+#endif
+static void forward_shot_v(DW_DTYPE const *__restrict const buoyancy, DW_DTYPE *__restrict const vy, DW_DTYPE *__restrict const vx, DW_DTYPE const *__restrict const sigmayy,
+    DW_DTYPE const *__restrict const sigmaxy, DW_DTYPE const *__restrict const sigmaxx,
+    DW_DTYPE *__restrict const m_sigmayyy, DW_DTYPE *__restrict const m_sigmaxyy,
+    DW_DTYPE *__restrict const m_sigmaxyx, DW_DTYPE *__restrict const m_sigmaxxx,
+    DW_DTYPE *__restrict const dvydbuoyancy, DW_DTYPE *__restrict const dvxdbuoyancy,
+    DW_DTYPE const *__restrict const ay,
+    DW_DTYPE const *__restrict const ayh, DW_DTYPE const *__restrict const ax, DW_DTYPE const *__restrict const axh,
+    DW_DTYPE const *__restrict const by, DW_DTYPE const *__restrict const byh, DW_DTYPE const *__restrict const bx,
+    DW_DTYPE const *__restrict const bxh,
+    DW_DTYPE const dt, int64_t const ny, int64_t const nx,
+    bool const buoyancy_requires_grad, int64_t const pml_y0,
+      int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1) {
+      int64_t y, x, y_begin_y, y_end_y, x_begin_y, x_end_y, y_begin_x, y_end_x, x_begin_x, x_end_x;
+        if (buoyancy_requires_grad) {
+          FORWARD_KERNEL_V(0, 0, 1)
+          FORWARD_KERNEL_V(0, 1, 1)
+          FORWARD_KERNEL_V(0, 2, 1)
+          FORWARD_KERNEL_V(1, 0, 1)
+          FORWARD_KERNEL_V(1, 1, 1)
+          FORWARD_KERNEL_V(1, 2, 1)
+          FORWARD_KERNEL_V(2, 0, 1)
+          FORWARD_KERNEL_V(2, 1, 1)
+          FORWARD_KERNEL_V(2, 2, 1)
+        } else {
+          FORWARD_KERNEL_V(0, 0, 0)
+          FORWARD_KERNEL_V(0, 1, 0)
+          FORWARD_KERNEL_V(0, 2, 0)
+          FORWARD_KERNEL_V(1, 0, 0)
+          FORWARD_KERNEL_V(1, 1, 0)
+          FORWARD_KERNEL_V(1, 2, 0)
+          FORWARD_KERNEL_V(2, 0, 0)
+          FORWARD_KERNEL_V(2, 1, 0)
+          FORWARD_KERNEL_V(2, 2, 0)
+        }
+}
+
+#ifdef _WIN32
+__declspec(noinline)
+#else
+__attribute__ ((noinline)) 
+#endif
+static void forward_shot_sigma(
+    DW_DTYPE const *__restrict const lamb,
+    DW_DTYPE const *__restrict const mu, DW_DTYPE const *__restrict const vy, DW_DTYPE const *__restrict const vx, DW_DTYPE *__restrict const sigmayy,
+    DW_DTYPE *__restrict const sigmaxy, DW_DTYPE *__restrict const sigmaxx, DW_DTYPE *__restrict const m_vyy,
+    DW_DTYPE *__restrict const m_vyx, DW_DTYPE *__restrict const m_vxy, DW_DTYPE *__restrict const m_vxx,
+    DW_DTYPE *__restrict const dvydy_store, DW_DTYPE *__restrict const dvxdx_store,
+    DW_DTYPE *__restrict const dvydxdvxdy_store, DW_DTYPE const *__restrict const ay, DW_DTYPE const *__restrict const ayh,
+    DW_DTYPE const *__restrict const ax, DW_DTYPE const *__restrict const axh, DW_DTYPE const *__restrict const by,
+    DW_DTYPE const *__restrict const byh, DW_DTYPE const *__restrict const bx, DW_DTYPE const *__restrict const bxh,
+    DW_DTYPE const dt, int64_t const ny, int64_t const nx,
+    bool const lamb_requires_grad, bool const mu_requires_grad, int64_t const pml_y0,
+      int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1) {
+      int64_t y, x, y_begin_ii, y_end_ii, x_begin_ii, x_end_ii, y_begin_xy, y_end_xy, x_begin_xy, x_end_xy;
+        if (lamb_requires_grad && mu_requires_grad) {
+            FORWARD_KERNEL_SIGMA(0, 0, 1, 1)
+            FORWARD_KERNEL_SIGMA(0, 1, 1, 1)
+            FORWARD_KERNEL_SIGMA(0, 2, 1, 1)
+            FORWARD_KERNEL_SIGMA(1, 0, 1, 1)
+            FORWARD_KERNEL_SIGMA(1, 1, 1, 1)
+            FORWARD_KERNEL_SIGMA(1, 2, 1, 1)
+            FORWARD_KERNEL_SIGMA(2, 0, 1, 1)
+            FORWARD_KERNEL_SIGMA(2, 1, 1, 1)
+            FORWARD_KERNEL_SIGMA(2, 2, 1, 1)
+          } else if (lamb_requires_grad) {
+            FORWARD_KERNEL_SIGMA(0, 0, 1, 0)
+            FORWARD_KERNEL_SIGMA(0, 1, 1, 0)
+            FORWARD_KERNEL_SIGMA(0, 2, 1, 0)
+            FORWARD_KERNEL_SIGMA(1, 0, 1, 0)
+            FORWARD_KERNEL_SIGMA(1, 1, 1, 0)
+            FORWARD_KERNEL_SIGMA(1, 2, 1, 0)
+            FORWARD_KERNEL_SIGMA(2, 0, 1, 0)
+            FORWARD_KERNEL_SIGMA(2, 1, 1, 0)
+            FORWARD_KERNEL_SIGMA(2, 2, 1, 0)
+          } else if (mu_requires_grad) {
+            FORWARD_KERNEL_SIGMA(0, 0, 0, 1)
+            FORWARD_KERNEL_SIGMA(0, 1, 0, 1)
+            FORWARD_KERNEL_SIGMA(0, 2, 0, 1)
+            FORWARD_KERNEL_SIGMA(1, 0, 0, 1)
+            FORWARD_KERNEL_SIGMA(1, 1, 0, 1)
+            FORWARD_KERNEL_SIGMA(1, 2, 0, 1)
+            FORWARD_KERNEL_SIGMA(2, 0, 0, 1)
+            FORWARD_KERNEL_SIGMA(2, 1, 0, 1)
+            FORWARD_KERNEL_SIGMA(2, 2, 0, 1)
+        } else {
+          FORWARD_KERNEL_SIGMA(0, 0, 0, 0)
+          FORWARD_KERNEL_SIGMA(0, 1, 0, 0)
+          FORWARD_KERNEL_SIGMA(0, 2, 0, 0)
+          FORWARD_KERNEL_SIGMA(1, 0, 0, 0)
+          FORWARD_KERNEL_SIGMA(1, 1, 0, 0)
+          FORWARD_KERNEL_SIGMA(1, 2, 0, 0)
+          FORWARD_KERNEL_SIGMA(2, 0, 0, 0)
+          FORWARD_KERNEL_SIGMA(2, 1, 0, 0)
+          FORWARD_KERNEL_SIGMA(2, 2, 0, 0)
+        }
+}
+
+
 static void backward_shot(DW_DTYPE const *__restrict const lamb,
       DW_DTYPE const *__restrict const mu, DW_DTYPE const *__restrict const buoyancy,
       DW_DTYPE const *__restrict const grad_r_y, DW_DTYPE const *__restrict const grad_r_x, DW_DTYPE const *__restrict const grad_r_p,
@@ -1562,8 +1665,6 @@ static void set_fd_coeffs_x(DW_DTYPE const dx) {
   }
 }
 
-
-
 #ifdef _WIN32
 __declspec(dllexport)
 #endif
@@ -1597,9 +1698,9 @@ __declspec(dllexport)
     int64_t shot;
     set_fd_coeffs_y(dy);
     set_fd_coeffs_x(dx);
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     for (shot = 0; shot < n_shots; ++shot) {
       int64_t const si = shot * ny * nx;
       int64_t const siy = shot * n_sources_y_per_shot;
@@ -1607,7 +1708,7 @@ __declspec(dllexport)
       int64_t const riy = shot * n_receivers_y_per_shot;
       int64_t const rix = shot * n_receivers_x_per_shot;
       int64_t const rip = shot * n_receivers_p_per_shot;
-      int64_t t, y, x, y_begin_y, y_end_y, x_begin_y, x_end_y, y_begin_x, y_end_x, x_begin_x, x_end_x, y_begin_ii, y_end_ii, x_begin_ii, x_end_ii, y_begin_xy, y_end_xy, x_begin_xy, x_end_xy;
+      int64_t t;
 
       for (t = 0; t < nt; ++t) {
         int64_t store_i = shot * (nt / step_ratio) * ny * nx + (t / step_ratio) * ny * nx;
@@ -1623,27 +1724,18 @@ __declspec(dllexport)
           record_pressure_receivers(r_p + rip * nt + t * n_receivers_p_per_shot, sigmayy + si,
                                     sigmaxx + si, receivers_p_i + rip, n_receivers_p_per_shot);
         }
-        if (buoyancy_requires_grad && ((t % step_ratio) == 0)) {
-          FORWARD_KERNEL_V(0, 0, 1)
-          FORWARD_KERNEL_V(0, 1, 1)
-          FORWARD_KERNEL_V(0, 2, 1)
-          FORWARD_KERNEL_V(1, 0, 1)
-          FORWARD_KERNEL_V(1, 1, 1)
-          FORWARD_KERNEL_V(1, 2, 1)
-          FORWARD_KERNEL_V(2, 0, 1)
-          FORWARD_KERNEL_V(2, 1, 1)
-          FORWARD_KERNEL_V(2, 2, 1)
-        } else {
-          FORWARD_KERNEL_V(0, 0, 0)
-          FORWARD_KERNEL_V(0, 1, 0)
-          FORWARD_KERNEL_V(0, 2, 0)
-          FORWARD_KERNEL_V(1, 0, 0)
-          FORWARD_KERNEL_V(1, 1, 0)
-          FORWARD_KERNEL_V(1, 2, 0)
-          FORWARD_KERNEL_V(2, 0, 0)
-          FORWARD_KERNEL_V(2, 1, 0)
-          FORWARD_KERNEL_V(2, 2, 0)
-        }
+        forward_shot_v(buoyancy, vy + si, vx + si, sigmayy + si,
+    sigmaxy + si, sigmaxx + si,
+    m_sigmayyy + si, m_sigmaxyy + si,
+    m_sigmaxyx + si, m_sigmaxxx + si,
+    dvydbuoyancy + store_i, dvxdbuoyancy + store_i,
+    ay,
+    ayh, ax, axh,
+    by, byh, bx,
+    bxh,
+    dt, ny, nx,
+    buoyancy_requires_grad && ((t % step_ratio) == 0), pml_y0,
+      pml_y1, pml_x0, pml_x1);
         if (n_sources_y_per_shot > 0) {
           add_sources(vy + si, f_y + siy * nt + t * n_sources_y_per_shot, sources_y_i + siy,
                       n_sources_y_per_shot);
@@ -1652,47 +1744,18 @@ __declspec(dllexport)
           add_sources(vx + si, f_x + six * nt + t * n_sources_x_per_shot, sources_x_i + six,
                       n_sources_x_per_shot);
         }
-        if (lamb_requires_grad && mu_requires_grad && ((t % step_ratio) == 0)) {
-            FORWARD_KERNEL_SIGMA(0, 0, 1, 1)
-            FORWARD_KERNEL_SIGMA(0, 1, 1, 1)
-            FORWARD_KERNEL_SIGMA(0, 2, 1, 1)
-            FORWARD_KERNEL_SIGMA(1, 0, 1, 1)
-            FORWARD_KERNEL_SIGMA(1, 1, 1, 1)
-            FORWARD_KERNEL_SIGMA(1, 2, 1, 1)
-            FORWARD_KERNEL_SIGMA(2, 0, 1, 1)
-            FORWARD_KERNEL_SIGMA(2, 1, 1, 1)
-            FORWARD_KERNEL_SIGMA(2, 2, 1, 1)
-          } else if (lamb_requires_grad && ((t % step_ratio) == 0)) {
-            FORWARD_KERNEL_SIGMA(0, 0, 1, 0)
-            FORWARD_KERNEL_SIGMA(0, 1, 1, 0)
-            FORWARD_KERNEL_SIGMA(0, 2, 1, 0)
-            FORWARD_KERNEL_SIGMA(1, 0, 1, 0)
-            FORWARD_KERNEL_SIGMA(1, 1, 1, 0)
-            FORWARD_KERNEL_SIGMA(1, 2, 1, 0)
-            FORWARD_KERNEL_SIGMA(2, 0, 1, 0)
-            FORWARD_KERNEL_SIGMA(2, 1, 1, 0)
-            FORWARD_KERNEL_SIGMA(2, 2, 1, 0)
-          } else if (mu_requires_grad && ((t % step_ratio) == 0)) {
-            FORWARD_KERNEL_SIGMA(0, 0, 0, 1)
-            FORWARD_KERNEL_SIGMA(0, 1, 0, 1)
-            FORWARD_KERNEL_SIGMA(0, 2, 0, 1)
-            FORWARD_KERNEL_SIGMA(1, 0, 0, 1)
-            FORWARD_KERNEL_SIGMA(1, 1, 0, 1)
-            FORWARD_KERNEL_SIGMA(1, 2, 0, 1)
-            FORWARD_KERNEL_SIGMA(2, 0, 0, 1)
-            FORWARD_KERNEL_SIGMA(2, 1, 0, 1)
-            FORWARD_KERNEL_SIGMA(2, 2, 0, 1)
-        } else {
-          FORWARD_KERNEL_SIGMA(0, 0, 0, 0)
-          FORWARD_KERNEL_SIGMA(0, 1, 0, 0)
-          FORWARD_KERNEL_SIGMA(0, 2, 0, 0)
-          FORWARD_KERNEL_SIGMA(1, 0, 0, 0)
-          FORWARD_KERNEL_SIGMA(1, 1, 0, 0)
-          FORWARD_KERNEL_SIGMA(1, 2, 0, 0)
-          FORWARD_KERNEL_SIGMA(2, 0, 0, 0)
-          FORWARD_KERNEL_SIGMA(2, 1, 0, 0)
-          FORWARD_KERNEL_SIGMA(2, 2, 0, 0)
-        }
+forward_shot_sigma(
+    lamb,
+    mu, vy + si, vx + si, sigmayy + si,
+    sigmaxy + si, sigmaxx + si, m_vyy + si,
+    m_vyx + si, m_vxy + si, m_vxx + si,
+    dvydy_store + store_i, dvxdx_store + store_i,
+    dvydxdvxdy_store + store_i, ay, ayh,
+    ax, axh, by,
+    byh, bx, bxh,
+    dt, ny, nx,
+    lamb_requires_grad && ((t % step_ratio) == 0), mu_requires_grad && ((t % step_ratio) == 0), pml_y0,
+      pml_y1, pml_x0, pml_x1);
       }
         if (n_receivers_y_per_shot > 0) {
           record_receivers(r_y + riy * (nt + 1) + t * n_receivers_y_per_shot, vy + si, receivers_y_i + riy,
@@ -1739,9 +1802,9 @@ __declspec(dllexport)
     int64_t shot;
     set_fd_coeffs_y(dy);
     set_fd_coeffs_x(dx);
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     for (shot = 0; shot < n_shots; ++shot) {
       int64_t const si = shot * ny * nx;
       int64_t const siy = shot * n_sources_y_per_shot;
@@ -1749,11 +1812,11 @@ __declspec(dllexport)
       int64_t const riy = shot * n_receivers_y_per_shot;
       int64_t const rix = shot * n_receivers_x_per_shot;
       int64_t const rip = shot * n_receivers_p_per_shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
     int64_t const threadi = omp_get_thread_num() * ny * nx;
 #else
     int64_t const threadi = 0;
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
       int64_t t;
       if (n_receivers_y_per_shot > 0 && nt > 0) {
         add_sources(vy + si, grad_r_y + riy * (nt+1) + nt * n_receivers_y_per_shot, receivers_y_i + riy,
@@ -1826,7 +1889,7 @@ __declspec(dllexport)
     }
 }
     }
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   if (lamb_requires_grad && n_threads > 1) {
     combine_grad_elastic(grad_lamb, grad_lamb_thread, n_threads, ny, nx);
   }
@@ -1836,5 +1899,5 @@ __declspec(dllexport)
   if (buoyancy_requires_grad && n_threads > 1) {
     combine_grad_elastic(grad_buoyancy, grad_buoyancy_thread, n_threads, ny, nx);
   }
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     }
diff --git a/src/deepwave/scalar.c b/src/deepwave/scalar.c
index 887e4f6..f975de0 100644
--- a/src/deepwave/scalar.c
+++ b/src/deepwave/scalar.c
@@ -17,9 +17,9 @@
  *    backward.
  */
 
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
 #include <omp.h>
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 #include <stdbool.h>
 #include <stdint.h>
 #include "common.h"
@@ -241,9 +241,9 @@ __declspec(dllexport)
         bool const v_requires_grad, int64_t const pml_y0, int64_t const pml_y1,
         int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const si = shot * n_sources_per_shot;
@@ -308,18 +308,18 @@ __declspec(dllexport)
         int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1,
         int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const si = shot * n_sources_per_shot;
     int64_t const ri = shot * n_receivers_per_shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
     int64_t const threadi = omp_get_thread_num() * ny * nx;
 #else
     int64_t const threadi = 0;
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     int64_t t;
     for (t = nt - 1; t >= 0; --t) {
       if ((nt - 1 - t) & 1) {
@@ -351,9 +351,9 @@ __declspec(dllexport)
       }
     }
   }
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   if (v_requires_grad && n_threads > 1) {
     combine_grad(grad_v, grad_v_thread, n_threads, ny, nx);
   }
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 }
diff --git a/src/deepwave/scalar_born.c b/src/deepwave/scalar_born.c
index 16320f0..eefd0da 100644
--- a/src/deepwave/scalar_born.c
+++ b/src/deepwave/scalar_born.c
@@ -17,9 +17,9 @@
  *    backward.
  */
 
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
 #include <omp.h>
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 #include <stdbool.h>
 #include <stdint.h>
 #include "common.h"
@@ -477,9 +477,9 @@ __declspec(dllexport)
         bool const v_requires_grad, bool const scatter_requires_grad, int64_t const pml_y0, int64_t const pml_y1,
         int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const si = shot * n_sources_per_shot;
@@ -569,20 +569,20 @@ __declspec(dllexport)
         int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1,
         int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const si = shot * n_sources_per_shot;
     int64_t const sisc = shot * n_sourcessc_per_shot;
     int64_t const ri = shot * n_receivers_per_shot;
     int64_t const risc = shot * n_receiverssc_per_shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
     int64_t const threadi = omp_get_thread_num() * ny * nx;
 #else
     int64_t const threadi = 0;
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     int64_t t;
     for (t = nt - 1; t >= 0; --t) {
       if ((nt - 1 - t) & 1) {
@@ -631,14 +631,14 @@ __declspec(dllexport)
       }
     }
   }
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   if (v_requires_grad && n_threads > 1) {
     combine_grad(grad_v, grad_v_thread, n_threads, ny, nx);
   }
   if (scatter_requires_grad && n_threads > 1) {
     combine_grad(grad_scatter, grad_scatter_thread, n_threads, ny, nx);
   }
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 }
 
 #ifdef _WIN32
@@ -670,18 +670,18 @@ __declspec(dllexport)
         int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1,
         int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const sisc = shot * n_sourcessc_per_shot;
     int64_t const risc = shot * n_receiverssc_per_shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
     int64_t const threadi = omp_get_thread_num() * ny * nx;
 #else
     int64_t const threadi = 0;
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     int64_t t;
     for (t = nt - 1; t >= 0; --t) {
       if ((nt - 1 - t) & 1) {
@@ -715,9 +715,9 @@ __declspec(dllexport)
       }
     }
   }
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   if (scatter_requires_grad && n_threads > 1) {
     combine_grad(grad_scatter, grad_scatter_thread, n_threads, ny, nx);
   }
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 }