diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7b59686..a472b3b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,32 +1,32 @@
 name: Build and test
 on: push
 jobs:
-    Linux-build:
-        runs-on: ubuntu-latest
-        container: quay.io/pypa/manylinux2014_x86_64
-        steps:
-            - name: Checkout
-              uses: actions/checkout@v3
-            - name: Install NVCC
-              run: |
-                yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
-                yum install -y cuda-nvcc-11-1-11.1.105-1 cuda-cudart-devel-11-1-11.1.74-1
-            - name: Compile
-              run: |
-                PATH=$PATH:/usr/local/cuda-11.1/bin
-                CUDA_HOME=/usr/local/cuda-11.1
-                CUDA_ROOT=/usr/local/cuda-11.1
-                CUDA_PATH=/usr/local/cuda-11.1
-                CUDADIR=/usr/local/cuda-11.1
-                LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/lib64
-                cd src/deepwave
-                cp /lib64/libgomp.so.1 .
-                ./build_linux.sh
-            - name: Archive built libraries
-              uses: actions/upload-artifact@v3
-              with:
-                name: linux_libraries
-                path: src/deepwave/*.so*
+#    Linux-build:
+#        runs-on: ubuntu-latest
+#        container: quay.io/pypa/manylinux2014_x86_64
+#        steps:
+#            - name: Checkout
+#              uses: actions/checkout@v3
+#            - name: Install NVCC
+#              run: |
+#                yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+#                yum install -y cuda-nvcc-11-1-11.1.105-1 cuda-cudart-devel-11-1-11.1.74-1
+#            - name: Compile
+#              run: |
+#                PATH=$PATH:/usr/local/cuda-11.1/bin
+#                CUDA_HOME=/usr/local/cuda-11.1
+#                CUDA_ROOT=/usr/local/cuda-11.1
+#                CUDA_PATH=/usr/local/cuda-11.1
+#                CUDADIR=/usr/local/cuda-11.1
+#                LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/lib64
+#                cd src/deepwave
+#                cp /lib64/libgomp.so.1 .
+#                ./build_linux.sh
+#            - name: Archive built libraries
+#              uses: actions/upload-artifact@v3
+#              with:
+#                name: linux_libraries
+#                path: src/deepwave/*.so*
     MacOS-build:
         runs-on: macos-11
         steps:
@@ -36,79 +36,94 @@ jobs:
               uses: actions/setup-python@v3
             - name: Install dependencies
               run: |
-                python -m pip install --upgrade pip
-                python -m pip install torch
+                nuget install intelopenmp.devel.osx -DirectDownload -NonInteractive
+                #ls -R
+                #python -m pip install torch
             - name: Compile
               run: |
                 cd src/deepwave
-                cp `python -c "import torch; print(torch.__path__[0])"`/lib/libiomp5.dylib .
+                #cp `python -c "import torch; print(torch.__path__[0])"`/lib/libiomp5.dylib .
+                mv intelopenmp.devel.osx*/lib/native/osx-x64/libiomp5.dylib .
                 brew install libomp
                 ./build_macos.sh
-            - name: Archive built libraries
-              uses: actions/upload-artifact@v3
-              with:
-                name: macos_libraries
-                path: src/deepwave/*.dylib
-    Windows-build:
-        runs-on: windows-2019
-        defaults:
-            run:
-                shell: bash
-        steps:
-            - name: Checkout
-              uses: actions/checkout@v3
-            - name: Set up Python
-              uses: actions/setup-python@v3
-            - name: Install NVCC
-              run: |
-                curl https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe -o cuda_11.1.1_win10_network.exe
-                chmod +x ./cuda_11.1.1_win10_network.exe
-                ./cuda_11.1.1_win10_network.exe -s nvcc_11.1 cudart_11.1
-                echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1" >> $GITHUB_ENV
-                echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\bin" >> $GITHUB_PATH
-            - name: Setup MSVC
-              uses: ilammy/msvc-dev-cmd@v1
-            - name: Compile
-              run: |
-                cd src/deepwave
-                ./build_windows.sh
-            - name: Archive built libraries
-              uses: actions/upload-artifact@v3
-              with:
-                name: windows_libraries
-                path: src/deepwave/*.dll
-    Test:
-        strategy:
-            matrix:
-                os: [ubuntu-latest, macos-latest, windows-latest]
-            fail-fast: false
-        runs-on: ${{ matrix.os }}
-        needs: [Linux-build, MacOS-build, Windows-build]
-        steps:
-            - name: Checkout
-              uses: actions/checkout@v3
-            - name: Download built Linux libraries
-              uses: actions/download-artifact@v3
-              with:
-                name: linux_libraries
-                path: src/deepwave/
-            - name: Download built MacOS libraries
-              uses: actions/download-artifact@v3
-              with:
-                name: macos_libraries
-                path: src/deepwave/
-            - name: Download built Windows libraries
-              uses: actions/download-artifact@v3
-              with:
-                name: windows_libraries
-                path: src/deepwave/
-            - name: Set up Python
-              uses: actions/setup-python@v3
-            - name: Install dependencies
-              run: |
+                cd ../../
                 python -m pip install --upgrade pip
                 python -m pip install pytest scipy
                 python -m pip install .
             - name: Test with pytest
               run: |
-                pytest
+                #pytest -s
+                cd tests
+                PYTHONVERBOSE=3 python -c "import test_elastic; test_wavefield_decays()"
+#            - name: Archive built libraries
+#              uses: actions/upload-artifact@v3
+#              with:
+#                name: macos_libraries
+#                path: src/deepwave/*.dylib
+#    Windows-build:
+#        runs-on: windows-2019
+#        defaults:
+#            run:
+#                shell: bash
+#        steps:
+#            - name: Checkout
+#              uses: actions/checkout@v3
+#            - name: Set up Python
+#              uses: actions/setup-python@v3
+#            - name: Install NVCC
+#              run: |
+#                curl https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe -o cuda_11.1.1_win10_network.exe
+#                chmod +x ./cuda_11.1.1_win10_network.exe
+#                ./cuda_11.1.1_win10_network.exe -s nvcc_11.1 cudart_11.1
+#                echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1" >> $GITHUB_ENV
+#                echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\bin" >> $GITHUB_PATH
+#            - name: Setup MSVC
+#              uses: ilammy/msvc-dev-cmd@v1
+#            - name: Compile
+#              run: |
+#                cd src/deepwave
+#                nuget install intelopenmp.devel.win -DirectDownload -NonInteractive
+#                nuget install intelopenmp.redist.win -DirectDownload -NonInteractive
+#                mv intelopenmp.devel.win*/lib/native/win-x64/libiomp5md.lib .
+#                mv intelopenmp.redist.win*/runtimes/win-x86/native/libiomp5md.dll .
+#                ./build_windows.sh
+#            - name: Archive built libraries
+#              uses: actions/upload-artifact@v3
+#              with:
+#                name: windows_libraries
+#                path: src/deepwave/*.dll
+#    Test:
+#        strategy:
+#            matrix:
+#                os: [ubuntu-latest, macos-latest, windows-latest]
+#            fail-fast: false
+#        runs-on: ${{ matrix.os }}
+#        needs: [Linux-build, MacOS-build, Windows-build]
+#        steps:
+#            - name: Checkout
+#              uses: actions/checkout@v3
+#            - name: Download built Linux libraries
+#              uses: actions/download-artifact@v3
+#              with:
+#                name: linux_libraries
+#                path: src/deepwave/
+#            - name: Download built MacOS libraries
+#              uses: actions/download-artifact@v3
+#              with:
+#                name: macos_libraries
+#                path: src/deepwave/
+#            - name: Download built Windows libraries
+#              uses: actions/download-artifact@v3
+#              with:
+#                name: windows_libraries
+#                path: src/deepwave/
+#            - name: Set up Python
+#              uses: actions/setup-python@v3
+#            - name: Install dependencies
+#              run: |
+#                python -m pip install --upgrade pip
+#                python -m pip install pytest scipy
+#                python -m pip install .
+#            - name: Test with pytest
+#              run: |
+#                PYTHONVERBOSE=3 pytest
diff --git a/src/deepwave/__init__.py b/src/deepwave/__init__.py
index 87373db..ceb02eb 100644
--- a/src/deepwave/__init__.py
+++ b/src/deepwave/__init__.py
@@ -56,8 +56,11 @@
 try:
     dll_cpu.omp_get_num_threads
     use_openmp = True
+    import torch
+    print('USING OPENMP', torch.get_num_threads())
 except AttributeError:
     use_openmp = False
+    print('NOT USING OPENMP')
 dll_cpu.scalar_iso_2_float_forward.restype = None
 dll_cpu.scalar_iso_4_float_forward.restype = None
 dll_cpu.scalar_iso_6_float_forward.restype = None
diff --git a/src/deepwave/build_linux.sh b/src/deepwave/build_linux.sh
index 6e5efcf..c0f35c2 100755
--- a/src/deepwave/build_linux.sh
+++ b/src/deepwave/build_linux.sh
@@ -3,7 +3,7 @@
 set -e
 
 DW_OMP_NAME=libgomp.so.1
-CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -fopenmp -Ofast -mavx2"
+CFLAGS="-Wall -Wextra -pedantic -fPIC -fopenmp -Ofast -mavx2"
 CUDAFLAGS="--restrict --use_fast_math -O3 -gencode=arch=compute_52,code=sm_52, -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80  --compiler-options -fPIC"
 gcc $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
 gcc $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
diff --git a/src/deepwave/build_macos.sh b/src/deepwave/build_macos.sh
index db3f959..5a86f16 100755
--- a/src/deepwave/build_macos.sh
+++ b/src/deepwave/build_macos.sh
@@ -3,7 +3,7 @@
 set -e
 
 DW_OMP_NAME=iomp5
-CFLAGS="-Wall -Wextra -pedantic -DDW_USE_OPENMP -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include"
+CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -Xpreprocessor -fopenmp -I`brew --prefix libomp`/include"
 clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
 clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
 clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o
@@ -26,26 +26,26 @@ clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_
 clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_4_double.o
 clang $CFLAGS -dynamiclib scalar_born_cpu_iso_2_float.o scalar_born_cpu_iso_4_float.o scalar_born_cpu_iso_6_float.o scalar_born_cpu_iso_8_float.o scalar_born_cpu_iso_2_double.o scalar_born_cpu_iso_4_double.o scalar_born_cpu_iso_6_double.o scalar_born_cpu_iso_8_double.o scalar_cpu_iso_2_float.o scalar_cpu_iso_4_float.o scalar_cpu_iso_6_float.o scalar_cpu_iso_8_float.o scalar_cpu_iso_2_double.o scalar_cpu_iso_4_double.o scalar_cpu_iso_6_double.o scalar_cpu_iso_8_double.o elastic_cpu_iso_2_float.o elastic_cpu_iso_4_float.o elastic_cpu_iso_2_double.o elastic_cpu_iso_4_double.o -L. -l$DW_OMP_NAME -rpath @loader_path/ -o libdeepwave_cpu_macos_x86_64.dylib
 rm *.o
-CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -arch arm64"
-clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
-clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
-clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o
-clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_8_float.o
-clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_2_double.o
-clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_4_double.o
-clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_6_double.o
-clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_8_double.o
-clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_2_float.o
-clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_4_float.o
-clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_6_float.o
-clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_8_float.o
-clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_2_double.o
-clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_4_double.o
-clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_6_double.o
-clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_8_double.o
-clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_2_float.o
-clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_4_float.o
-clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_2_double.o
-clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_4_double.o
-clang $CFLAGS -shared scalar_born_cpu_iso_2_float.o scalar_born_cpu_iso_4_float.o scalar_born_cpu_iso_6_float.o scalar_born_cpu_iso_8_float.o scalar_born_cpu_iso_2_double.o scalar_born_cpu_iso_4_double.o scalar_born_cpu_iso_6_double.o scalar_born_cpu_iso_8_double.o scalar_cpu_iso_2_float.o scalar_cpu_iso_4_float.o scalar_cpu_iso_6_float.o scalar_cpu_iso_8_float.o scalar_cpu_iso_2_double.o scalar_cpu_iso_4_double.o scalar_cpu_iso_6_double.o scalar_cpu_iso_8_double.o elastic_cpu_iso_2_float.o elastic_cpu_iso_4_float.o elastic_cpu_iso_2_double.o elastic_cpu_iso_4_double.o -o libdeepwave_cpu_macos_arm64.dylib
-rm *.o
+#CFLAGS="-Wall -Wextra -pedantic -fPIC -Ofast -arch arm64"
+#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_2_float.o
+#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_4_float.o
+#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_6_float.o
+#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar.c -o scalar_cpu_iso_8_float.o
+#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_2_double.o
+#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_4_double.o
+#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_6_double.o
+#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar.c -o scalar_cpu_iso_8_double.o
+#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_2_float.o
+#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_4_float.o
+#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_6_float.o
+#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=float -c scalar_born.c -o scalar_born_cpu_iso_8_float.o
+#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_2_double.o
+#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_4_double.o
+#clang $CFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_6_double.o
+#clang $CFLAGS -DDW_ACCURACY=8 -DDW_DTYPE=double -c scalar_born.c -o scalar_born_cpu_iso_8_double.o
+#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_2_float.o
+#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c elastic.c -o elastic_cpu_iso_4_float.o
+#clang $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_2_double.o
+#clang $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -o elastic_cpu_iso_4_double.o
+#clang $CFLAGS -shared scalar_born_cpu_iso_2_float.o scalar_born_cpu_iso_4_float.o scalar_born_cpu_iso_6_float.o scalar_born_cpu_iso_8_float.o scalar_born_cpu_iso_2_double.o scalar_born_cpu_iso_4_double.o scalar_born_cpu_iso_6_double.o scalar_born_cpu_iso_8_double.o scalar_cpu_iso_2_float.o scalar_cpu_iso_4_float.o scalar_cpu_iso_6_float.o scalar_cpu_iso_8_float.o scalar_cpu_iso_2_double.o scalar_cpu_iso_4_double.o scalar_cpu_iso_6_double.o scalar_cpu_iso_8_double.o elastic_cpu_iso_2_float.o elastic_cpu_iso_4_float.o elastic_cpu_iso_2_double.o elastic_cpu_iso_4_double.o -o libdeepwave_cpu_macos_arm64.dylib
+#rm *.o
diff --git a/src/deepwave/build_windows.sh b/src/deepwave/build_windows.sh
index fa64b3e..6028256 100755
--- a/src/deepwave/build_windows.sh
+++ b/src/deepwave/build_windows.sh
@@ -2,7 +2,7 @@
 
 set -e
 
-CFLAGS="-Wall -O2 -fp:fast -arch:AVX2"
+CFLAGS="-Wall -O2 -fp:fast -arch:AVX2 -openmp"
 CUDAFLAGS="--restrict --use_fast_math -O3 -gencode=arch=compute_52,code=sm_52, -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80"
 cl $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.c -Foscalar_cpu_iso_2_float.obj
 cl $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.c -Foscalar_cpu_iso_4_float.obj
@@ -24,7 +24,7 @@ cl $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c elastic.c -Foelastic_cpu_iso_2_fl
 cl $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c elastic.c -Foelastic_cpu_iso_4_float.obj
 cl $CFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=double -c elastic.c -Foelastic_cpu_iso_2_double.obj
 cl $CFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=double -c elastic.c -Foelastic_cpu_iso_4_double.obj
-cl $CFLAGS -LD scalar_born_cpu_iso_2_float.obj scalar_born_cpu_iso_4_float.obj scalar_born_cpu_iso_6_float.obj scalar_born_cpu_iso_8_float.obj scalar_born_cpu_iso_2_double.obj scalar_born_cpu_iso_4_double.obj scalar_born_cpu_iso_6_double.obj scalar_born_cpu_iso_8_double.obj scalar_cpu_iso_2_float.obj scalar_cpu_iso_4_float.obj scalar_cpu_iso_6_float.obj scalar_cpu_iso_8_float.obj scalar_cpu_iso_2_double.obj scalar_cpu_iso_4_double.obj scalar_cpu_iso_6_double.obj scalar_cpu_iso_8_double.obj elastic_cpu_iso_2_float.obj elastic_cpu_iso_4_float.obj elastic_cpu_iso_2_double.obj elastic_cpu_iso_4_double.obj -Felibdeepwave_cpu_windows_x86_64.dll
+cl $CFLAGS -LD scalar_born_cpu_iso_2_float.obj scalar_born_cpu_iso_4_float.obj scalar_born_cpu_iso_6_float.obj scalar_born_cpu_iso_8_float.obj scalar_born_cpu_iso_2_double.obj scalar_born_cpu_iso_4_double.obj scalar_born_cpu_iso_6_double.obj scalar_born_cpu_iso_8_double.obj scalar_cpu_iso_2_float.obj scalar_cpu_iso_4_float.obj scalar_cpu_iso_6_float.obj scalar_cpu_iso_8_float.obj scalar_cpu_iso_2_double.obj scalar_cpu_iso_4_double.obj scalar_cpu_iso_6_double.obj scalar_cpu_iso_8_double.obj elastic_cpu_iso_2_float.obj elastic_cpu_iso_4_float.obj elastic_cpu_iso_2_double.obj elastic_cpu_iso_4_double.obj -nodefaultlib:vcomp libiomp5md.lib -Felibdeepwave_cpu_windows_x86_64.dll
 nvcc $CUDAFLAGS -DDW_ACCURACY=2 -DDW_DTYPE=float -c scalar.cu -o scalar_cuda_iso_2_float.obj
 nvcc $CUDAFLAGS -DDW_ACCURACY=4 -DDW_DTYPE=float -c scalar.cu -o scalar_cuda_iso_4_float.obj
 nvcc $CUDAFLAGS -DDW_ACCURACY=6 -DDW_DTYPE=float -c scalar.cu -o scalar_cuda_iso_6_float.obj
diff --git a/src/deepwave/common_cpu.h b/src/deepwave/common_cpu.h
index 07f794b..30478c9 100644
--- a/src/deepwave/common_cpu.h
+++ b/src/deepwave/common_cpu.h
@@ -1,12 +1,6 @@
 #ifndef DW_COMMON_CPU_H
 #define DW_COMMON_CPU_H
 
-//#ifdef DW_USE_OPENMP
-//int dw_use_openmp = 1;
-//#else
-//int dw_use_openmp = 0;
-//#endif /* DW_USE_OPENMP */
-
 static void add_sources(DW_DTYPE *__restrict const wf,
                         DW_DTYPE const *__restrict const f,
                         int64_t const *__restrict const sources_i,
diff --git a/src/deepwave/elastic.c b/src/deepwave/elastic.c
index bb75b77..43dde83 100644
--- a/src/deepwave/elastic.c
+++ b/src/deepwave/elastic.c
@@ -9,9 +9,9 @@ VX  SII | VX  SII | VX  sii
 SXY-VY--|-SXY-VY--|-SXY vy
 */
 
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
 #include <omp.h>
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 #include <stdbool.h>
 #include <stdint.h>
 #include "common.h"
@@ -66,7 +66,7 @@ SXY-VY--|-SXY-VY--|-SXY vy
   for (y = y_begin_y; y < y_end_y; ++y) {\
     int64_t yi = y * nx;\
     for (x = x_begin_y; x < x_end_y; ++x) {\
-      int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\
+      int64_t i = yi + x, j, k;\
       DW_DTYPE dsigmayydy = 0;\
       DW_DTYPE dsigmaxydx = 0;\
 \
@@ -118,15 +118,15 @@ SXY-VY--|-SXY-VY--|-SXY vy
       {\
       DW_DTYPE buoyancyyhxh;\
       if (pml_y == 2 && y == ny - 1) {\
-        buoyancyyhxh = (buoyancy[i_noshot] + buoyancy[i_noshot + 1]) / 2;\
+        buoyancyyhxh = (buoyancy[i] + buoyancy[i + 1]) / 2;\
       } else {\
-        buoyancyyhxh = (buoyancy[i_noshot] + buoyancy[i_noshot + 1] + buoyancy[i_noshot + nx] +\
-                        buoyancy[i_noshot + nx + 1]) /\
+        buoyancyyhxh = (buoyancy[i] + buoyancy[i + 1] + buoyancy[i + nx] +\
+                        buoyancy[i + nx + 1]) /\
                        4;\
       }\
       vy[i] += buoyancyyhxh * dt * (dsigmayydy + dsigmaxydx);\
       if (buoyancy_requires_grad) {\
-        dvydbuoyancy[store_i + i_noshot] = dt * (dsigmayydy + dsigmaxydx);\
+        dvydbuoyancy[i] = dt * (dsigmayydy + dsigmaxydx);\
       }\
       }\
     }\
@@ -135,7 +135,7 @@ SXY-VY--|-SXY-VY--|-SXY vy
   for (y = y_begin_x; y < y_end_x; ++y) {\
     int64_t yi = y * nx;\
     for (x = x_begin_x; x < x_end_x; ++x) {\
-      int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\
+      int64_t i = yi + x, j, k;\
       DW_DTYPE dsigmaxydy = 0;\
       DW_DTYPE dsigmaxxdx = 0;\
 \
@@ -184,9 +184,9 @@ SXY-VY--|-SXY-VY--|-SXY vy
         m_sigmaxxx[i] = ax[x] * m_sigmaxxx[i] + bx[x] * dsigmaxxdx;\
         dsigmaxxdx += m_sigmaxxx[i];\
       }\
-      vx[i] += buoyancy[i_noshot] * dt * (dsigmaxxdx + dsigmaxydy);\
+      vx[i] += buoyancy[i] * dt * (dsigmaxxdx + dsigmaxydy);\
       if (buoyancy_requires_grad) {\
-        dvxdbuoyancy[store_i + i_noshot] = dt * (dsigmaxxdx + dsigmaxydy);\
+        dvxdbuoyancy[i] = dt * (dsigmaxxdx + dsigmaxydy);\
       }\
     }\
 \
@@ -231,7 +231,7 @@ SXY-VY--|-SXY-VY--|-SXY vy
   for (y = y_begin_ii; y < y_end_ii; ++y) {\
     int64_t yi = y * nx;\
     for (x = x_begin_ii; x < x_end_ii; ++x) {\
-      int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, k;\
+      int64_t i = yi + x, j, k;\
       DW_DTYPE dvydy = 0;\
       DW_DTYPE dvxdx = 0;\
 \
@@ -280,13 +280,13 @@ SXY-VY--|-SXY-VY--|-SXY vy
         dvxdx += m_vxx[i];\
       }\
       {\
-      DW_DTYPE lambyxh = (lamb[i_noshot] + lamb[i_noshot + 1]) / 2;\
-      DW_DTYPE muyxh = (mu[i_noshot] + mu[i_noshot + 1]) / 2;\
+      DW_DTYPE lambyxh = (lamb[i] + lamb[i + 1]) / 2;\
+      DW_DTYPE muyxh = (mu[i] + mu[i + 1]) / 2;\
       sigmayy[i] += dt * ((lambyxh + 2 * muyxh) * dvydy + lambyxh * dvxdx);\
       sigmaxx[i] += dt * ((lambyxh + 2 * muyxh) * dvxdx + lambyxh * dvydy);\
       if (lamb_requires_grad || mu_requires_grad) {\
-        dvydy_store[store_i + i_noshot] = dt * dvydy;\
-        dvxdx_store[store_i + i_noshot] = dt * dvxdx;\
+        dvydy_store[i] = dt * dvydy;\
+        dvxdx_store[i] = dt * dvxdx;\
       }\
       }\
     }\
@@ -295,7 +295,7 @@ SXY-VY--|-SXY-VY--|-SXY vy
   for (y = y_begin_xy; y < y_end_xy; ++y) {\
     int64_t yi = y * nx;\
     for (x = x_begin_xy; x < x_end_xy; ++x) {\
-      int64_t i_noshot = yi + x, i = shot * ny * nx + i_noshot, j, jp, k;\
+      int64_t i = yi + x, j, jp, k;\
       DW_DTYPE dvydx = 0;\
       DW_DTYPE dvxdy = 0;\
 \
@@ -421,10 +421,10 @@ SXY-VY--|-SXY-VY--|-SXY vy
         dvydx += m_vyx[i];\
       }\
       {\
-      DW_DTYPE muyhx = (mu[i_noshot] + mu[i_noshot + nx]) / 2;\
+      DW_DTYPE muyhx = (mu[i] + mu[i + nx]) / 2;\
       sigmaxy[i] += dt * muyhx * (dvydx + dvxdy);\
       if (mu_requires_grad) {\
-        dvydxdvxdy_store[store_i + i_noshot] = dt * (dvydx + dvxdy);\
+        dvydxdvxdy_store[i] = dt * (dvydx + dvxdy);\
       }\
       }\
     }\
@@ -1385,6 +1385,109 @@ static void combine_grad_elastic(DW_DTYPE *__restrict const grad,
   }
 }
 
+#ifdef _WIN32
+__declspec(noinline)
+#else
+__attribute__ ((noinline)) 
+#endif
+static void forward_shot_v(DW_DTYPE const *__restrict const buoyancy, DW_DTYPE *__restrict const vy, DW_DTYPE *__restrict const vx, DW_DTYPE const *__restrict const sigmayy,
+    DW_DTYPE const *__restrict const sigmaxy, DW_DTYPE const *__restrict const sigmaxx,
+    DW_DTYPE *__restrict const m_sigmayyy, DW_DTYPE *__restrict const m_sigmaxyy,
+    DW_DTYPE *__restrict const m_sigmaxyx, DW_DTYPE *__restrict const m_sigmaxxx,
+    DW_DTYPE *__restrict const dvydbuoyancy, DW_DTYPE *__restrict const dvxdbuoyancy,
+    DW_DTYPE const *__restrict const ay,
+    DW_DTYPE const *__restrict const ayh, DW_DTYPE const *__restrict const ax, DW_DTYPE const *__restrict const axh,
+    DW_DTYPE const *__restrict const by, DW_DTYPE const *__restrict const byh, DW_DTYPE const *__restrict const bx,
+    DW_DTYPE const *__restrict const bxh,
+    DW_DTYPE const dt, int64_t const ny, int64_t const nx,
+    bool const buoyancy_requires_grad, int64_t const pml_y0,
+      int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1) {
+      int64_t y, x, y_begin_y, y_end_y, x_begin_y, x_end_y, y_begin_x, y_end_x, x_begin_x, x_end_x;
+        if (buoyancy_requires_grad) {
+          FORWARD_KERNEL_V(0, 0, 1)
+          FORWARD_KERNEL_V(0, 1, 1)
+          FORWARD_KERNEL_V(0, 2, 1)
+          FORWARD_KERNEL_V(1, 0, 1)
+          FORWARD_KERNEL_V(1, 1, 1)
+          FORWARD_KERNEL_V(1, 2, 1)
+          FORWARD_KERNEL_V(2, 0, 1)
+          FORWARD_KERNEL_V(2, 1, 1)
+          FORWARD_KERNEL_V(2, 2, 1)
+        } else {
+          FORWARD_KERNEL_V(0, 0, 0)
+          FORWARD_KERNEL_V(0, 1, 0)
+          FORWARD_KERNEL_V(0, 2, 0)
+          FORWARD_KERNEL_V(1, 0, 0)
+          FORWARD_KERNEL_V(1, 1, 0)
+          FORWARD_KERNEL_V(1, 2, 0)
+          FORWARD_KERNEL_V(2, 0, 0)
+          FORWARD_KERNEL_V(2, 1, 0)
+          FORWARD_KERNEL_V(2, 2, 0)
+        }
+}
+
+#ifdef _WIN32
+__declspec(noinline)
+#else
+__attribute__ ((noinline)) 
+#endif
+static void forward_shot_sigma(
+    DW_DTYPE const *__restrict const lamb,
+    DW_DTYPE const *__restrict const mu, DW_DTYPE const *__restrict const vy, DW_DTYPE const *__restrict const vx, DW_DTYPE *__restrict const sigmayy,
+    DW_DTYPE *__restrict const sigmaxy, DW_DTYPE *__restrict const sigmaxx, DW_DTYPE *__restrict const m_vyy,
+    DW_DTYPE *__restrict const m_vyx, DW_DTYPE *__restrict const m_vxy, DW_DTYPE *__restrict const m_vxx,
+    DW_DTYPE *__restrict const dvydy_store, DW_DTYPE *__restrict const dvxdx_store,
+    DW_DTYPE *__restrict const dvydxdvxdy_store, DW_DTYPE const *__restrict const ay, DW_DTYPE const *__restrict const ayh,
+    DW_DTYPE const *__restrict const ax, DW_DTYPE const *__restrict const axh, DW_DTYPE const *__restrict const by,
+    DW_DTYPE const *__restrict const byh, DW_DTYPE const *__restrict const bx, DW_DTYPE const *__restrict const bxh,
+    DW_DTYPE const dt, int64_t const ny, int64_t const nx,
+    bool const lamb_requires_grad, bool const mu_requires_grad, int64_t const pml_y0,
+      int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1) {
+      int64_t y, x, y_begin_ii, y_end_ii, x_begin_ii, x_end_ii, y_begin_xy, y_end_xy, x_begin_xy, x_end_xy;
+        if (lamb_requires_grad && mu_requires_grad) {
+            FORWARD_KERNEL_SIGMA(0, 0, 1, 1)
+            FORWARD_KERNEL_SIGMA(0, 1, 1, 1)
+            FORWARD_KERNEL_SIGMA(0, 2, 1, 1)
+            FORWARD_KERNEL_SIGMA(1, 0, 1, 1)
+            FORWARD_KERNEL_SIGMA(1, 1, 1, 1)
+            FORWARD_KERNEL_SIGMA(1, 2, 1, 1)
+            FORWARD_KERNEL_SIGMA(2, 0, 1, 1)
+            FORWARD_KERNEL_SIGMA(2, 1, 1, 1)
+            FORWARD_KERNEL_SIGMA(2, 2, 1, 1)
+          } else if (lamb_requires_grad) {
+            FORWARD_KERNEL_SIGMA(0, 0, 1, 0)
+            FORWARD_KERNEL_SIGMA(0, 1, 1, 0)
+            FORWARD_KERNEL_SIGMA(0, 2, 1, 0)
+            FORWARD_KERNEL_SIGMA(1, 0, 1, 0)
+            FORWARD_KERNEL_SIGMA(1, 1, 1, 0)
+            FORWARD_KERNEL_SIGMA(1, 2, 1, 0)
+            FORWARD_KERNEL_SIGMA(2, 0, 1, 0)
+            FORWARD_KERNEL_SIGMA(2, 1, 1, 0)
+            FORWARD_KERNEL_SIGMA(2, 2, 1, 0)
+          } else if (mu_requires_grad) {
+            FORWARD_KERNEL_SIGMA(0, 0, 0, 1)
+            FORWARD_KERNEL_SIGMA(0, 1, 0, 1)
+            FORWARD_KERNEL_SIGMA(0, 2, 0, 1)
+            FORWARD_KERNEL_SIGMA(1, 0, 0, 1)
+            FORWARD_KERNEL_SIGMA(1, 1, 0, 1)
+            FORWARD_KERNEL_SIGMA(1, 2, 0, 1)
+            FORWARD_KERNEL_SIGMA(2, 0, 0, 1)
+            FORWARD_KERNEL_SIGMA(2, 1, 0, 1)
+            FORWARD_KERNEL_SIGMA(2, 2, 0, 1)
+        } else {
+          FORWARD_KERNEL_SIGMA(0, 0, 0, 0)
+          FORWARD_KERNEL_SIGMA(0, 1, 0, 0)
+          FORWARD_KERNEL_SIGMA(0, 2, 0, 0)
+          FORWARD_KERNEL_SIGMA(1, 0, 0, 0)
+          FORWARD_KERNEL_SIGMA(1, 1, 0, 0)
+          FORWARD_KERNEL_SIGMA(1, 2, 0, 0)
+          FORWARD_KERNEL_SIGMA(2, 0, 0, 0)
+          FORWARD_KERNEL_SIGMA(2, 1, 0, 0)
+          FORWARD_KERNEL_SIGMA(2, 2, 0, 0)
+        }
+}
+
+
 static void backward_shot(DW_DTYPE const *__restrict const lamb,
       DW_DTYPE const *__restrict const mu, DW_DTYPE const *__restrict const buoyancy,
       DW_DTYPE const *__restrict const grad_r_y, DW_DTYPE const *__restrict const grad_r_x, DW_DTYPE const *__restrict const grad_r_p,
@@ -1562,8 +1665,6 @@ static void set_fd_coeffs_x(DW_DTYPE const dx) {
   }
 }
 
-
-
 #ifdef _WIN32
 __declspec(dllexport)
 #endif
@@ -1597,9 +1698,9 @@ __declspec(dllexport)
     int64_t shot;
     set_fd_coeffs_y(dy);
     set_fd_coeffs_x(dx);
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     for (shot = 0; shot < n_shots; ++shot) {
       int64_t const si = shot * ny * nx;
       int64_t const siy = shot * n_sources_y_per_shot;
@@ -1607,7 +1708,7 @@ __declspec(dllexport)
       int64_t const riy = shot * n_receivers_y_per_shot;
       int64_t const rix = shot * n_receivers_x_per_shot;
       int64_t const rip = shot * n_receivers_p_per_shot;
-      int64_t t, y, x, y_begin_y, y_end_y, x_begin_y, x_end_y, y_begin_x, y_end_x, x_begin_x, x_end_x, y_begin_ii, y_end_ii, x_begin_ii, x_end_ii, y_begin_xy, y_end_xy, x_begin_xy, x_end_xy;
+      int64_t t;
 
       for (t = 0; t < nt; ++t) {
         int64_t store_i = shot * (nt / step_ratio) * ny * nx + (t / step_ratio) * ny * nx;
@@ -1623,27 +1724,18 @@ __declspec(dllexport)
           record_pressure_receivers(r_p + rip * nt + t * n_receivers_p_per_shot, sigmayy + si,
                                     sigmaxx + si, receivers_p_i + rip, n_receivers_p_per_shot);
         }
-        if (buoyancy_requires_grad && ((t % step_ratio) == 0)) {
-          FORWARD_KERNEL_V(0, 0, 1)
-          FORWARD_KERNEL_V(0, 1, 1)
-          FORWARD_KERNEL_V(0, 2, 1)
-          FORWARD_KERNEL_V(1, 0, 1)
-          FORWARD_KERNEL_V(1, 1, 1)
-          FORWARD_KERNEL_V(1, 2, 1)
-          FORWARD_KERNEL_V(2, 0, 1)
-          FORWARD_KERNEL_V(2, 1, 1)
-          FORWARD_KERNEL_V(2, 2, 1)
-        } else {
-          FORWARD_KERNEL_V(0, 0, 0)
-          FORWARD_KERNEL_V(0, 1, 0)
-          FORWARD_KERNEL_V(0, 2, 0)
-          FORWARD_KERNEL_V(1, 0, 0)
-          FORWARD_KERNEL_V(1, 1, 0)
-          FORWARD_KERNEL_V(1, 2, 0)
-          FORWARD_KERNEL_V(2, 0, 0)
-          FORWARD_KERNEL_V(2, 1, 0)
-          FORWARD_KERNEL_V(2, 2, 0)
-        }
+        forward_shot_v(buoyancy, vy + si, vx + si, sigmayy + si,
+    sigmaxy + si, sigmaxx + si,
+    m_sigmayyy + si, m_sigmaxyy + si,
+    m_sigmaxyx + si, m_sigmaxxx + si,
+    dvydbuoyancy + store_i, dvxdbuoyancy + store_i,
+    ay,
+    ayh, ax, axh,
+    by, byh, bx,
+    bxh,
+    dt, ny, nx,
+    buoyancy_requires_grad && ((t % step_ratio) == 0), pml_y0,
+      pml_y1, pml_x0, pml_x1);
         if (n_sources_y_per_shot > 0) {
           add_sources(vy + si, f_y + siy * nt + t * n_sources_y_per_shot, sources_y_i + siy,
                       n_sources_y_per_shot);
@@ -1652,47 +1744,18 @@ __declspec(dllexport)
           add_sources(vx + si, f_x + six * nt + t * n_sources_x_per_shot, sources_x_i + six,
                       n_sources_x_per_shot);
         }
-        if (lamb_requires_grad && mu_requires_grad && ((t % step_ratio) == 0)) {
-            FORWARD_KERNEL_SIGMA(0, 0, 1, 1)
-            FORWARD_KERNEL_SIGMA(0, 1, 1, 1)
-            FORWARD_KERNEL_SIGMA(0, 2, 1, 1)
-            FORWARD_KERNEL_SIGMA(1, 0, 1, 1)
-            FORWARD_KERNEL_SIGMA(1, 1, 1, 1)
-            FORWARD_KERNEL_SIGMA(1, 2, 1, 1)
-            FORWARD_KERNEL_SIGMA(2, 0, 1, 1)
-            FORWARD_KERNEL_SIGMA(2, 1, 1, 1)
-            FORWARD_KERNEL_SIGMA(2, 2, 1, 1)
-          } else if (lamb_requires_grad && ((t % step_ratio) == 0)) {
-            FORWARD_KERNEL_SIGMA(0, 0, 1, 0)
-            FORWARD_KERNEL_SIGMA(0, 1, 1, 0)
-            FORWARD_KERNEL_SIGMA(0, 2, 1, 0)
-            FORWARD_KERNEL_SIGMA(1, 0, 1, 0)
-            FORWARD_KERNEL_SIGMA(1, 1, 1, 0)
-            FORWARD_KERNEL_SIGMA(1, 2, 1, 0)
-            FORWARD_KERNEL_SIGMA(2, 0, 1, 0)
-            FORWARD_KERNEL_SIGMA(2, 1, 1, 0)
-            FORWARD_KERNEL_SIGMA(2, 2, 1, 0)
-          } else if (mu_requires_grad && ((t % step_ratio) == 0)) {
-            FORWARD_KERNEL_SIGMA(0, 0, 0, 1)
-            FORWARD_KERNEL_SIGMA(0, 1, 0, 1)
-            FORWARD_KERNEL_SIGMA(0, 2, 0, 1)
-            FORWARD_KERNEL_SIGMA(1, 0, 0, 1)
-            FORWARD_KERNEL_SIGMA(1, 1, 0, 1)
-            FORWARD_KERNEL_SIGMA(1, 2, 0, 1)
-            FORWARD_KERNEL_SIGMA(2, 0, 0, 1)
-            FORWARD_KERNEL_SIGMA(2, 1, 0, 1)
-            FORWARD_KERNEL_SIGMA(2, 2, 0, 1)
-        } else {
-          FORWARD_KERNEL_SIGMA(0, 0, 0, 0)
-          FORWARD_KERNEL_SIGMA(0, 1, 0, 0)
-          FORWARD_KERNEL_SIGMA(0, 2, 0, 0)
-          FORWARD_KERNEL_SIGMA(1, 0, 0, 0)
-          FORWARD_KERNEL_SIGMA(1, 1, 0, 0)
-          FORWARD_KERNEL_SIGMA(1, 2, 0, 0)
-          FORWARD_KERNEL_SIGMA(2, 0, 0, 0)
-          FORWARD_KERNEL_SIGMA(2, 1, 0, 0)
-          FORWARD_KERNEL_SIGMA(2, 2, 0, 0)
-        }
+forward_shot_sigma(
+    lamb,
+    mu, vy + si, vx + si, sigmayy + si,
+    sigmaxy + si, sigmaxx + si, m_vyy + si,
+    m_vyx + si, m_vxy + si, m_vxx + si,
+    dvydy_store + store_i, dvxdx_store + store_i,
+    dvydxdvxdy_store + store_i, ay, ayh,
+    ax, axh, by,
+    byh, bx, bxh,
+    dt, ny, nx,
+    lamb_requires_grad && ((t % step_ratio) == 0), mu_requires_grad && ((t % step_ratio) == 0), pml_y0,
+      pml_y1, pml_x0, pml_x1);
       }
         if (n_receivers_y_per_shot > 0) {
           record_receivers(r_y + riy * (nt + 1) + t * n_receivers_y_per_shot, vy + si, receivers_y_i + riy,
@@ -1739,9 +1802,9 @@ __declspec(dllexport)
     int64_t shot;
     set_fd_coeffs_y(dy);
     set_fd_coeffs_x(dx);
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     for (shot = 0; shot < n_shots; ++shot) {
       int64_t const si = shot * ny * nx;
       int64_t const siy = shot * n_sources_y_per_shot;
@@ -1749,11 +1812,11 @@ __declspec(dllexport)
       int64_t const riy = shot * n_receivers_y_per_shot;
       int64_t const rix = shot * n_receivers_x_per_shot;
       int64_t const rip = shot * n_receivers_p_per_shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
     int64_t const threadi = omp_get_thread_num() * ny * nx;
 #else
     int64_t const threadi = 0;
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
       int64_t t;
       if (n_receivers_y_per_shot > 0 && nt > 0) {
         add_sources(vy + si, grad_r_y + riy * (nt+1) + nt * n_receivers_y_per_shot, receivers_y_i + riy,
@@ -1826,7 +1889,7 @@ __declspec(dllexport)
     }
 }
     }
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   if (lamb_requires_grad && n_threads > 1) {
     combine_grad_elastic(grad_lamb, grad_lamb_thread, n_threads, ny, nx);
   }
@@ -1836,5 +1899,5 @@ __declspec(dllexport)
   if (buoyancy_requires_grad && n_threads > 1) {
     combine_grad_elastic(grad_buoyancy, grad_buoyancy_thread, n_threads, ny, nx);
   }
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     }
diff --git a/src/deepwave/scalar.c b/src/deepwave/scalar.c
index 887e4f6..f975de0 100644
--- a/src/deepwave/scalar.c
+++ b/src/deepwave/scalar.c
@@ -17,9 +17,9 @@
  *    backward.
  */
 
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
 #include <omp.h>
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 #include <stdbool.h>
 #include <stdint.h>
 #include "common.h"
@@ -241,9 +241,9 @@ __declspec(dllexport)
         bool const v_requires_grad, int64_t const pml_y0, int64_t const pml_y1,
         int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const si = shot * n_sources_per_shot;
@@ -308,18 +308,18 @@ __declspec(dllexport)
         int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1,
         int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const si = shot * n_sources_per_shot;
     int64_t const ri = shot * n_receivers_per_shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
     int64_t const threadi = omp_get_thread_num() * ny * nx;
 #else
     int64_t const threadi = 0;
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     int64_t t;
     for (t = nt - 1; t >= 0; --t) {
       if ((nt - 1 - t) & 1) {
@@ -351,9 +351,9 @@ __declspec(dllexport)
       }
     }
   }
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   if (v_requires_grad && n_threads > 1) {
     combine_grad(grad_v, grad_v_thread, n_threads, ny, nx);
   }
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 }
diff --git a/src/deepwave/scalar_born.c b/src/deepwave/scalar_born.c
index 16320f0..eefd0da 100644
--- a/src/deepwave/scalar_born.c
+++ b/src/deepwave/scalar_born.c
@@ -17,9 +17,9 @@
  *    backward.
  */
 
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
 #include <omp.h>
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 #include <stdbool.h>
 #include <stdint.h>
 #include "common.h"
@@ -477,9 +477,9 @@ __declspec(dllexport)
         bool const v_requires_grad, bool const scatter_requires_grad, int64_t const pml_y0, int64_t const pml_y1,
         int64_t const pml_x0, int64_t const pml_x1, int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const si = shot * n_sources_per_shot;
@@ -569,20 +569,20 @@ __declspec(dllexport)
         int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1,
         int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const si = shot * n_sources_per_shot;
     int64_t const sisc = shot * n_sourcessc_per_shot;
     int64_t const ri = shot * n_receivers_per_shot;
     int64_t const risc = shot * n_receiverssc_per_shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
     int64_t const threadi = omp_get_thread_num() * ny * nx;
 #else
     int64_t const threadi = 0;
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     int64_t t;
     for (t = nt - 1; t >= 0; --t) {
       if ((nt - 1 - t) & 1) {
@@ -631,14 +631,14 @@ __declspec(dllexport)
       }
     }
   }
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   if (v_requires_grad && n_threads > 1) {
     combine_grad(grad_v, grad_v_thread, n_threads, ny, nx);
   }
   if (scatter_requires_grad && n_threads > 1) {
     combine_grad(grad_scatter, grad_scatter_thread, n_threads, ny, nx);
   }
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 }
 
 #ifdef _WIN32
@@ -670,18 +670,18 @@ __declspec(dllexport)
         int64_t const pml_y1, int64_t const pml_x0, int64_t const pml_x1,
         int64_t const n_threads) {
   int64_t shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   #pragma omp parallel for num_threads(n_threads)
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
   for (shot = 0; shot < n_shots; ++shot) {
     int64_t const i = shot * ny * nx;
     int64_t const sisc = shot * n_sourcessc_per_shot;
     int64_t const risc = shot * n_receiverssc_per_shot;
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
     int64_t const threadi = omp_get_thread_num() * ny * nx;
 #else
     int64_t const threadi = 0;
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
     int64_t t;
     for (t = nt - 1; t >= 0; --t) {
       if ((nt - 1 - t) & 1) {
@@ -715,9 +715,9 @@ __declspec(dllexport)
       }
     }
   }
-#ifdef DW_USE_OPENMP
+#ifdef _OPENMP
   if (scatter_requires_grad && n_threads > 1) {
     combine_grad(grad_scatter, grad_scatter_thread, n_threads, ny, nx);
   }
-#endif /* DW_USE_OPENMP */
+#endif /* _OPENMP */
 }