Skip to content

Commit

Permalink
Merge pull request #2868 from boutproject/dump-openmp
Browse files Browse the repository at this point in the history
Add number of OpenMP threads to dump file
  • Loading branch information
bendudson committed Feb 23, 2024
2 parents 42c6805 + 2a1185d commit 5d0a1ad
Show file tree
Hide file tree
Showing 47 changed files with 351 additions and 333 deletions.
2 changes: 2 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ SpacesInParentheses: false
SpacesInSquareBrackets: false
StatementMacros:
- BOUT_OMP
- BOUT_OMP_PERF
- BOUT_OMP_SAFE
Standard: c++14
TabWidth: 8
UseTab: Never
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,7 @@ set(BOUT_HAS_PNETCDF OFF)
# while for static builds we need the dependencies too
if (BUILD_SHARED_LIBS)
# Include rpath linker flag so user doesn't need to set LD_LIBRARY_PATH
set(CONFIG_LDFLAGS "${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG}\$BOUT_LIB_PATH -L\$BOUT_LIB_PATH -lbout++ -lfmt")
set(CONFIG_LDFLAGS "${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG}\$BOUT_LIB_PATH -L\$BOUT_LIB_PATH -lbout++ -lfmt ${CONFIG_LDFLAGS_SHARED}")
else()
set(CONFIG_LDFLAGS "${CONFIG_LDFLAGS}")
endif()
Expand Down
1 change: 1 addition & 0 deletions bin/bout-config.in
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ all()
echo " --has-slepc -> $has_slepc"
echo " --has-arkode -> $has_arkode"
echo " --has-nls -> $has_nls"
echo " --has-openmp -> $has_openmp"
echo
echo " --petsc-has-sundials -> $petsc_has_sundials"
echo
Expand Down
3 changes: 3 additions & 0 deletions cmake/SetupBOUTThirdParty.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ endif ()
# determined in SetupCompilers.cmake
if (BOUT_USE_OPENMP)
target_link_libraries(bout++ PUBLIC OpenMP::OpenMP_CXX)
set(CONFIG_LDFLAGS "${CONFIG_LDFLAGS} -fopenmp")
set(CONFIG_LDFLAGS_SHARED "${CONFIG_LDFLAGS_SHARED} -fopenmp")
set(CONFIG_CFLAGS "${CONFIG_CFLAGS} -fopenmp")
endif()

# determined in SetupCompilers.cmake
Expand Down
6 changes: 3 additions & 3 deletions examples/performance/iterator-offsets/iterator-offsets.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ int main(int argc, char** argv) {
#if BOUT_USE_OPENMP
ITERATOR_TEST_BLOCK(
"Nested loop (omp)",
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for(int i=0;i<mesh->LocalNx;++i) {
for (int j = mesh->ystart; j < mesh->yend; ++j) {
for (int k = 0; k < mesh->LocalNz; ++k) {
Expand All @@ -98,7 +98,7 @@ int main(int argc, char** argv) {
deriv(a, result, "RGN_NOY"););

ITERATOR_TEST_BLOCK(
"Region with stencil", BOUT_OMP(parallel) {
"Region with stencil", BOUT_OMP_PERF(parallel) {
stencil s;
BOUT_FOR_INNER(i, mesh->getRegion3D("RGN_NOY")) {
s.m = a[i.ym()];
Expand All @@ -110,7 +110,7 @@ int main(int argc, char** argv) {
});

ITERATOR_TEST_BLOCK(
"Region with stencil and function pointer", BOUT_OMP(parallel) {
"Region with stencil and function pointer", BOUT_OMP_PERF(parallel) {
stencil s;
BOUT_FOR_INNER(i, mesh->getRegion3D("RGN_NOY")) {
s.m = a[i.ym()];
Expand Down
4 changes: 2 additions & 2 deletions examples/performance/iterator/iterator.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ int main(int argc, char** argv) {
"C loop", for (int j = 0; j < len; ++j) { rd[j] = ad[j] + bd[j]; };);
#if BOUT_USE_OPENMP
ITERATOR_TEST_BLOCK("C loop (omp)",
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for(int j=0;j<len;++j) {
rd[j] = ad[j] + bd[j];
};
Expand All @@ -85,7 +85,7 @@ int main(int argc, char** argv) {

#if BOUT_USE_OPENMP
ITERATOR_TEST_BLOCK("Nested loop (omp)",
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for(int i=0;i<mesh->LocalNx;++i) {
for (int j = 0; j < mesh->LocalNy; ++j) {
for (int k = 0; k < mesh->LocalNz; ++k) {
Expand Down
12 changes: 2 additions & 10 deletions include/bout/array.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#include <memory>
#include <vector>

#ifdef _OPENMP
#if BOUT_USE_OPENMP
#include <omp.h>
#endif

Expand Down Expand Up @@ -375,22 +375,14 @@ private:
* @param[in] cleanup If set to true, deletes all dataBlock and clears the store
*/
static storeType& store(bool cleanup = false) {
#ifdef _OPENMP
static arenaType arena(omp_get_max_threads());
#else
static arenaType arena(1);
#endif
if (!cleanup) {
#ifdef _OPENMP
return arena[omp_get_thread_num()];
#else
return arena[0];
#endif
}

// Clean by deleting all data -- possible that just stores.clear() is
// sufficient rather than looping over each entry.
BOUT_OMP(single)
BOUT_OMP_SAFE(single)
{
for (auto& stores : arena) {
for (auto& p : stores) {
Expand Down
26 changes: 13 additions & 13 deletions include/bout/cyclic_reduction.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ public:
Matrix<T> bMatrix(1, N);
Matrix<T> cMatrix(1, N);

BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < N; ++i) {
aMatrix(0, i) = a[i];
bMatrix(0, i) = b[i];
Expand All @@ -126,7 +126,7 @@ public:
allocMemory(nprocs, nsys, N);

// Fill coefficient array
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int j = 0; j < Nsys; j++) {
for (int i = 0; i < N; i++) {
coefs(j, 4 * i) = a(j, i);
Expand All @@ -149,7 +149,7 @@ public:
Matrix<T> xMatrix(1, N);

// Copy input data into matrix
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < N; ++i) {
rhsMatrix(0, i) = rhs[i];
}
Expand All @@ -158,7 +158,7 @@ public:
solve(rhsMatrix, xMatrix);

// Copy result back into argument
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < N; ++i) {
x[i] = xMatrix(0, i);
}
Expand All @@ -184,7 +184,7 @@ public:

// Insert RHS into coefs array. Ordered to allow efficient partitioning
// for MPI send/receives
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int j = 0; j < Nsys; j++) {
for (int i = 0; i < N; i++) {
coefs(j, 4 * i + 3) = rhs(j, i);
Expand Down Expand Up @@ -230,7 +230,7 @@ public:

if (p == myproc) {
// Just copy the data
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < myns; i++) {
for (int j = 0; j < 8; j++) {
ifcs(i, 8 * p + j) = myif(sys0 + i, j);
Expand Down Expand Up @@ -285,7 +285,7 @@ public:
#ifdef DIAGNOSE
output << "Copying received data from " << p << endl;
#endif
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < myns; i++) {
for (int j = 0; j < 8; j++) {
#ifdef DIAGNOSE
Expand Down Expand Up @@ -317,7 +317,7 @@ public:
x1.ensureUnique();
xn.ensureUnique();

BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < myns; ++i) {
// (a b) (x1) = (b1)
// (c d) (xn) (bn)
Expand Down Expand Up @@ -364,7 +364,7 @@ public:

if (p == myproc) {
// Just copy the data
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < myns; i++) {
x1[sys0 + i] = ifx(i, 2 * p);
xn[sys0 + i] = ifx(i, 2 * p + 1);
Expand All @@ -389,7 +389,7 @@ public:
// Send data
for (int p = 0; p < nprocs; p++) { // Loop over processor
if (p != myproc) {
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < myns; i++) {
ifp[2 * i] = ifx(i, 2 * p);
ifp[2 * i + 1] = ifx(i, 2 * p + 1);
Expand Down Expand Up @@ -427,7 +427,7 @@ public:
nsp++;
}

BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < nsp; i++) {
x1[s0 + i] = recvbuffer(fromproc, 2 * i);
xn[s0 + i] = recvbuffer(fromproc, 2 * i + 1);
Expand Down Expand Up @@ -540,7 +540,7 @@ private:
}
#endif

BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int j = 0; j < ns; j++) {
// Calculate upper interface equation

Expand Down Expand Up @@ -619,7 +619,7 @@ private:
// Tridiagonal system, solve using serial Thomas algorithm
// xa -- Result for each system
// co -- Coefficients & rhs for each system
BOUT_OMP(parallel for)
BOUT_OMP_PERF(parallel for)
for (int i = 0; i < ns; i++) { // Loop over systems
Array<T> gam(nloc); // Thread-local array
T bet = 1.0;
Expand Down
4 changes: 2 additions & 2 deletions include/bout/hypre_interface.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ public:
weights.begin(), weights.end(), std::back_inserter(values),
[&value_](BoutReal weight) -> HYPRE_Complex { return weight * value_; });
const HYPRE_BigInt ncolumns = static_cast<HYPRE_BigInt>(positions.size());
// BOUT_OMP(critical)
// BOUT_OMP_SAFE(critical)
for (HYPRE_BigInt i = 0; i < ncolumns; ++i) {
matrix->setVal(row, positions[i], values[i]);
}
Expand All @@ -495,7 +495,7 @@ public:
weights.begin(), weights.end(), std::back_inserter(values),
[&value_](BoutReal weight) -> HYPRE_Complex { return weight * value_; });
const HYPRE_BigInt ncolumns = static_cast<HYPRE_BigInt>(positions.size());
// BOUT_OMP(critical)
// BOUT_OMP_SAFE(critical)
for (HYPRE_BigInt i = 0; i < ncolumns; ++i) {
matrix->addVal(row, positions[i], values[i]);
}
Expand Down
27 changes: 26 additions & 1 deletion include/bout/openmpwrap.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@
#ifndef BOUT_OPENMPWRAP_H
#define BOUT_OPENMPWRAP_H

#include "bout/build_defines.hxx"

#if BOUT_USE_OPENMP || defined(_OPENMP)
#include "omp.h"
#endif

#ifdef _OPENMP
//Some helpers for indirection -- required so that the _Pragma gets "omp <x>"
//where <x> is any number of valid omp options/environments (e.g. atomic, critical etc.)
#define INDIRECT0(a) #a
Expand All @@ -35,12 +42,30 @@

//Define a macro wrapper to the use of `#pragma omp` to avoid unknown pragma
//warnings when compiling without openmp support.
#if BOUT_USE_OPENMP
#define BOUT_OMP_SAFE(...) _Pragma(INDIRECT2(__VA_ARGS__))
#define BOUT_OMP(...) _Pragma(INDIRECT2(__VA_ARGS__))
#else
#define BOUT_OMP_SAFE(...)
#define BOUT_OMP(...)
#endif

#if BOUT_USE_OPENMP

#ifndef INDIRECT2
#error expected macro INDIRECT2 to be available
#endif

#define BOUT_OMP_PERF(...) _Pragma(INDIRECT2(__VA_ARGS__))
#else
#define BOUT_OMP_PERF(...)
#endif

#ifndef _OPENMP
inline int constexpr omp_get_max_threads() { return 1; }
inline int constexpr omp_get_num_threads() { return 1; }
inline int constexpr omp_get_thread_num() { return 0; }
#endif

//Perhaps want to cleanup local helpers with below, but DON'T!
//This would cause uses of BOUT_OMP to break
// #undef INDIRECT0
Expand Down
8 changes: 4 additions & 4 deletions include/bout/petsc_interface.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ public:
#endif
BoutReal value = BoutNaN;
int status = 0;
BOUT_OMP(critical)
BOUT_OMP_SAFE(critical)
status = VecGetValues(*get(), 1, &global, &value);
if (status != 0) {
throw BoutException("Error when getting element of a PETSc vector.");
Expand Down Expand Up @@ -355,7 +355,7 @@ public:
PetscBool assembled = PETSC_FALSE;
MatAssembled(*petscMatrix, &assembled);
if (assembled == PETSC_TRUE) {
BOUT_OMP(critical)
BOUT_OMP_SAFE(critical)
MatGetValues(*petscMatrix, 1, &petscRow, 1, &petscCol, &value);
} else {
value = 0.;
Expand Down Expand Up @@ -400,7 +400,7 @@ public:
[&val](BoutReal weight) -> PetscScalar { return weight * val; });

int status = 0;
BOUT_OMP(critical)
BOUT_OMP_SAFE(critical)
status = MatSetValues(*petscMatrix, 1, &petscRow, positions.size(),
positions.data(), values.data(), mode);
if (status != 0) {
Expand Down Expand Up @@ -467,7 +467,7 @@ public:
#endif
BoutReal value = BoutNaN;
int status = 0;
BOUT_OMP(critical)
BOUT_OMP_SAFE(critical)
status = MatGetValues(*get(), 1, &global1, 1, &global2, &value);
if (status != 0) {
throw BoutException("Error when getting elements of a PETSc matrix.");
Expand Down
2 changes: 1 addition & 1 deletion include/bout/region.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ class BoutMask;

#if BOUT_USE_OPENMP
#define BOUT_FOR_OMP(index, region, omp_pragmas) \
BOUT_OMP(omp_pragmas) \
BOUT_OMP_PERF(omp_pragmas) \
for (auto block = region.getBlocks().cbegin(); block < region.getBlocks().cend(); \
++block) \
for (auto index = block->first; index < block->second; ++index)
Expand Down
4 changes: 2 additions & 2 deletions manual/sphinx/developer_docs/data_types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ verion of the macro::
For loops inside parallel regions, there is ``BOUT_FOR_INNER``::

Field3D f(0.0);
BOUT_OMP(parallel) {
BOUT_OMP_PERF(parallel) {
BOUT_FOR_INNER(i, f.getMesh()->getRegion3D("RGN_ALL")) {
f[i] = a[i] + b[i];
}
Expand Down Expand Up @@ -357,7 +357,7 @@ Tuning BOUT_FOR loops
The ``BOUT_FOR`` macros use two nested loops: The outer loop is OpenMP
parallelised, and iterates over contiguous blocks::

BOUT_OMP(parallel for schedule(guided))
BOUT_OMP_PERF(parallel for schedule(guided))
for (auto block = region.getBlocks().cbegin();
block < region.getBlocks().cend();
++block)
Expand Down
8 changes: 3 additions & 5 deletions src/bout++.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -582,11 +582,8 @@ void printCompileTimeOptions() {
output_info.write(_("\tSUNDIALS support {}\n"), is_enabled(has_sundials));
output_info.write(_("\tBacktrace in exceptions {}\n"), is_enabled(use_backtrace));
output_info.write(_("\tColour in logs {}\n"), is_enabled(use_color));
output_info.write(_("\tOpenMP parallelisation {}"), is_enabled(use_openmp));
#ifdef _OPENMP
output_info.write(_(", using {} threads"), omp_get_max_threads());
#endif
output_info.write("\n");
output_info.write(_("\tOpenMP parallelisation {}, using {} threads\n"),
is_enabled(use_openmp), omp_get_max_threads());
output_info.write(_("\tExtra debug output {}\n"), is_enabled(use_output_debug));
output_info.write(_("\tFloating-point exceptions {}\n"), is_enabled(use_sigfpe));
output_info.write(_("\tSignal handling support {}\n"), is_enabled(use_signal));
Expand Down Expand Up @@ -715,6 +712,7 @@ void addBuildFlagsToOptions(Options& options) {
options["use_backtrace"].force(bout::build::use_backtrace);
options["use_color"].force(bout::build::use_color);
options["use_openmp"].force(bout::build::use_openmp);
options["openmp_threads"].force(omp_get_max_threads());
options["use_output_debug"].force(bout::build::use_output_debug);
options["use_sigfpe"].force(bout::build::use_sigfpe);
options["use_signal"].force(bout::build::use_signal);
Expand Down
4 changes: 2 additions & 2 deletions src/field/field3d.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ Field3D filter(const Field3D& var, int N0, const std::string& rgn) {

const Region<Ind2D>& region = var.getRegion2D(region_str);

BOUT_OMP(parallel)
BOUT_OMP_PERF(parallel)
{
Array<dcomplex> f(ncz / 2 + 1);

Expand Down Expand Up @@ -668,7 +668,7 @@ Field3D lowPass(const Field3D& var, int zmax, bool keep_zonal, const std::string

const Region<Ind2D>& region = var.getRegion2D(region_str);

BOUT_OMP(parallel)
BOUT_OMP_PERF(parallel)
{
Array<dcomplex> f(ncz / 2 + 1);

Expand Down

0 comments on commit 5d0a1ad

Please sign in to comment.