From 0cade0ca0ebc1d7fb1cbfd4f81709ad8340fae1c Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Fri, 12 Jul 2024 10:28:11 +0100
Subject: [PATCH] [mlir][OpenMP] Lower REDUCTION clause for SECTIONS construct
 (#97859)

This shares code with WsloopOp (the changes to Wsloop should be NFC).
OpenMPIRBuilder basically implements SECTIONS as a wsloop over a case
statement with each SECTION as a case for a particular loopiv value.

Unfortunately it proved very difficult to share code between these and
ParallelOp. ParallelOp does quite a few things differently (doing more
work inside of the bodygen callback and laying out blocks differently).
Aligning reduction implementations for wsloop and parallel will probably
involve functional changes to both, so I won't attempt that in this
commit.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 510 ++++++++++--------
 mlir/test/Target/LLVMIR/openmp-llvm.mlir      |   2 +
 .../openmp-reduction-array-sections.mlir      | 214 ++++++++
 .../LLVMIR/openmp-reduction-sections.mlir     | 152 ++++++
 4 files changed, 665 insertions(+), 213 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 0c9c699a1f390b..a9ffe89252b462 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -555,6 +555,239 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
   return bodyGenStatus;
 }
 
+/// Allocate space for privatized reduction variables.
+template <typename T>
+static void allocByValReductionVars(
+    T loop, ArrayRef<BlockArgument> reductionArgs, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation,
+    llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+    SmallVectorImpl<llvm::Value *> &privateReductionVariables,
+    DenseMap<Value, llvm::Value *> &reductionVariableMap,
+    llvm::ArrayRef<bool> isByRefs) {
+  llvm::IRBuilderBase::InsertPointGuard guard(builder);
+  builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+
+  for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
+    if (isByRefs[i])
+      continue;
+    llvm::Value *var = builder.CreateAlloca(
+        moduleTranslation.convertType(reductionDecls[i].getType()));
+    moduleTranslation.mapValue(reductionArgs[i], var);
+    privateReductionVariables[i] = var;
+    reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);
+  }
+}
+
+/// Map input argument to all reduction initialization regions
+template <typename T>
+static void
+mapInitializationArg(T loop, LLVM::ModuleTranslation &moduleTranslation,
+                     SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+                     unsigned i) {
+  // map input argument to the initialization region
+  mlir::omp::DeclareReductionOp &reduction = reductionDecls[i];
+  Region &initializerRegion = reduction.getInitializerRegion();
+  Block &entry = initializerRegion.front();
+  assert(entry.getNumArguments() == 1 &&
+         "the initialization region has one argument");
+
+  mlir::Value mlirSource = loop.getReductionVars()[i];
+  llvm::Value *llvmSource = moduleTranslation.lookupValue(mlirSource);
+  assert(llvmSource && "lookup reduction var");
+  moduleTranslation.mapValue(entry.getArgument(0), llvmSource);
+}
+
+/// Collect reduction info
+template <typename T>
+static void collectReductionInfo(
+    T loop, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation,
+    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+    SmallVectorImpl<OwningReductionGen> &owningReductionGens,
+    SmallVectorImpl<OwningAtomicReductionGen> &owningAtomicReductionGens,
+    const ArrayRef<llvm::Value *> privateReductionVariables,
+    SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos) {
+  unsigned numReductions = loop.getNumReductionVars();
+
+  for (unsigned i = 0; i < numReductions; ++i) {
+    owningReductionGens.push_back(
+        makeReductionGen(reductionDecls[i], builder, moduleTranslation));
+    owningAtomicReductionGens.push_back(
+        makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation));
+  }
+
+  // Collect the reduction information.
+  reductionInfos.reserve(numReductions);
+  for (unsigned i = 0; i < numReductions; ++i) {
+    llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy atomicGen = nullptr;
+    if (owningAtomicReductionGens[i])
+      atomicGen = owningAtomicReductionGens[i];
+    llvm::Value *variable =
+        moduleTranslation.lookupValue(loop.getReductionVars()[i]);
+    reductionInfos.push_back(
+        {moduleTranslation.convertType(reductionDecls[i].getType()), variable,
+         privateReductionVariables[i],
+         /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar,
+         owningReductionGens[i],
+         /*ReductionGenClang=*/nullptr, atomicGen});
+  }
+}
+
+/// handling of DeclareReductionOp's cleanup region
+static LogicalResult
+inlineOmpRegionCleanup(llvm::SmallVectorImpl<Region *> &cleanupRegions,
+                       llvm::ArrayRef<llvm::Value *> privateVariables,
+                       LLVM::ModuleTranslation &moduleTranslation,
+                       llvm::IRBuilderBase &builder, StringRef regionName,
+                       bool shouldLoadCleanupRegionArg = true) {
+  for (auto [i, cleanupRegion] : llvm::enumerate(cleanupRegions)) {
+    if (cleanupRegion->empty())
+      continue;
+
+    // map the argument to the cleanup region
+    Block &entry = cleanupRegion->front();
+
+    llvm::Instruction *potentialTerminator =
+        builder.GetInsertBlock()->empty() ? nullptr
+                                          : &builder.GetInsertBlock()->back();
+    if (potentialTerminator && potentialTerminator->isTerminator())
+      builder.SetInsertPoint(potentialTerminator);
+    llvm::Value *prviateVarValue =
+        shouldLoadCleanupRegionArg
+            ? builder.CreateLoad(
+                  moduleTranslation.convertType(entry.getArgument(0).getType()),
+                  privateVariables[i])
+            : privateVariables[i];
+
+    moduleTranslation.mapValue(entry.getArgument(0), prviateVarValue);
+
+    if (failed(inlineConvertOmpRegions(*cleanupRegion, regionName, builder,
+                                       moduleTranslation)))
+      return failure();
+
+    // clear block argument mapping in case it needs to be re-created with a
+    // different source for another use of the same reduction decl
+    moduleTranslation.forgetMapping(*cleanupRegion);
+  }
+  return success();
+}
+
+// TODO: not used by ParallelOp
+template <class OP>
+static LogicalResult createReductionsAndCleanup(
+    OP op, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation,
+    llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+    ArrayRef<llvm::Value *> privateReductionVariables, ArrayRef<bool> isByRef) {
+  // Process the reductions if required.
+  if (op.getNumReductionVars() == 0)
+    return success();
+
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+
+  // Create the reduction generators. We need to own them here because
+  // ReductionInfo only accepts references to the generators.
+  SmallVector<OwningReductionGen> owningReductionGens;
+  SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;
+  SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
+  collectReductionInfo(op, builder, moduleTranslation, reductionDecls,
+                       owningReductionGens, owningAtomicReductionGens,
+                       privateReductionVariables, reductionInfos);
+
+  // The call to createReductions below expects the block to have a
+  // terminator. Create an unreachable instruction to serve as terminator
+  // and remove it later.
+  llvm::UnreachableInst *tempTerminator = builder.CreateUnreachable();
+  builder.SetInsertPoint(tempTerminator);
+  llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
+      ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,
+                                   isByRef, op.getNowait());
+  if (!contInsertPoint.getBlock())
+    return op->emitOpError() << "failed to convert reductions";
+  auto nextInsertionPoint =
+      ompBuilder->createBarrier(contInsertPoint, llvm::omp::OMPD_for);
+  tempTerminator->eraseFromParent();
+  builder.restoreIP(nextInsertionPoint);
+
+  // after the construct, deallocate private reduction variables
+  SmallVector<Region *> reductionRegions;
+  llvm::transform(reductionDecls, std::back_inserter(reductionRegions),
+                  [](omp::DeclareReductionOp reductionDecl) {
+                    return &reductionDecl.getCleanupRegion();
+                  });
+  return inlineOmpRegionCleanup(reductionRegions, privateReductionVariables,
+                                moduleTranslation, builder,
+                                "omp.reduction.cleanup");
+  return success();
+}
+
+static ArrayRef<bool> getIsByRef(std::optional<ArrayRef<bool>> attr) {
+  if (!attr)
+    return {};
+  return *attr;
+}
+
+// TODO: not used by omp.parallel
+template <typename OP>
+static LogicalResult allocAndInitializeReductionVars(
+    OP op, ArrayRef<BlockArgument> reductionArgs, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation,
+    llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+    SmallVectorImpl<llvm::Value *> &privateReductionVariables,
+    DenseMap<Value, llvm::Value *> &reductionVariableMap,
+    llvm::ArrayRef<bool> isByRef) {
+  if (op.getNumReductionVars() == 0)
+    return success();
+
+  allocByValReductionVars(op, reductionArgs, builder, moduleTranslation,
+                          allocaIP, reductionDecls, privateReductionVariables,
+                          reductionVariableMap, isByRef);
+
+  // Before the loop, store the initial values of reductions into reduction
+  // variables. Although this could be done after allocas, we don't want to mess
+  // up with the alloca insertion point.
+  for (unsigned i = 0; i < op.getNumReductionVars(); ++i) {
+    SmallVector<llvm::Value *> phis;
+
+    // map block argument to initializer region
+    mapInitializationArg(op, moduleTranslation, reductionDecls, i);
+
+    if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
+                                       "omp.reduction.neutral", builder,
+                                       moduleTranslation, &phis)))
+      return failure();
+    assert(phis.size() == 1 && "expected one value to be yielded from the "
+                               "reduction neutral element declaration region");
+    if (isByRef[i]) {
+      // Allocate reduction variable (which is a pointer to the real reduction
+      // variable allocated in the inlined region)
+      llvm::Value *var = builder.CreateAlloca(
+          moduleTranslation.convertType(reductionDecls[i].getType()));
+      // Store the result of the inlined region to the allocated reduction var
+      // ptr
+      builder.CreateStore(phis[0], var);
+
+      privateReductionVariables[i] = var;
+      moduleTranslation.mapValue(reductionArgs[i], phis[0]);
+      reductionVariableMap.try_emplace(op.getReductionVars()[i], phis[0]);
+    } else {
+      // for by-ref case the store is inside of the reduction region
+      builder.CreateStore(phis[0], privateReductionVariables[i]);
+      // the rest was handled in allocByValReductionVars
+    }
+
+    // forget the mapping for the initializer region because we might need a
+    // different mapping if this reduction declaration is re-used for a
+    // different variable
+    moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion());
+  }
+
+  return success();
+}
+
 static LogicalResult
 convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
@@ -565,13 +798,38 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
   auto sectionsOp = cast<omp::SectionsOp>(opInst);
 
   // TODO: Support the following clauses: private, firstprivate, lastprivate,
-  // reduction, allocate
-  if (!sectionsOp.getReductionVars().empty() || sectionsOp.getReductions() ||
-      !sectionsOp.getAllocateVars().empty() ||
+  // allocate
+  if (!sectionsOp.getAllocateVars().empty() ||
       !sectionsOp.getAllocatorsVars().empty())
     return emitError(sectionsOp.getLoc())
-           << "reduction and allocate clauses are not supported for sections "
-              "construct";
+           << "allocate clause is not supported for sections construct";
+
+  llvm::ArrayRef<bool> isByRef = getIsByRef(sectionsOp.getReductionVarsByref());
+  assert(isByRef.size() == sectionsOp.getNumReductionVars());
+
+  SmallVector<omp::DeclareReductionOp> reductionDecls;
+  collectReductionDecls(sectionsOp, reductionDecls);
+  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+      findAllocaInsertPoint(builder, moduleTranslation);
+
+  SmallVector<llvm::Value *> privateReductionVariables(
+      sectionsOp.getNumReductionVars());
+  DenseMap<Value, llvm::Value *> reductionVariableMap;
+
+  MutableArrayRef<BlockArgument> reductionArgs =
+      sectionsOp.getRegion().getArguments();
+
+  if (failed(allocAndInitializeReductionVars(
+          sectionsOp, reductionArgs, builder, moduleTranslation, allocaIP,
+          reductionDecls, privateReductionVariables, reductionVariableMap,
+          isByRef)))
+    return failure();
+
+  // Store the mapping between reduction variables and their private copies on
+  // ModuleTranslation stack. It can be then recovered when translating
+  // omp.reduce operations in a separate call.
+  LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
+      moduleTranslation, reductionVariableMap);
 
   LogicalResult bodyGenStatus = success();
   SmallVector<StorableBodyGenCallbackTy> sectionCBs;
@@ -582,9 +840,24 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
       continue;
 
     Region &region = sectionOp.getRegion();
-    auto sectionCB = [&region, &builder, &moduleTranslation, &bodyGenStatus](
-                         InsertPointTy allocaIP, InsertPointTy codeGenIP) {
+    auto sectionCB = [&sectionsOp, &region, &builder, &moduleTranslation,
+                      &bodyGenStatus](InsertPointTy allocaIP,
+                                      InsertPointTy codeGenIP) {
       builder.restoreIP(codeGenIP);
+
+      // map the omp.section reduction block argument to the omp.sections block
+      // arguments
+      // TODO: this assumes that the only block arguments are reduction
+      // variables
+      assert(region.getNumArguments() ==
+             sectionsOp.getRegion().getNumArguments());
+      for (auto [sectionsArg, sectionArg] : llvm::zip_equal(
+               sectionsOp.getRegion().getArguments(), region.getArguments())) {
+        llvm::Value *llvmVal = moduleTranslation.lookupValue(sectionsArg);
+        assert(llvmVal);
+        moduleTranslation.mapValue(sectionArg, llvmVal);
+      }
+
       convertOmpOpRegions(region, "omp.section.region", builder,
                           moduleTranslation, bodyGenStatus);
     };
@@ -613,13 +886,19 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
   // called for variables which have destructors/finalizers.
   auto finiCB = [&](InsertPointTy codeGenIP) {};
 
-  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+  allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createSections(
       ompLoc, allocaIP, sectionCBs, privCB, finiCB, false,
       sectionsOp.getNowait()));
-  return bodyGenStatus;
+
+  if (failed(bodyGenStatus))
+    return bodyGenStatus;
+
+  // Process the reductions if required.
+  return createReductionsAndCleanup(sectionsOp, builder, moduleTranslation,
+                                    allocaIP, reductionDecls,
+                                    privateReductionVariables, isByRef);
 }
 
 /// Converts an OpenMP single construct into LLVM IR using OpenMPIRBuilder.
@@ -769,131 +1048,6 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
       ompLoc, allocaIP, bodyCB));
   return bodyGenStatus;
 }
-
-/// Allocate space for privatized reduction variables.
-template <typename T>
-static void allocByValReductionVars(
-    T loop, ArrayRef<BlockArgument> reductionArgs, llvm::IRBuilderBase &builder,
-    LLVM::ModuleTranslation &moduleTranslation,
-    llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
-    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
-    SmallVectorImpl<llvm::Value *> &privateReductionVariables,
-    DenseMap<Value, llvm::Value *> &reductionVariableMap,
-    llvm::ArrayRef<bool> isByRefs) {
-  llvm::IRBuilderBase::InsertPointGuard guard(builder);
-  builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
-
-  for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
-    if (isByRefs[i])
-      continue;
-    llvm::Value *var = builder.CreateAlloca(
-        moduleTranslation.convertType(reductionDecls[i].getType()));
-    moduleTranslation.mapValue(reductionArgs[i], var);
-    privateReductionVariables[i] = var;
-    reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);
-  }
-}
-
-/// Map input argument to all reduction initialization regions
-template <typename T>
-static void
-mapInitializationArg(T loop, LLVM::ModuleTranslation &moduleTranslation,
-                     SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
-                     unsigned i) {
-  // map input argument to the initialization region
-  mlir::omp::DeclareReductionOp &reduction = reductionDecls[i];
-  Region &initializerRegion = reduction.getInitializerRegion();
-  Block &entry = initializerRegion.front();
-  assert(entry.getNumArguments() == 1 &&
-         "the initialization region has one argument");
-
-  mlir::Value mlirSource = loop.getReductionVars()[i];
-  llvm::Value *llvmSource = moduleTranslation.lookupValue(mlirSource);
-  assert(llvmSource && "lookup reduction var");
-  moduleTranslation.mapValue(entry.getArgument(0), llvmSource);
-}
-
-/// Collect reduction info
-template <typename T>
-static void collectReductionInfo(
-    T loop, llvm::IRBuilderBase &builder,
-    LLVM::ModuleTranslation &moduleTranslation,
-    SmallVector<omp::DeclareReductionOp> &reductionDecls,
-    SmallVector<OwningReductionGen> &owningReductionGens,
-    SmallVector<OwningAtomicReductionGen> &owningAtomicReductionGens,
-    const SmallVector<llvm::Value *> &privateReductionVariables,
-    SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos) {
-  unsigned numReductions = loop.getNumReductionVars();
-
-  for (unsigned i = 0; i < numReductions; ++i) {
-    owningReductionGens.push_back(
-        makeReductionGen(reductionDecls[i], builder, moduleTranslation));
-    owningAtomicReductionGens.push_back(
-        makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation));
-  }
-
-  // Collect the reduction information.
-  reductionInfos.reserve(numReductions);
-  for (unsigned i = 0; i < numReductions; ++i) {
-    llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy atomicGen = nullptr;
-    if (owningAtomicReductionGens[i])
-      atomicGen = owningAtomicReductionGens[i];
-    llvm::Value *variable =
-        moduleTranslation.lookupValue(loop.getReductionVars()[i]);
-    reductionInfos.push_back(
-        {moduleTranslation.convertType(reductionDecls[i].getType()), variable,
-         privateReductionVariables[i],
-         /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar,
-         owningReductionGens[i],
-         /*ReductionGenClang=*/nullptr, atomicGen});
-  }
-}
-
-/// handling of DeclareReductionOp's cleanup region
-static LogicalResult
-inlineOmpRegionCleanup(llvm::SmallVectorImpl<Region *> &cleanupRegions,
-                       llvm::ArrayRef<llvm::Value *> privateVariables,
-                       LLVM::ModuleTranslation &moduleTranslation,
-                       llvm::IRBuilderBase &builder, StringRef regionName,
-                       bool shouldLoadCleanupRegionArg = true) {
-  for (auto [i, cleanupRegion] : llvm::enumerate(cleanupRegions)) {
-    if (cleanupRegion->empty())
-      continue;
-
-    // map the argument to the cleanup region
-    Block &entry = cleanupRegion->front();
-
-    llvm::Instruction *potentialTerminator =
-        builder.GetInsertBlock()->empty() ? nullptr
-                                          : &builder.GetInsertBlock()->back();
-    if (potentialTerminator && potentialTerminator->isTerminator())
-      builder.SetInsertPoint(potentialTerminator);
-    llvm::Value *prviateVarValue =
-        shouldLoadCleanupRegionArg
-            ? builder.CreateLoad(
-                  moduleTranslation.convertType(entry.getArgument(0).getType()),
-                  privateVariables[i])
-            : privateVariables[i];
-
-    moduleTranslation.mapValue(entry.getArgument(0), prviateVarValue);
-
-    if (failed(inlineConvertOmpRegions(*cleanupRegion, regionName, builder,
-                                       moduleTranslation)))
-      return failure();
-
-    // clear block argument mapping in case it needs to be re-created with a
-    // different source for another use of the same reduction decl
-    moduleTranslation.forgetMapping(*cleanupRegion);
-  }
-  return success();
-}
-
-static ArrayRef<bool> getIsByRef(std::optional<ArrayRef<bool>> attr) {
-  if (!attr)
-    return {};
-  return *attr;
-}
-
 /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -933,48 +1087,11 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   MutableArrayRef<BlockArgument> reductionArgs =
       wsloopOp.getRegion().getArguments();
 
-  allocByValReductionVars(wsloopOp, reductionArgs, builder, moduleTranslation,
-                          allocaIP, reductionDecls, privateReductionVariables,
-                          reductionVariableMap, isByRef);
-
-  // Before the loop, store the initial values of reductions into reduction
-  // variables. Although this could be done after allocas, we don't want to mess
-  // up with the alloca insertion point.
-  for (unsigned i = 0; i < wsloopOp.getNumReductionVars(); ++i) {
-    SmallVector<llvm::Value *> phis;
-
-    // map block argument to initializer region
-    mapInitializationArg(wsloopOp, moduleTranslation, reductionDecls, i);
-
-    if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
-                                       "omp.reduction.neutral", builder,
-                                       moduleTranslation, &phis)))
-      return failure();
-    assert(phis.size() == 1 && "expected one value to be yielded from the "
-                               "reduction neutral element declaration region");
-    if (isByRef[i]) {
-      // Allocate reduction variable (which is a pointer to the real reduction
-      // variable allocated in the inlined region)
-      llvm::Value *var = builder.CreateAlloca(
-          moduleTranslation.convertType(reductionDecls[i].getType()));
-      // Store the result of the inlined region to the allocated reduction var
-      // ptr
-      builder.CreateStore(phis[0], var);
-
-      privateReductionVariables[i] = var;
-      moduleTranslation.mapValue(reductionArgs[i], phis[0]);
-      reductionVariableMap.try_emplace(wsloopOp.getReductionVars()[i], phis[0]);
-    } else {
-      // for by-ref case the store is inside of the reduction region
-      builder.CreateStore(phis[0], privateReductionVariables[i]);
-      // the rest was handled in allocByValReductionVars
-    }
-
-    // forget the mapping for the initializer region because we might need a
-    // different mapping if this reduction declaration is re-used for a
-    // different variable
-    moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion());
-  }
+  if (failed(allocAndInitializeReductionVars(
+          wsloopOp, reductionArgs, builder, moduleTranslation, allocaIP,
+          reductionDecls, privateReductionVariables, reductionVariableMap,
+          isByRef)))
+    return failure();
 
   // Store the mapping between reduction variables and their private copies on
   // ModuleTranslation stack. It can be then recovered when translating
@@ -1067,42 +1184,9 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   builder.restoreIP(afterIP);
 
   // Process the reductions if required.
-  if (wsloopOp.getNumReductionVars() == 0)
-    return success();
-
-  // Create the reduction generators. We need to own them here because
-  // ReductionInfo only accepts references to the generators.
-  SmallVector<OwningReductionGen> owningReductionGens;
-  SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;
-  SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
-  collectReductionInfo(wsloopOp, builder, moduleTranslation, reductionDecls,
-                       owningReductionGens, owningAtomicReductionGens,
-                       privateReductionVariables, reductionInfos);
-
-  // The call to createReductions below expects the block to have a
-  // terminator. Create an unreachable instruction to serve as terminator
-  // and remove it later.
-  llvm::UnreachableInst *tempTerminator = builder.CreateUnreachable();
-  builder.SetInsertPoint(tempTerminator);
-  llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
-      ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,
-                                   isByRef, wsloopOp.getNowait());
-  if (!contInsertPoint.getBlock())
-    return wsloopOp->emitOpError() << "failed to convert reductions";
-  auto nextInsertionPoint =
-      ompBuilder->createBarrier(contInsertPoint, llvm::omp::OMPD_for);
-  tempTerminator->eraseFromParent();
-  builder.restoreIP(nextInsertionPoint);
-
-  // after the workshare loop, deallocate private reduction variables
-  SmallVector<Region *> reductionRegions;
-  llvm::transform(reductionDecls, std::back_inserter(reductionRegions),
-                  [](omp::DeclareReductionOp reductionDecl) {
-                    return &reductionDecl.getCleanupRegion();
-                  });
-  return inlineOmpRegionCleanup(reductionRegions, privateReductionVariables,
-                                moduleTranslation, builder,
-                                "omp.reduction.cleanup");
+  return createReductionsAndCleanup(wsloopOp, builder, moduleTranslation,
+                                    allocaIP, reductionDecls,
+                                    privateReductionVariables, isByRef);
 }
 
 /// A RAII class that on construction replaces the region arguments of the
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index dfeaf4be33adb8..7f860268db11d4 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -2099,6 +2099,8 @@ llvm.func @omp_sections_empty() -> () {
   omp.sections {
     omp.terminator
   }
+  // CHECK-NEXT: br label %entry
+  // CHECK: entry:
   // CHECK-NEXT: ret void
   llvm.return
 }
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
new file mode 100644
index 00000000000000..5682e7e96ab186
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -0,0 +1,214 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// nonesense minimised code simulating the control flow graph generated by flang
+// for array reductions. The important thing here is that we are testing a byref
+// reduction with a cleanup region, and the various regions contain multiple
+// blocks
+omp.declare_reduction @add_reduction_byref_box_Uxf32 : !llvm.ptr init {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+} combiner {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.mlir.constant(0 : i64) : i64
+  %1 = llvm.mlir.constant(0 : index) : i64
+  %2 = llvm.mlir.constant(1 : index) : i64
+  llvm.br ^bb1(%0 : i64)
+^bb1(%3: i64):  // 2 preds: ^bb0, ^bb2
+  %4 = llvm.icmp "sgt" %3, %1 : i64
+  llvm.cond_br %4, ^bb2, ^bb3
+^bb2:  // pred: ^bb1
+  %5 = llvm.sub %3, %2 : i64
+  llvm.br ^bb1(%5 : i64)
+^bb3:  // pred: ^bb1
+  omp.yield(%arg0 : !llvm.ptr)
+}  cleanup {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(0 : i64) : i64
+  %1 = llvm.ptrtoint %arg0 : !llvm.ptr to i64
+  %2 = llvm.icmp "ne" %1, %0 : i64
+  llvm.cond_br %2, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  llvm.br ^bb2
+^bb2:  // 2 preds: ^bb0, ^bb1
+  omp.yield
+}
+llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.internal_name = "_QPsectionsreduction"} {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.mlir.constant(0 : index) : i64
+  %2 = llvm.mlir.constant(1 : index) : i64
+  omp.parallel {
+    %3 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+    omp.sections reduction(byref @add_reduction_byref_box_Uxf32 -> %3 : !llvm.ptr) {
+    ^bb0(%arg1: !llvm.ptr):
+      omp.section {
+      ^bb0(%arg2: !llvm.ptr):
+        llvm.br ^bb1(%0 : i64)
+      ^bb1(%4: i64):  // 2 preds: ^bb0, ^bb2
+        %5 = llvm.icmp "sgt" %4, %1 : i64
+        llvm.cond_br %5, ^bb2, ^bb3
+      ^bb2:  // pred: ^bb1
+        %6 = llvm.sub %4, %2 : i64
+        llvm.br ^bb1(%6 : i64)
+      ^bb3:  // pred: ^bb1
+        omp.terminator
+      }
+      omp.section {
+      ^bb0(%arg2: !llvm.ptr):
+        llvm.br ^bb1(%0 : i64)
+      ^bb1(%4: i64):  // 2 preds: ^bb0, ^bb2
+        %5 = llvm.icmp "sgt" %4, %1 : i64
+        llvm.cond_br %5, ^bb2, ^bb3
+      ^bb2:  // pred: ^bb1
+        %6 = llvm.sub %4, %2 : i64
+        llvm.br ^bb1(%6 : i64)
+      ^bb3:  // pred: ^bb1
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @sectionsreduction_..omp_par
+// CHECK:       omp.par.entry:
+// CHECK:         %[[VAL_6:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_7:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_8:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_9:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_10:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_11:.*]] = load i32, ptr %[[VAL_12:.*]], align 4
+// CHECK:         store i32 %[[VAL_11]], ptr %[[VAL_10]], align 4
+// CHECK:         %[[VAL_13:.*]] = load i32, ptr %[[VAL_10]], align 4
+// CHECK:         %[[VAL_14:.*]] = alloca [1 x ptr], align 8
+// CHECK:         br label %[[VAL_15:.*]]
+// CHECK:       omp.reduction.init:                               ; preds = %[[VAL_16:.*]]
+// CHECK:         br label %[[VAL_17:.*]]
+// CHECK:       omp.par.region:                                   ; preds = %[[VAL_15]]
+// CHECK:         br label %[[VAL_18:.*]]
+// CHECK:       omp.par.region1:                                  ; preds = %[[VAL_17]]
+// CHECK:         %[[VAL_19:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK:         %[[VAL_20:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
+// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
+// CHECK:         br label %[[VAL_22:.*]]
+// CHECK:       omp_section_loop.preheader:                       ; preds = %[[VAL_18]]
+// CHECK:         store i32 0, ptr %[[VAL_7]], align 4
+// CHECK:         store i32 1, ptr %[[VAL_8]], align 4
+// CHECK:         store i32 1, ptr %[[VAL_9]], align 4
+// CHECK:         %[[VAL_23:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[VAL_23]], i32 34, ptr %[[VAL_6]], ptr %[[VAL_7]], ptr %[[VAL_8]], ptr %[[VAL_9]], i32 1, i32 0)
+// CHECK:         %[[VAL_24:.*]] = load i32, ptr %[[VAL_7]], align 4
+// CHECK:         %[[VAL_25:.*]] = load i32, ptr %[[VAL_8]], align 4
+// CHECK:         %[[VAL_26:.*]] = sub i32 %[[VAL_25]], %[[VAL_24]]
+// CHECK:         %[[VAL_27:.*]] = add i32 %[[VAL_26]], 1
+// CHECK:         br label %[[VAL_28:.*]]
+// CHECK:       omp_section_loop.header:                          ; preds = %[[VAL_29:.*]], %[[VAL_22]]
+// CHECK:         %[[VAL_30:.*]] = phi i32 [ 0, %[[VAL_22]] ], [ %[[VAL_31:.*]], %[[VAL_29]] ]
+// CHECK:         br label %[[VAL_32:.*]]
+// CHECK:       omp_section_loop.cond:                            ; preds = %[[VAL_28]]
+// CHECK:         %[[VAL_33:.*]] = icmp ult i32 %[[VAL_30]], %[[VAL_27]]
+// CHECK:         br i1 %[[VAL_33]], label %[[VAL_34:.*]], label %[[VAL_35:.*]]
+// CHECK:       omp_section_loop.exit:                            ; preds = %[[VAL_32]]
+// CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_23]])
+// CHECK:         %[[VAL_36:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_36]])
+// CHECK:         br label %[[VAL_37:.*]]
+// CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_35]]
+// CHECK:         br label %[[VAL_38:.*]]
+// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_37]]
+// CHECK:         %[[VAL_39:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_14]], i64 0, i64 0
+// CHECK:         store ptr %[[VAL_21]], ptr %[[VAL_39]], align 8
+// CHECK:         %[[VAL_40:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_41:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_40]], i32 1, i64 8, ptr %[[VAL_14]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         switch i32 %[[VAL_41]], label %[[VAL_42:.*]] [
+// CHECK:           i32 1, label %[[VAL_43:.*]]
+// CHECK:           i32 2, label %[[VAL_44:.*]]
+// CHECK:         ]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_38]]
+// CHECK:         unreachable
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_38]]
+// CHECK:         %[[VAL_45:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// CHECK:         br label %[[VAL_46:.*]]
+// CHECK:       omp.reduction.nonatomic.body:                     ; preds = %[[VAL_43]]
+// CHECK:         br label %[[VAL_47:.*]]
+// CHECK:       omp.reduction.nonatomic.body16:                   ; preds = %[[VAL_48:.*]], %[[VAL_46]]
+// CHECK:         %[[VAL_49:.*]] = phi i64 [ %[[VAL_50:.*]], %[[VAL_48]] ], [ 0, %[[VAL_46]] ]
+// CHECK:         %[[VAL_51:.*]] = icmp sgt i64 %[[VAL_49]], 0
+// CHECK:         br i1 %[[VAL_51]], label %[[VAL_48]], label %[[VAL_52:.*]]
+// CHECK:       omp.reduction.nonatomic.body18:                   ; preds = %[[VAL_47]]
+// CHECK:         br label %[[VAL_53:.*]]
+// CHECK:       omp.region.cont15:                                ; preds = %[[VAL_52]]
+// CHECK:         %[[VAL_54:.*]] = phi ptr [ %[[VAL_19]], %[[VAL_52]] ]
+// CHECK:         call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_40]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         br label %[[VAL_42]]
+// CHECK:       omp.reduction.nonatomic.body17:                   ; preds = %[[VAL_47]]
+// CHECK:         %[[VAL_50]] = sub i64 %[[VAL_49]], 1
+// CHECK:         br label %[[VAL_47]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_53]], %[[VAL_38]]
+// CHECK:         %[[VAL_55:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_55]])
+// CHECK:         %[[VAL_56:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// CHECK:         br label %[[VAL_57:.*]]
+// CHECK:       omp.reduction.cleanup:                            ; preds = %[[VAL_42]]
+// CHECK:         %[[VAL_58:.*]] = ptrtoint ptr %[[VAL_56]] to i64
+// CHECK:         %[[VAL_59:.*]] = icmp ne i64 %[[VAL_58]], 0
+// CHECK:         br i1 %[[VAL_59]], label %[[VAL_60:.*]], label %[[VAL_61:.*]]
+// CHECK:       omp.reduction.cleanup22:                          ; preds = %[[VAL_60]], %[[VAL_57]]
+// CHECK:         br label %[[VAL_62:.*]]
+// CHECK:       omp.region.cont20:                                ; preds = %[[VAL_61]]
+// CHECK:         br label %[[VAL_63:.*]]
+// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_62]]
+// CHECK:         br label %[[VAL_64:.*]]
+// CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_63]]
+// CHECK:         br label %[[VAL_65:.*]]
+// CHECK:       omp.reduction.cleanup21:                          ; preds = %[[VAL_57]]
+// CHECK:         br label %[[VAL_61]]
+// CHECK:       omp_section_loop.body:                            ; preds = %[[VAL_32]]
+// CHECK:         %[[VAL_66:.*]] = add i32 %[[VAL_30]], %[[VAL_24]]
+// CHECK:         %[[VAL_67:.*]] = mul i32 %[[VAL_66]], 1
+// CHECK:         %[[VAL_68:.*]] = add i32 %[[VAL_67]], 0
+// CHECK:         switch i32 %[[VAL_68]], label %[[VAL_69:.*]] [
+// CHECK:           i32 0, label %[[VAL_70:.*]]
+// CHECK:           i32 1, label %[[VAL_71:.*]]
+// CHECK:         ]
+// CHECK:       omp_section_loop.body.case6:                      ; preds = %[[VAL_34]]
+// CHECK:         br label %[[VAL_72:.*]]
+// CHECK:       omp.section.region8:                              ; preds = %[[VAL_71]]
+// CHECK:         br label %[[VAL_73:.*]]
+// CHECK:       omp.section.region9:                              ; preds = %[[VAL_74:.*]], %[[VAL_72]]
+// CHECK:         %[[VAL_75:.*]] = phi i64 [ %[[VAL_76:.*]], %[[VAL_74]] ], [ 1, %[[VAL_72]] ]
+// CHECK:         %[[VAL_77:.*]] = icmp sgt i64 %[[VAL_75]], 0
+// CHECK:         br i1 %[[VAL_77]], label %[[VAL_74]], label %[[VAL_78:.*]]
+// CHECK:       omp.section.region11:                             ; preds = %[[VAL_73]]
+// CHECK:         br label %[[VAL_79:.*]]
+// CHECK:       omp.region.cont7:                                 ; preds = %[[VAL_78]]
+// CHECK:         br label %[[VAL_69]]
+// CHECK:       omp.section.region10:                             ; preds = %[[VAL_73]]
+// CHECK:         %[[VAL_76]] = sub i64 %[[VAL_75]], 1
+// CHECK:         br label %[[VAL_73]]
+// CHECK:       omp_section_loop.body.case:                       ; preds = %[[VAL_34]]
+// CHECK:         br label %[[VAL_80:.*]]
+// CHECK:       omp.section.region:                               ; preds = %[[VAL_70]]
+// CHECK:         br label %[[VAL_81:.*]]
+// CHECK:       omp.section.region3:                              ; preds = %[[VAL_82:.*]], %[[VAL_80]]
+// CHECK:         %[[VAL_83:.*]] = phi i64 [ %[[VAL_84:.*]], %[[VAL_82]] ], [ 1, %[[VAL_80]] ]
+// CHECK:         %[[VAL_85:.*]] = icmp sgt i64 %[[VAL_83]], 0
+// CHECK:         br i1 %[[VAL_85]], label %[[VAL_82]], label %[[VAL_86:.*]]
+// CHECK:       omp.section.region5:                              ; preds = %[[VAL_81]]
+// CHECK:         br label %[[VAL_87:.*]]
+// CHECK:       omp.region.cont2:                                 ; preds = %[[VAL_86]]
+// CHECK:         br label %[[VAL_69]]
+// CHECK:       omp.section.region4:                              ; preds = %[[VAL_81]]
+// CHECK:         %[[VAL_84]] = sub i64 %[[VAL_83]], 1
+// CHECK:         br label %[[VAL_81]]
+// CHECK:       omp_section_loop.body.sections.after:             ; preds = %[[VAL_79]], %[[VAL_87]], %[[VAL_34]]
+// CHECK:         br label %[[VAL_29]]
+// CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_69]]
+// CHECK:         %[[VAL_31]] = add nuw i32 %[[VAL_30]], 1
+// CHECK:         br label %[[VAL_28]]
+// CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %[[VAL_64]]
+// CHECK:         ret void
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
new file mode 100644
index 00000000000000..694180a5ced373
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir
@@ -0,0 +1,152 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+omp.declare_reduction @add_reduction_f32 : f32 init {
+^bb0(%arg0: f32):
+  %0 = llvm.mlir.constant(0.000000e+00 : f32) : f32
+  omp.yield(%0 : f32)
+} combiner {
+^bb0(%arg0: f32, %arg1: f32):
+  %0 = llvm.fadd %arg0, %arg1  {fastmathFlags = #llvm.fastmath<contract>} : f32
+  omp.yield(%0 : f32)
+}  
+llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.internal_name = "_QPsections"} {
+  %0 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+  %1 = llvm.mlir.constant(1.000000e+00 : f32) : f32
+  omp.parallel {
+    omp.sections reduction(@add_reduction_f32 -> %arg0 : !llvm.ptr) {
+    ^bb0(%arg1: !llvm.ptr):
+      omp.section {
+      ^bb0(%arg2: !llvm.ptr):
+        %2 = llvm.load %arg2 : !llvm.ptr -> f32
+        %3 = llvm.fadd %2, %1  {fastmathFlags = #llvm.fastmath<contract>} : f32
+        llvm.store %3, %arg2 : f32, !llvm.ptr
+        omp.terminator
+      }
+      omp.section {
+      ^bb0(%arg2: !llvm.ptr):
+        %2 = llvm.load %arg2 : !llvm.ptr -> f32
+        %3 = llvm.fadd %2, %0  {fastmathFlags = #llvm.fastmath<contract>} : f32
+        llvm.store %3, %arg2 : f32, !llvm.ptr
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @sections_..omp_par
+// CHECK:       omp.par.entry:
+// CHECK:         %[[VAL_9:.*]] = getelementptr { ptr }, ptr %[[VAL_10:.*]], i32 0, i32 0
+// CHECK:         %[[VAL_11:.*]] = load ptr, ptr %[[VAL_9]], align 8
+// CHECK:         %[[VAL_12:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_13:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_14:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_15:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_16:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_17:.*]] = load i32, ptr %[[VAL_18:.*]], align 4
+// CHECK:         store i32 %[[VAL_17]], ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_20:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_21:.*]] = alloca [1 x ptr], align 8
+// CHECK:         br label %[[VAL_22:.*]]
+// CHECK:       omp.reduction.init:                               ; preds = %[[VAL_23:.*]]
+// CHECK:         br label %[[VAL_24:.*]]
+// CHECK:       omp.par.region:                                   ; preds = %[[VAL_22]]
+// CHECK:         br label %[[VAL_25:.*]]
+// CHECK:       omp.par.region1:                                  ; preds = %[[VAL_24]]
+// CHECK:         store float 0.000000e+00, ptr %[[VAL_20]], align 4
+// CHECK:         br label %[[VAL_26:.*]]
+// CHECK:       omp_section_loop.preheader:                       ; preds = %[[VAL_25]]
+// CHECK:         store i32 0, ptr %[[VAL_13]], align 4
+// CHECK:         store i32 1, ptr %[[VAL_14]], align 4
+// CHECK:         store i32 1, ptr %[[VAL_15]], align 4
+// CHECK:         %[[VAL_27:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[VAL_27]], i32 34, ptr %[[VAL_12]], ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_15]], i32 1, i32 0)
+// CHECK:         %[[VAL_28:.*]] = load i32, ptr %[[VAL_13]], align 4
+// CHECK:         %[[VAL_29:.*]] = load i32, ptr %[[VAL_14]], align 4
+// CHECK:         %[[VAL_30:.*]] = sub i32 %[[VAL_29]], %[[VAL_28]]
+// CHECK:         %[[VAL_31:.*]] = add i32 %[[VAL_30]], 1
+// CHECK:         br label %[[VAL_32:.*]]
+// CHECK:       omp_section_loop.header:                          ; preds = %[[VAL_33:.*]], %[[VAL_26]]
+// CHECK:         %[[VAL_34:.*]] = phi i32 [ 0, %[[VAL_26]] ], [ %[[VAL_35:.*]], %[[VAL_33]] ]
+// CHECK:         br label %[[VAL_36:.*]]
+// CHECK:       omp_section_loop.cond:                            ; preds = %[[VAL_32]]
+// CHECK:         %[[VAL_37:.*]] = icmp ult i32 %[[VAL_34]], %[[VAL_31]]
+// CHECK:         br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_39:.*]]
+// CHECK:       omp_section_loop.exit:                            ; preds = %[[VAL_36]]
+// CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_27]])
+// CHECK:         %[[VAL_40:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_40]])
+// CHECK:         br label %[[VAL_41:.*]]
+// CHECK:       omp_section_loop.after:                           ; preds = %[[VAL_39]]
+// CHECK:         br label %[[VAL_42:.*]]
+// CHECK:       omp_section_loop.aftersections.fini:              ; preds = %[[VAL_41]]
+// CHECK:         %[[VAL_43:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_21]], i64 0, i64 0
+// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_43]], align 8
+// CHECK:         %[[VAL_44:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_45:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_44]], i32 1, i64 8, ptr %[[VAL_21]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         switch i32 %[[VAL_45]], label %[[VAL_46:.*]] [
+// CHECK:           i32 1, label %[[VAL_47:.*]]
+// CHECK:           i32 2, label %[[VAL_48:.*]]
+// CHECK:         ]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_42]]
+// CHECK:         unreachable
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_42]]
+// CHECK:         %[[VAL_49:.*]] = load float, ptr %[[VAL_11]], align 4
+// CHECK:         %[[VAL_50:.*]] = load float, ptr %[[VAL_20]], align 4
+// CHECK:         %[[VAL_51:.*]] = fadd contract float %[[VAL_49]], %[[VAL_50]]
+// CHECK:         store float %[[VAL_51]], ptr %[[VAL_11]], align 4
+// CHECK:         call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_44]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         br label %[[VAL_46]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_47]], %[[VAL_42]]
+// CHECK:         %[[VAL_52:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_52]])
+// CHECK:         br label %[[VAL_53:.*]]
+// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_46]]
+// CHECK:         br label %[[VAL_54:.*]]
+// CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_53]]
+// CHECK:         br label %[[VAL_55:.*]]
+// CHECK:       omp_section_loop.body:                            ; preds = %[[VAL_36]]
+// CHECK:         %[[VAL_56:.*]] = add i32 %[[VAL_34]], %[[VAL_28]]
+// CHECK:         %[[VAL_57:.*]] = mul i32 %[[VAL_56]], 1
+// CHECK:         %[[VAL_58:.*]] = add i32 %[[VAL_57]], 0
+// CHECK:         switch i32 %[[VAL_58]], label %[[VAL_59:.*]] [
+// CHECK:           i32 0, label %[[VAL_60:.*]]
+// CHECK:           i32 1, label %[[VAL_61:.*]]
+// CHECK:         ]
+// CHECK:       omp_section_loop.body.case3:                      ; preds = %[[VAL_38]]
+// CHECK:         br label %[[VAL_62:.*]]
+// CHECK:       omp.section.region5:                              ; preds = %[[VAL_61]]
+// CHECK:         %[[VAL_63:.*]] = load float, ptr %[[VAL_20]], align 4
+// CHECK:         %[[VAL_64:.*]] = fadd contract float %[[VAL_63]], 2.000000e+00
+// CHECK:         store float %[[VAL_64]], ptr %[[VAL_20]], align 4
+// CHECK:         br label %[[VAL_65:.*]]
+// CHECK:       omp.region.cont4:                                 ; preds = %[[VAL_62]]
+// CHECK:         br label %[[VAL_59]]
+// CHECK:       omp_section_loop.body.case:                       ; preds = %[[VAL_38]]
+// CHECK:         br label %[[VAL_66:.*]]
+// CHECK:       omp.section.region:                               ; preds = %[[VAL_60]]
+// CHECK:         %[[VAL_67:.*]] = load float, ptr %[[VAL_20]], align 4
+// CHECK:         %[[VAL_68:.*]] = fadd contract float %[[VAL_67]], 1.000000e+00
+// CHECK:         store float %[[VAL_68]], ptr %[[VAL_20]], align 4
+// CHECK:         br label %[[VAL_69:.*]]
+// CHECK:       omp.region.cont2:                                 ; preds = %[[VAL_66]]
+// CHECK:         br label %[[VAL_59]]
+// CHECK:       omp_section_loop.body.sections.after:             ; preds = %[[VAL_65]], %[[VAL_69]], %[[VAL_38]]
+// CHECK:         br label %[[VAL_33]]
+// CHECK:       omp_section_loop.inc:                             ; preds = %[[VAL_59]]
+// CHECK:         %[[VAL_35]] = add nuw i32 %[[VAL_34]], 1
+// CHECK:         br label %[[VAL_32]]
+// CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %[[VAL_54]]
+// CHECK:         ret void
+// CHECK:         %[[VAL_70:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_71:.*]], i64 0, i64 0
+// CHECK:         %[[VAL_72:.*]] = load ptr, ptr %[[VAL_70]], align 8
+// CHECK:         %[[VAL_73:.*]] = load float, ptr %[[VAL_72]], align 4
+// CHECK:         %[[VAL_74:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_75:.*]], i64 0, i64 0
+// CHECK:         %[[VAL_76:.*]] = load ptr, ptr %[[VAL_74]], align 8
+// CHECK:         %[[VAL_77:.*]] = load float, ptr %[[VAL_76]], align 4
+// CHECK:         %[[VAL_78:.*]] = fadd contract float %[[VAL_73]], %[[VAL_77]]
+// CHECK:         store float %[[VAL_78]], ptr %[[VAL_72]], align 4
+// CHECK:         ret void