From 0cade0ca0ebc1d7fb1cbfd4f81709ad8340fae1c Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Fri, 12 Jul 2024 10:28:11 +0100 Subject: [PATCH] [mlir][OpenMP] Lower REDUCTION clause for SECTIONS construct (#97859) This shares code with WsloopOp (the changes to Wsloop should be NFC). OpenMPIRBuilder basically implements SECTIONS as a wsloop over a case statement with each SECTION as a case for a particular loopiv value. Unfortunately it proved very difficult to share code between these and ParallelOp. ParallelOp does quite a few things differently (doing more work inside of the bodygen callback and laying out blocks differently). Aligning reduction implementations for wsloop and parallel will probably involve functional changes to both, so I won't attempt that in this commit. --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 510 ++++++++++-------- mlir/test/Target/LLVMIR/openmp-llvm.mlir | 2 + .../openmp-reduction-array-sections.mlir | 214 ++++++++ .../LLVMIR/openmp-reduction-sections.mlir | 152 ++++++ 4 files changed, 665 insertions(+), 213 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir create mode 100644 mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 0c9c699a1f390b..a9ffe89252b462 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -555,6 +555,239 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder, return bodyGenStatus; } +/// Allocate space for privatized reduction variables. +template +static void allocByValReductionVars( + T loop, ArrayRef reductionArgs, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, + SmallVectorImpl &reductionDecls, + SmallVectorImpl &privateReductionVariables, + DenseMap &reductionVariableMap, + llvm::ArrayRef isByRefs) { + llvm::IRBuilderBase::InsertPointGuard guard(builder); + builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); + + for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) { + if (isByRefs[i]) + continue; + llvm::Value *var = builder.CreateAlloca( + moduleTranslation.convertType(reductionDecls[i].getType())); + moduleTranslation.mapValue(reductionArgs[i], var); + privateReductionVariables[i] = var; + reductionVariableMap.try_emplace(loop.getReductionVars()[i], var); + } +} + +/// Map input argument to all reduction initialization regions +template +static void +mapInitializationArg(T loop, LLVM::ModuleTranslation &moduleTranslation, + SmallVectorImpl &reductionDecls, + unsigned i) { + // map input argument to the initialization region + mlir::omp::DeclareReductionOp &reduction = reductionDecls[i]; + Region &initializerRegion = reduction.getInitializerRegion(); + Block &entry = initializerRegion.front(); + assert(entry.getNumArguments() == 1 && + "the initialization region has one argument"); + + mlir::Value mlirSource = loop.getReductionVars()[i]; + llvm::Value *llvmSource = moduleTranslation.lookupValue(mlirSource); + assert(llvmSource && "lookup reduction var"); + moduleTranslation.mapValue(entry.getArgument(0), llvmSource); +} + +/// Collect reduction info +template +static void collectReductionInfo( + T loop, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + SmallVectorImpl &reductionDecls, + SmallVectorImpl &owningReductionGens, + SmallVectorImpl &owningAtomicReductionGens, + const ArrayRef privateReductionVariables, + SmallVectorImpl &reductionInfos) { + unsigned numReductions = loop.getNumReductionVars(); + + for (unsigned i = 0; i < numReductions; ++i) { + owningReductionGens.push_back( + makeReductionGen(reductionDecls[i], builder, moduleTranslation)); + owningAtomicReductionGens.push_back( + makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation)); + } + + // Collect the reduction information. + reductionInfos.reserve(numReductions); + for (unsigned i = 0; i < numReductions; ++i) { + llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy atomicGen = nullptr; + if (owningAtomicReductionGens[i]) + atomicGen = owningAtomicReductionGens[i]; + llvm::Value *variable = + moduleTranslation.lookupValue(loop.getReductionVars()[i]); + reductionInfos.push_back( + {moduleTranslation.convertType(reductionDecls[i].getType()), variable, + privateReductionVariables[i], + /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar, + owningReductionGens[i], + /*ReductionGenClang=*/nullptr, atomicGen}); + } +} + +/// handling of DeclareReductionOp's cleanup region +static LogicalResult +inlineOmpRegionCleanup(llvm::SmallVectorImpl &cleanupRegions, + llvm::ArrayRef privateVariables, + LLVM::ModuleTranslation &moduleTranslation, + llvm::IRBuilderBase &builder, StringRef regionName, + bool shouldLoadCleanupRegionArg = true) { + for (auto [i, cleanupRegion] : llvm::enumerate(cleanupRegions)) { + if (cleanupRegion->empty()) + continue; + + // map the argument to the cleanup region + Block &entry = cleanupRegion->front(); + + llvm::Instruction *potentialTerminator = + builder.GetInsertBlock()->empty() ? nullptr + : &builder.GetInsertBlock()->back(); + if (potentialTerminator && potentialTerminator->isTerminator()) + builder.SetInsertPoint(potentialTerminator); + llvm::Value *prviateVarValue = + shouldLoadCleanupRegionArg + ? builder.CreateLoad( + moduleTranslation.convertType(entry.getArgument(0).getType()), + privateVariables[i]) + : privateVariables[i]; + + moduleTranslation.mapValue(entry.getArgument(0), prviateVarValue); + + if (failed(inlineConvertOmpRegions(*cleanupRegion, regionName, builder, + moduleTranslation))) + return failure(); + + // clear block argument mapping in case it needs to be re-created with a + // different source for another use of the same reduction decl + moduleTranslation.forgetMapping(*cleanupRegion); + } + return success(); +} + +// TODO: not used by ParallelOp +template +static LogicalResult createReductionsAndCleanup( + OP op, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, + SmallVectorImpl &reductionDecls, + ArrayRef privateReductionVariables, ArrayRef isByRef) { + // Process the reductions if required. + if (op.getNumReductionVars() == 0) + return success(); + + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + + // Create the reduction generators. We need to own them here because + // ReductionInfo only accepts references to the generators. + SmallVector owningReductionGens; + SmallVector owningAtomicReductionGens; + SmallVector reductionInfos; + collectReductionInfo(op, builder, moduleTranslation, reductionDecls, + owningReductionGens, owningAtomicReductionGens, + privateReductionVariables, reductionInfos); + + // The call to createReductions below expects the block to have a + // terminator. Create an unreachable instruction to serve as terminator + // and remove it later. + llvm::UnreachableInst *tempTerminator = builder.CreateUnreachable(); + builder.SetInsertPoint(tempTerminator); + llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint = + ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos, + isByRef, op.getNowait()); + if (!contInsertPoint.getBlock()) + return op->emitOpError() << "failed to convert reductions"; + auto nextInsertionPoint = + ompBuilder->createBarrier(contInsertPoint, llvm::omp::OMPD_for); + tempTerminator->eraseFromParent(); + builder.restoreIP(nextInsertionPoint); + + // after the construct, deallocate private reduction variables + SmallVector reductionRegions; + llvm::transform(reductionDecls, std::back_inserter(reductionRegions), + [](omp::DeclareReductionOp reductionDecl) { + return &reductionDecl.getCleanupRegion(); + }); + return inlineOmpRegionCleanup(reductionRegions, privateReductionVariables, + moduleTranslation, builder, + "omp.reduction.cleanup"); + return success(); +} + +static ArrayRef getIsByRef(std::optional> attr) { + if (!attr) + return {}; + return *attr; +} + +// TODO: not used by omp.parallel +template +static LogicalResult allocAndInitializeReductionVars( + OP op, ArrayRef reductionArgs, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, + SmallVectorImpl &reductionDecls, + SmallVectorImpl &privateReductionVariables, + DenseMap &reductionVariableMap, + llvm::ArrayRef isByRef) { + if (op.getNumReductionVars() == 0) + return success(); + + allocByValReductionVars(op, reductionArgs, builder, moduleTranslation, + allocaIP, reductionDecls, privateReductionVariables, + reductionVariableMap, isByRef); + + // Before the loop, store the initial values of reductions into reduction + // variables. Although this could be done after allocas, we don't want to mess + // up with the alloca insertion point. + for (unsigned i = 0; i < op.getNumReductionVars(); ++i) { + SmallVector phis; + + // map block argument to initializer region + mapInitializationArg(op, moduleTranslation, reductionDecls, i); + + if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(), + "omp.reduction.neutral", builder, + moduleTranslation, &phis))) + return failure(); + assert(phis.size() == 1 && "expected one value to be yielded from the " + "reduction neutral element declaration region"); + if (isByRef[i]) { + // Allocate reduction variable (which is a pointer to the real reduction + // variable allocated in the inlined region) + llvm::Value *var = builder.CreateAlloca( + moduleTranslation.convertType(reductionDecls[i].getType())); + // Store the result of the inlined region to the allocated reduction var + // ptr + builder.CreateStore(phis[0], var); + + privateReductionVariables[i] = var; + moduleTranslation.mapValue(reductionArgs[i], phis[0]); + reductionVariableMap.try_emplace(op.getReductionVars()[i], phis[0]); + } else { + // for by-ref case the store is inside of the reduction region + builder.CreateStore(phis[0], privateReductionVariables[i]); + // the rest was handled in allocByValReductionVars + } + + // forget the mapping for the initializer region because we might need a + // different mapping if this reduction declaration is re-used for a + // different variable + moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion()); + } + + return success(); +} + static LogicalResult convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { @@ -565,13 +798,38 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, auto sectionsOp = cast(opInst); // TODO: Support the following clauses: private, firstprivate, lastprivate, - // reduction, allocate - if (!sectionsOp.getReductionVars().empty() || sectionsOp.getReductions() || - !sectionsOp.getAllocateVars().empty() || + // allocate + if (!sectionsOp.getAllocateVars().empty() || !sectionsOp.getAllocatorsVars().empty()) return emitError(sectionsOp.getLoc()) - << "reduction and allocate clauses are not supported for sections " - "construct"; + << "allocate clause is not supported for sections construct"; + + llvm::ArrayRef isByRef = getIsByRef(sectionsOp.getReductionVarsByref()); + assert(isByRef.size() == sectionsOp.getNumReductionVars()); + + SmallVector reductionDecls; + collectReductionDecls(sectionsOp, reductionDecls); + llvm::OpenMPIRBuilder::InsertPointTy allocaIP = + findAllocaInsertPoint(builder, moduleTranslation); + + SmallVector privateReductionVariables( + sectionsOp.getNumReductionVars()); + DenseMap reductionVariableMap; + + MutableArrayRef reductionArgs = + sectionsOp.getRegion().getArguments(); + + if (failed(allocAndInitializeReductionVars( + sectionsOp, reductionArgs, builder, moduleTranslation, allocaIP, + reductionDecls, privateReductionVariables, reductionVariableMap, + isByRef))) + return failure(); + + // Store the mapping between reduction variables and their private copies on + // ModuleTranslation stack. It can be then recovered when translating + // omp.reduce operations in a separate call. + LLVM::ModuleTranslation::SaveStack mappingGuard( + moduleTranslation, reductionVariableMap); LogicalResult bodyGenStatus = success(); SmallVector sectionCBs; @@ -582,9 +840,24 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, continue; Region ®ion = sectionOp.getRegion(); - auto sectionCB = [®ion, &builder, &moduleTranslation, &bodyGenStatus]( - InsertPointTy allocaIP, InsertPointTy codeGenIP) { + auto sectionCB = [§ionsOp, ®ion, &builder, &moduleTranslation, + &bodyGenStatus](InsertPointTy allocaIP, + InsertPointTy codeGenIP) { builder.restoreIP(codeGenIP); + + // map the omp.section reduction block argument to the omp.sections block + // arguments + // TODO: this assumes that the only block arguments are reduction + // variables + assert(region.getNumArguments() == + sectionsOp.getRegion().getNumArguments()); + for (auto [sectionsArg, sectionArg] : llvm::zip_equal( + sectionsOp.getRegion().getArguments(), region.getArguments())) { + llvm::Value *llvmVal = moduleTranslation.lookupValue(sectionsArg); + assert(llvmVal); + moduleTranslation.mapValue(sectionArg, llvmVal); + } + convertOmpOpRegions(region, "omp.section.region", builder, moduleTranslation, bodyGenStatus); }; @@ -613,13 +886,19 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, // called for variables which have destructors/finalizers. auto finiCB = [&](InsertPointTy codeGenIP) {}; - llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + allocaIP = findAllocaInsertPoint(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createSections( ompLoc, allocaIP, sectionCBs, privCB, finiCB, false, sectionsOp.getNowait())); - return bodyGenStatus; + + if (failed(bodyGenStatus)) + return bodyGenStatus; + + // Process the reductions if required. + return createReductionsAndCleanup(sectionsOp, builder, moduleTranslation, + allocaIP, reductionDecls, + privateReductionVariables, isByRef); } /// Converts an OpenMP single construct into LLVM IR using OpenMPIRBuilder. @@ -769,131 +1048,6 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder, ompLoc, allocaIP, bodyCB)); return bodyGenStatus; } - -/// Allocate space for privatized reduction variables. -template -static void allocByValReductionVars( - T loop, ArrayRef reductionArgs, llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation, - llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, - SmallVectorImpl &reductionDecls, - SmallVectorImpl &privateReductionVariables, - DenseMap &reductionVariableMap, - llvm::ArrayRef isByRefs) { - llvm::IRBuilderBase::InsertPointGuard guard(builder); - builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); - - for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) { - if (isByRefs[i]) - continue; - llvm::Value *var = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); - moduleTranslation.mapValue(reductionArgs[i], var); - privateReductionVariables[i] = var; - reductionVariableMap.try_emplace(loop.getReductionVars()[i], var); - } -} - -/// Map input argument to all reduction initialization regions -template -static void -mapInitializationArg(T loop, LLVM::ModuleTranslation &moduleTranslation, - SmallVectorImpl &reductionDecls, - unsigned i) { - // map input argument to the initialization region - mlir::omp::DeclareReductionOp &reduction = reductionDecls[i]; - Region &initializerRegion = reduction.getInitializerRegion(); - Block &entry = initializerRegion.front(); - assert(entry.getNumArguments() == 1 && - "the initialization region has one argument"); - - mlir::Value mlirSource = loop.getReductionVars()[i]; - llvm::Value *llvmSource = moduleTranslation.lookupValue(mlirSource); - assert(llvmSource && "lookup reduction var"); - moduleTranslation.mapValue(entry.getArgument(0), llvmSource); -} - -/// Collect reduction info -template -static void collectReductionInfo( - T loop, llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation, - SmallVector &reductionDecls, - SmallVector &owningReductionGens, - SmallVector &owningAtomicReductionGens, - const SmallVector &privateReductionVariables, - SmallVector &reductionInfos) { - unsigned numReductions = loop.getNumReductionVars(); - - for (unsigned i = 0; i < numReductions; ++i) { - owningReductionGens.push_back( - makeReductionGen(reductionDecls[i], builder, moduleTranslation)); - owningAtomicReductionGens.push_back( - makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation)); - } - - // Collect the reduction information. - reductionInfos.reserve(numReductions); - for (unsigned i = 0; i < numReductions; ++i) { - llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy atomicGen = nullptr; - if (owningAtomicReductionGens[i]) - atomicGen = owningAtomicReductionGens[i]; - llvm::Value *variable = - moduleTranslation.lookupValue(loop.getReductionVars()[i]); - reductionInfos.push_back( - {moduleTranslation.convertType(reductionDecls[i].getType()), variable, - privateReductionVariables[i], - /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar, - owningReductionGens[i], - /*ReductionGenClang=*/nullptr, atomicGen}); - } -} - -/// handling of DeclareReductionOp's cleanup region -static LogicalResult -inlineOmpRegionCleanup(llvm::SmallVectorImpl &cleanupRegions, - llvm::ArrayRef privateVariables, - LLVM::ModuleTranslation &moduleTranslation, - llvm::IRBuilderBase &builder, StringRef regionName, - bool shouldLoadCleanupRegionArg = true) { - for (auto [i, cleanupRegion] : llvm::enumerate(cleanupRegions)) { - if (cleanupRegion->empty()) - continue; - - // map the argument to the cleanup region - Block &entry = cleanupRegion->front(); - - llvm::Instruction *potentialTerminator = - builder.GetInsertBlock()->empty() ? nullptr - : &builder.GetInsertBlock()->back(); - if (potentialTerminator && potentialTerminator->isTerminator()) - builder.SetInsertPoint(potentialTerminator); - llvm::Value *prviateVarValue = - shouldLoadCleanupRegionArg - ? builder.CreateLoad( - moduleTranslation.convertType(entry.getArgument(0).getType()), - privateVariables[i]) - : privateVariables[i]; - - moduleTranslation.mapValue(entry.getArgument(0), prviateVarValue); - - if (failed(inlineConvertOmpRegions(*cleanupRegion, regionName, builder, - moduleTranslation))) - return failure(); - - // clear block argument mapping in case it needs to be re-created with a - // different source for another use of the same reduction decl - moduleTranslation.forgetMapping(*cleanupRegion); - } - return success(); -} - -static ArrayRef getIsByRef(std::optional> attr) { - if (!attr) - return {}; - return *attr; -} - /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, @@ -933,48 +1087,11 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, MutableArrayRef reductionArgs = wsloopOp.getRegion().getArguments(); - allocByValReductionVars(wsloopOp, reductionArgs, builder, moduleTranslation, - allocaIP, reductionDecls, privateReductionVariables, - reductionVariableMap, isByRef); - - // Before the loop, store the initial values of reductions into reduction - // variables. Although this could be done after allocas, we don't want to mess - // up with the alloca insertion point. - for (unsigned i = 0; i < wsloopOp.getNumReductionVars(); ++i) { - SmallVector phis; - - // map block argument to initializer region - mapInitializationArg(wsloopOp, moduleTranslation, reductionDecls, i); - - if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(), - "omp.reduction.neutral", builder, - moduleTranslation, &phis))) - return failure(); - assert(phis.size() == 1 && "expected one value to be yielded from the " - "reduction neutral element declaration region"); - if (isByRef[i]) { - // Allocate reduction variable (which is a pointer to the real reduction - // variable allocated in the inlined region) - llvm::Value *var = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); - // Store the result of the inlined region to the allocated reduction var - // ptr - builder.CreateStore(phis[0], var); - - privateReductionVariables[i] = var; - moduleTranslation.mapValue(reductionArgs[i], phis[0]); - reductionVariableMap.try_emplace(wsloopOp.getReductionVars()[i], phis[0]); - } else { - // for by-ref case the store is inside of the reduction region - builder.CreateStore(phis[0], privateReductionVariables[i]); - // the rest was handled in allocByValReductionVars - } - - // forget the mapping for the initializer region because we might need a - // different mapping if this reduction declaration is re-used for a - // different variable - moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion()); - } + if (failed(allocAndInitializeReductionVars( + wsloopOp, reductionArgs, builder, moduleTranslation, allocaIP, + reductionDecls, privateReductionVariables, reductionVariableMap, + isByRef))) + return failure(); // Store the mapping between reduction variables and their private copies on // ModuleTranslation stack. It can be then recovered when translating @@ -1067,42 +1184,9 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, builder.restoreIP(afterIP); // Process the reductions if required. - if (wsloopOp.getNumReductionVars() == 0) - return success(); - - // Create the reduction generators. We need to own them here because - // ReductionInfo only accepts references to the generators. - SmallVector owningReductionGens; - SmallVector owningAtomicReductionGens; - SmallVector reductionInfos; - collectReductionInfo(wsloopOp, builder, moduleTranslation, reductionDecls, - owningReductionGens, owningAtomicReductionGens, - privateReductionVariables, reductionInfos); - - // The call to createReductions below expects the block to have a - // terminator. Create an unreachable instruction to serve as terminator - // and remove it later. - llvm::UnreachableInst *tempTerminator = builder.CreateUnreachable(); - builder.SetInsertPoint(tempTerminator); - llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint = - ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos, - isByRef, wsloopOp.getNowait()); - if (!contInsertPoint.getBlock()) - return wsloopOp->emitOpError() << "failed to convert reductions"; - auto nextInsertionPoint = - ompBuilder->createBarrier(contInsertPoint, llvm::omp::OMPD_for); - tempTerminator->eraseFromParent(); - builder.restoreIP(nextInsertionPoint); - - // after the workshare loop, deallocate private reduction variables - SmallVector reductionRegions; - llvm::transform(reductionDecls, std::back_inserter(reductionRegions), - [](omp::DeclareReductionOp reductionDecl) { - return &reductionDecl.getCleanupRegion(); - }); - return inlineOmpRegionCleanup(reductionRegions, privateReductionVariables, - moduleTranslation, builder, - "omp.reduction.cleanup"); + return createReductionsAndCleanup(wsloopOp, builder, moduleTranslation, + allocaIP, reductionDecls, + privateReductionVariables, isByRef); } /// A RAII class that on construction replaces the region arguments of the diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir index dfeaf4be33adb8..7f860268db11d4 100644 --- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir +++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir @@ -2099,6 +2099,8 @@ llvm.func @omp_sections_empty() -> () { omp.sections { omp.terminator } + // CHECK-NEXT: br label %entry + // CHECK: entry: // CHECK-NEXT: ret void llvm.return } diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir new file mode 100644 index 00000000000000..5682e7e96ab186 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir @@ -0,0 +1,214 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// nonesense minimised code simulating the control flow graph generated by flang +// for array reductions. The important thing here is that we are testing a byref +// reduction with a cleanup region, and the various regions contain multiple +// blocks +omp.declare_reduction @add_reduction_byref_box_Uxf32 : !llvm.ptr init { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} combiner { +^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.mlir.constant(0 : index) : i64 + %2 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb1(%0 : i64) +^bb1(%3: i64): // 2 preds: ^bb0, ^bb2 + %4 = llvm.icmp "sgt" %3, %1 : i64 + llvm.cond_br %4, ^bb2, ^bb3 +^bb2: // pred: ^bb1 + %5 = llvm.sub %3, %2 : i64 + llvm.br ^bb1(%5 : i64) +^bb3: // pred: ^bb1 + omp.yield(%arg0 : !llvm.ptr) +} cleanup { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(0 : i64) : i64 + %1 = llvm.ptrtoint %arg0 : !llvm.ptr to i64 + %2 = llvm.icmp "ne" %1, %0 : i64 + llvm.cond_br %2, ^bb1, ^bb2 +^bb1: // pred: ^bb0 + llvm.br ^bb2 +^bb2: // 2 preds: ^bb0, ^bb1 + omp.yield +} +llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.internal_name = "_QPsectionsreduction"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.mlir.constant(0 : index) : i64 + %2 = llvm.mlir.constant(1 : index) : i64 + omp.parallel { + %3 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr + omp.sections reduction(byref @add_reduction_byref_box_Uxf32 -> %3 : !llvm.ptr) { + ^bb0(%arg1: !llvm.ptr): + omp.section { + ^bb0(%arg2: !llvm.ptr): + llvm.br ^bb1(%0 : i64) + ^bb1(%4: i64): // 2 preds: ^bb0, ^bb2 + %5 = llvm.icmp "sgt" %4, %1 : i64 + llvm.cond_br %5, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %6 = llvm.sub %4, %2 : i64 + llvm.br ^bb1(%6 : i64) + ^bb3: // pred: ^bb1 + omp.terminator + } + omp.section { + ^bb0(%arg2: !llvm.ptr): + llvm.br ^bb1(%0 : i64) + ^bb1(%4: i64): // 2 preds: ^bb0, ^bb2 + %5 = llvm.icmp "sgt" %4, %1 : i64 + llvm.cond_br %5, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + %6 = llvm.sub %4, %2 : i64 + llvm.br ^bb1(%6 : i64) + ^bb3: // pred: ^bb1 + omp.terminator + } + omp.terminator + } + omp.terminator + } + llvm.return +} + +// CHECK-LABEL: define internal void @sectionsreduction_..omp_par +// CHECK: omp.par.entry: +// CHECK: %[[VAL_6:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_7:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_8:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_9:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_10:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_11:.*]] = load i32, ptr %[[VAL_12:.*]], align 4 +// CHECK: store i32 %[[VAL_11]], ptr %[[VAL_10]], align 4 +// CHECK: %[[VAL_13:.*]] = load i32, ptr %[[VAL_10]], align 4 +// CHECK: %[[VAL_14:.*]] = alloca [1 x ptr], align 8 +// CHECK: br label %[[VAL_15:.*]] +// CHECK: omp.reduction.init: ; preds = %[[VAL_16:.*]] +// CHECK: br label %[[VAL_17:.*]] +// CHECK: omp.par.region: ; preds = %[[VAL_15]] +// CHECK: br label %[[VAL_18:.*]] +// CHECK: omp.par.region1: ; preds = %[[VAL_17]] +// CHECK: %[[VAL_19:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 +// CHECK: %[[VAL_20:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 +// CHECK: %[[VAL_21:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8 +// CHECK: br label %[[VAL_22:.*]] +// CHECK: omp_section_loop.preheader: ; preds = %[[VAL_18]] +// CHECK: store i32 0, ptr %[[VAL_7]], align 4 +// CHECK: store i32 1, ptr %[[VAL_8]], align 4 +// CHECK: store i32 1, ptr %[[VAL_9]], align 4 +// CHECK: %[[VAL_23:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[VAL_23]], i32 34, ptr %[[VAL_6]], ptr %[[VAL_7]], ptr %[[VAL_8]], ptr %[[VAL_9]], i32 1, i32 0) +// CHECK: %[[VAL_24:.*]] = load i32, ptr %[[VAL_7]], align 4 +// CHECK: %[[VAL_25:.*]] = load i32, ptr %[[VAL_8]], align 4 +// CHECK: %[[VAL_26:.*]] = sub i32 %[[VAL_25]], %[[VAL_24]] +// CHECK: %[[VAL_27:.*]] = add i32 %[[VAL_26]], 1 +// CHECK: br label %[[VAL_28:.*]] +// CHECK: omp_section_loop.header: ; preds = %[[VAL_29:.*]], %[[VAL_22]] +// CHECK: %[[VAL_30:.*]] = phi i32 [ 0, %[[VAL_22]] ], [ %[[VAL_31:.*]], %[[VAL_29]] ] +// CHECK: br label %[[VAL_32:.*]] +// CHECK: omp_section_loop.cond: ; preds = %[[VAL_28]] +// CHECK: %[[VAL_33:.*]] = icmp ult i32 %[[VAL_30]], %[[VAL_27]] +// CHECK: br i1 %[[VAL_33]], label %[[VAL_34:.*]], label %[[VAL_35:.*]] +// CHECK: omp_section_loop.exit: ; preds = %[[VAL_32]] +// CHECK: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_23]]) +// CHECK: %[[VAL_36:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_36]]) +// CHECK: br label %[[VAL_37:.*]] +// CHECK: omp_section_loop.after: ; preds = %[[VAL_35]] +// CHECK: br label %[[VAL_38:.*]] +// CHECK: omp_section_loop.aftersections.fini: ; preds = %[[VAL_37]] +// CHECK: %[[VAL_39:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_14]], i64 0, i64 0 +// CHECK: store ptr %[[VAL_21]], ptr %[[VAL_39]], align 8 +// CHECK: %[[VAL_40:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: %[[VAL_41:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_40]], i32 1, i64 8, ptr %[[VAL_14]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var) +// CHECK: switch i32 %[[VAL_41]], label %[[VAL_42:.*]] [ +// CHECK: i32 1, label %[[VAL_43:.*]] +// CHECK: i32 2, label %[[VAL_44:.*]] +// CHECK: ] +// CHECK: reduce.switch.atomic: ; preds = %[[VAL_38]] +// CHECK: unreachable +// CHECK: reduce.switch.nonatomic: ; preds = %[[VAL_38]] +// CHECK: %[[VAL_45:.*]] = load ptr, ptr %[[VAL_21]], align 8 +// CHECK: br label %[[VAL_46:.*]] +// CHECK: omp.reduction.nonatomic.body: ; preds = %[[VAL_43]] +// CHECK: br label %[[VAL_47:.*]] +// CHECK: omp.reduction.nonatomic.body16: ; preds = %[[VAL_48:.*]], %[[VAL_46]] +// CHECK: %[[VAL_49:.*]] = phi i64 [ %[[VAL_50:.*]], %[[VAL_48]] ], [ 0, %[[VAL_46]] ] +// CHECK: %[[VAL_51:.*]] = icmp sgt i64 %[[VAL_49]], 0 +// CHECK: br i1 %[[VAL_51]], label %[[VAL_48]], label %[[VAL_52:.*]] +// CHECK: omp.reduction.nonatomic.body18: ; preds = %[[VAL_47]] +// CHECK: br label %[[VAL_53:.*]] +// CHECK: omp.region.cont15: ; preds = %[[VAL_52]] +// CHECK: %[[VAL_54:.*]] = phi ptr [ %[[VAL_19]], %[[VAL_52]] ] +// CHECK: call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_40]], ptr @.gomp_critical_user_.reduction.var) +// CHECK: br label %[[VAL_42]] +// CHECK: omp.reduction.nonatomic.body17: ; preds = %[[VAL_47]] +// CHECK: %[[VAL_50]] = sub i64 %[[VAL_49]], 1 +// CHECK: br label %[[VAL_47]] +// CHECK: reduce.finalize: ; preds = %[[VAL_53]], %[[VAL_38]] +// CHECK: %[[VAL_55:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_55]]) +// CHECK: %[[VAL_56:.*]] = load ptr, ptr %[[VAL_21]], align 8 +// CHECK: br label %[[VAL_57:.*]] +// CHECK: omp.reduction.cleanup: ; preds = %[[VAL_42]] +// CHECK: %[[VAL_58:.*]] = ptrtoint ptr %[[VAL_56]] to i64 +// CHECK: %[[VAL_59:.*]] = icmp ne i64 %[[VAL_58]], 0 +// CHECK: br i1 %[[VAL_59]], label %[[VAL_60:.*]], label %[[VAL_61:.*]] +// CHECK: omp.reduction.cleanup22: ; preds = %[[VAL_60]], %[[VAL_57]] +// CHECK: br label %[[VAL_62:.*]] +// CHECK: omp.region.cont20: ; preds = %[[VAL_61]] +// CHECK: br label %[[VAL_63:.*]] +// CHECK: omp.region.cont: ; preds = %[[VAL_62]] +// CHECK: br label %[[VAL_64:.*]] +// CHECK: omp.par.pre_finalize: ; preds = %[[VAL_63]] +// CHECK: br label %[[VAL_65:.*]] +// CHECK: omp.reduction.cleanup21: ; preds = %[[VAL_57]] +// CHECK: br label %[[VAL_61]] +// CHECK: omp_section_loop.body: ; preds = %[[VAL_32]] +// CHECK: %[[VAL_66:.*]] = add i32 %[[VAL_30]], %[[VAL_24]] +// CHECK: %[[VAL_67:.*]] = mul i32 %[[VAL_66]], 1 +// CHECK: %[[VAL_68:.*]] = add i32 %[[VAL_67]], 0 +// CHECK: switch i32 %[[VAL_68]], label %[[VAL_69:.*]] [ +// CHECK: i32 0, label %[[VAL_70:.*]] +// CHECK: i32 1, label %[[VAL_71:.*]] +// CHECK: ] +// CHECK: omp_section_loop.body.case6: ; preds = %[[VAL_34]] +// CHECK: br label %[[VAL_72:.*]] +// CHECK: omp.section.region8: ; preds = %[[VAL_71]] +// CHECK: br label %[[VAL_73:.*]] +// CHECK: omp.section.region9: ; preds = %[[VAL_74:.*]], %[[VAL_72]] +// CHECK: %[[VAL_75:.*]] = phi i64 [ %[[VAL_76:.*]], %[[VAL_74]] ], [ 1, %[[VAL_72]] ] +// CHECK: %[[VAL_77:.*]] = icmp sgt i64 %[[VAL_75]], 0 +// CHECK: br i1 %[[VAL_77]], label %[[VAL_74]], label %[[VAL_78:.*]] +// CHECK: omp.section.region11: ; preds = %[[VAL_73]] +// CHECK: br label %[[VAL_79:.*]] +// CHECK: omp.region.cont7: ; preds = %[[VAL_78]] +// CHECK: br label %[[VAL_69]] +// CHECK: omp.section.region10: ; preds = %[[VAL_73]] +// CHECK: %[[VAL_76]] = sub i64 %[[VAL_75]], 1 +// CHECK: br label %[[VAL_73]] +// CHECK: omp_section_loop.body.case: ; preds = %[[VAL_34]] +// CHECK: br label %[[VAL_80:.*]] +// CHECK: omp.section.region: ; preds = %[[VAL_70]] +// CHECK: br label %[[VAL_81:.*]] +// CHECK: omp.section.region3: ; preds = %[[VAL_82:.*]], %[[VAL_80]] +// CHECK: %[[VAL_83:.*]] = phi i64 [ %[[VAL_84:.*]], %[[VAL_82]] ], [ 1, %[[VAL_80]] ] +// CHECK: %[[VAL_85:.*]] = icmp sgt i64 %[[VAL_83]], 0 +// CHECK: br i1 %[[VAL_85]], label %[[VAL_82]], label %[[VAL_86:.*]] +// CHECK: omp.section.region5: ; preds = %[[VAL_81]] +// CHECK: br label %[[VAL_87:.*]] +// CHECK: omp.region.cont2: ; preds = %[[VAL_86]] +// CHECK: br label %[[VAL_69]] +// CHECK: omp.section.region4: ; preds = %[[VAL_81]] +// CHECK: %[[VAL_84]] = sub i64 %[[VAL_83]], 1 +// CHECK: br label %[[VAL_81]] +// CHECK: omp_section_loop.body.sections.after: ; preds = %[[VAL_79]], %[[VAL_87]], %[[VAL_34]] +// CHECK: br label %[[VAL_29]] +// CHECK: omp_section_loop.inc: ; preds = %[[VAL_69]] +// CHECK: %[[VAL_31]] = add nuw i32 %[[VAL_30]], 1 +// CHECK: br label %[[VAL_28]] +// CHECK: omp.par.outlined.exit.exitStub: ; preds = %[[VAL_64]] +// CHECK: ret void diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir new file mode 100644 index 00000000000000..694180a5ced373 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-reduction-sections.mlir @@ -0,0 +1,152 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +omp.declare_reduction @add_reduction_f32 : f32 init { +^bb0(%arg0: f32): + %0 = llvm.mlir.constant(0.000000e+00 : f32) : f32 + omp.yield(%0 : f32) +} combiner { +^bb0(%arg0: f32, %arg1: f32): + %0 = llvm.fadd %arg0, %arg1 {fastmathFlags = #llvm.fastmath} : f32 + omp.yield(%0 : f32) +} +llvm.func @sections_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attributes {fir.internal_name = "_QPsections"} { + %0 = llvm.mlir.constant(2.000000e+00 : f32) : f32 + %1 = llvm.mlir.constant(1.000000e+00 : f32) : f32 + omp.parallel { + omp.sections reduction(@add_reduction_f32 -> %arg0 : !llvm.ptr) { + ^bb0(%arg1: !llvm.ptr): + omp.section { + ^bb0(%arg2: !llvm.ptr): + %2 = llvm.load %arg2 : !llvm.ptr -> f32 + %3 = llvm.fadd %2, %1 {fastmathFlags = #llvm.fastmath} : f32 + llvm.store %3, %arg2 : f32, !llvm.ptr + omp.terminator + } + omp.section { + ^bb0(%arg2: !llvm.ptr): + %2 = llvm.load %arg2 : !llvm.ptr -> f32 + %3 = llvm.fadd %2, %0 {fastmathFlags = #llvm.fastmath} : f32 + llvm.store %3, %arg2 : f32, !llvm.ptr + omp.terminator + } + omp.terminator + } + omp.terminator + } + llvm.return +} + +// CHECK-LABEL: define internal void @sections_..omp_par +// CHECK: omp.par.entry: +// CHECK: %[[VAL_9:.*]] = getelementptr { ptr }, ptr %[[VAL_10:.*]], i32 0, i32 0 +// CHECK: %[[VAL_11:.*]] = load ptr, ptr %[[VAL_9]], align 8 +// CHECK: %[[VAL_12:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_13:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_14:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_15:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_16:.*]] = alloca i32, align 4 +// CHECK: %[[VAL_17:.*]] = load i32, ptr %[[VAL_18:.*]], align 4 +// CHECK: store i32 %[[VAL_17]], ptr %[[VAL_16]], align 4 +// CHECK: %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4 +// CHECK: %[[VAL_20:.*]] = alloca float, align 4 +// CHECK: %[[VAL_21:.*]] = alloca [1 x ptr], align 8 +// CHECK: br label %[[VAL_22:.*]] +// CHECK: omp.reduction.init: ; preds = %[[VAL_23:.*]] +// CHECK: br label %[[VAL_24:.*]] +// CHECK: omp.par.region: ; preds = %[[VAL_22]] +// CHECK: br label %[[VAL_25:.*]] +// CHECK: omp.par.region1: ; preds = %[[VAL_24]] +// CHECK: store float 0.000000e+00, ptr %[[VAL_20]], align 4 +// CHECK: br label %[[VAL_26:.*]] +// CHECK: omp_section_loop.preheader: ; preds = %[[VAL_25]] +// CHECK: store i32 0, ptr %[[VAL_13]], align 4 +// CHECK: store i32 1, ptr %[[VAL_14]], align 4 +// CHECK: store i32 1, ptr %[[VAL_15]], align 4 +// CHECK: %[[VAL_27:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[VAL_27]], i32 34, ptr %[[VAL_12]], ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_15]], i32 1, i32 0) +// CHECK: %[[VAL_28:.*]] = load i32, ptr %[[VAL_13]], align 4 +// CHECK: %[[VAL_29:.*]] = load i32, ptr %[[VAL_14]], align 4 +// CHECK: %[[VAL_30:.*]] = sub i32 %[[VAL_29]], %[[VAL_28]] +// CHECK: %[[VAL_31:.*]] = add i32 %[[VAL_30]], 1 +// CHECK: br label %[[VAL_32:.*]] +// CHECK: omp_section_loop.header: ; preds = %[[VAL_33:.*]], %[[VAL_26]] +// CHECK: %[[VAL_34:.*]] = phi i32 [ 0, %[[VAL_26]] ], [ %[[VAL_35:.*]], %[[VAL_33]] ] +// CHECK: br label %[[VAL_36:.*]] +// CHECK: omp_section_loop.cond: ; preds = %[[VAL_32]] +// CHECK: %[[VAL_37:.*]] = icmp ult i32 %[[VAL_34]], %[[VAL_31]] +// CHECK: br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_39:.*]] +// CHECK: omp_section_loop.exit: ; preds = %[[VAL_36]] +// CHECK: call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_27]]) +// CHECK: %[[VAL_40:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_40]]) +// CHECK: br label %[[VAL_41:.*]] +// CHECK: omp_section_loop.after: ; preds = %[[VAL_39]] +// CHECK: br label %[[VAL_42:.*]] +// CHECK: omp_section_loop.aftersections.fini: ; preds = %[[VAL_41]] +// CHECK: %[[VAL_43:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_21]], i64 0, i64 0 +// CHECK: store ptr %[[VAL_20]], ptr %[[VAL_43]], align 8 +// CHECK: %[[VAL_44:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: %[[VAL_45:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_44]], i32 1, i64 8, ptr %[[VAL_21]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var) +// CHECK: switch i32 %[[VAL_45]], label %[[VAL_46:.*]] [ +// CHECK: i32 1, label %[[VAL_47:.*]] +// CHECK: i32 2, label %[[VAL_48:.*]] +// CHECK: ] +// CHECK: reduce.switch.atomic: ; preds = %[[VAL_42]] +// CHECK: unreachable +// CHECK: reduce.switch.nonatomic: ; preds = %[[VAL_42]] +// CHECK: %[[VAL_49:.*]] = load float, ptr %[[VAL_11]], align 4 +// CHECK: %[[VAL_50:.*]] = load float, ptr %[[VAL_20]], align 4 +// CHECK: %[[VAL_51:.*]] = fadd contract float %[[VAL_49]], %[[VAL_50]] +// CHECK: store float %[[VAL_51]], ptr %[[VAL_11]], align 4 +// CHECK: call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_44]], ptr @.gomp_critical_user_.reduction.var) +// CHECK: br label %[[VAL_46]] +// CHECK: reduce.finalize: ; preds = %[[VAL_47]], %[[VAL_42]] +// CHECK: %[[VAL_52:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: call void @__kmpc_barrier(ptr @2, i32 %[[VAL_52]]) +// CHECK: br label %[[VAL_53:.*]] +// CHECK: omp.region.cont: ; preds = %[[VAL_46]] +// CHECK: br label %[[VAL_54:.*]] +// CHECK: omp.par.pre_finalize: ; preds = %[[VAL_53]] +// CHECK: br label %[[VAL_55:.*]] +// CHECK: omp_section_loop.body: ; preds = %[[VAL_36]] +// CHECK: %[[VAL_56:.*]] = add i32 %[[VAL_34]], %[[VAL_28]] +// CHECK: %[[VAL_57:.*]] = mul i32 %[[VAL_56]], 1 +// CHECK: %[[VAL_58:.*]] = add i32 %[[VAL_57]], 0 +// CHECK: switch i32 %[[VAL_58]], label %[[VAL_59:.*]] [ +// CHECK: i32 0, label %[[VAL_60:.*]] +// CHECK: i32 1, label %[[VAL_61:.*]] +// CHECK: ] +// CHECK: omp_section_loop.body.case3: ; preds = %[[VAL_38]] +// CHECK: br label %[[VAL_62:.*]] +// CHECK: omp.section.region5: ; preds = %[[VAL_61]] +// CHECK: %[[VAL_63:.*]] = load float, ptr %[[VAL_20]], align 4 +// CHECK: %[[VAL_64:.*]] = fadd contract float %[[VAL_63]], 2.000000e+00 +// CHECK: store float %[[VAL_64]], ptr %[[VAL_20]], align 4 +// CHECK: br label %[[VAL_65:.*]] +// CHECK: omp.region.cont4: ; preds = %[[VAL_62]] +// CHECK: br label %[[VAL_59]] +// CHECK: omp_section_loop.body.case: ; preds = %[[VAL_38]] +// CHECK: br label %[[VAL_66:.*]] +// CHECK: omp.section.region: ; preds = %[[VAL_60]] +// CHECK: %[[VAL_67:.*]] = load float, ptr %[[VAL_20]], align 4 +// CHECK: %[[VAL_68:.*]] = fadd contract float %[[VAL_67]], 1.000000e+00 +// CHECK: store float %[[VAL_68]], ptr %[[VAL_20]], align 4 +// CHECK: br label %[[VAL_69:.*]] +// CHECK: omp.region.cont2: ; preds = %[[VAL_66]] +// CHECK: br label %[[VAL_59]] +// CHECK: omp_section_loop.body.sections.after: ; preds = %[[VAL_65]], %[[VAL_69]], %[[VAL_38]] +// CHECK: br label %[[VAL_33]] +// CHECK: omp_section_loop.inc: ; preds = %[[VAL_59]] +// CHECK: %[[VAL_35]] = add nuw i32 %[[VAL_34]], 1 +// CHECK: br label %[[VAL_32]] +// CHECK: omp.par.outlined.exit.exitStub: ; preds = %[[VAL_54]] +// CHECK: ret void +// CHECK: %[[VAL_70:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_71:.*]], i64 0, i64 0 +// CHECK: %[[VAL_72:.*]] = load ptr, ptr %[[VAL_70]], align 8 +// CHECK: %[[VAL_73:.*]] = load float, ptr %[[VAL_72]], align 4 +// CHECK: %[[VAL_74:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_75:.*]], i64 0, i64 0 +// CHECK: %[[VAL_76:.*]] = load ptr, ptr %[[VAL_74]], align 8 +// CHECK: %[[VAL_77:.*]] = load float, ptr %[[VAL_76]], align 4 +// CHECK: %[[VAL_78:.*]] = fadd contract float %[[VAL_73]], %[[VAL_77]] +// CHECK: store float %[[VAL_78]], ptr %[[VAL_72]], align 4 +// CHECK: ret void