apple · bgogul · Oct 12, 2018 · Oct 8, 2018 · Oct 9, 2018 · Oct 9, 2018
diff --git a/lib/SILOptimizer/Mandatory/TFCanonicalizeCFG.cpp b/lib/SILOptimizer/Mandatory/TFCanonicalizeCFG.cpp
@@ -335,8 +335,8 @@ class BasicBlockCloner : public SILClonerWithScopes<BasicBlockCloner> {
 
   bool hasCloned() const { return cloned; }
 
-  /// Return a cloned block.
-  SILBasicBlock *cloneBlock(SILBasicBlock *bb) {
+  /// Create a block and clone everything except the instructions.
+  SILBasicBlock *initBlock(SILBasicBlock *bb) {
     auto bbIt = BBMap.find(bb);
     if (bbIt != BBMap.end())
       return bbIt->second;
@@ -354,13 +354,26 @@ class BasicBlockCloner : public SILClonerWithScopes<BasicBlockCloner> {
       ValueMap[arg] = newBB->createPhiArgument(
           arg->getType(), arg->getOwnershipKind(), arg->getDecl());
     }
-    // Clone all the instructions.
+    return newBB;
+  }
+
+  // Clone all the instructions and return the cloned block.
+  SILBasicBlock *cloneBlock(SILBasicBlock * bb) {
+    auto bbIt = BBMap.find(bb);
+    assert (bbIt != BBMap.end() && "Block is not initialied before cloning.");
+    SILBasicBlock *newBB = bbIt->second;
+    getBuilder().setInsertionPoint(newBB);
     for (auto &inst : *bb) {
       visit(&inst);
     }
     return newBB;
   }
 
+  SILBasicBlock *initAndCloneBlock(SILBasicBlock * bb) {
+    initBlock(bb);
+    return cloneBlock(bb);
+  }
+
   /// Handle references to basic blocks when cloning.
   SILBasicBlock *remapBasicBlock(SILBasicBlock *bb) {
     // If the block was not cloned by this cloner, directly reference it.
@@ -370,6 +383,18 @@ class BasicBlockCloner : public SILClonerWithScopes<BasicBlockCloner> {
       return bbIt->second;
     return bb;
   }
+
+  SILValue remapValue(SILValue Value) {
+    auto VI = ValueMap.find(Value);
+    if (VI != ValueMap.end())
+      return VI->second;
+    return Value;
+  }
+
+  void updateValueMap(SILValue oldValue, SILValue newValue)  {
+    auto emplaceResult = ValueMap.try_emplace(oldValue, newValue);
+    assert(emplaceResult.second && "Remapping value multiple times during SESE cloning.");
+  }
 };
 
 }  // namespace
@@ -382,7 +407,8 @@ class SingleExitLoopTransformer {
                             PostDominanceInfo *PDI)
       : deviceInfo(deviceInfo), DI(DI), PDI(PDI), LI(LI), loop(loop),
         header(loop->getHeader()), preheader(loop->getLoopPreheader()),
-        latch(loop->getLoopLatch()), currentFn(header->getParent()) {
+        latch(loop->getLoopLatch()), currentFn(header->getParent()),
+        oldHeaderNumArgs(header->getNumArguments()), hasUndefsAtPreheader(false) {
     assert(preheader && "Canonicalization should have given us one preheader");
     assert(latch && "Canonicalization should have given us one latch block");
     initialize();
@@ -421,6 +447,9 @@ class SingleExitLoopTransformer {
   /// we will get a single exit block.
   void ensureSingleExitBlock();
 
+  ///  Unroll the body of the loop once.
+  void unrollLoopBody();
+
   /// Compute escaping values and what values to use as arguments at preheader.
   llvm::DenseMap<SILValue, SILValue> computeEscapingValuesSubstMap() const;
 
@@ -471,6 +500,10 @@ class SingleExitLoopTransformer {
   SILBasicBlock *preheader;
   SILBasicBlock *latch;
   SILFunction *currentFn;
+  unsigned oldHeaderNumArgs;
+  /// Flag to track if we have undefs at preheader corresponding to escaping
+  /// values and exit args.
+  bool hasUndefsAtPreheader;
   /// Equivalence classes induced by argument passing.
   llvm::EquivalenceClasses<SILValue> equivalentValues;
   /// exit blocks of the loop.
@@ -606,7 +639,7 @@ void SingleExitLoopTransformer::ensureSingleExitBlock() {
         if (DI->properlyDominates(succ, header)) continue;
 
         // Clone the block and rewire the edge.
-        SILBasicBlock *clonedSucc = cloner.cloneBlock(succ);
+        SILBasicBlock *clonedSucc = cloner.initAndCloneBlock(succ);
         changeBranchTarget(current->getTerminator(), edgeIdx, clonedSucc,
                            /*preserveArgs*/ true);
         worklist.insert(clonedSucc);
@@ -839,10 +872,12 @@ void SingleExitLoopTransformer::patchPreheader(SILBasicBlock *newHeader) {
   // Simply pass in an undef. This will never be accessed at runtime.
   SmallVector<SILValue, 8> newArgs;
   for (const auto &kv : escapingValueSubstMap) {
+    hasUndefsAtPreheader |= isa<SILUndef>(kv.second);
     newArgs.push_back(kv.second);
   }
   if (TFNoUndefsInSESE) {
     for (const auto &kv : exitArgSubstMap) {
+      hasUndefsAtPreheader |= isa<SILUndef>(kv.second);
       newArgs.push_back(kv.second);
     }
   }
@@ -862,11 +897,6 @@ SingleExitLoopTransformer::patchEdges(SILBasicBlock *newHeader,
 
   llvm::DenseMap<SILBasicBlock *, intmax_t> exitIndices;
 
-  unsigned oldHeaderNumArgs =
-      newHeader->getNumArguments() -
-      (escapingValueSubstMap.size() + exitArgSubstMap.size() +
-       /* exitIndex, stayInLoop*/ 2);
-
   // Identify the exit from the header (if any) and assign '0' as its index.
   SILBasicBlock *headerExit = nullptr;
   for (SILBasicBlock *succ : header->getSuccessorBlocks()) {
@@ -1103,6 +1133,15 @@ bool SingleExitLoopTransformer::transform() {
 
   // Update the loop header to newHeader.
   loop->moveToHeader(newHeader);
+
+  if (TFNoUndefsInSESE) {
+    // If we still have undefs at preheader, simply clone the loop body once
+    // before the actual loop.
+    if (hasUndefsAtPreheader) {
+      unrollLoopBody();
+    }
+  }
+
   return true;
 }
 
@@ -1150,6 +1189,128 @@ void SESERegionBuilder::ensureSingleExitFromLoops() {
   }
 }
 
+void SingleExitLoopTransformer::unrollLoopBody() {
+  BasicBlockCloner cloner(*currentFn);
+  // Setup cloner so that newHeader's argument's are replaced with values in
+  // preheader.
+  SILBasicBlock *newHeader = loop->getHeader();
+  auto preheaderTermInst = dyn_cast<BranchInst>(preheader->getTerminator());
+  assert(preheaderTermInst && "Preheader of a loop has a non-branch terminator");
+  for (unsigned argIndex = 0; argIndex < oldHeaderNumArgs; ++argIndex) {
+    auto preHeaderArg = preheaderTermInst->getArg(argIndex);
+    auto newHeaderArg = newHeader->getArgument(argIndex);
+    cloner.updateValueMap(newHeaderArg, preHeaderArg);
+  }
+  // Clone everything except the new header. We should traverse the
+  // blocks in depth first order to ensure values are cloned before they are used.
+  SmallPtrSet<SILBasicBlock *, 32> worklist;
+  SmallVector<SILBasicBlock *, 32> initializedBlocks;
+  worklist.insert(header);
+  while (!worklist.empty()) {
+    SILBasicBlock *current = *worklist.begin();
+    worklist.erase(current);
+    cloner.initBlock(current);
+    initializedBlocks.push_back(current);
+    for (SILBasicBlock *succ : current->getSuccessorBlocks()) {
+      // Skip if succ is not a part of the loop, is already cloned, or
+      // is the new preheader.
+      if (!loop->contains(succ) || cloner.remapBasicBlock(succ) != succ ||
+          succ == newHeader) {
+        continue;
+      }
+      worklist.insert(succ);
+    }
+  }
+  for (SILBasicBlock *bb : initializedBlocks) {
+    cloner.cloneBlock(bb);
+  }
+
+  // Get the clone for the original and new header.
+  SILBasicBlock *clonedHeader = cloner.remapBasicBlock(header);
+  replaceBranchTarget(preheader->getTerminator(), newHeader, clonedHeader,
+                      /*preserveArgs*/ false);
+
+  // Along a path in the loop body where an escaping value or an exit argument
+  // is not defined, the SESE loop canonicalization would have propagated the
+  // corresponding loop carried state that was added to the new header. However,
+  // these are not remapped when the loop body is unrolled (as we won't know
+  // what value to use in the unrolled body as it is undefined along that path).
+  // This following code patches these arguments by picking a value that
+  // dominates `pred` and is equivalent to the corresponding argument in the
+  // cloned block. e.g.,
+  //
+  //   do {
+  //     if (...) break;
+  //     i += 1
+  //   } while(...)
+  //   return i
+  //
+  // --CFG--
+  //   preheader: i0 = 0; br header(i0)
+  //
+  //   header(i0): cond ??, break, body
+  //
+  //   break: br exit(i0)
+  //
+  //   body: i1 = i0 + 1; cond ??, header(i1), exit(i1)
+  //
+  //   exit(i2): return i2
+  //
+  // --Canonicalized CFG (not everything is shown)--
+  //   preheader: i0 = 0; br newHeader(i0, undef)
+  //
+  //   newHeader(i0, i3): cond stayInLoop, header, exit(i3)
+  //
+  //   header: cond ??, break, body
+  //
+  //   break: br newLatch(i0, i3)
+  //
+  //   body: i1 = i0 + 1; cond ??, newHeader(i1, i1), newLatch(i1, i1)
+  //
+  //   newLatch(i4, i5): br newHeader(i4, i5)
+  //
+  //   exit(i2): return i2
+  //
+  // In the unrolled body of the loop, break will be cloned as follows:
+  // (prime refers to the cloned version):
+  //    break': br newLatch'(i0', i3)
+  //
+  // Note that i3 is not cloned, which is patched here as follows:
+  //    break': br newLatch'(i0', ii')
+  // `i1` is equivalent to `i3` as they both flow into the argument `i5` of
+  // `newLatch`.
+  SILBasicBlock *newLatch = loop->getLoopLatch();
+  SILBasicBlock *clonedNewLatch = cloner.remapBasicBlock(newLatch);
+  for (SILBasicBlock *pred : newLatch->getPredecessorBlocks()) {
+    auto predTermInst = dyn_cast<BranchInst>(pred->getTerminator());
+    assert(predTermInst && "Preheader of a loop has a non-branch terminator");
+    for (unsigned argIndex = 0; argIndex < predTermInst->getNumArgs(); ++argIndex) {
+      auto arg = predTermInst->getArg(argIndex);
+      // Skip if this is not a uncloned argument as illustrated above.
+      if (!isa<SILArgument>(arg) ||
+          cast<SILArgument>(arg)->getParent() != newHeader) {
+        continue;
+      }
+      // Iterate over the incoming values of the corresponding argument in the
+      // latch block and pick one that is suitable to be used here.
+      auto destBBArg = newLatch->getArgument(argIndex);
+      SmallVector<SILValue, 8> incomingValues;
+      destBBArg->getIncomingPhiValues(incomingValues);
+      for (auto value : incomingValues) {
+        if (value != arg && DI->properlyDominates(value, predTermInst)) {
+          // A suitable value is found. Update the edge value in the unrolled
+          // loop with the corresponding cloned value.
+          SILBasicBlock *clonedPred = cloner.remapBasicBlock(pred);
+          changeEdgeValue(clonedPred->getTerminator(), clonedNewLatch, argIndex,
+                          cloner.remapValue(value));
+          break;
+        }
+      }
+    }
+  }
+
+}
+
 /// Process the specified loop, collapsing it into a SESE region node.  This
 /// forms a WhileLoopSESERegion node and puts it into the loopPreheaders data
 /// structure, allowing the outer level's acyclic region handling to pick it

diff --git a/test/TensorFlow/sese_loop_canonicalization.sil b/test/TensorFlow/sese_loop_canonicalization.sil
@@ -231,6 +231,11 @@ public func nestedLoopWithBreak(breakCount:Int32) -> Int32 {
 
 // CHECK-LABEL: --- XLA CFG Canonicalize: $doWhileLoop
 // CHECK: [sequence
+// CHECK:   {condition Header: {{bb[0-9]+}}
+// CHECK:     {condition Header: {{bb[0-9]+}}
+// CHECK:       block {{bb[0-9]+}}
+// CHECK:       block {{bb[0-9]+}}}
+// CHECK:     block {{bb[0-9]+}}}
 // CHECK:   <while Preheader: [[PHDR:bb[0-9]+]], Header: [[HDR:bb[0-9]+]], exit: [[EXIT:bb[0-9]+]]
 // CHECK:     [sequence
 // CHECK:       {condition Header: {{bb[0-9]+}}
@@ -243,8 +248,8 @@ public func nestedLoopWithBreak(breakCount:Int32) -> Int32 {
 
 // Make sure undef is still left in this case for now.
 // CHECK: sil @$doWhileLoop : {{.*}} (Builtin.Int32) -> Builtin.Int32 {
-// CHECK: [[PHDR]]({{.*}} : $Builtin.Int32):
-// CHECK: br [[HDR]]({{.*}} : $Builtin.Int32, undef : $Builtin.Int32, {{.*}} : $TensorHandle<Builtin.Int32>, {{.*}} : $TensorHandle<Builtin.Int1>)
+// CHECK: [[PHDR]]({{.*}} : $TensorHandle<Builtin.Int1>):
+// CHECK: br [[HDR]]({{.*}} : $Builtin.Int32, {{.*}} : $Builtin.Int32, {{.*}} : $TensorHandle<Builtin.Int32>, {{.*}} : $TensorHandle<Builtin.Int1>)
 
 sil @$doWhileLoop : $@convention(thin) (Builtin.Int32) -> Builtin.Int32 {
 bb0(%0 : $Builtin.Int32):
@@ -299,19 +304,19 @@ bb3 (%9 : $Builtin.Int32):
 //
 // CHECK-LABEL: --- XLA CFG Canonicalize: $loopThatRequiresNodeCloning
 // CHECK: [sequence
-// CHECK:   {condition Header: bb0
-// CHECK:     block bb1
+// CHECK:   {condition Header: {{bb[0-9]+}}
+// CHECK:     block {{bb[0-9]+}}
 // CHECK:     [sequence
-// CHECK:       <while Preheader: bb2, Header: bb9, exit: bb11
+// CHECK:       <while Preheader: {{bb[0-9]+}}, Header: {{bb[0-9]+}}, exit: {{bb[0-9]+}}
 // CHECK:         [sequence
-// CHECK:           {condition Header: bb3
-// CHECK:             block bb4
-// CHECK:             {condition Header: bb5
-// CHECK:               block bb7
-// CHECK:               block bb6}}
-// CHECK:           block bb10]>
-// CHECK:       block bb11]}
-// CHECK:   block bb8]
+// CHECK:           {condition Header: {{bb[0-9]+}}
+// CHECK:             block {{bb[0-9]+}}
+// CHECK:             {condition Header: {{bb[0-9]+}}
+// CHECK:               block {{bb[0-9]+}}
+// CHECK:               block {{bb[0-9]+}}}}
+// CHECK:           block {{bb[0-9]+}}]>
+// CHECK:       block {{bb[0-9]+}}]}
+// CHECK:   block {{bb[0-9]+}}]
 // CHECK: --- XLA CFG Canonicalize end
 sil @$loopThatRequiresNodeCloning : $@convention(thin) (Builtin.Int32, Builtin.Int32) -> Builtin.Int32 {
 bb0(%0 : $Builtin.Int32, %1 : $Builtin.Int32):

diff --git a/test/TensorFlowRuntime/sese_loop_canonicalization.swift b/test/TensorFlowRuntime/sese_loop_canonicalization.swift
@@ -89,6 +89,30 @@ ControlFlowTests.testAllBackends("sumOfProductsWithBound") {
   // Effectively no bound as natSum(3) * natSum(3) is 36.
   expectNearlyEqualWithScalarTensor(36, sumOfProductsWithBound(3, 3, 100))
 }
+
+
+func doWhileLoopWithBreak(_ breakIndex:Int32) -> Tensor<Int32> {
+  var i: Int32 = 1
+  var sum = Tensor<Int32>(0)
+  let maxCount: Int32 = 100
+	repeat {
+    sum += i
+    if (i == breakIndex) {
+      break
+    }
+    i += 1
+  } while i <= maxCount
+  return sum
+}
+
+ControlFlowTests.testAllBackends("doWhileLoopWithBreak") {
+  expectEqualWithScalarTensor(3, natSumWithBreak(2))
+  expectEqualWithScalarTensor(55, natSumWithBreak(10))
+  expectEqualWithScalarTensor(5050, natSumWithBreak(-300))
+  expectEqualWithScalarTensor(5050, natSumWithBreak(100))
+  expectEqualWithScalarTensor(5050, natSumWithBreak(200))
+}
+
 #endif // CUDA
 
 runAllTests()