ad-freiburg · joka921 · Apr 25, 2021 · Sep 23, 2020 · Apr 10, 2021 · Apr 10, 2021
diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp
@@ -174,19 +174,27 @@ struct resizeIfVec<vector<C>, C> {
  *                        its already allocated storage.
  */
 template <int IN_WIDTH, int OUT_WIDTH>
-void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
-                  size_t blockEnd, const IdTableView<IN_WIDTH>& input,
-                  const vector<ResultTable::ResultType>& inputTypes,
-                  IdTableStatic<OUT_WIDTH>* result, size_t resultRow,
-                  const ResultTable* inTable, ResultTable* outTable,
-                  const Index& index,
-                  ad_utility::HashSet<size_t>& distinctHashSet) {
+
+void GroupBy::processGroup(const GroupBy::Aggregate& a, size_t blockStart,
+                           size_t blockEnd, const IdTableView<IN_WIDTH>& input,
+
+                           const vector<ResultTable::ResultType>& inputTypes,
+                           IdTableStatic<OUT_WIDTH>* result, size_t resultRow,
+                           const ResultTable* inTable, ResultTable* outTable,
+                           const Index& index,
+                           ad_utility::HashSet<size_t>& distinctHashSet) const {
+  auto check = [this](size_t i) {
+    if (i % 32768 == 0) {
+      checkTimeout();
+    }
+  };
   switch (a._type) {
     case ParsedQuery::AggregateType::AVG: {
       float res = 0;
       if (inputTypes[a._inCol] == ResultTable::ResultType::VERBATIM) {
         if (a._distinct) {
           for (size_t i = blockStart; i <= blockEnd; i++) {
+            check(i);
             const auto it = distinctHashSet.find(input(i, a._inCol));
             if (it == distinctHashSet.end()) {
               distinctHashSet.insert(input(i, a._inCol));
@@ -196,6 +204,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
           distinctHashSet.clear();
         } else {
           for (size_t i = blockStart; i <= blockEnd; i++) {
+            check(i);
             res += input(i, a._inCol);
           }
         }
@@ -204,6 +213,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
         float tmpF;
         if (a._distinct) {
           for (size_t i = blockStart; i <= blockEnd; i++) {
+            check(i);
             const auto it = distinctHashSet.find(input(i, a._inCol));
             if (it == distinctHashSet.end()) {
               distinctHashSet.insert(input(i, a._inCol));
@@ -214,6 +224,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
           distinctHashSet.clear();
         } else {
           for (size_t i = blockStart; i <= blockEnd; i++) {
+            check(i);
             std::memcpy(&tmpF, &input(i, a._inCol), sizeof(float));
             res += tmpF;
           }
@@ -224,6 +235,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
       } else {
         if (a._distinct) {
           for (size_t i = blockStart; i <= blockEnd; i++) {
+            check(i);
             const auto it = distinctHashSet.find(input(i, a._inCol));
             if (it == distinctHashSet.end()) {
               distinctHashSet.insert(input(i, a._inCol));
@@ -242,6 +254,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
           distinctHashSet.clear();
         } else {
           for (size_t i = blockStart; i <= blockEnd; i++) {
+            check(i);
             // load the string, parse it as an xsd::int or float
             // TODO(schnelle): What's the correct way to handle OPTIONAL here
             std::string entity =
@@ -265,6 +278,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
       if (a._distinct) {
         size_t count = 0;
         for (size_t i = blockStart; i <= blockEnd; i++) {
+          check(i);
           const auto it = distinctHashSet.find(input(i, a._inCol));
           if (it == distinctHashSet.end()) {
             count++;
@@ -283,6 +297,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
       if (inputTypes[a._inCol] == ResultTable::ResultType::VERBATIM) {
         if (a._distinct) {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             const auto it = distinctHashSet.find(input(i, a._inCol));
             if (it == distinctHashSet.end()) {
               distinctHashSet.insert(input(i, a._inCol));
@@ -296,6 +311,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
           distinctHashSet.clear();
         } else {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             out << input(i, a._inCol) << *delim;
           }
           out << input(blockEnd, a._inCol);
@@ -304,6 +320,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
         float f;
         if (a._distinct) {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             const auto it = distinctHashSet.find(input(i, a._inCol));
             if (it == distinctHashSet.end()) {
               distinctHashSet.insert(input(i, a._inCol));
@@ -319,6 +336,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
           distinctHashSet.clear();
         } else {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             std::memcpy(&f, &input(i, a._inCol), sizeof(float));
             out << f << *delim;
           }
@@ -328,6 +346,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
       } else if (inputTypes[a._inCol] == ResultTable::ResultType::TEXT) {
         if (a._distinct) {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             const auto it = distinctHashSet.find(input(i, a._inCol));
             if (it == distinctHashSet.end()) {
               distinctHashSet.insert(input(i, a._inCol));
@@ -341,13 +360,15 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
           distinctHashSet.clear();
         } else {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             out << index.getTextExcerpt(input(i, a._inCol)) << *delim;
           }
           out << index.getTextExcerpt(input(blockEnd, a._inCol));
         }
       } else if (inputTypes[a._inCol] == ResultTable::ResultType::LOCAL_VOCAB) {
         if (a._distinct) {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             const auto it = distinctHashSet.find(input(i, a._inCol));
             if (it == distinctHashSet.end()) {
               distinctHashSet.insert(input(i, a._inCol));
@@ -366,6 +387,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
           distinctHashSet.clear();
         } else {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             // TODO(schnelle): What's the correct way to handle OPTIONAL here
             out << inTable->idToOptionalString(input(i, a._inCol)).value_or("")
                 << *delim;
@@ -376,6 +398,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
       } else {
         if (a._distinct) {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             const auto it = distinctHashSet.find(input(i, a._inCol));
             if (it == distinctHashSet.end()) {
               distinctHashSet.insert(input(i, a._inCol));
@@ -405,6 +428,7 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
           distinctHashSet.clear();
         } else {
           for (size_t i = blockStart; i + 1 <= blockEnd; i++) {
+            check(i);
             // TODO(schnelle): What's the correct way to handle OPTIONAL here
             std::string entity =
                 index.idToOptionalString(input(i, a._inCol)).value_or("");
@@ -583,12 +607,12 @@ void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
 }
 
 template <int IN_WIDTH, int OUT_WIDTH>
-void doGroupBy(const IdTable& dynInput,
-               const vector<ResultTable::ResultType>& inputTypes,
-               const vector<size_t>& groupByCols,
-               const vector<GroupBy::Aggregate>& aggregates, IdTable* dynResult,
-               const ResultTable* inTable, ResultTable* outTable,
-               const Index& index) {
+void GroupBy::doGroupBy(const IdTable& dynInput,
+                        const vector<ResultTable::ResultType>& inputTypes,
+                        const vector<size_t>& groupByCols,
+                        const vector<GroupBy::Aggregate>& aggregates,
+                        IdTable* dynResult, const ResultTable* inTable,
+                        ResultTable* outTable, const Index& index) const {
   LOG(DEBUG) << "Group by input size " << dynInput.size() << std::endl;
   if (dynInput.size() == 0) {
     return;
@@ -621,6 +645,9 @@ void doGroupBy(const IdTable& dynInput,
   size_t blockStart = 0;
   size_t blockEnd = 0;
   for (size_t pos = 1; pos < input.size(); pos++) {
+    if (pos % 32768 == 0) {
+      checkTimeout();
+    }
     bool rowMatchesCurrentBlock = true;
     for (size_t i = 0; i < currentGroupBlock.size(); i++) {
       if (input(pos, currentGroupBlock[i].first) !=
@@ -764,16 +791,23 @@ void GroupBy::computeResult(ResultTable* result) {
 
   int inWidth = subresult->_data.cols();
   int outWidth = result->_data.cols();
-  CALL_FIXED_SIZE_2(inWidth, outWidth, doGroupBy, subresult->_data,
-                    inputResultTypes, groupByCols, aggregates, &result->_data,
-                    subresult.get(), result, getIndex());
-
-  // Free the user data used by GROUP_CONCAT aggregates.
-  for (Aggregate& a : aggregates) {
-    if (a._type == ParsedQuery::AggregateType::GROUP_CONCAT) {
-      delete static_cast<std::string*>(a._userData);
+
+  auto cleanup = [&]() {
+    // Free the user data used by GROUP_CONCAT aggregates.
+    for (Aggregate& a : aggregates) {
+      if (a._type == ParsedQuery::AggregateType::GROUP_CONCAT) {
+        delete static_cast<std::string*>(a._userData);
+      }
     }
+  };
+  try {
+    CALL_FIXED_SIZE_2(inWidth, outWidth, doGroupBy, subresult->_data,
+                      inputResultTypes, groupByCols, aggregates, &result->_data,
+                      subresult.get(), result, getIndex());
+  } catch (...) {
+    cleanup();
+    throw;
   }
-
+  cleanup();
   LOG(DEBUG) << "GroupBy result computation done." << std::endl;
 }
diff --git a/src/engine/GroupBy.h b/src/engine/GroupBy.h
@@ -98,13 +98,23 @@ class GroupBy : public Operation {
   ad_utility::HashMap<string, size_t> _varColMap;
 
   virtual void computeResult(ResultTable* result) override;
-};
 
-// This method is declared here solely for unit testing purposes
-template <int IN_WIDTH, int OUT_WIDTH>
-void doGroupBy(const IdTable& dynInput,
-               const vector<ResultTable::ResultType>& inputTypes,
-               const vector<size_t>& groupByCols,
-               const vector<GroupBy::Aggregate>& aggregates, IdTable* dynResult,
-               const ResultTable* inTable, ResultTable* outTable,
-               const Index& index);
+  template <int IN_WIDTH, int OUT_WIDTH>
+  void processGroup(const GroupBy::Aggregate& a, size_t blockStart,
+                    size_t blockEnd, const IdTableView<IN_WIDTH>& input,
+                    const vector<ResultTable::ResultType>& inputTypes,
+                    IdTableStatic<OUT_WIDTH>* result, size_t resultRow,
+                    const ResultTable* inTable, ResultTable* outTable,
+                    const Index& index,
+                    ad_utility::HashSet<size_t>& distinctHashSet) const;
+
+  template <int IN_WIDTH, int OUT_WIDTH>
+  void doGroupBy(const IdTable& dynInput,
+                 const vector<ResultTable::ResultType>& inputTypes,
+                 const vector<size_t>& groupByCols,
+                 const vector<GroupBy::Aggregate>& aggregates,
+                 IdTable* dynResult, const ResultTable* inTable,
+                 ResultTable* outTable, const Index& index) const;
+
+  FRIEND_TEST(GroupByTest, doGroupBy);
+};
diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp
@@ -280,7 +280,7 @@ void IndexScan::computePSOfreeS(ResultTable* result) const {
   result->_resultTypes.push_back(ResultTable::ResultType::KB);
   result->_sortedBy = {0, 1};
   const auto& idx = _executionContext->getIndex();
-  idx.scan(_predicate, &result->_data, idx._PSO);
+  idx.scan(_predicate, &result->_data, idx._PSO, _timeoutTimer);
 }
 
 // _____________________________________________________________________________
@@ -299,7 +299,7 @@ void IndexScan::computePOSfreeO(ResultTable* result) const {
   result->_resultTypes.push_back(ResultTable::ResultType::KB);
   result->_sortedBy = {0, 1};
   const auto& idx = _executionContext->getIndex();
-  idx.scan(_predicate, &result->_data, idx._POS);
+  idx.scan(_predicate, &result->_data, idx._POS, _timeoutTimer);
 }
 
 // _____________________________________________________________________________
@@ -338,7 +338,7 @@ void IndexScan::computeSPOfreeP(ResultTable* result) const {
   result->_resultTypes.push_back(ResultTable::ResultType::KB);
   result->_sortedBy = {0, 1};
   const auto& idx = _executionContext->getIndex();
-  idx.scan(_subject, &result->_data, idx._SPO);
+  idx.scan(_subject, &result->_data, idx._SPO, _timeoutTimer);
 }
 
 // _____________________________________________________________________________
@@ -357,7 +357,7 @@ void IndexScan::computeSOPfreeO(ResultTable* result) const {
   result->_resultTypes.push_back(ResultTable::ResultType::KB);
   result->_sortedBy = {0, 1};
   const auto& idx = _executionContext->getIndex();
-  idx.scan(_subject, &result->_data, idx._SOP);
+  idx.scan(_subject, &result->_data, idx._SOP, _timeoutTimer);
 }
 
 // _____________________________________________________________________________
@@ -377,7 +377,7 @@ void IndexScan::computeOSPfreeS(ResultTable* result) const {
   result->_resultTypes.push_back(ResultTable::ResultType::KB);
   result->_sortedBy = {0, 1};
   const auto& idx = _executionContext->getIndex();
-  idx.scan(_object, &result->_data, idx._OSP);
+  idx.scan(_object, &result->_data, idx._OSP, _timeoutTimer);
 }
 
 // _____________________________________________________________________________

diff --git a/src/engine/Join.cpp b/src/engine/Join.cpp
@@ -18,7 +18,7 @@ Join::Join(QueryExecutionContext* qec, std::shared_ptr<QueryExecutionTree> t1,
            size_t t2JoinCol, bool keepJoinColumn)
     : Operation(qec) {
   // Make sure subtrees are ordered so that identical queries can be identified.
-  if (t1.get()->asString() < t2.get()->asString()) {
+  if (t1 && t2 && t1.get()->asString() < t2.get()->asString()) {
     _left = t1;
     _leftJoinCol = t1JoinCol;
     _right = t2;
@@ -336,6 +336,8 @@ void Join::doComputeJoinWithFullScanDummyLeft(const IdTable& ndr,
       // Do a scan.
       LOG(TRACE) << "Inner scan with ID: " << currentJoinId << endl;
       IdTable jr(2, _executionContext->getAllocator());
+      checkTimeout();  // the scan is a disk operation, so we can check the
+
       scan(currentJoinId, &jr);
       LOG(TRACE) << "Got #items: " << jr.size() << endl;
       // Build the cross product.
@@ -376,6 +378,8 @@ void Join::doComputeJoinWithFullScanDummyRight(const IdTable& ndr,
     } else {
       // Do a scan.
       LOG(TRACE) << "Inner scan with ID: " << currentJoinId << endl;
+      checkTimeout();  // the scan is a disk operation, so we can check the
+                       // timeout frequently
       IdTable jr(2, _executionContext->getAllocator());
       scan(currentJoinId, &jr);
       LOG(TRACE) << "Got #items: " << jr.size() << endl;
@@ -525,13 +529,19 @@ void Join::join(const IdTable& dynA, size_t jc1, const IdTable& dynB,
     while (i < a.size() && j < b.size()) {
       while (a(i, jc1) < b(j, jc2)) {
         ++i;
+        if (i % (1024 * 16) == 0) {
+          checkTimeout();
+        }
         if (i >= a.size()) {
           goto finish;
         }
       }
 
       while (b(j, jc2) < a(i, jc1)) {
         ++j;
+        if (j % (1024 * 16) == 0) {
+          checkTimeout();
+        }
         if (j >= b.size()) {
           goto finish;
         }
@@ -559,12 +569,18 @@ void Join::join(const IdTable& dynA, size_t jc1, const IdTable& dynB,
           }
 
           ++j;
+          if (j % (1024 * 4) == 0) {
+            checkTimeout();
+          }
           if (j >= b.size()) {
             // The next i might still match
             break;
           }
         }
         ++i;
+        if (i % (1024 * 4) == 0) {
+          checkTimeout();
+        }
         if (i >= a.size()) {
           goto finish;
         }

diff --git a/src/engine/Join.h b/src/engine/Join.h
@@ -63,8 +63,8 @@ class Join : public Operation {
    * the result in dynRes. Creates a cross product for matching rows
    **/
   template <int L_WIDTH, int R_WIDTH, int OUT_WIDTH>
-  static void join(const IdTable& dynA, size_t jc1, const IdTable& dynB,
-                   size_t jc2, IdTable* dynRes);
+  void join(const IdTable& dynA, size_t jc1, const IdTable& dynB, size_t jc2,
+            IdTable* dynRes);
 
   class RightLargerTag {};
   class LeftLargerTag {};