-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SetOfIntervals as a base for efficient SparqlRange expressions
- Loading branch information
Showing
5 changed files
with
371 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
// Copyright 2021, University of Freiburg, | ||
// Chair of Algorithms and Data Structures. | ||
// Author: Johannes Kalmbach<joka921> (johannes.kalmbach@gmail.com) | ||
|
||
#include "SetOfIntervals.h" | ||
|
||
namespace ad_utility { | ||
// ___________________________________________________________________________ | ||
SetOfIntervals SortAndCheckDisjointAndNonempty(SetOfIntervals input) { | ||
auto& vec = input._intervals; | ||
auto cmp = [](const auto& a, const auto& b) { return a.first < b.first; }; | ||
std::sort(vec.begin(), vec.end(), cmp); | ||
for (size_t i = 0; i < vec.size(); ++i) { | ||
AD_CHECK(vec[i].second > vec[i].first); | ||
} | ||
|
||
for (size_t i = 1; i < vec.size(); ++i) { | ||
AD_CHECK(vec[i].first >= vec[i - 1].second) | ||
} | ||
|
||
return input; | ||
} | ||
|
||
// ___________________________________________________________________________ | ||
SetOfIntervals Intersection::operator()(SetOfIntervals A, | ||
SetOfIntervals B) const { | ||
// First sort by the beginning of the interval | ||
A = SortAndCheckDisjointAndNonempty(std::move(A)); | ||
B = SortAndCheckDisjointAndNonempty(std::move(B)); | ||
|
||
SetOfIntervals result; | ||
auto itA = A._intervals.begin(); | ||
auto itB = B._intervals.begin(); | ||
|
||
// All values smaller than minIdxNotChecked are either already contained in | ||
// the result or will never become part of it. This variable helps us to | ||
// enforce the invariant that the result intervals are disjoint. | ||
size_t minIdxNotChecked = 0; | ||
|
||
// Compute the intersection using the "zipper" algorithm extended to | ||
// intervals. | ||
while (itA < A._intervals.end() && itB < B._intervals.end()) { | ||
// Invariant: All intervals before iteratorA and iteratorB have already been | ||
// completely dealt with. | ||
auto& itSmaller = itA->first < itB->first ? itA : itB; | ||
auto& itGreaterEq = itA->first < itB->first ? itB : itA; | ||
|
||
// Compute the intersection. | ||
std::pair<size_t, size_t> intersection{ | ||
itGreaterEq->first, std::min(itGreaterEq->second, itSmaller->second)}; | ||
|
||
// Truncate the intersection s.t. it lies completely after (including) | ||
// minIdxNotChecked. Also update minIdxNotChecked, which is then guaranteed | ||
// to be >= std::min(itGreaterEq->second, itSmaller->second) | ||
intersection.first = std::max(intersection.first, minIdxNotChecked); | ||
minIdxNotChecked = std::max(minIdxNotChecked, intersection.second); | ||
|
||
if (intersection.first < intersection.second) { | ||
result._intervals.push_back(std::move(intersection)); | ||
} | ||
|
||
// At least one of the iterators is advanced, which guarantees progress. | ||
if (minIdxNotChecked >= itSmaller->second) { | ||
++itSmaller; | ||
} | ||
if (minIdxNotChecked >= itGreaterEq->second) { | ||
++itGreaterEq; | ||
} | ||
} | ||
|
||
return CheckSortedAndDisjointAndSimplify( | ||
SortAndCheckDisjointAndNonempty(std::move(result))); | ||
} | ||
|
||
// __________________________________________________________________________ | ||
SetOfIntervals Union::operator()(SetOfIntervals A, SetOfIntervals B) const { | ||
// First sort by the beginning of the interval | ||
A = SortAndCheckDisjointAndNonempty(std::move(A)); | ||
B = SortAndCheckDisjointAndNonempty(std::move(B)); | ||
SetOfIntervals result; | ||
auto itA = A._intervals.begin(); | ||
auto itB = B._intervals.begin(); | ||
|
||
// All values smaller than minIdxNotChecked are either already contained in | ||
// the result or will never become part of it. This variable helps us to | ||
// enforce the invariant that the result intervals are disjoint. | ||
size_t minIdxNotChecked = 0; | ||
|
||
// Truncate an interval such that it lies after (including) minIdxNotChecked. | ||
// Update minIdxNotChecked and append the interval to the result, if it did | ||
// not become empty by the truncation. | ||
auto truncateAndAppendInterval = | ||
[&minIdxNotChecked, &result](std::pair<size_t, size_t> interval) { | ||
interval.first = std::max(minIdxNotChecked, interval.first); | ||
minIdxNotChecked = std::max(minIdxNotChecked, interval.second); | ||
|
||
if (interval.first < interval.second) { | ||
result._intervals.push_back(interval); | ||
} | ||
}; | ||
|
||
// Compute the union using the "zipper" algorithm extended to | ||
// intervals. | ||
while (itA < A._intervals.end() && itB < B._intervals.end()) { | ||
auto& itSmaller = itA->first < itB->first ? itA : itB; | ||
auto& itGreaterEq = itA->first < itB->first ? itB : itA; | ||
|
||
// If the intervals do not overlap, output the smaller one (unless | ||
// minIdxNotChecked >= the right end of the interval. | ||
if (itSmaller->second <= itGreaterEq->first) { | ||
truncateAndAppendInterval(*itSmaller); | ||
++itSmaller; | ||
continue; | ||
} | ||
// The ranges overlap | ||
std::pair<size_t, size_t> nextUnion{ | ||
itSmaller->first, std::max(itGreaterEq->second, itSmaller->second)}; | ||
truncateAndAppendInterval(nextUnion); | ||
; | ||
++itSmaller; | ||
++itGreaterEq; | ||
} | ||
|
||
// Attach the remaining intervals (which at this point either all come from A | ||
// or from B) | ||
std::for_each(itA, A._intervals.end(), truncateAndAppendInterval); | ||
std::for_each(itB, B._intervals.end(), truncateAndAppendInterval); | ||
|
||
return CheckSortedAndDisjointAndSimplify( | ||
SortAndCheckDisjointAndNonempty(std::move(result))); | ||
} | ||
|
||
// ___________________________________________________________________________ | ||
SetOfIntervals CheckSortedAndDisjointAndSimplify( | ||
const SetOfIntervals& inputSet) { | ||
auto& inputVec = inputSet._intervals; | ||
if (inputVec.empty()) { | ||
return {}; | ||
} | ||
auto current = inputVec[0]; | ||
SetOfIntervals result; | ||
for (size_t i = 1; i < inputVec.size(); ++i) { | ||
AD_CHECK(inputVec[i].first >= current.second); | ||
if (inputVec[i].first == current.second) { | ||
current = {current.first, inputVec[i].second}; | ||
} else { | ||
result._intervals.push_back(current); | ||
current = inputVec[i]; | ||
} | ||
} | ||
result._intervals.push_back(current); | ||
return result; | ||
} | ||
|
||
} // namespace ad_utility |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
// Copyright 2021, University of Freiburg, | ||
// Chair of Algorithms and Data Structures. | ||
// Author: Johannes Kalmbach<joka921> (johannes.kalmbach@gmail.com) | ||
|
||
#pragma once | ||
|
||
#include <utility> | ||
#include <vector> | ||
|
||
#include "../util/Exception.h" | ||
#include "../util/TypeTraits.h" | ||
|
||
namespace ad_utility { | ||
|
||
/// A vector of pairs of <size_t, size_t> with the following semantics: It | ||
/// represents the union of the intervals [first, second) of the individual | ||
/// pairs. The intervals have to be pairwise disjoint and nonempty. They | ||
/// also have to be sorted in ascending order. | ||
struct SetOfIntervals { | ||
using Vec = std::vector<std::pair<size_t, size_t>>; | ||
Vec _intervals; | ||
// _________________________________________________________________________ | ||
bool operator==(const SetOfIntervals&) const = default; | ||
}; | ||
|
||
/// Sort the intervals in ascending order and assert that they are indeed | ||
/// disjoint and nonempty. | ||
SetOfIntervals SortAndCheckDisjointAndNonempty(SetOfIntervals input); | ||
|
||
/// Assert that the set is sorted, and simplify it by merging adjacent | ||
/// intervals. | ||
SetOfIntervals CheckSortedAndDisjointAndSimplify(const SetOfIntervals& input); | ||
|
||
/// Compute the intersection of two sets of intervals; | ||
struct Intersection { | ||
SetOfIntervals operator()(SetOfIntervals A, SetOfIntervals B) const; | ||
}; | ||
|
||
// Compute the union of two sets of intervals. | ||
struct Union { | ||
SetOfIntervals operator()(SetOfIntervals A, SetOfIntervals B) const; | ||
}; | ||
|
||
// Write `targetSize` many bools to the iterator. The i-th bool is true if | ||
// and only if `i` is contained in the set of intervals. `targetSize` has to | ||
// be >= the right end (not included) of the rightmost interval. | ||
template <typename OutputIterator> | ||
void toBitContainer(const SetOfIntervals& s, size_t targetSize, | ||
OutputIterator it) { | ||
size_t previousEnd = 0; | ||
for (const auto& [begin, end] : s._intervals) { | ||
AD_CHECK(end <= targetSize); | ||
auto spaceUntilInterval = begin - previousEnd; | ||
std::fill(it, it + spaceUntilInterval, false); | ||
it += spaceUntilInterval; | ||
|
||
auto sizeOfInterval = end - begin; | ||
std::fill(it, it + sizeOfInterval, true); | ||
it += sizeOfInterval; | ||
|
||
previousEnd = end; | ||
} | ||
} | ||
|
||
// Transform a SetOfIntervals to a std::vector<bool> of size `targetSize` where | ||
// the element at index i is true if and only if i is contained in the set. | ||
// `targetSize` has to be >= the right end (not included) of the rightmost | ||
// interval. | ||
// __________________________________________________________________________ | ||
inline std::vector<bool> toBitVector(const SetOfIntervals& a, | ||
size_t targetSize) { | ||
std::vector<bool> result(targetSize, false); | ||
toBitContainer(a, targetSize, begin(result)); | ||
return result; | ||
} | ||
|
||
} // namespace ad_utility |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
// Copyright 2021, University of Freiburg, | ||
// Chair of Algorithms and Data Structures. | ||
// Author: Johannes Kalmbach <johannes.kalmbach@gmail.com> | ||
|
||
#include <gtest/gtest.h> | ||
|
||
#include <unordered_set> | ||
|
||
#include "../src/parser/SetOfIntervals.h" | ||
#include "../src/util/Random.h" | ||
|
||
using namespace ad_utility; | ||
|
||
TEST(SetOfIntervals, SortAndCheckDisjointAndNonempty) { | ||
// Sorted and valid set. | ||
SetOfIntervals s{{{0, 2}, {2, 5}, {6, 12}}}; | ||
auto t = SortAndCheckDisjointAndNonempty(s); | ||
ASSERT_EQ(s, t); | ||
// Unsorted and valid set. | ||
SetOfIntervals u{{{6, 12}, {0, 2}, {2, 5}}}; | ||
ASSERT_EQ(s, SortAndCheckDisjointAndNonempty(u)); | ||
|
||
// The empty set is valid. | ||
SetOfIntervals empty{}; | ||
ASSERT_EQ(empty, SortAndCheckDisjointAndNonempty(empty)); | ||
|
||
// Invalid set with empty interval. | ||
SetOfIntervals emptyInterval{{{4, 5}, {2, 2}}}; | ||
ASSERT_THROW(SortAndCheckDisjointAndNonempty(emptyInterval), | ||
ad_semsearch::Exception); | ||
|
||
// Invalid set with overlapping intervals | ||
SetOfIntervals overlapping{{{4, 6}, {2, 5}}}; | ||
ASSERT_THROW(SortAndCheckDisjointAndNonempty(overlapping), | ||
ad_semsearch::Exception); | ||
} | ||
|
||
TEST(SetOfIntervals, CheckSortedAndDisjointAndSimplify) { | ||
SetOfIntervals nonOverlapping{{{0, 2}, {3, 5}, {6, 8}}}; | ||
ASSERT_EQ(nonOverlapping, CheckSortedAndDisjointAndSimplify(nonOverlapping)); | ||
SetOfIntervals overlapping{{{0, 2}, {2, 5}, {5, 8}}}; | ||
SetOfIntervals expected{{{0, 8}}}; | ||
ASSERT_EQ(expected, CheckSortedAndDisjointAndSimplify(overlapping)); | ||
|
||
{ | ||
SetOfIntervals partiallyOverlapping{{{0, 2}, {3, 5}, {5, 7}}}; | ||
SetOfIntervals expected2{{{0, 2}, {3, 7}}}; | ||
ASSERT_EQ(expected2, | ||
CheckSortedAndDisjointAndSimplify(partiallyOverlapping)); | ||
} | ||
|
||
SetOfIntervals unsorted{{{3, 5}, {0, 2}}}; | ||
ASSERT_THROW(CheckSortedAndDisjointAndSimplify(unsorted), | ||
ad_semsearch::Exception); | ||
} | ||
|
||
TEST(SetOfIntervals, Union) { | ||
SetOfIntervals s{{{4, 6}, {0, 2}, {10, 380}}}; | ||
SetOfIntervals empty{}; | ||
// Union with empty set leaves input unchanged | ||
ASSERT_EQ(Union{}(s, empty), SortAndCheckDisjointAndNonempty(s)); | ||
ASSERT_EQ(Union{}(empty, s), SortAndCheckDisjointAndNonempty(s)); | ||
|
||
SetOfIntervals nonOverlapping{{{2, 3}, {7, 10}, {400, 401}}}; | ||
SetOfIntervals expected{{{0, 3}, {4, 6}, {7, 380}, {400, 401}}}; | ||
ASSERT_EQ(Union{}(s, nonOverlapping), expected); | ||
ASSERT_EQ(Union{}(nonOverlapping, s), expected); | ||
|
||
{ | ||
// Complete enclosing of two intervals. | ||
SetOfIntervals a{{{2, 3}, {4, 5}, {7, 9}}}; | ||
SetOfIntervals b{{{0, 6}, {8, 9}}}; | ||
SetOfIntervals c{{{0, 6}, {7, 9}}}; | ||
ASSERT_EQ(Union{}(a, b), c); | ||
} | ||
{ | ||
// Complete enclosing of three | ||
SetOfIntervals a{{{2, 3}, {4, 5}, {7, 8}}}; | ||
SetOfIntervals b{{{0, 9}}}; | ||
ASSERT_EQ(Union{}(a, b), b); | ||
} | ||
|
||
{ | ||
// Partial overlap | ||
SetOfIntervals a{{{2, 3}, {4, 6}, {7, 10}}}; | ||
SetOfIntervals b{{{0, 5}, {8, 11}}}; | ||
SetOfIntervals c{{{0, 6}, {7, 11}}}; | ||
ASSERT_EQ(Union{}(a, b), c); | ||
} | ||
} | ||
|
||
TEST(SetOfIntervals, Intersection) { | ||
SetOfIntervals s{{{4, 6}, {0, 2}, {10, 380}}}; | ||
SetOfIntervals empty{}; | ||
// Union with empty set leaves input unchanged | ||
ASSERT_EQ(Intersection{}(s, empty), empty); | ||
ASSERT_EQ(Intersection{}(empty, s), empty); | ||
|
||
SetOfIntervals noOverlap{{{2, 3}, {7, 10}, {400, 401}}}; | ||
ASSERT_EQ(Intersection{}(s, noOverlap), empty); | ||
ASSERT_EQ(Intersection{}(noOverlap, s), empty); | ||
{ | ||
// Complete enclosing of two | ||
SetOfIntervals a{{{2, 3}, {4, 5}, {7, 9}}}; | ||
SetOfIntervals b{{{0, 6}, {8, 10}}}; | ||
SetOfIntervals c{{{2, 3}, {4, 5}, {8, 9}}}; | ||
ASSERT_EQ(Intersection{}(a, b), c); | ||
} | ||
{ | ||
// Complete enclosing of three | ||
SetOfIntervals a{{{2, 3}, {4, 5}, {7, 8}}}; | ||
SetOfIntervals b{{{0, 9}}}; | ||
ASSERT_EQ(Intersection{}(a, b), a); | ||
} | ||
|
||
{ | ||
// Partial overlap | ||
SetOfIntervals a{{{2, 3}, {4, 6}, {7, 10}}}; | ||
SetOfIntervals b{{{0, 5}, {8, 11}}}; | ||
SetOfIntervals c{{{2, 3}, {4, 5}, {8, 10}}}; | ||
ASSERT_EQ(Intersection{}(a, b), c); | ||
} | ||
} | ||
|
||
TEST(SetOfIntervals, toBitContainer) { | ||
SetOfIntervals a{{{2, 3}, {4, 6}, {7, 10}}}; | ||
std::unordered_set<size_t> elements{2, 4, 5, 7, 8, 9}; | ||
auto expanded = toBitVector(a, 200); | ||
ASSERT_EQ(200ul, expanded.size()); | ||
for (size_t i = 0; i < expanded.size(); ++i) { | ||
ASSERT_EQ(elements.contains(i), expanded[i]); | ||
} | ||
} |