Skip to content

Commit

Permalink
SetOfIntervals as a base for efficient SparqlRange expressions
Browse files Browse the repository at this point in the history
  • Loading branch information
joka921 committed Sep 25, 2021
1 parent eb48b9a commit fca6bad
Show file tree
Hide file tree
Showing 5 changed files with 371 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/parser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ add_library(parser
ContextFileParser.cpp ContextFileParser.h
ParallelParseBuffer.h
PropertyPathParser.h PropertyPathParser.cpp
SparqlLexer.h SparqlLexer.cpp TokenizerCtre.h TurtleTokenId.h ParallelBuffer.cpp)
SparqlLexer.h SparqlLexer.cpp TokenizerCtre.h TurtleTokenId.h ParallelBuffer.cpp
SetOfIntervals.h SetOfIntervals.cpp)
target_link_libraries(parser rdfEscaping re2 absl::flat_hash_map)

add_subdirectory(sparqlParser)
155 changes: 155 additions & 0 deletions src/parser/SetOfIntervals.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// Copyright 2021, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach<joka921> (johannes.kalmbach@gmail.com)

#include "SetOfIntervals.h"

namespace ad_utility {
// ___________________________________________________________________________
SetOfIntervals SortAndCheckDisjointAndNonempty(SetOfIntervals input) {
auto& vec = input._intervals;
auto cmp = [](const auto& a, const auto& b) { return a.first < b.first; };
std::sort(vec.begin(), vec.end(), cmp);
for (size_t i = 0; i < vec.size(); ++i) {
AD_CHECK(vec[i].second > vec[i].first);
}

for (size_t i = 1; i < vec.size(); ++i) {
AD_CHECK(vec[i].first >= vec[i - 1].second)
}

return input;
}

// ___________________________________________________________________________
SetOfIntervals Intersection::operator()(SetOfIntervals A,
SetOfIntervals B) const {
// First sort by the beginning of the interval
A = SortAndCheckDisjointAndNonempty(std::move(A));
B = SortAndCheckDisjointAndNonempty(std::move(B));

SetOfIntervals result;
auto itA = A._intervals.begin();
auto itB = B._intervals.begin();

// All values smaller than minIdxNotChecked are either already contained in
// the result or will never become part of it. This variable helps us to
// enforce the invariant that the result intervals are disjoint.
size_t minIdxNotChecked = 0;

// Compute the intersection using the "zipper" algorithm extended to
// intervals.
while (itA < A._intervals.end() && itB < B._intervals.end()) {
// Invariant: All intervals before iteratorA and iteratorB have already been
// completely dealt with.
auto& itSmaller = itA->first < itB->first ? itA : itB;
auto& itGreaterEq = itA->first < itB->first ? itB : itA;

// Compute the intersection.
std::pair<size_t, size_t> intersection{
itGreaterEq->first, std::min(itGreaterEq->second, itSmaller->second)};

// Truncate the intersection s.t. it lies completely after (including)
// minIdxNotChecked. Also update minIdxNotChecked, which is then guaranteed
// to be >= std::min(itGreaterEq->second, itSmaller->second)
intersection.first = std::max(intersection.first, minIdxNotChecked);
minIdxNotChecked = std::max(minIdxNotChecked, intersection.second);

if (intersection.first < intersection.second) {
result._intervals.push_back(std::move(intersection));
}

// At least one of the iterators is advanced, which guarantees progress.
if (minIdxNotChecked >= itSmaller->second) {
++itSmaller;
}
if (minIdxNotChecked >= itGreaterEq->second) {
++itGreaterEq;
}
}

return CheckSortedAndDisjointAndSimplify(
SortAndCheckDisjointAndNonempty(std::move(result)));
}

// __________________________________________________________________________
SetOfIntervals Union::operator()(SetOfIntervals A, SetOfIntervals B) const {
// First sort by the beginning of the interval
A = SortAndCheckDisjointAndNonempty(std::move(A));
B = SortAndCheckDisjointAndNonempty(std::move(B));
SetOfIntervals result;
auto itA = A._intervals.begin();
auto itB = B._intervals.begin();

// All values smaller than minIdxNotChecked are either already contained in
// the result or will never become part of it. This variable helps us to
// enforce the invariant that the result intervals are disjoint.
size_t minIdxNotChecked = 0;

// Truncate an interval such that it lies after (including) minIdxNotChecked.
// Update minIdxNotChecked and append the interval to the result, if it did
// not become empty by the truncation.
auto truncateAndAppendInterval =
[&minIdxNotChecked, &result](std::pair<size_t, size_t> interval) {
interval.first = std::max(minIdxNotChecked, interval.first);
minIdxNotChecked = std::max(minIdxNotChecked, interval.second);

if (interval.first < interval.second) {
result._intervals.push_back(interval);
}
};

// Compute the union using the "zipper" algorithm extended to
// intervals.
while (itA < A._intervals.end() && itB < B._intervals.end()) {
auto& itSmaller = itA->first < itB->first ? itA : itB;
auto& itGreaterEq = itA->first < itB->first ? itB : itA;

// If the intervals do not overlap, output the smaller one (unless
// minIdxNotChecked >= the right end of the interval.
if (itSmaller->second <= itGreaterEq->first) {
truncateAndAppendInterval(*itSmaller);
++itSmaller;
continue;
}
// The ranges overlap
std::pair<size_t, size_t> nextUnion{
itSmaller->first, std::max(itGreaterEq->second, itSmaller->second)};
truncateAndAppendInterval(nextUnion);
;
++itSmaller;
++itGreaterEq;
}

// Attach the remaining intervals (which at this point either all come from A
// or from B)
std::for_each(itA, A._intervals.end(), truncateAndAppendInterval);
std::for_each(itB, B._intervals.end(), truncateAndAppendInterval);

return CheckSortedAndDisjointAndSimplify(
SortAndCheckDisjointAndNonempty(std::move(result)));
}

// ___________________________________________________________________________
SetOfIntervals CheckSortedAndDisjointAndSimplify(
const SetOfIntervals& inputSet) {
auto& inputVec = inputSet._intervals;
if (inputVec.empty()) {
return {};
}
auto current = inputVec[0];
SetOfIntervals result;
for (size_t i = 1; i < inputVec.size(); ++i) {
AD_CHECK(inputVec[i].first >= current.second);
if (inputVec[i].first == current.second) {
current = {current.first, inputVec[i].second};
} else {
result._intervals.push_back(current);
current = inputVec[i];
}
}
result._intervals.push_back(current);
return result;
}

} // namespace ad_utility
77 changes: 77 additions & 0 deletions src/parser/SetOfIntervals.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copyright 2021, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach<joka921> (johannes.kalmbach@gmail.com)

#pragma once

#include <utility>
#include <vector>

#include "../util/Exception.h"
#include "../util/TypeTraits.h"

namespace ad_utility {

/// A vector of pairs of <size_t, size_t> with the following semantics: It
/// represents the union of the intervals [first, second) of the individual
/// pairs. The intervals have to be pairwise disjoint and nonempty. They
/// also have to be sorted in ascending order.
struct SetOfIntervals {
using Vec = std::vector<std::pair<size_t, size_t>>;
Vec _intervals;
// _________________________________________________________________________
bool operator==(const SetOfIntervals&) const = default;
};

/// Sort the intervals in ascending order and assert that they are indeed
/// disjoint and nonempty.
SetOfIntervals SortAndCheckDisjointAndNonempty(SetOfIntervals input);

/// Assert that the set is sorted, and simplify it by merging adjacent
/// intervals.
SetOfIntervals CheckSortedAndDisjointAndSimplify(const SetOfIntervals& input);

/// Compute the intersection of two sets of intervals;
struct Intersection {
SetOfIntervals operator()(SetOfIntervals A, SetOfIntervals B) const;
};

// Compute the union of two sets of intervals.
struct Union {
SetOfIntervals operator()(SetOfIntervals A, SetOfIntervals B) const;
};

// Write `targetSize` many bools to the iterator. The i-th bool is true if
// and only if `i` is contained in the set of intervals. `targetSize` has to
// be >= the right end (not included) of the rightmost interval.
template <typename OutputIterator>
void toBitContainer(const SetOfIntervals& s, size_t targetSize,
OutputIterator it) {
size_t previousEnd = 0;
for (const auto& [begin, end] : s._intervals) {
AD_CHECK(end <= targetSize);
auto spaceUntilInterval = begin - previousEnd;
std::fill(it, it + spaceUntilInterval, false);
it += spaceUntilInterval;

auto sizeOfInterval = end - begin;
std::fill(it, it + sizeOfInterval, true);
it += sizeOfInterval;

previousEnd = end;
}
}

// Transform a SetOfIntervals to a std::vector<bool> of size `targetSize` where
// the element at index i is true if and only if i is contained in the set.
// `targetSize` has to be >= the right end (not included) of the rightmost
// interval.
// __________________________________________________________________________
inline std::vector<bool> toBitVector(const SetOfIntervals& a,
size_t targetSize) {
std::vector<bool> result(targetSize, false);
toBitContainer(a, targetSize, begin(result));
return result;
}

} // namespace ad_utility
4 changes: 4 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,7 @@ add_test(TaskQueueTest TaskQueueTest)
add_executable(TypeTraitsTest TypeTraitsTest.cpp)
add_test(TypeTraitsTest TypeTraitsTest)
target_link_libraries(TypeTraitsTest gtest_main ${CMAKE_THREAD_LIBS_INIT})

add_executable(SetOfIntervalsTest SetOfIntervalsTest.cpp)
target_link_libraries(SetOfIntervalsTest parser gtest_main ${CMAKE_THREAD_LIBS_INIT})
add_test(SetOfIntervalsTest SetOfIntervalsTest)
133 changes: 133 additions & 0 deletions test/SetOfIntervalsTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright 2021, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach <johannes.kalmbach@gmail.com>

#include <gtest/gtest.h>

#include <unordered_set>

#include "../src/parser/SetOfIntervals.h"
#include "../src/util/Random.h"

using namespace ad_utility;

TEST(SetOfIntervals, SortAndCheckDisjointAndNonempty) {
// Sorted and valid set.
SetOfIntervals s{{{0, 2}, {2, 5}, {6, 12}}};
auto t = SortAndCheckDisjointAndNonempty(s);
ASSERT_EQ(s, t);
// Unsorted and valid set.
SetOfIntervals u{{{6, 12}, {0, 2}, {2, 5}}};
ASSERT_EQ(s, SortAndCheckDisjointAndNonempty(u));

// The empty set is valid.
SetOfIntervals empty{};
ASSERT_EQ(empty, SortAndCheckDisjointAndNonempty(empty));

// Invalid set with empty interval.
SetOfIntervals emptyInterval{{{4, 5}, {2, 2}}};
ASSERT_THROW(SortAndCheckDisjointAndNonempty(emptyInterval),
ad_semsearch::Exception);

// Invalid set with overlapping intervals
SetOfIntervals overlapping{{{4, 6}, {2, 5}}};
ASSERT_THROW(SortAndCheckDisjointAndNonempty(overlapping),
ad_semsearch::Exception);
}

TEST(SetOfIntervals, CheckSortedAndDisjointAndSimplify) {
SetOfIntervals nonOverlapping{{{0, 2}, {3, 5}, {6, 8}}};
ASSERT_EQ(nonOverlapping, CheckSortedAndDisjointAndSimplify(nonOverlapping));
SetOfIntervals overlapping{{{0, 2}, {2, 5}, {5, 8}}};
SetOfIntervals expected{{{0, 8}}};
ASSERT_EQ(expected, CheckSortedAndDisjointAndSimplify(overlapping));

{
SetOfIntervals partiallyOverlapping{{{0, 2}, {3, 5}, {5, 7}}};
SetOfIntervals expected2{{{0, 2}, {3, 7}}};
ASSERT_EQ(expected2,
CheckSortedAndDisjointAndSimplify(partiallyOverlapping));
}

SetOfIntervals unsorted{{{3, 5}, {0, 2}}};
ASSERT_THROW(CheckSortedAndDisjointAndSimplify(unsorted),
ad_semsearch::Exception);
}

TEST(SetOfIntervals, Union) {
SetOfIntervals s{{{4, 6}, {0, 2}, {10, 380}}};
SetOfIntervals empty{};
// Union with empty set leaves input unchanged
ASSERT_EQ(Union{}(s, empty), SortAndCheckDisjointAndNonempty(s));
ASSERT_EQ(Union{}(empty, s), SortAndCheckDisjointAndNonempty(s));

SetOfIntervals nonOverlapping{{{2, 3}, {7, 10}, {400, 401}}};
SetOfIntervals expected{{{0, 3}, {4, 6}, {7, 380}, {400, 401}}};
ASSERT_EQ(Union{}(s, nonOverlapping), expected);
ASSERT_EQ(Union{}(nonOverlapping, s), expected);

{
// Complete enclosing of two intervals.
SetOfIntervals a{{{2, 3}, {4, 5}, {7, 9}}};
SetOfIntervals b{{{0, 6}, {8, 9}}};
SetOfIntervals c{{{0, 6}, {7, 9}}};
ASSERT_EQ(Union{}(a, b), c);
}
{
// Complete enclosing of three
SetOfIntervals a{{{2, 3}, {4, 5}, {7, 8}}};
SetOfIntervals b{{{0, 9}}};
ASSERT_EQ(Union{}(a, b), b);
}

{
// Partial overlap
SetOfIntervals a{{{2, 3}, {4, 6}, {7, 10}}};
SetOfIntervals b{{{0, 5}, {8, 11}}};
SetOfIntervals c{{{0, 6}, {7, 11}}};
ASSERT_EQ(Union{}(a, b), c);
}
}

TEST(SetOfIntervals, Intersection) {
SetOfIntervals s{{{4, 6}, {0, 2}, {10, 380}}};
SetOfIntervals empty{};
// Union with empty set leaves input unchanged
ASSERT_EQ(Intersection{}(s, empty), empty);
ASSERT_EQ(Intersection{}(empty, s), empty);

SetOfIntervals noOverlap{{{2, 3}, {7, 10}, {400, 401}}};
ASSERT_EQ(Intersection{}(s, noOverlap), empty);
ASSERT_EQ(Intersection{}(noOverlap, s), empty);
{
// Complete enclosing of two
SetOfIntervals a{{{2, 3}, {4, 5}, {7, 9}}};
SetOfIntervals b{{{0, 6}, {8, 10}}};
SetOfIntervals c{{{2, 3}, {4, 5}, {8, 9}}};
ASSERT_EQ(Intersection{}(a, b), c);
}
{
// Complete enclosing of three
SetOfIntervals a{{{2, 3}, {4, 5}, {7, 8}}};
SetOfIntervals b{{{0, 9}}};
ASSERT_EQ(Intersection{}(a, b), a);
}

{
// Partial overlap
SetOfIntervals a{{{2, 3}, {4, 6}, {7, 10}}};
SetOfIntervals b{{{0, 5}, {8, 11}}};
SetOfIntervals c{{{2, 3}, {4, 5}, {8, 10}}};
ASSERT_EQ(Intersection{}(a, b), c);
}
}

TEST(SetOfIntervals, toBitContainer) {
SetOfIntervals a{{{2, 3}, {4, 6}, {7, 10}}};
std::unordered_set<size_t> elements{2, 4, 5, 7, 8, 9};
auto expanded = toBitVector(a, 200);
ASSERT_EQ(200ul, expanded.size());
for (size_t i = 0; i < expanded.size(); ++i) {
ASSERT_EQ(elements.contains(i), expanded[i]);
}
}

0 comments on commit fca6bad

Please sign in to comment.