-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Parallel parsing of clean turtle Files. (#466)
* Implementation of a parallel Turtle Parser that assumes the following additional constraints on the .ttl format: ** All `PREFIX` and `BASE` declarations appear at the beginning of the .ttl file before any triples ** Whenever the regex `. *\n` matches, we have reached the end of a triple (in the standard .ttl format this could also match in the middle of a multiline literal * This parallel parser is active in the relaxed parsing mode (`ascii-prefixes-only`)
- Loading branch information
Showing
18 changed files
with
424 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
// Copyright 2021, University of Freiburg, Chair of Algorithms and Data | ||
// Structures. Author: Johannes Kalmbach <kalmbacj@cs.uni-freiburg.de> | ||
|
||
#include "./ParallelBuffer.h" | ||
|
||
// _________________________________________________________________________ | ||
void ParallelFileBuffer::open(const string& filename) { | ||
_file.open(filename, "r"); | ||
_eof = false; | ||
_buf.resize(_blocksize); | ||
auto task = [&file = this->_file, bs = this->_blocksize, | ||
&buf = this->_buf]() { return file.read(buf.data(), bs); }; | ||
_fut = std::async(task); | ||
} | ||
|
||
// ___________________________________________________________________________ | ||
std::optional<std::vector<char>> ParallelFileBuffer::getNextBlock() { | ||
if (_eof) { | ||
return std::nullopt; | ||
} | ||
|
||
AD_CHECK(_file.isOpen() && _fut.valid()); | ||
auto numBytesRead = _fut.get(); | ||
if (numBytesRead == 0) { | ||
_eof = true; | ||
return std::nullopt; | ||
} | ||
_buf.resize(numBytesRead); | ||
std::optional<std::vector<char>> ret = std::move(_buf); | ||
|
||
_buf.resize(_blocksize); | ||
auto getNextBlock = [&file = this->_file, bs = this->_blocksize, | ||
&buf = this->_buf]() { | ||
return file.read(buf.data(), bs); | ||
}; | ||
_fut = std::async(getNextBlock); | ||
|
||
return ret; | ||
} | ||
|
||
// ____________________________________________________________________________ | ||
std::optional<size_t> ParallelBufferWithEndRegex::findRegexNearEnd( | ||
const std::vector<char>& vec, const re2::RE2& regex) { | ||
size_t chunkSize = 1000; | ||
size_t inputSize = vec.size(); | ||
re2::StringPiece regexResult; | ||
bool match = false; | ||
while (true) { | ||
if (chunkSize >= inputSize) { | ||
break; | ||
} | ||
|
||
auto startIdx = inputSize - chunkSize; | ||
auto regexInput = re2::StringPiece{vec.data() + startIdx, chunkSize}; | ||
|
||
match = RE2::PartialMatch(regexInput, regex, ®exResult); | ||
if (match) { | ||
break; | ||
} | ||
|
||
if (chunkSize == inputSize - 1) { | ||
break; | ||
} | ||
chunkSize = std::min(chunkSize * 2, inputSize - 1); | ||
} | ||
if (!match) { | ||
return std::nullopt; | ||
} | ||
|
||
// regexResult.data() is a pointer to the beginning of the match, vec.data() | ||
// is a pointer to the beginning of the total input. | ||
return regexResult.data() + regexResult.size() - vec.data(); | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
std::optional<std::vector<char>> ParallelBufferWithEndRegex::getNextBlock() { | ||
auto rawInput = _rawBuffer.getNextBlock(); | ||
if (!rawInput || _exhausted) { | ||
_exhausted = true; | ||
if (_remainder.empty()) { | ||
return std::nullopt; | ||
} | ||
auto copy = std::move(_remainder); | ||
// The C++ standard does not require that _remainder is empty after the | ||
// move, but we need it to be empty to make the logic above work. | ||
_remainder.clear(); | ||
return copy; | ||
} | ||
|
||
auto endPosition = findRegexNearEnd(rawInput.value(), _endRegex); | ||
if (!endPosition) { | ||
if (_rawBuffer.getNextBlock()) { | ||
throw std::runtime_error( | ||
"The regex which marks the end of a statement was not found at " | ||
"all within a single batch that was not the last one. Please " | ||
"increase the FILE_BUFFER_SIZE " | ||
"or choose a different parser"); | ||
} | ||
// This was the last (possibly incomplete) block, simply concatenate | ||
endPosition = rawInput->size(); | ||
_exhausted = true; | ||
} | ||
std::vector<char> result; | ||
result.reserve(_remainder.size() + *endPosition); | ||
result.insert(result.end(), _remainder.begin(), _remainder.end()); | ||
result.insert(result.end(), rawInput->begin(), | ||
rawInput->begin() + *endPosition); | ||
_remainder.clear(); | ||
_remainder.insert(_remainder.end(), rawInput->begin() + *endPosition, | ||
rawInput->end()); | ||
return result; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.