-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Faster Index Build- Phase 1 #227
Changes from all commits
3905d6a
689bb7a
ae83a85
fad4425
06af9ab
19f06a9
6efd6b9
2d45553
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// Copyright 2019, University of Freiburg, | ||
// Chair of Algorithms and Data Structures. | ||
// Author: Johannes Kalmbach(joka921) <johannes.kalmbach@gmail.com> | ||
// | ||
// Only performs the "mergeVocabulary" step of the IndexBuilder pipeline | ||
// Can be used e.g. for benchmarking this step to develop faster IndexBuilders. | ||
|
||
#include "index/Vocabulary.h" | ||
#include "index/VocabularyGenerator.h" | ||
|
||
// ____________________________________________________________________________________________________ | ||
int main(int argc, char** argv) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This gives an unused warning for the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This results in an |
||
if (argc != 3) { | ||
std::cerr | ||
<< "Usage: " << argv[0] | ||
<< "<basename of index> <number of partial vocabulary files to merge>"; | ||
} | ||
std::string basename = argv[1]; | ||
size_t numFiles = atoi(argv[2]); | ||
|
||
VocabularyMerger m; | ||
m.mergeVocabulary(basename, numFiles, StringSortComparator()); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,7 +55,7 @@ void Distinct::computeResult(ResultTable* result) { | |
subRes->_resultTypes.begin(), | ||
subRes->_resultTypes.end()); | ||
result->_localVocab = subRes->_localVocab; | ||
int width = subRes->_data.size(); | ||
int width = subRes->_data.cols(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uhoh, good find!
niklas88 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
CALL_FIXED_SIZE_1(width, getEngine().distinct, subRes->_data, _keepIndices, | ||
&result->_data); | ||
LOG(DEBUG) << "Distinct result computation done." << endl; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) | ||
|
||
#include "Filter.h" | ||
#include <algorithm> | ||
#include <optional> | ||
#include <regex> | ||
#include <sstream> | ||
|
@@ -415,10 +416,25 @@ void Filter::computeFilterFixedValue( | |
// remove the leading '^' symbol | ||
std::string rhs = _rhs.substr(1); | ||
std::string upperBoundStr = rhs; | ||
upperBoundStr[upperBoundStr.size() - 1]++; | ||
if (getIndex().getVocab().isCaseInsensitiveOrdering()) { | ||
upperBoundStr = ad_utility::getUppercaseUtf8(upperBoundStr); | ||
upperBoundStr[upperBoundStr.size() - 1]++; | ||
upperBoundStr = | ||
StringSortComparator::rdfLiteralToValueForLT(upperBoundStr); | ||
// less than and greater equal require the same value | ||
rhs = StringSortComparator::rdfLiteralToValueForLT(rhs); | ||
|
||
LOG(INFO) << "upperBound was converted to " << upperBoundStr << '\n'; | ||
LOG(INFO) << "lowerBound was converted to " << rhs << '\n'; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More comments + lower log level |
||
} else { | ||
upperBoundStr[upperBoundStr.size() - 1]++; | ||
} | ||
|
||
size_t upperBound = | ||
getIndex().getVocab().getValueIdForLT(upperBoundStr); | ||
size_t lowerBound = getIndex().getVocab().getValueIdForGE(rhs); | ||
LOG(DEBUG) << "upper and lower bound are " << upperBound << ' ' | ||
<< lowerBound << std::endl; | ||
if (lhs_is_sorted) { | ||
// The input data is sorted, use binary search to locate the first | ||
// and last element that match rhs and copy the range. | ||
|
@@ -504,6 +520,28 @@ void Filter::computeResultFixedValue( | |
rhs_string = ad_utility::convertValueLiteralToIndexWord(rhs_string); | ||
} else if (ad_utility::isNumeric(_rhs)) { | ||
rhs_string = ad_utility::convertNumericToIndexWord(rhs_string); | ||
} else { | ||
if (getIndex().getVocab().isCaseInsensitiveOrdering()) { | ||
// We have to move to the correct end of the | ||
// "same letters but different case" - range | ||
// to make the filters work | ||
switch (_type) { | ||
case SparqlFilter::GE: | ||
case SparqlFilter::LT: { | ||
rhs_string = | ||
StringSortComparator::rdfLiteralToValueForLT(rhs_string); | ||
} | ||
|
||
break; | ||
case SparqlFilter::GT: | ||
case SparqlFilter::LE: { | ||
rhs_string = | ||
StringSortComparator::rdfLiteralToValueForGT(rhs_string); | ||
} break; | ||
default: | ||
break; | ||
} | ||
} | ||
} | ||
if (_type == SparqlFilter::EQ || _type == SparqlFilter::NE) { | ||
if (!getIndex().getVocab().getId(_rhs, &rhs)) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a small comment what this is used for. I assume it's for manually merging vocabularies if there was an error? Is this generally useful?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will add the comment, the reason is more "Benchmarking the vocabulary Merging without having to wait 9 hours for the TurtleParser"