Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Pattern Trick Stats #188

Merged
merged 2 commits into from
Feb 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 0 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,6 @@ message(STATUS ---)
###############################################################################
##### Actual project configuration #####
###############################################################################

option(DISABLE_PATTERN_TRICK_STATISTICS "This disables the computation of statistics when using the pattern trick." OFF)
if (DISABLE_PATTERN_TRICK_STATISTICS)
add_definitions(-DDISABLE_PATTERN_TRICK_STATISTICS)
endif (DISABLE_PATTERN_TRICK_STATISTICS)

set(LOG_LEVEL_FATAL 0)
set(LOG_LEVEL_ERROR 1)
set(LOG_LEVEL_WARN 2)
Expand Down
111 changes: 51 additions & 60 deletions src/engine/CountAvailablePredicates.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,46 +237,35 @@ void CountAvailablePredicates::computePatternTrick(
<< subjectColumn << std::endl;
ad_utility::HashMap<Id, size_t> predicateCounts;
ad_utility::HashMap<size_t, size_t> patternCounts;
size_t posInput = 0;
#ifndef DISABLE_PATTERN_TRICK_STATISTICS
size_t inputIdx = 0;
// These variables are used to gather additional statistics
size_t num_entities_with_pattern = 0;
// the number of predicates counted with patterns
size_t predicates_from_patterns = 0;
size_t numEntitiesWithPatterns = 0;
// the number of distinct predicates in patterns
size_t numPatternPredicates = 0;
// the number of predicates counted without patterns
size_t predicates_from_lists = 0;
#endif
size_t numListPredicates = 0;
Id lastSubject = ID_NO_VALUE;
while (posInput < input->size()) {
while (inputIdx < input->size()) {
// Skip over elements with the same subject (don't count them twice)
if ((*input)[posInput][subjectColumn] == lastSubject) {
posInput++;
Id subject = (*input)[inputIdx][subjectColumn];
if (subject == lastSubject) {
inputIdx++;
continue;
}
size_t subject = (*input)[posInput][subjectColumn];
lastSubject = subject;
if (subject < hasPattern.size() && hasPattern[subject] != NO_PATTERN) {
// The subject matches a pattern
patternCounts[hasPattern[subject]]++;
#ifndef DISABLE_PATTERN_TRICK_STATISTICS
num_entities_with_pattern++;
#endif
numEntitiesWithPatterns++;
} else if (subject < hasPredicate.size()) {
// The subject does not match a pattern
size_t numPredicates;
Id* predicateData;
std::tie(predicateData, numPredicates) = hasPredicate[subject];
#ifndef DISABLE_PATTERN_TRICK_STATISTICS
predicates_from_lists += numPredicates;
#endif
numListPredicates += numPredicates;
if (numPredicates > 0) {
for (size_t i = 0; i < numPredicates; i++) {
auto it = predicateCounts.find(predicateData[i]);
if (it == predicateCounts.end()) {
predicateCounts[predicateData[i]] = 1;
} else {
it->second++;
}
predicateCounts[predicateData[i]]++;
}
} else {
LOG(TRACE) << "No pattern or has-relation entry found for entity "
Expand All @@ -288,16 +277,19 @@ void CountAvailablePredicates::computePatternTrick(
"(its id is to high)."
<< std::endl;
}
posInput++;
inputIdx++;
}
LOG(DEBUG) << "Using " << patternCounts.size()
<< " patterns for computing the result." << std::endl;
// the number of predicates counted with patterns
size_t numPredicatesSubsumedInPatterns = 0;
// resolve the patterns to predicate counts
for (const auto& it : patternCounts) {
std::pair<Id*, size_t> pattern = patterns[it.first];
predicates_from_patterns += it.second;
numPatternPredicates += pattern.second;
for (size_t i = 0; i < pattern.second; i++) {
predicateCounts[pattern.first[i]] += it.second;
numPredicatesSubsumedInPatterns += it.second;
}
}
// write the predicate counts to the result
Expand All @@ -306,50 +298,49 @@ void CountAvailablePredicates::computePatternTrick(
result->push_back(array<Id, 2>{it.first, static_cast<Id>(it.second)});
}

#ifndef DISABLE_PATTERN_TRICK_STATISTICS
// Print interesting statistics about the pattern trick
double ratio_has_pattern =
static_cast<double>(num_entities_with_pattern) / input->size();
size_t num_predicates_total =
predicates_from_lists + predicates_from_patterns;
double ratio_counted_with_pattern =
static_cast<double>(predicates_from_patterns) / num_predicates_total;
double ratioHasPatterns =
static_cast<double>(numEntitiesWithPatterns) / input->size();
size_t numPredicatesWithRepetitions =
numPredicatesSubsumedInPatterns + numListPredicates;
double ratioCountedWithPatterns =
static_cast<double>(numPredicatesSubsumedInPatterns) /
numPredicatesWithRepetitions;

size_t cost_with_patterns =
input->size() + predicates_from_lists + patternCounts.size();
size_t cost_without_patterns = input->size() + num_predicates_total;
double cost_ratio =
static_cast<double>(cost_with_patterns) / cost_without_patterns;
size_t costWithPatterns =
input->size() + numListPredicates + numPatternPredicates;
size_t costWithoutPatterns = input->size() + numPredicatesWithRepetitions;
double costRatio =
static_cast<double>(costWithPatterns) / costWithoutPatterns;
// Print the ratio of entities that used a pattern
LOG(DEBUG) << num_entities_with_pattern << " of " << input->size()
LOG(DEBUG) << numEntitiesWithPatterns << " of " << input->size()
<< " entities had a pattern. That equals "
<< (ratio_has_pattern * 100) << "%" << std::endl;
<< (ratioHasPatterns * 100) << " %" << std::endl;
// Print info about how many predicates where counted with patterns
LOG(DEBUG) << "Of the " << num_predicates_total << " predicates "
<< predicates_from_patterns
<< " were counted using patterns while " << predicates_from_lists
<< " were counted without patterns. That equals "
<< (ratio_counted_with_pattern * 100) << "%" << std::endl;
LOG(DEBUG) << "Of the " << numPredicatesWithRepetitions << "predicates "
<< numPredicatesSubsumedInPatterns
<< " were counted with patterns, " << numListPredicates
<< " were counted without.";
LOG(DEBUG) << "The ratio is " << (ratioCountedWithPatterns * 100) << "%"
<< std::endl;
// Print information about of efficient the pattern trick is
LOG(DEBUG) << "The conceptual cost of the operation with patterns was "
<< cost_with_patterns
<< " while without patterns it would have been "
<< cost_without_patterns << std::endl;
LOG(DEBUG) << "The conceptual cost with patterns was " << costWithPatterns
<< " vs " << costWithoutPatterns << " without patterns"
<< std::endl;
// Print the cost improvement using the the pattern trick gave us
LOG(DEBUG) << "This equals a ratio of cost with to cost without patterns of "
<< cost_ratio << std::endl;
LOG(DEBUG) << "This gives a ratio with to without of " << costRatio
<< std::endl;

// Add these values to the runtime info
runtimeInfo->addDetail("numEntities", std::to_string(input->size()));
runtimeInfo->addDetail("numPredicatesWithRepetitions",
std::to_string(numPredicatesWithRepetitions));
runtimeInfo->addDetail("percentEntitesWithPatterns",
std::to_string(ratio_has_pattern * 100) + "%");
runtimeInfo->addDetail(
"percentPredicatesThroughPatterns",
std::to_string(ratio_counted_with_pattern * 100) + "%");
std::to_string(ratioHasPatterns * 100) + "%");
runtimeInfo->addDetail("percentPredicatesFromPatterns",
std::to_string(ratioCountedWithPatterns * 100) + "%");
runtimeInfo->addDetail("costWithoutPatterns",
std::to_string(cost_without_patterns));
runtimeInfo->addDetail("costWithPatterns",
std::to_string(cost_with_patterns));
runtimeInfo->addDetail("costImprovement",
std::to_string(cost_ratio * 100) + "%");
#endif
std::to_string(costWithoutPatterns));
runtimeInfo->addDetail("costWithPatterns", std::to_string(costWithPatterns));
runtimeInfo->addDetail("costRatio", std::to_string(costRatio * 100) + "%");
}