ad-freiburg · niklas88 · Jan 29, 2019 · Jan 29, 2019 · Jan 29, 2019
diff --git a/src/index/Index.cpp b/src/index/Index.cpp
@@ -40,6 +40,12 @@ std::unique_ptr<Index::StxxlVec> Index::createIdTriplesAndVocab(
   // first save the total number of words, this is needed to initialize the
   // dense IndexMetaData variants
   _totalVocabularySize = linesAndWords.nofWords;
+  // Save the lower and upper bound of language tagged predicates
+  // TODO(schnelle): These should either also be available when reading the
+  // Index from disk or reokaced with local variables only available when
+  // building the index.
+  _langPredLowerBound = linesAndWords.langPredLowerBound;
+  _langPredUpperBound = linesAndWords.langPredUpperBound;
   LOG(INFO) << "total size of vocabulary (internal and external) is "
             << _totalVocabularySize << std::endl;
 
@@ -51,17 +57,19 @@ std::unique_ptr<Index::StxxlVec> Index::createIdTriplesAndVocab(
   // clear vocabulary to save ram (only information from partial binary files
   // used from now on). This will preserve information about externalized
   // Prefixes etc.
+  // TODO(schnelle): Since we don't use the Vocabulary anywhere until now
+  // this seems pointless
   _vocab.clear();
-  convertPartialToGlobalIds<Parser>(*(linesAndWords.idTriples),
-                                    linesAndWords.actualPartialSizes,
-                                    NUM_TRIPLES_PER_PARTIAL_VOCAB);
+  convertPartialToGlobalIds(*linesAndWords.idTriples,
+                            linesAndWords.actualPartialSizes,
+                            NUM_TRIPLES_PER_PARTIAL_VOCAB);
 
   if (!_keepTempFiles) {
     // remove temporary files only used during index creation
     LOG(INFO) << "Removing temporary files (partial vocabulary and external "
                  "text file...\n";
 
-    // TODO: using system and rm is not really elegant nor portable.
+    // TODO(all): using system and rm is not really elegant nor portable.
     // use std::filesystem as soon as QLever is ported to C++17
     string removeCommand1 =
         "rm -- " + _onDiskBase + EXTERNAL_LITS_TEXT_FILE_NAME;
@@ -92,7 +100,6 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
   StxxlVec& idTriples = *idTriplesPtr;
 
   // also perform unique for first permutation
-
   createPermutationPair<IndexMetaDataHmap>(&idTriples, Permutation::Pso,
                                            Permutation::Pos, true);
   if (allPermutations) {
@@ -102,7 +109,7 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
     createPermutationPair<IndexMetaDataMmap>(&idTriples, Permutation::Osp,
                                              Permutation::Ops);
   } else if (_usePatterns) {
-    // vector is not yet sorted
+    // Not constructed with Spo, Sop, needs extra sort
     createPatterns(false, &idTriples);
   }
   // move compression to end
@@ -132,7 +139,6 @@ void Index::createFromFile(const string& filename, bool allPermutations) {
               << ". Terminating...\n";
     AD_CHECK(false);
   }
-  // also perform unique for first permutation
   writeConfiguration();
 }
 
@@ -244,21 +250,19 @@ VocabularyData Index::passFileForVocabulary(const string& filename,
 
   LOG(INFO) << "Merging vocabulary\n";
   VocabularyData res;
-  res.nofWords = mergeVocabulary(_onDiskBase, numFiles);
+  res.nofWords = mergeVocabulary(_onDiskBase, numFiles, &res.langPredLowerBound,
+                                 &res.langPredUpperBound);
   res.idTriples = std::move(idTriples);
   res.actualPartialSizes = std::move(actualPartialSizes);
   LOG(INFO) << "Pass done.\n";
-  res.idTriples->size();
   return res;
 }
 
 // _____________________________________________________________________________
-template <class Parser>
 void Index::convertPartialToGlobalIds(
     StxxlVec& data, const vector<size_t>& actualLinesPerPartial,
     size_t linesPerPartial) {
   LOG(INFO) << "Updating Ids in stxxl vector to global Ids.\n";
-  array<string, 3> spo;
 
   size_t i = 0;
   // iterate over all partial vocabularies
@@ -318,7 +322,7 @@ std::optional<MetaData> Index::createPermutationImpl(const string& fileName,
     LOG(WARN) << "Attempt to write an empty index!" << std::endl;
     return std::nullopt;
   }
-  ad_utility::File out(fileName.c_str(), "w");
+  ad_utility::File out(fileName, "w");
   LOG(INFO) << "Creating an on-disk index permutation of " << vec.size()
             << " elements / facts." << std::endl;
   // Iterate over the vector and identify relation boundaries
@@ -478,52 +482,46 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
   Pattern pattern;
 
   size_t patternIndex = 0;
-  Id currentRel;
-  currentRel = vec[0][0];
-  bool isValidPattern = true;
-  size_t numInvalidPatterns = 0;
+  Id currentSubj;
+  currentSubj = vec[0][0];
   size_t numValidPatterns = 0;
 
   for (StxxlVec::bufreader_type reader(vec); !reader.empty(); ++reader) {
-    if ((*reader)[0] != currentRel) {
-      currentRel = (*reader)[0];
-      if (isValidPattern) {
-        numValidPatterns++;
-        auto it = patternCounts.find(pattern);
-        if (it == patternCounts.end()) {
-          patternCounts.insert(std::pair<Pattern, size_t>(pattern, size_t(1)));
-        } else {
-          (*it).second++;
-        }
+    if ((*reader)[0] != currentSubj) {
+      currentSubj = (*reader)[0];
+      numValidPatterns++;
+      auto it = patternCounts.find(pattern);
+      if (it == patternCounts.end()) {
+        patternCounts.insert(std::pair<Pattern, size_t>(pattern, size_t(1)));
       } else {
-        numInvalidPatterns++;
+        (*it).second++;
       }
-      isValidPattern = true;
       pattern.clear();
       patternIndex = 0;
     }
+    Id currentPred = (*reader)[1];
+    // Ignore @lang@<predicate> language tagged predicates
+    if (currentPred >= _langPredLowerBound &&
+        currentPred < _langPredUpperBound) {
+      continue;
+    }
+
     // don't list predicates twice
-    if (patternIndex == 0 || pattern[patternIndex - 1] != ((*reader)[1])) {
-      pattern.push_back((*reader)[1]);
+    if (patternIndex == 0 || pattern[patternIndex - 1] != currentPred) {
+      pattern.push_back(currentPred);
       patternIndex++;
     }
   }
   // process the last entry
-  if (isValidPattern) {
-    auto it = patternCounts.find(pattern);
-    if (it == patternCounts.end()) {
-      patternCounts.insert(std::pair<Pattern, size_t>(pattern, size_t(1)));
-    } else {
-      (*it).second++;
-    }
+  auto it = patternCounts.find(pattern);
+  if (it == patternCounts.end()) {
+    patternCounts.insert(std::pair<Pattern, size_t>(pattern, size_t(1)));
+  } else {
+    (*it).second++;
   }
   LOG(INFO) << "Counted patterns and found " << patternCounts.size()
             << " distinct patterns." << std::endl;
-  LOG(INFO) << "Patterns where found for " << numValidPatterns << " entities."
-            << std::endl;
-  LOG(INFO) << "Discarded the patterns of " << numInvalidPatterns
-            << " entities"
-               " because they were too large."
+  LOG(INFO) << "Patterns were found for " << numValidPatterns << " entities."
             << std::endl;
 
   // stores patterns sorted by their number of occurences
@@ -614,20 +612,14 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
   ad_utility::HashSet<Id> predicateHashSet;
 
   pattern.clear();
-  currentRel = vec[0][0];
+  currentSubj = vec[0][0];
   patternIndex = 0;
-  // Create the has-relation and has-pattern predicates
+  // Create the has-predicate and has-pattern predicates
   for (StxxlVec::bufreader_type reader2(vec); !reader2.empty(); ++reader2) {
-    if ((*reader2)[0] != currentRel) {
+    if ((*reader2)[0] != currentSubj) {
       // we have arrived at a new entity;
       fullHasPredicateEntitiesDistinctSize++;
-      std::unordered_map<Pattern, Id>::iterator it;
-      if (isValidPattern) {
-        it = patternSet.find(pattern);
-      } else {
-        it = patternSet.end();
-        numInvalidEntities++;
-      }
+      auto it = patternSet.find(pattern);
       // increase the haspredicate size here as every predicate is only
       // listed once per entity (otherwise it woul always be the same as
       // vec.size()
@@ -641,12 +633,12 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
             fullHasPredicatePredicatesDistinctSize++;
           }
           entityHasPredicate.push_back(
-              std::array<Id, 2>{currentRel, pattern[i]});
+              std::array<Id, 2>{currentSubj, pattern[i]});
         }
       } else {
         numEntitiesWithPatterns++;
         // The pattern does exist, add an entry to the has-pattern predicate
-        entityHasPattern.push_back(std::array<Id, 2>{currentRel, it->second});
+        entityHasPattern.push_back(std::array<Id, 2>{currentSubj, it->second});
         if (!haveCountedPattern[it->second]) {
           haveCountedPattern[it->second] = true;
           // iterate over the pattern once to
@@ -659,30 +651,25 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
         }
       }
       pattern.clear();
-      currentRel = (*reader2)[0];
+      currentSubj = (*reader2)[0];
       patternIndex = 0;
-      isValidPattern = true;
     }
     // don't list predicates twice
-    if (patternIndex == 0 || pattern[patternIndex - 1] != ((*reader2)[1])) {
-      pattern.push_back((*reader2)[1]);
+    Id currentPred = (*reader2)[1];
+    if (patternIndex == 0 || pattern[patternIndex - 1] != currentPred) {
+      pattern.push_back(currentPred);
       patternIndex++;
     }
   }
   // process the last element
   fullHasPredicateSize += pattern.size();
   fullHasPredicateEntitiesDistinctSize++;
-  std::unordered_map<Pattern, Id>::iterator it;
-  if (isValidPattern) {
-    it = patternSet.find(pattern);
-  } else {
-    it = patternSet.end();
-  }
-  if (it == patternSet.end()) {
+  auto last = patternSet.find(pattern);
+  if (last == patternSet.end()) {
     numEntitiesWithoutPatterns++;
     // The pattern does not exist, use the has-relation predicate instead
     for (size_t i = 0; i < patternIndex; i++) {
-      entityHasPredicate.push_back(std::array<Id, 2>{currentRel, pattern[i]});
+      entityHasPredicate.push_back(std::array<Id, 2>{currentSubj, pattern[i]});
       if (predicateHashSet.find(pattern[i]) == predicateHashSet.end()) {
         predicateHashSet.insert(pattern[i]);
         fullHasPredicatePredicatesDistinctSize++;
@@ -691,7 +678,7 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
   } else {
     numEntitiesWithPatterns++;
     // The pattern does exist, add an entry to the has-pattern predicate
-    entityHasPattern.push_back(std::array<Id, 2>{currentRel, it->second});
+    entityHasPattern.push_back(std::array<Id, 2>{currentSubj, last->second});
     for (size_t i = 0; i < patternIndex; i++) {
       if (predicateHashSet.find(pattern[i]) == predicateHashSet.end()) {
         predicateHashSet.insert(pattern[i]);
@@ -734,7 +721,7 @@ void Index::createPatternsImpl(const string& fileName, const StxxlVec& vec,
              << fullHasPredicateMultiplicityPredicates << std::endl;
 
   // Store all data in the file
-  ad_utility::File file(fileName.c_str(), "w");
+  ad_utility::File file(fileName, "w");
 
   // Write a byte of ones to make it less likely that an unversioned file is
   // read as a versioned one (unversioned files begin with the id of the lowest
@@ -981,7 +968,7 @@ void Index::createFromOnDiskIndex(const string& onDiskBase,
     // Read the pattern info from the patterns file
     std::string patternsFilePath = _onDiskBase + ".index.patterns";
     ad_utility::File patternsFile;
-    patternsFile.open(patternsFilePath.c_str(), "r");
+    patternsFile.open(patternsFilePath, "r");
     AD_CHECK(patternsFile.isOpen());
     off_t off = 0;
     unsigned char firstByte;
@@ -1073,19 +1060,19 @@ bool Index::ready() const { return _psoFile.isOpen() && _posFile.isOpen(); }
 // _____________________________________________________________________________
 void Index::openFileHandles() {
   AD_CHECK(_onDiskBase.size() > 0);
-  _psoFile.open((_onDiskBase + ".index.pso").c_str(), "r");
-  _posFile.open((_onDiskBase + ".index.pos").c_str(), "r");
+  _psoFile.open((_onDiskBase + ".index.pso"), "r");
+  _posFile.open((_onDiskBase + ".index.pos"), "r");
   if (ad_utility::File::exists(_onDiskBase + ".index.spo")) {
-    _spoFile.open((_onDiskBase + ".index.spo").c_str(), "r");
+    _spoFile.open((_onDiskBase + ".index.spo"), "r");
   }
   if (ad_utility::File::exists(_onDiskBase + ".index.sop")) {
-    _sopFile.open((_onDiskBase + ".index.sop").c_str(), "r");
+    _sopFile.open((_onDiskBase + ".index.sop"), "r");
   }
   if (ad_utility::File::exists(_onDiskBase + ".index.osp")) {
-    _ospFile.open((_onDiskBase + ".index.osp").c_str(), "r");
+    _ospFile.open((_onDiskBase + ".index.osp"), "r");
   }
   if (ad_utility::File::exists(_onDiskBase + ".index.ops")) {
-    _opsFile.open((_onDiskBase + ".index.ops").c_str(), "r");
+    _opsFile.open((_onDiskBase + ".index.ops"), "r");
   }
   AD_CHECK(_psoFile.isOpen());
   AD_CHECK(_posFile.isOpen());
@@ -1548,7 +1535,7 @@ size_t Index::sizeEstimate(const string& sub, const string& pred,
 // _____________________________________________________________________________
 template <class T>
 void Index::writeAsciiListFile(const string& filename, const T& ids) const {
-  std::ofstream f(filename.c_str());
+  std::ofstream f(filename);
 
   for (size_t i = 0; i < ids.size(); ++i) {
     f << ids[i] << ' ';

diff --git a/src/index/Index.h b/src/index/Index.h
@@ -44,6 +44,9 @@ struct VocabularyData {
   using StxxlVec = stxxl::vector<array<Id, 3>>;
   // The total number of distinct words in the complete Vocabulary
   size_t nofWords;
+  // Id lower and upper bound of @lang@<predicate> predicates
+  Id langPredLowerBound;
+  Id langPredUpperBound;
   // The number of triples in the idTriples vec that each partial vocabulary is
   // responsible for (depends on the number of additional language filter
   // triples)
@@ -319,6 +322,8 @@ class Index {
   bool hasAllPermutations() const { return _spoFile.isOpen(); }
 
  private:
+  Id _langPredLowerBound;
+  Id _langPredUpperBound;
   string _onDiskBase;
   string _settingsFileName;
   bool _onDiskLiterals = false;
@@ -380,7 +385,6 @@ class Index {
   VocabularyData passFileForVocabulary(const string& ntFile,
                                        size_t linesPerPartial = 100000000);
 
-  template <class Parser>
   void convertPartialToGlobalIds(StxxlVec& data,
                                  const vector<size_t>& actualLinesPerPartial,
                                  size_t linesPerPartial);
@@ -444,14 +448,13 @@ class Index {
    * @param fileName The name of the file in which the data should be stored
    * @param vec The vectors of triples in spo order.
    */
-  static void createPatternsImpl(const string& fileName, const StxxlVec& vec,
-                                 CompactStringVector<Id, Id>& hasPredicate,
-                                 std::vector<PatternID>& hasPattern,
-                                 CompactStringVector<size_t, Id>& patterns,
-                                 double& fullHasPredicateMultiplicityEntities,
-                                 double& fullHasPredicateMultiplicityPredicates,
-                                 size_t& fullHasPredicateSize,
-                                 size_t maxNumPatterns);
+  void createPatternsImpl(const string& fileName, const StxxlVec& vec,
+                          CompactStringVector<Id, Id>& hasPredicate,
+                          std::vector<PatternID>& hasPattern,
+                          CompactStringVector<size_t, Id>& patterns,
+                          double& fullHasPredicateMultiplicityEntities,
+                          double& fullHasPredicateMultiplicityPredicates,
+                          size_t& fullHasPredicateSize, size_t maxNumPatterns);
 
   // wrap the static function using the internal member variables
   // the bool indicates wether the StxxlVec has to be sorted before the pattern

diff --git a/src/index/VocabularyGenerator.cpp b/src/index/VocabularyGenerator.cpp
@@ -39,7 +39,8 @@ class QueueCompare {
 };
 
 // ___________________________________________________________________
-size_t mergeVocabulary(const std::string& basename, size_t numFiles) {
+size_t mergeVocabulary(const std::string& basename, size_t numFiles,
+                       Id* langPredLowerBound, Id* langPredUpperBound) {
   std::vector<std::fstream> infiles;
 
   // we will store pairs of <partialId, globalId>
@@ -79,6 +80,9 @@ size_t mergeVocabulary(const std::string& basename, size_t numFiles) {
   // the number of words we have written. This also is the global Id of the next
   // word we see, unless it is is equal to the previous word
   size_t totalWritten = 0;
+  bool firstLangPredSeen = false;
+  *langPredLowerBound = 0;
+  *langPredUpperBound = 0;
 
   // start k-way merge
   while (!queue.empty()) {
@@ -100,6 +104,16 @@ size_t mergeVocabulary(const std::string& basename, size_t numFiles) {
       // write id to corresponding vec
       idVecs[top._partialFileId].push_back(
           std::make_pair(top._partialWordId, totalWritten));
+
+      if (top._value.size() > 0 && top._value[0] == '@') {
+        // exclusive
+        *langPredUpperBound = totalWritten + 1;
+        if (!firstLangPredSeen) {
+          // inclusive
+          *langPredLowerBound = totalWritten;
+          firstLangPredSeen = true;
+        }
+      }
       totalWritten++;
     } else {
       // this is a duplicate which already occured in another partial vocabulary