Ranges for the four datatypes the vocabulary is holding (#443)

Get the ID range for each datatype (IRI, literal, numeric value, date) Function for printing the first and last element of each range and the elements (outside of the range) next to them
ad-freiburg · Jul 23, 2021 · e4e0a21 · e4e0a21
1 parent 55b7b90
commit e4e0a21
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 0 deletions.
diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp
@@ -474,6 +474,44 @@ const std::optional<string> Vocabulary<S, C>::idToOptionalString(Id id) const {
     return _externalLiterals[id];
   }
 }
+
+// ___________________________________________________________________________
+template <typename S, typename C>
+ad_utility::HashMap<typename Vocabulary<S, C>::Datatypes, std::pair<Id, Id>>
+Vocabulary<S, C>::getRangesForDatatypes() const {
+  ad_utility::HashMap<Datatypes, std::pair<Id, Id>> result;
+  result[Datatypes::Float] = prefix_range(VALUE_FLOAT_PREFIX);
+  result[Datatypes::Date] = prefix_range(VALUE_DATE_PREFIX);
+  result[Datatypes::Literal] = prefix_range("\"");
+  result[Datatypes::Iri] = prefix_range("<");
+
+  return result;
+};
+
+template <typename S, typename C>
+template <typename, typename>
+void Vocabulary<S, C>::printRangesForDatatypes() {
+  auto ranges = getRangesForDatatypes();
+  auto logRange = [&](const auto& range) {
+    LOG(INFO) << range.first << " " << range.second << '\n';
+    if (range.second > range.first) {
+      LOG(INFO) << idToOptionalString(range.first).value() << '\n';
+      LOG(INFO) << idToOptionalString(range.second - 1).value() << '\n';
+    }
+    if (range.second < _words.size()) {
+      LOG(INFO) << idToOptionalString(range.second).value() << '\n';
+    }
+
+    if (range.first > 0) {
+      LOG(INFO) << idToOptionalString(range.first - 1).value() << '\n';
+    }
+  };
+
+  for (const auto& pair : ranges) {
+    logRange(pair.second);
+  }
+}
+
 template const std::optional<string>
 RdfsVocabulary::idToOptionalString<CompressedString, void>(Id id) const;
 
@@ -492,6 +530,8 @@ template void RdfsVocabulary::prefixCompressFile<CompressedString, void>(
     const string& infile, const string& outfile,
     const vector<string>& prefixes);
 
+template void RdfsVocabulary::printRangesForDatatypes();
+
 template void TextVocabulary::createFromSet<std::string, void>(
     const ad_utility::HashSet<std::string>& set);
 template void TextVocabulary::writeToFile<std::string, void>(

diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h
@@ -74,6 +74,9 @@ struct Prefix {
 //! CompressedString -> prefix compression is applied
 template <class StringType, class ComparatorType>
 class Vocabulary {
+  // The different type of data that is stored in the vocabulary
+  enum class Datatypes { Literal, Iri, Float, Date };
+
   template <typename T, typename R = void>
   using enable_if_compressed =
       std::enable_if_t<std::is_same_v<T, CompressedString>>;
@@ -177,6 +180,12 @@ class Vocabulary {
   // consider using the prefixRange function.
   bool getIdRangeForFullTextPrefix(const string& word, IdRange* range) const;
 
+  ad_utility::HashMap<Datatypes, std::pair<Id, Id>> getRangesForDatatypes()
+      const;
+
+  template <typename U = StringType, typename = enable_if_compressed<U>>
+  void printRangesForDatatypes();
+
   // only used during Index building, not needed for compressed vocabulary
   template <typename U = StringType, typename = enable_if_uncompressed<U>>
   void createFromSet(const ad_utility::HashSet<StringType>& set);