Skip to content

Commit

Permalink
Merge pull request #105 from joka921/f.prefixCompressionNew
Browse files Browse the repository at this point in the history
Prefix Compression and faster startup time
  • Loading branch information
joka921 committed Sep 1, 2018
2 parents 23a8bad + f4bf41a commit 240daaa
Show file tree
Hide file tree
Showing 37 changed files with 1,855 additions and 455 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "third_party/googletest"]
path = third_party/googletest
url = https://github.com/google/googletest.git
[submodule "third_party/json"]
path = third_party/json
url = https://github.com/nlohmann/json.git
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DLOGLEVEL=${LOG_LEVEL_DEBUG}")
################################
add_subdirectory(third_party/googletest/googletest)
include_directories(third_party/googletest/googletest/include)

################################
# NLOHNMANN-JSON
################################
# Header only, nothing to include
include_directories(third_party/json/include/)

################################
# STXXL
################################
Expand Down Expand Up @@ -106,6 +113,9 @@ target_link_libraries (WriteIndexListsMain engine ${CMAKE_THREAD_LIBS_INIT})
add_executable(MetaDataConverterMain src/MetaDataConverterMain.cpp)
target_link_libraries (MetaDataConverterMain metaConverter ${CMAKE_THREAD_LIBS_INIT})

add_executable(PrefixHeuristicEvaluatorMain src/PrefixHeuristicEvaluatorMain.cpp)
target_link_libraries (PrefixHeuristicEvaluatorMain index ${CMAKE_THREAD_LIBS_INIT})

#add_executable(TextFilterComparison src/experiments/TextFilterComparison.cpp)
#target_link_libraries (TextFilterComparison experiments)

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ VOLUME ["/input", "/index"]
ENV INDEX_PREFIX index
# Need the shell to get the INDEX_PREFIX envirionment variable
ENTRYPOINT ["/bin/sh", "-c", "exec ServerMain -i \"/index/${INDEX_PREFIX}\" -p 7001 \"$@\"", "--"]
CMD ["-t", "-a", "-l", "-P"]
CMD ["-t", "-a", "-P"]

# docker build -t qlever-<name> .
# # When running with user namespaces you may need to make the index folder accessible
Expand Down
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ the name chosen during `docker build`.
docker run -it -p 7001:7001 -v "$(pwd)/index:/index" -e INDEX_PREFIX=<prefix> --name qlever-<name> qlever-<name> <ServerMain args>

where `<ServerMain args>` are arguments (except for port and index prefix)
which are always included. If none are supplied `-t -a -l` is used. If you want
which are always included. If none are supplied `-t -a` is used. If you want
the container to run in the background and restart automatically replace `-it`
with `-d --restart=unless-stopped`

Expand Down Expand Up @@ -214,7 +214,7 @@ To generate a patterns file and include support for ql:has-predicates:

./IndexBuilderMain -i /path/to/myindex -n /path/to/input.nt --patterns

If you want some literals to be written to an on disk vocabulary (by default this concerns literals longer than 50 chars and literals in less frequent lagnuages), add an topional parameter -l. This is useful for large knowledge bases that included texts (descriptions etc) as literals and thus consume lots of memory on startup without this option.
If you want some literals to be written to an on disk vocabulary (by default this concerns literals longer than 50 chars and literals in less frequent lagnuages), add an optional parameter -l. This is useful for large knowledge bases that included texts (descriptions etc) as literals and thus consume lots of memory on startup without this option.

./IndexBuilderMain -i /path/to/myindex -n /path/to/input.nt -l

Expand All @@ -239,9 +239,10 @@ b) With text collection:
Depending on if you built the index with the -a version, two or six index permutations will be registered.
For some data this can be a significant difference in memory consumption.

If you built an index using the -l and/or -a options, make sure to include it at startup
If you built an index using the -a option, make sure to include it at startup
(otherwise only 2 of the 6 permutations will be registered).

./ServerMain -i /path/to/myindex -p <PORT> -t -a -l
./ServerMain -i /path/to/myindex -p <PORT> -t -a

## 4. Running queries:

Expand Down Expand Up @@ -553,8 +554,8 @@ memory usage. Larger KBs are much more problematic.
There are two things that can contribute to high RAM usage (and large startup
times) during runtime:

1) The size of the KB vocabulary. Using the -l flag while building the index and
starting the server causes long and rarely used strings to be externalized to
1) The size of the KB vocabulary. Using the -l flag while building the index
causes long and rarely used strings to be externalized to
disk. This saves a significant amount of memory at little to no time cost for
typical queries. The strategy can be modified to be more aggressive (currently
by editing directly in the code during index construction)
Expand Down
2 changes: 1 addition & 1 deletion e2e/e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ fi
# Launch the Server using the freshly baked index. Can't simply use a subshell here because
# then we can't easily get the SERVER_PID out of that subshell
pushd "./build"
./ServerMain -i "../$INDEX" -p 9099 -t -a -l --patterns &> server_log.txt &
./ServerMain -i "../$INDEX" -p 9099 -t -a --patterns &> server_log.txt &
SERVER_PID=$!
popd

Expand Down
21 changes: 15 additions & 6 deletions src/MetaDataConverterMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,22 @@
#include "./util/File.h"

// _________________________________________________________
// Opens an index from disk. Determines whether this index was built by an older
// QLever version and has to be updated in order to use it (efficiently or at
// all) with the current QLever version. Will NOT overwrite existing files but
// create new files with a .converted suffix which has to be manually removed
// to make the index work. It is highly recommended to backup the original index
// before overwriting it like this.
//
// This converter prints detailed information about which files were created and
// which files have to be renamed in ordere to complete the index update
int main(int argc, char** argv) {
if (argc != 2) {
std::cerr << "Usage: ./MetaDataConverterMain <indexPrefix>\n";
exit(1);
}
std::string in = argv[1];
std::array<std::string, 4> sparseNames{".pso", ".pos", ".spo", ".sop"};
std::array<std::string, 2> sparseNames{".pso", ".pos"};
for (const auto& n : sparseNames) {
std::string permutName = in + ".index" + n;
if (!ad_utility::File::exists(permutName)) {
Expand All @@ -24,11 +33,10 @@ int main(int argc, char** argv) {
"this index. Skipping\n";
continue;
}
addMagicNumberToSparseMetaDataPermutation(permutName,
permutName + ".converted");
convertPermutationToHmap(permutName, permutName + ".converted");
}

std::array<std::string, 2> denseNames{".osp", ".ops"};
std::array<std::string, 4> denseNames{".spo", ".sop", ".osp", ".ops"};
for (const auto& n : denseNames) {
std::string permutName = in + ".index" + n;
if (!ad_utility::File::exists(permutName)) {
Expand All @@ -37,7 +45,8 @@ int main(int argc, char** argv) {
"this index. Skipping\n";
continue;
}
convertHmapBasedPermutatationToMmap(permutName, permutName + ".converted",
permutName + MMAP_FILE_SUFFIX);
convertPermutationToMmap(permutName, permutName + ".converted",
permutName + MMAP_FILE_SUFFIX);
}
CompressVocabAndCreateConfigurationFile(in);
}
43 changes: 43 additions & 0 deletions src/PrefixHeuristicEvaluatorMain.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright 2018, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach<joka921> (johannes.kalmbach@gmail.com)

#include <iostream>
#include "./global/Constants.h"
#include "./index/PrefixHeuristic.h"

// Reads a vocabulary of words from file, calculates the prefixes with which the
// greedy heuristic would compress this vocabulary and prints them on the
// screen (mostly for testing and evaluation purposes of the greedy algorithm)
//
// It is assumed, that there are 127 prefixes which are encoded by 1 byte each.
// Also prints some statistics about the compression (e.g. compression ratio)
//
// The vocabulary in the input file at argv[1] must be one word per line and
// alphabetically sorted
// _______________________________________________________________
int main(int argc, char** argv) {
if (argc != 2) {
std::cerr << "Usage: ./PrefixHeuristicEvaluatorMain <filename>\n";
std::cerr << "Reads a vocabulary of words from file, calculates the "
"prefixes with which the greedy heuristic would compress this "
"vocabulary and prints them on the"
" screen (mostly for testing and evaluation purposes of the "
"greedy algorithm).\n"

" It is assumed, that there are 127 prefixes which are "
"encoded by 1 byte each."
" Also prints some statistics about the compression (e.g. "
"compression ratio)\n"

" The vocabulary in the input file at argv[1] must be one "
"word per line and alphabetically sorted";
exit(1);
}

for (const auto& p :
calculatePrefixes(argv[1], 127, NUM_COMPRESSION_PREFIXES, true)) {
std::cout << p << '\n';
}
}

19 changes: 8 additions & 11 deletions src/ServerMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,6 @@ void printUsage(char* execName) {
<< "Show this help and exit." << endl;
cout << " " << std::setw(20) << "i, index" << std::setw(1) << " "
<< "The location of the index files." << endl;
cout << " " << std::setw(20) << "l, on-disk-literals" << std::setw(1)
<< " "
<< "Indicates that the literals can be found on disk with the index."
<< endl;
cout << " " << std::setw(20) << "p, port" << std::setw(1) << " "
<< "The port on which to run the web interface." << endl;
cout << " " << std::setw(20) << "P, patterns" << std::setw(1) << " "
Expand Down Expand Up @@ -81,7 +77,6 @@ int main(int argc, char** argv) {
// filled / set depending on the options.
string index = "";
bool text = false;
bool onDiskLiterals = false;
bool allPermutations = false;
bool optimizeOptionals = true;
int port = -1;
Expand All @@ -91,7 +86,7 @@ int main(int argc, char** argv) {
optind = 1;
// Process command line arguments.
while (true) {
int c = getopt_long(argc, argv, "i:p:j:tlauhPm", options, NULL);
int c = getopt_long(argc, argv, "i:p:j:tauhPml", options, NULL);
if (c == -1) break;
switch (c) {
case 'i':
Expand All @@ -106,9 +101,6 @@ int main(int argc, char** argv) {
case 't':
text = true;
break;
case 'l':
onDiskLiterals = true;
break;
case 'a':
allPermutations = true;
break;
Expand All @@ -122,6 +114,11 @@ int main(int argc, char** argv) {
printUsage(argv[0]);
exit(0);
break;
case 'l':
std::cout << "Warning: the -l flag (onDiskLiterals) is deprecated and "
"will be ignored for ServerMain. The correct setting for "
"this flag is read directly from the index\n";
break;
default:
cout << endl
<< "! ERROR in processing options (getopt returned '" << c
Expand Down Expand Up @@ -152,8 +149,8 @@ int main(int argc, char** argv) {

try {
Server server(port, numThreads);
server.initialize(index, text, allPermutations, onDiskLiterals,
optimizeOptionals, usePatterns);
server.initialize(index, text, allPermutations, optimizeOptionals,
usePatterns);
server.run();
} catch (const ad_semsearch::Exception& e) {
LOG(ERROR) << e.getFullErrorMessage() << '\n';
Expand Down
5 changes: 2 additions & 3 deletions src/engine/Server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,15 @@

// _____________________________________________________________________________
void Server::initialize(const string& ontologyBaseName, bool useText,
bool allPermutations, bool onDiskLiterals,
bool optimizeOptionals, bool usePatterns) {
bool allPermutations, bool optimizeOptionals,
bool usePatterns) {
LOG(INFO) << "Initializing server..." << std::endl;

_optimizeOptionals = optimizeOptionals;

_index.setUsePatterns(usePatterns);

// Init the index.
_index.setOnDiskLiterals(onDiskLiterals);
_index.createFromOnDiskIndex(ontologyBaseName, allPermutations);
if (useText) {
_index.addTextFromOnDiskIndex();
Expand Down
4 changes: 2 additions & 2 deletions src/engine/Server.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ class Server {

// Initialize the server.
void initialize(const string& ontologyBaseName, bool useText,
bool allPermutations = false, bool onDiskLiterals = false,
bool optimizeOptionals = true, bool usePatterns = false);
bool allPermutations = false, bool optimizeOptionals = true,
bool usePatterns = false);

//! Loop, wait for requests and trigger processing.
void run();
Expand Down
16 changes: 16 additions & 0 deletions src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,19 @@ static const int DEFAULT_NOF_VALUE_MANTISSA_DIGITS = 30;
static const int DEFAULT_NOF_DATE_YEAR_DIGITS = 19;

static const std::string MMAP_FILE_SUFFIX = ".meta-mmap";
static const std::string CONFIGURATION_FILE = ".meta-data.json";

// Constants for the range of valid compression prefixes
// all ASCII- printable characters are left out.
// when adding more special characters to the vocabulary make sure to leave out
// \n since the vocabulary is stored in a text file line by line.
// All prefix codes have a most significant bit of 1. This means the prefix
// codes are never valid UTF-8 and thus it is always able to determine, whether
// this vocabulary was compressed or not.
static constexpr uint8_t MIN_COMPRESSION_PREFIX = 128;
static constexpr uint8_t NUM_COMPRESSION_PREFIXES = 127;
// if this is the first character of a compressed string, this means that no
// compression has been applied to a word
static const uint8_t NO_PREFIX_CHAR =
MIN_COMPRESSION_PREFIX + NUM_COMPRESSION_PREFIXES;

5 changes: 3 additions & 2 deletions src/index/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

add_library(index
Index.h Index.cpp Index.Text.cpp
Vocabulary.h Vocabulary.cpp
Vocabulary.h VocabularyImpl.h
VocabularyGenerator.h VocabularyGenerator.cpp
ConstantsIndexCreation.h
ExternalVocabulary.h ExternalVocabulary.cpp
Expand All @@ -11,7 +11,8 @@ add_library(index
StxxlSortFunctors.h
TextMetaData.cpp TextMetaData.h
DocsDB.cpp DocsDB.h
FTSAlgorithms.cpp FTSAlgorithms.h)
FTSAlgorithms.cpp FTSAlgorithms.h
PrefixHeuristic.cpp PrefixHeuristic.h)

target_link_libraries(index parser ${STXXL_LIBRARIES})

Expand Down
60 changes: 60 additions & 0 deletions src/index/CompressedString.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Copyright 2018, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach<joka921> (johannes.kalmbach@gmail.com)

#pragma once

#include <functional>
#include <string>

using std::string;

// Class to store strings that have been compressed.
// Forbids automatic conversion from the compressed strings in the vocabulary to
// "ordinary" strings to avoid bugs.
// only implements/inherits functionality from std::string that is actually used
// TODO<niklas> is there a better way to do this?
class CompressedString : private string {
public:
CompressedString() : string() {}

// explicit conversions from strings
static CompressedString fromString(const string& other) { return other; }

// ______________________________________________________________
static CompressedString fromString(string&& other) {
return std::move(other);
}

// explicit conversions to strings and string_views
string toString() const { return *this; }

// ______________________________________________________
std::string_view toStringView() const { return *this; }

// _______________________________________________________
bool empty() const { return string::empty(); }

// __________________________________________________________
const char& operator[](size_t pos) const { return string::operator[](pos); }

private:
// private constructors and assignments internally used by the to and from
// string conversions
CompressedString(string&& other) : string(std::move(other)){};

// _____________________________________________________________
CompressedString(const string& other) : string(other){};

// _____________________________________________________________
CompressedString& operator=(string&& other) {
*this = CompressedString(std::move(other));
return *this;
}

// _______________________________________________________________
CompressedString& operator=(const string& other) {
*this = CompressedString(other);
return *this;
}
};
3 changes: 3 additions & 0 deletions src/index/ExternalVocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ class ExternalVocabulary {

void initFromFile(const string& file);

// close the underlying file and uninitialize this vocabulary for further use
void clear() { _file.close(); }

//! Get the word with the given id
//! (as non-reference, returning a cost ref is not possible, because the
//! string does not necessarily already exist in memory - unlike for an
Expand Down

0 comments on commit 240daaa

Please sign in to comment.