Merge branch 'master' into f.efficientIndexCreation

ad-freiburg · Jul 5, 2018 · 8730c41 · 8730c41
2 parents cc275bf + 2cbf8d1
commit 8730c41
Show file tree

Hide file tree

Showing 15 changed files with 554 additions and 130 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+# End-to-End data
+e2e_data/*
 # Compiled Object files
 *.slo
 *.lo

diff --git a/.travis.yml b/.travis.yml
@@ -5,12 +5,20 @@ dist: trusty
 addons:
   apt:
     sources:
+      - deadsnakes
       - ubuntu-toolchain-r-test
     packages:
       - gcc-5
       - g++-5
       - libsparsehash-dev
+      - python3.6
+      - python3-yaml
       - cmake
+      - netcat
+
+cache:
+  directories:
+    - e2e_data/scientist-collection/
 
 env:
   - CC=gcc-5 CXX=g++-5
@@ -21,9 +29,11 @@ before_script:
   - cd build
   - cmake ..
 
-script: 
-  - make -j 3 
+script:
+  - make -j 3
   - make test
+  - cd ..
+  - e2e/e2e.sh
 
 notifications:
   email:

diff --git a/README.md b/README.md
diff --git a/e2e/e2e.sh b/e2e/e2e.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+set -e
+PROJECT_DIR="$(dirname ${BASH_SOURCE[0]})/.."
+# Change to the project directory so we can use simple relative paths
+cd "$PROJECT_DIR"
+function bail {
+	echo "$*"
+	exit 1
+}
+
+function cleanup_server {
+	echo "The Server Log follows:"
+	cat "build/server_log.txt"
+	# Killing 0 sends the signal to all processes in the current
+	# process group
+	kill $SERVER_PID
+}
+
+# Travis CI is super cool but also uses ancient OS images and so to get
+# a python that supports typing we need to install from the deadsnakes
+# repository which does not override the system python
+if [ -f "/usr/bin/python3.6" ]; then
+	export PYTHON_BINARY="/usr/bin/python3.6"
+else
+	export PYTHON_BINARY=`which python3`
+fi
+
+mkdir -p "e2e_data"
+# Can't check for the scientist-collection directory because
+# Travis' caching creates it
+if [ ! -e "e2e_data/scientist-collection/scientists.nt" ]; then
+	# Why the hell is this a ZIP that can't easily be decompressed from stdin?!?
+	wget -O "e2e_data/scientist-collection.zip" \
+		"http://filicudi.informatik.uni-freiburg.de/bjoern-data/scientist-collection.zip"
+	unzip "e2e_data/scientist-collection.zip" -d "e2e_data"
+fi;
+
+INDEX="e2e_data/scientists-index"
+
+# Delete and rebuild the index
+if [ "$1" != "no-index" ]; then
+	rm -f "$INDEX.*"
+	pushd "./build"
+	./IndexBuilderMain -a -l -i "../$INDEX" \
+		-n "../e2e_data/scientist-collection/scientists.nt" \
+		-w "../e2e_data/scientist-collection/scientists.wordsfile.tsv" \
+		-d "../e2e_data/scientist-collection/scientists.docsfile.tsv" \
+		--patterns || bail "Building Index failed"
+	popd
+fi
+
+# Launch the Server using the freshly baked index. Can't simply use a subshell here because
+# then we can't easily get the SERVER_PID out of that subshell
+pushd "./build"
+./ServerMain -i "../$INDEX" -p 9099 -t -a -l --patterns &> server_log.txt &
+SERVER_PID=$!
+popd
+
+# Setup the kill switch so it gets called whatever way we exit
+trap cleanup_server EXIT
+echo "Waiting for ServerMain to launch and open port"
+while ! curl --max-time 1 --output /dev/null --silent http://localhost:9099/; do
+	sleep 1
+done
+$PYTHON_BINARY e2e/queryit.py "e2e/scientists_queries.yaml" "http://localhost:9099" || bail "Querying Server failed"
diff --git a/e2e/queryit.py b/e2e/queryit.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""
+QLever Query Tool for End2End Testing
+"""
+
+import sys
+import urllib.parse
+import urllib.request
+from typing import Dict, Any, List
+from enum import Enum
+import json
+import yaml
+
+class Color:
+    """
+    Enum-like class for storing ANSI Color Codes
+    """
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+def eprint(*args, color=Color.FAIL, **kwargs):
+    """
+    Like print but to stderr
+    """
+    sys.stderr.write(color)
+    print(*args, file=sys.stderr, **kwargs)
+    print(Color.ENDC, file=sys.stderr)
+
+def exec_query(endpoint_url: str, sparql: str,
+               max_send: int = 4096) -> Dict[str, Any]:
+    """
+    Execute a single SPARQL query against the given endpoint
+    """
+    params = urllib.parse.urlencode({'query': sparql, 'send': max_send})
+    url_suffix = '/?'+params
+    request = urllib.request.Request(endpoint_url+url_suffix)
+    conn = urllib.request.urlopen(request)
+    if conn.status != 200:
+        eprint("Error executing SPARQL Query: ", sparql)
+        return None
+    return json.load(conn)
+
+def is_result_sane(result: Dict[str, Any]) -> bool:
+    """
+    Checks a QLever Result object for sanity
+    """
+    required_fields = ['query', 'status', 'resultsize', 'selected', 'res']
+    for field in required_fields:
+        if field not in result:
+            eprint('QLever Result is missing "%s" field' % field)
+            return False
+    return True
+
+def test_row(gold_row: List[Any],
+             actual_row: List[Any], epsilon=0.1) -> bool:
+    """
+    Test if gold_row and actual_row match. For floats we allow an epsilon
+    difference. If a gold_row cell is None it is ignored.
+    Returns True if they match
+    """
+    for i, gold in enumerate(gold_row):
+        if gold is None:
+            continue
+        actual = actual_row[i]
+        matches = False
+        if isinstance(gold, int):
+            matches = int(actual) == gold
+        elif isinstance(gold, float):
+            matches = abs(gold - float(actual)) <= epsilon
+        else:
+            matches = gold == actual
+
+        if not matches:
+            return False
+    return True
+
+def test_check(check_dict: Dict[str, Any], result: Dict[str, Any]) -> bool:
+    """
+    Test if the named result check holds. Returns True if it does
+    """
+    res = result['res']
+    for check, value in check_dict.items():
+        if check == 'num_rows':
+            if len(res) != int(value):
+                eprint("num_rows check failed:\n" +
+                       "\texpected %r, got %r" %
+                       (value, len(res)))
+                return False
+        elif check == 'num_cols':
+            for row in res:
+                if len(row) != int(value):
+                    eprint("num_cols check failed:\n" +
+                           "\texpected %r, got %r, row: %s" %
+                           (value, len(row), json.dumps(row)))
+                    return False
+        elif check == 'selected':
+            if value != result['selected']:
+                eprint("selected check failed:\n" +
+                       "\texpected %r, got %r" %
+                       (value, result['selected']))
+                return False
+        elif check == 'res':
+            gold_res = value
+            for i, gold_row in enumerate(gold_res):
+                actual_row = res[i]
+                if not test_row(gold_row, actual_row):
+                    eprint("res check failed:\n" +
+                           "\tat row %r" % i +
+                           "\texpected %r, got %r" %
+                           (gold_row, actual_row))
+                    return False
+        elif check == 'contains_row':
+            found = False
+            gold_row = value
+            for actual_row in res:
+                if test_row(gold_row, actual_row):
+                    found = True
+                    break
+            if not found:
+                eprint("contains_row check failed:\n" +
+                       "\tdid not find %r" % gold_row)
+                return False
+
+
+    return True
+
+
+
+def solution_checks(solution: Dict[str, Any],
+                    result: Dict[str, Any]) -> bool:
+    """
+    Tests the checks specified in the solution
+    """
+    if not 'checks' in solution:
+        return True
+    passed = True
+    checks = solution['checks']
+    for check in checks:
+        if not test_check(check, result):
+            passed = False
+    return passed
+
+def print_qlever_result(result: Dict[str, Any]) -> None:
+    """
+    Prints a QLever Result to stdout
+    """
+    eprint(json.dumps(result))
+
+
+def main() -> None:
+    """
+    Run QLever queries stored in a YAML file against a QLever instance
+    """
+    if len(sys.argv) != 3:
+        eprint("Usage: ", sys.argv[0], "<yaml_in> <qlever_endpoint_url>")
+        sys.exit(1)
+
+    inpath = sys.argv[1]
+    endpoint_url = sys.argv[2]
+    error_detected = False
+    with open(inpath, 'rb') if inpath != '-' else sys.stdin as infile:
+        yaml_tree = yaml.load(infile)
+        queries = yaml_tree['queries']
+        for query in queries:
+            query_name = query['query']
+            solutions = query['solutions']
+            for solution in solutions:
+                solution_type = solution['type']
+                solution_sparql = solution['sparql']
+                print(Color.HEADER+'Trying: ', query_name,
+                      '(%s)' % solution_type + Color.ENDC)
+                print('SPARQL:')
+                print(solution_sparql)
+                result = exec_query(endpoint_url, solution_sparql)
+                if not result:
+                    # A print was already done in exec_query()
+                    error_detected = True
+                    print_qlever_result(result)
+                    continue
+
+                if not is_result_sane(result):
+                    error_detected = True
+                    print_qlever_result(result)
+                    continue
+
+                if result['status'] != 'OK':
+                    eprint('QLever Result "status" is not "OK"')
+                    error_detected = True
+                    print_qlever_result(result)
+                    continue
+
+                if not solution_checks(solution, result):
+                    error_detected = True
+                    continue
+
+    if error_detected:
+        print(Color.FAIL+'Query tool found errors!'+Color.ENDC)
+        sys.exit(2)
+
+    print(Color.OKGREEN+'Query tool did not find errors, search harder!'+Color.ENDC)
+
+
+
+if __name__ == '__main__':
+    main()