diff --git a/README.md b/README.md index 09aa76c8f..0fbcdec0a 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,9 @@ include analysing ryml with: * undefined behavior * thread +ryml is also partially available in Python, with more languages to follow (see +below). + ------ @@ -677,6 +680,105 @@ do need this, then you will need to declare and use an allocator from a ryml memory resource that outlives the tree and/or parser. +------ + +## Other languages + +One of the aims of ryml is to provide an efficient YAML API for other +languages. There's already a cursory implementation for Python (using only +the low-level API). After ironing out the general approach, other languages +are likely to follow: probably (in order) JavaScript, C#, Java, Ruby, PHP, +Octave and R (all of this is possible because we're +using [SWIG](http://www.swig.org/), which makes it easy to do so). + +### Python + +(Note that this is a work in progress. Additions will be made and things will +be changed.) With that said, here's an example of the Python API: + +```python +import ryml + +# because ryml does not take ownership of the source buffer +# ryml cannot accept strings; only bytes or bytearrays +src = b"{HELLO: a, foo: b, bar: c, baz: d, seq: [0, 1, 2, 3]}" + +def check(tree): + # for now, only the index-based low-level API is implemented + assert tree.size() == 10 + assert tree.root_id() == 0 + assert tree.first_child(0) == 1 + assert tree.next_sibling(1) == 2 + assert tree.first_sibling(5) == 2 + assert tree.last_sibling(1) == 5 + # use bytes objects for queries + assert tree.find_child(0, b"foo") == 1 + assert tree.key(1) == b"foo") + assert tree.val(1) == b"b") + assert tree.find_child(0, b"seq") == 5 + assert tree.is_seq(5) + # to loop over children: + for i, ch in enumerate(ryml.children(tree, 5)): + assert tree.val(ch) == [b"0", b"1", b"2", b"3"][i] + # to loop over siblings: + for i, sib in enumerate(ryml.siblings(tree, 5)): + assert tree.key(sib) == [b"HELLO", b"foo", b"bar", b"baz", b"seq"][i] + # to walk over all elements + visited = [False] * tree.size() + for n, indentation_level in ryml.walk(tree): + # just a dumb emitter + left = " " * indentation_level + if tree.is_keyval(n): + print("{}{}: {}".format(left, tree.key(n), tree.val(n)) + elif tree.is_val(n): + print("- {}".format(left, tree.val(n)) + elif tree.is_keyseq(n): + print("{}{}:".format(left, tree.key(n)) + visited[inode] = True + assert False not in visited + # NOTE about encoding! + k = tree.get_key(5) + print(k) # '' + assert k == b"seq" # ok, as expected + assert k != "seq" # not ok - NOTE THIS! + assert str(k) != "seq" # not ok + assert str(k, "utf8") == "seq" # ok again + +# parse immutable buffer +tree = ryml.parse(src) +check(tree) # OK + +# also works, but requires bytearrays or +# objects offering writeable memory +mutable = bytearray(src) +tree = ryml.parse_in_situ(mutable) +check(tree) # OK +``` + +As expected, the performance results so far are encouraging. In +a [timeit benchmark](api/python/parse_bm.py) compared +against [PyYaml](https://pyyaml.org/) +and [ruamel.yaml](https://yaml.readthedocs.io/en/latest/), ryml parses +quicker by a factor of 30x-50x: + +``` ++-----------------------+-------+----------+---------+----------------+ +| case | iters | time(ms) | avg(ms) | avg_read(MB/s) | ++-----------------------+-------+----------+---------+----------------+ +| parse:RuamelYaml | 88 | 800.483 | 9.096 | 0.234 | +| parse:PyYaml | 88 | 541.370 | 6.152 | 0.346 | +| parse:RymlRo | 3888 | 776.020 | 0.200 | 10.667 | +| parse:RymlRoReuse | 1888 | 381.558 | 0.202 | 10.535 | +| parse:RymlInSitu | 3888 | 775.121 | 0.199 | 10.679 | +| parse:RymlInSituReuse | 3888 | 774.534 | 0.199 | 10.687 | ++-----------------------+-------+----------+---------+----------------+ +``` + +(Note that the results above are biased in favour of ryml, because ryml does +not perform any type conversions: return types are merely `memoryviews` to +the source buffer.) + + ------ ## YAML standard conformance @@ -698,12 +800,11 @@ appear some cases which YAML fails to parse. So we welcome your [bug reports or pull requests!](https://github.com/biojppm/rapidyaml/issues). -Integration of the ~300 cases in -the [YAML test suite](https://github.com/yaml/yaml-test-suite) is ongoing +Integration of the ~300 cases in the +[YAML test suite](https://github.com/yaml/yaml-test-suite) is ongoing work. - ------ ## Alternative libraries diff --git a/api/CMakeLists.txt b/api/CMakeLists.txt index d3863b8cc..9aeb986b3 100644 --- a/api/CMakeLists.txt +++ b/api/CMakeLists.txt @@ -116,6 +116,14 @@ if(RYML_BUILD_API_PYTHON3) endfunction() add_python_test(parse.py) + #if(RYML_BUILD_BENCHMARKS) + # c4_add_benchmark_cmd(ryml ryml-python3-api-bm-travis-ryml + # COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/../bm/cases/travis.yml ryml) + # c4_add_benchmark_cmd(ryml ryml-python3-api-bm-appveyor-ryml + # COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/../bm/cases/appveyor.yml ryml) + # c4_add_benchmark_cmd(ryml ryml-python3-api-bm-compile_commands-ryml + # COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/../bm/cases/compile_commands.json ryml) + #endif() endif() diff --git a/api/python/parse.py b/api/python/parse.py index 60093a8b6..d6f3a3916 100644 --- a/api/python/parse.py +++ b/api/python/parse.py @@ -7,8 +7,18 @@ class SimpleHardcoded: yaml = "{HELLO: a, foo: b, bar: c, baz: d, seq: [0, 1, 2, 3]}" def check(self, ut, t): + + for i, sib in enumerate(ryml.siblings(t, 5)): + s = t.key(sib) + r = [b"HELLO", b"foo", b"bar", b"baz", b"seq"][i] + print("'{}' vs '{}': {}, {}".format(s, r, s == r, s is not r)) + s = str(t.key(sib), "utf8") + r = ["HELLO", "foo", "bar", "baz", "seq"][i] + print("'{}' vs '{}': {}, {}".format(s, r, s == r, s is not r)) + # some convenient shorthands eq = ut.assertEqual + ne = ut.assertNotEqual fs = ut.assertFalse tr = ut.assertTrue # @@ -95,24 +105,45 @@ def check(self, ut, t): eq(num, 5) eq(num, t.num_siblings(t.first_child(t.root_id()))) # + + for i, ch in enumerate(ryml.children(t, 5)): + eq(t.val(ch), [b"0", b"1", b"2", b"3"][i]) + + sibs = [b"HELLO", b"foo", b"bar", b"baz", b"seq"] + sibs_s = ["HELLO", "foo", "bar", "baz", "seq"] + for i, sib in enumerate(ryml.siblings(t, 5)): + k = t.key(sib) + k_s = str(k, "utf8") + eq(k, sibs[i]) + eq(k_s, sibs_s[i]) + ne(k, sibs_s[i]) + ne(k_s, sibs[i]) + k_s = str(k) + ne(k_s, sibs_s[i]) + ne(k_s, sibs[i]) + num = 0 for id in ryml.siblings(t, 0): num += 1 eq(num, 1) # num = 0 - for id in ryml.walk(t): + for id, level in ryml.walk(t): num += 1 if t.is_root(id): eq(id, 0) + eq(level, 0) if t.is_map(id): eq(id, 0) + eq(level, 0) if t.is_seq(id): eq(id, 5) + eq(level, 1) if t.is_keyval(id): tr(id > 0 and id < 5) if t.is_val(id): tr(id > 5) + eq(level, 2) eq(num, t.size()) # num = 0 diff --git a/api/python/parse_bm.py b/api/python/parse_bm.py new file mode 100644 index 000000000..41a0a5e5d --- /dev/null +++ b/api/python/parse_bm.py @@ -0,0 +1,128 @@ +import ryml +import ruamel.yaml +import yaml +import timeit +import time +import prettytable +from collections import OrderedDict as odict + + +class RunResults: + + __slots__ = ('name', 'count', 'time', 'avg', 'MBps', 'timeit') + + def __init__(self, name, time, count, MB, timeit): + self.name = name + self.time = time + self.count = count + self.avg = time / count + self.MBps = MB / self.time / 1000.0 + self.timeit = timeit + + def __str__(self): + fmt = "{}: count={} time={:.3f}ms avg={:.3f}ms MB/s={:.3f}" + fmt = fmt.format(self.name, self.count, self.time, self.avg, self.MBps) + return fmt + + +class BmCase: + + def __init__(self, filename): + with open(filename, "r") as f: + src = f.read() + self.src_as_str = src + self.src_as_bytes = bytes(src, "utf8") + self.src_as_bytearray = bytearray(src, "utf8") + + def run(self, bm_name, cls): + obj = cls() + method = getattr(obj, bm_name) + self.count = 0 + self.MB = 0 + def fn(): + method(self) + self.count += 1 + self.MB += len(self.src_as_str) + t = timeit.Timer(fn) + delta = time.time() + result = t.autorange() + delta = 1000. * (time.time() - delta) + name = bm_name + ":" + cls.__name__ + return RunResults(name, delta, self.count, self.MB, result) + + +class RymlRo: + + def parse(self, case): + r = ryml.parse(case.src_as_bytearray) + + +class RymlRoReuse: + + def __init__(self): + self.tree = ryml.Tree() + + def parse(self, case): + ryml.parse(case.src_as_bytearray, tree=ryml.Tree()) + + + +class RymlInSitu: + + def parse(self, case): + r = ryml.parse_in_situ(case.src_as_bytearray) + + +class RymlInSituReuse: + + def __init__(self): + self.tree = ryml.Tree() + + def parse(self, case): + self.tree.clear() + self.tree.clear_arena() + ryml.parse_in_situ(case.src_as_bytearray, tree=self.tree) + + +class RuamelYaml: + + def parse(self, case): + r = ruamel.yaml.load(case.src_as_str, Loader=ruamel.yaml.Loader) + + +class PyYaml: + + def parse(self, case): + r = yaml.safe_load(case.src_as_str) + + +def run(filename): + case = BmCase(filename) + approaches = (RuamelYaml, + PyYaml, + RymlRo, + RymlRoReuse, + RymlInSitu, + RymlInSituReuse) + benchmarks = ('parse', ) + for bm in benchmarks: + results = odict() + for cls in approaches: + r = case.run(bm, cls) + results[r.name] = r + print(r) + table = prettytable.PrettyTable() + table.field_names = ["case", "count", "time(ms)", "avg(ms)", "avg_read(MB/s)"] + table.align["case"] = "l" + def f(v): return "{:.3f}".format(v) + for v in results.values(): + table.add_row([v.name, v.count, f(v.time), f(v.avg), f(v.MBps)]) + print(table) + + +if __name__ == "__main__": + import sys + if len(sys.argv) < 2: + raise Exception("") + filename = sys.argv[1] + run(filename) diff --git a/api/python/requirements_dev.txt b/api/python/requirements_dev.txt new file mode 100644 index 000000000..cc22dcadd --- /dev/null +++ b/api/python/requirements_dev.txt @@ -0,0 +1,3 @@ +ruamel.yaml +pyyaml +prettytable diff --git a/api/ryml.i b/api/ryml.i index 899503a30..a0ac547c3 100644 --- a/api/ryml.i +++ b/api/ryml.i @@ -100,16 +100,16 @@ using csubstr = c4::csubstr; void parse_csubstr(c4::csubstr s, c4::yml::Tree *t) { - printf("PARSE READONLY: s=%.*s\n", (int)s.len, s.str); + //printf("PARSE READONLY: s=%.*s\n", (int)s.len, s.str); c4::yml::parse(s, t); - printf("PARSE READONLY OK: tree size=%zu\n", t->size()); + //printf("PARSE READONLY OK: tree size=%zu\n", t->size()); } void parse_substr(c4::substr s, c4::yml::Tree *t) { - printf("PARSE INPLACE: s=%.*s\n", (int)s.len, s.str); + //printf("PARSE INPLACE: s=%.*s\n", (int)s.len, s.str); c4::yml::parse(s, t); - printf("PARSE INPLACE OK: tree size=%zu\n", t->size()); + //printf("PARSE INPLACE OK: tree size=%zu\n", t->size()); } %} @@ -137,14 +137,14 @@ def siblings(tree, node): ch = tree.next_sibling(ch) -def walk(tree, node=None): +def walk(tree, node=None, indentation_level=0): assert tree is not None if node is None: node = tree.root_id() - yield node + yield node, indentation_level ch = tree.first_child(node) while ch != NONE: - for gc in walk(tree, ch): - yield gc + for gc, il in walk(tree, ch, indentation_level + 1): + yield gc, il ch = tree.next_sibling(ch) diff --git a/extern/c4core b/extern/c4core index 7ea1d6913..59490bbf5 160000 --- a/extern/c4core +++ b/extern/c4core @@ -1 +1 @@ -Subproject commit 7ea1d6913bf0cdcb19ff916822c7637a256cd907 +Subproject commit 59490bbf53bab12e576f973dd76eb8b67275482b