Skip to content

Commit

Permalink
python API: unit tests, benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
biojppm committed Mar 19, 2019
1 parent 4cd4883 commit 3ad4484
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 13 deletions.
107 changes: 104 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ include analysing ryml with:
* undefined behavior
* thread

ryml is also partially available in Python, with more languages to follow (see
below).


------

Expand Down Expand Up @@ -677,6 +680,105 @@ do need this, then you will need to declare and use an allocator from a ryml
memory resource that outlives the tree and/or parser.
------
## Other languages
One of the aims of ryml is to provide an efficient YAML API for other
languages. There's already a cursory implementation for Python (using only
the low-level API). After ironing out the general approach, other languages
are likely to follow: probably (in order) JavaScript, C#, Java, Ruby, PHP,
Octave and R (all of this is possible because we're
using [SWIG](http://www.swig.org/), which makes it easy to do so).

### Python

(Note that this is a work in progress. Additions will be made and things will
be changed.) With that said, here's an example of the Python API:

```python
import ryml

# because ryml does not take ownership of the source buffer
# ryml cannot accept strings; only bytes or bytearrays
src = b"{HELLO: a, foo: b, bar: c, baz: d, seq: [0, 1, 2, 3]}"

def check(tree):
# for now, only the index-based low-level API is implemented
assert tree.size() == 10
assert tree.root_id() == 0
assert tree.first_child(0) == 1
assert tree.next_sibling(1) == 2
assert tree.first_sibling(5) == 2
assert tree.last_sibling(1) == 5
# use bytes objects for queries
assert tree.find_child(0, b"foo") == 1
assert tree.key(1) == b"foo")
assert tree.val(1) == b"b")
assert tree.find_child(0, b"seq") == 5
assert tree.is_seq(5)
# to loop over children:
for i, ch in enumerate(ryml.children(tree, 5)):
assert tree.val(ch) == [b"0", b"1", b"2", b"3"][i]
# to loop over siblings:
for i, sib in enumerate(ryml.siblings(tree, 5)):
assert tree.key(sib) == [b"HELLO", b"foo", b"bar", b"baz", b"seq"][i]
# to walk over all elements
visited = [False] * tree.size()
for n, indentation_level in ryml.walk(tree):
# just a dumb emitter
left = " " * indentation_level
if tree.is_keyval(n):
print("{}{}: {}".format(left, tree.key(n), tree.val(n))
elif tree.is_val(n):
print("- {}".format(left, tree.val(n))
elif tree.is_keyseq(n):
print("{}{}:".format(left, tree.key(n))
visited[inode] = True
assert False not in visited
# NOTE about encoding!
k = tree.get_key(5)
print(k) # '<memory at 0x7f80d5b93f48>'
assert k == b"seq" # ok, as expected
assert k != "seq" # not ok - NOTE THIS!
assert str(k) != "seq" # not ok
assert str(k, "utf8") == "seq" # ok again

# parse immutable buffer
tree = ryml.parse(src)
check(tree) # OK

# also works, but requires bytearrays or
# objects offering writeable memory
mutable = bytearray(src)
tree = ryml.parse_in_situ(mutable)
check(tree) # OK
```

As expected, the performance results so far are encouraging. In
a [timeit benchmark](api/python/parse_bm.py) compared
against [PyYaml](https://pyyaml.org/)
and [ruamel.yaml](https://yaml.readthedocs.io/en/latest/), ryml parses
quicker by a factor of 30x-50x:

```
+-----------------------+-------+----------+---------+----------------+
| case | iters | time(ms) | avg(ms) | avg_read(MB/s) |
+-----------------------+-------+----------+---------+----------------+
| parse:RuamelYaml | 88 | 800.483 | 9.096 | 0.234 |
| parse:PyYaml | 88 | 541.370 | 6.152 | 0.346 |
| parse:RymlRo | 3888 | 776.020 | 0.200 | 10.667 |
| parse:RymlRoReuse | 1888 | 381.558 | 0.202 | 10.535 |
| parse:RymlInSitu | 3888 | 775.121 | 0.199 | 10.679 |
| parse:RymlInSituReuse | 3888 | 774.534 | 0.199 | 10.687 |
+-----------------------+-------+----------+---------+----------------+
```

(Note that the results above are biased in favour of ryml, because ryml does
not perform any type conversions: return types are merely `memoryviews` to
the source buffer.)


------

## YAML standard conformance
Expand All @@ -698,12 +800,11 @@ appear some cases which YAML fails to parse. So we welcome
your
[bug reports or pull requests!](https://github.com/biojppm/rapidyaml/issues).

Integration of the ~300 cases in
the [YAML test suite](https://github.com/yaml/yaml-test-suite) is ongoing
Integration of the ~300 cases in the
[YAML test suite](https://github.com/yaml/yaml-test-suite) is ongoing
work.



------

## Alternative libraries
Expand Down
8 changes: 8 additions & 0 deletions api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,14 @@ if(RYML_BUILD_API_PYTHON3)
endfunction()

add_python_test(parse.py)
#if(RYML_BUILD_BENCHMARKS)
# c4_add_benchmark_cmd(ryml ryml-python3-api-bm-travis-ryml
# COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/../bm/cases/travis.yml ryml)
# c4_add_benchmark_cmd(ryml ryml-python3-api-bm-appveyor-ryml
# COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/../bm/cases/appveyor.yml ryml)
# c4_add_benchmark_cmd(ryml ryml-python3-api-bm-compile_commands-ryml
# COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/../bm/cases/compile_commands.json ryml)
#endif()
endif()


Expand Down
33 changes: 32 additions & 1 deletion api/python/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,18 @@ class SimpleHardcoded:
yaml = "{HELLO: a, foo: b, bar: c, baz: d, seq: [0, 1, 2, 3]}"

def check(self, ut, t):

for i, sib in enumerate(ryml.siblings(t, 5)):
s = t.key(sib)
r = [b"HELLO", b"foo", b"bar", b"baz", b"seq"][i]
print("'{}' vs '{}': {}, {}".format(s, r, s == r, s is not r))
s = str(t.key(sib), "utf8")
r = ["HELLO", "foo", "bar", "baz", "seq"][i]
print("'{}' vs '{}': {}, {}".format(s, r, s == r, s is not r))

# some convenient shorthands
eq = ut.assertEqual
ne = ut.assertNotEqual
fs = ut.assertFalse
tr = ut.assertTrue
#
Expand Down Expand Up @@ -95,24 +105,45 @@ def check(self, ut, t):
eq(num, 5)
eq(num, t.num_siblings(t.first_child(t.root_id())))
#

for i, ch in enumerate(ryml.children(t, 5)):
eq(t.val(ch), [b"0", b"1", b"2", b"3"][i])

sibs = [b"HELLO", b"foo", b"bar", b"baz", b"seq"]
sibs_s = ["HELLO", "foo", "bar", "baz", "seq"]
for i, sib in enumerate(ryml.siblings(t, 5)):
k = t.key(sib)
k_s = str(k, "utf8")
eq(k, sibs[i])
eq(k_s, sibs_s[i])
ne(k, sibs_s[i])
ne(k_s, sibs[i])
k_s = str(k)
ne(k_s, sibs_s[i])
ne(k_s, sibs[i])

num = 0
for id in ryml.siblings(t, 0):
num += 1
eq(num, 1)
#
num = 0
for id in ryml.walk(t):
for id, level in ryml.walk(t):
num += 1
if t.is_root(id):
eq(id, 0)
eq(level, 0)
if t.is_map(id):
eq(id, 0)
eq(level, 0)
if t.is_seq(id):
eq(id, 5)
eq(level, 1)
if t.is_keyval(id):
tr(id > 0 and id < 5)
if t.is_val(id):
tr(id > 5)
eq(level, 2)
eq(num, t.size())
#
num = 0
Expand Down
128 changes: 128 additions & 0 deletions api/python/parse_bm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import ryml
import ruamel.yaml
import yaml
import timeit
import time
import prettytable
from collections import OrderedDict as odict


class RunResults:

__slots__ = ('name', 'count', 'time', 'avg', 'MBps', 'timeit')

def __init__(self, name, time, count, MB, timeit):
self.name = name
self.time = time
self.count = count
self.avg = time / count
self.MBps = MB / self.time / 1000.0
self.timeit = timeit

def __str__(self):
fmt = "{}: count={} time={:.3f}ms avg={:.3f}ms MB/s={:.3f}"
fmt = fmt.format(self.name, self.count, self.time, self.avg, self.MBps)
return fmt


class BmCase:

def __init__(self, filename):
with open(filename, "r") as f:
src = f.read()
self.src_as_str = src
self.src_as_bytes = bytes(src, "utf8")
self.src_as_bytearray = bytearray(src, "utf8")

def run(self, bm_name, cls):
obj = cls()
method = getattr(obj, bm_name)
self.count = 0
self.MB = 0
def fn():
method(self)
self.count += 1
self.MB += len(self.src_as_str)
t = timeit.Timer(fn)
delta = time.time()
result = t.autorange()
delta = 1000. * (time.time() - delta)
name = bm_name + ":" + cls.__name__
return RunResults(name, delta, self.count, self.MB, result)


class RymlRo:

def parse(self, case):
r = ryml.parse(case.src_as_bytearray)


class RymlRoReuse:

def __init__(self):
self.tree = ryml.Tree()

def parse(self, case):
ryml.parse(case.src_as_bytearray, tree=ryml.Tree())



class RymlInSitu:

def parse(self, case):
r = ryml.parse_in_situ(case.src_as_bytearray)


class RymlInSituReuse:

def __init__(self):
self.tree = ryml.Tree()

def parse(self, case):
self.tree.clear()
self.tree.clear_arena()
ryml.parse_in_situ(case.src_as_bytearray, tree=self.tree)


class RuamelYaml:

def parse(self, case):
r = ruamel.yaml.load(case.src_as_str, Loader=ruamel.yaml.Loader)


class PyYaml:

def parse(self, case):
r = yaml.safe_load(case.src_as_str)


def run(filename):
case = BmCase(filename)
approaches = (RuamelYaml,
PyYaml,
RymlRo,
RymlRoReuse,
RymlInSitu,
RymlInSituReuse)
benchmarks = ('parse', )
for bm in benchmarks:
results = odict()
for cls in approaches:
r = case.run(bm, cls)
results[r.name] = r
print(r)
table = prettytable.PrettyTable()
table.field_names = ["case", "count", "time(ms)", "avg(ms)", "avg_read(MB/s)"]
table.align["case"] = "l"
def f(v): return "{:.3f}".format(v)
for v in results.values():
table.add_row([v.name, v.count, f(v.time), f(v.avg), f(v.MBps)])
print(table)


if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
raise Exception("")
filename = sys.argv[1]
run(filename)
3 changes: 3 additions & 0 deletions api/python/requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ruamel.yaml
pyyaml
prettytable
16 changes: 8 additions & 8 deletions api/ryml.i
Original file line number Diff line number Diff line change
Expand Up @@ -100,16 +100,16 @@ using csubstr = c4::csubstr;

void parse_csubstr(c4::csubstr s, c4::yml::Tree *t)
{
printf("PARSE READONLY: s=%.*s\n", (int)s.len, s.str);
//printf("PARSE READONLY: s=%.*s\n", (int)s.len, s.str);
c4::yml::parse(s, t);
printf("PARSE READONLY OK: tree size=%zu\n", t->size());
//printf("PARSE READONLY OK: tree size=%zu\n", t->size());
}

void parse_substr(c4::substr s, c4::yml::Tree *t)
{
printf("PARSE INPLACE: s=%.*s\n", (int)s.len, s.str);
//printf("PARSE INPLACE: s=%.*s\n", (int)s.len, s.str);
c4::yml::parse(s, t);
printf("PARSE INPLACE OK: tree size=%zu\n", t->size());
//printf("PARSE INPLACE OK: tree size=%zu\n", t->size());
}

%}
Expand Down Expand Up @@ -137,14 +137,14 @@ def siblings(tree, node):
ch = tree.next_sibling(ch)


def walk(tree, node=None):
def walk(tree, node=None, indentation_level=0):
assert tree is not None
if node is None: node = tree.root_id()
yield node
yield node, indentation_level
ch = tree.first_child(node)
while ch != NONE:
for gc in walk(tree, ch):
yield gc
for gc, il in walk(tree, ch, indentation_level + 1):
yield gc, il
ch = tree.next_sibling(ch)


Expand Down
2 changes: 1 addition & 1 deletion extern/c4core
Submodule c4core updated 1 files
+1 −1 cmake

0 comments on commit 3ad4484

Please sign in to comment.