From c52bbd6b32f332f79d22d615baf07e8162c32830 Mon Sep 17 00:00:00 2001 From: Erik Forsberg Date: Mon, 24 Oct 2016 14:10:36 +0200 Subject: [PATCH 1/3] AVRO-1938: Py2 support for generating canonical form of schemas --- lang/py/src/avro/schema.py | 109 ++++++++++++++++++++++++++++++++++-- lang/py/test/test_schema.py | 14 +++-- 2 files changed, 115 insertions(+), 8 deletions(-) diff --git a/lang/py/src/avro/schema.py b/lang/py/src/avro/schema.py index 6a7fbbb8c8b..6be2c8537c5 100644 --- a/lang/py/src/avro/schema.py +++ b/lang/py/src/avro/schema.py @@ -33,10 +33,8 @@ A boolean; or Null. """ -try: - import json -except ImportError: - import simplejson as json +# Require simplejson as it support item_sort_key, which we use to sort properties when emitting canonical form of schema +import simplejson as json # # Constants @@ -94,6 +92,16 @@ 'ignore', ) +CANONICAL_FIELD_ORDER = ( + 'name', + 'type', + 'fields', + 'symbols', + 'items', + 'values', + 'size' +) + # # Exceptions # @@ -110,6 +118,7 @@ class SchemaParseException(AvroException): class Schema(object): """Base class for all Schema classes.""" + def __init__(self, type, other_props=None): # Ensure valid ctor args if not isinstance(type, basestring): @@ -153,6 +162,34 @@ def to_json(self, names): """ raise Exception("Must be implemented by subclasses.") + def to_canonical_json(self, names): + """ + Converts the schema object into its Canonical Form + + http://avro.apache.org/docs/current/spec.html#Parsing+Canonical+Form+for+Schemas + """ + raise NotImplementedError("to_canonical_json must be implemented by subclasses") + + @staticmethod + def _keep_canonical_properties(props): + to_dump = props.copy() + for key in props.keys(): + if key not in CANONICAL_FIELD_ORDER: + del to_dump[key] + return to_dump + + @staticmethod + def _sort_properties(item): + key = item[0] + if key in CANONICAL_FIELD_ORDER: + return CANONICAL_FIELD_ORDER.index(key) + else: + raise RuntimeError("Unsortable item %r" % (item,)) + + def canonical_form(self): + return json.dumps(self.to_canonical_json(None), item_sort_key=Schema._sort_properties, + separators=(',', ':')) + class Name(object): """Class to describe Avro name.""" @@ -376,6 +413,13 @@ def to_json(self, names=None): to_dump['type'] = self.type.to_json(names) return to_dump + def to_canonical_json(self, names=None): + if names is None: + names = Names() + to_dump = Schema._keep_canonical_properties(self.props) + to_dump["type"] = self.type.to_canonical_json(names) + return to_dump + def __eq__(self, that): to_cmp = json.loads(str(self)) return to_cmp == json.loads(str(that)) @@ -401,6 +445,12 @@ def to_json(self, names=None): else: return self.props + def to_canonical_json(self, names=None): + if len(self.props) == 1: + return self.fullname + else: + return self._keep_canonical_properties(self.props) + def __eq__(self, that): return self.props == that.props @@ -433,6 +483,12 @@ def to_json(self, names=None): names.names[self.fullname] = self return names.prune_namespace(self.props) + def to_canonical_json(self, names): + to_dump = self._keep_canonical_properties(self.to_json()) + to_dump["name"] = self.fullname + + return to_dump + def __eq__(self, that): return self.props == that.props @@ -469,6 +525,12 @@ def to_json(self, names=None): names.names[self.fullname] = self return names.prune_namespace(self.props) + def to_canonical_json(self, names=None): + to_dump = self._keep_canonical_properties(self.to_json(names)) + to_dump["name"] = self.fullname + + return to_dump + def __eq__(self, that): return self.props == that.props @@ -504,6 +566,14 @@ def to_json(self, names=None): to_dump['items'] = item_schema.to_json(names) return to_dump + def to_canonical_json(self, names=None): + if names is None: + names = Names() + to_dump = self._keep_canonical_properties(self.props) + item_schema = self.get_prop("items") + to_dump["items"] = item_schema.to_canonical_json(names) + return to_dump + def __eq__(self, that): to_cmp = json.loads(str(self)) return to_cmp == json.loads(str(that)) @@ -535,6 +605,13 @@ def to_json(self, names=None): to_dump['values'] = self.get_prop('values').to_json(names) return to_dump + def to_canonical_json(self, names): + if names is None: + names = Names() + to_dump = self._keep_canonical_properties(self.props) + to_dump["values"] = self.get_prop("values").to_canonical_json(names) + return to_dump + def __eq__(self, that): to_cmp = json.loads(str(self)) return to_cmp == json.loads(str(that)) @@ -583,6 +660,11 @@ def to_json(self, names=None): to_dump.append(schema.to_json(names)) return to_dump + def to_canonical_json(self, names): + if names is None: + names = Names() + return [schema.to_canonical_json(names) for schema in self.schemas] + def __eq__(self, that): to_cmp = json.loads(str(self)) return to_cmp == json.loads(str(that)) @@ -692,6 +774,25 @@ def to_json(self, names=None): to_dump['fields'] = [ f.to_json(names) for f in self.fields ] return to_dump + def to_canonical_json(self, names): + if names is None: + names = Names() + + if self.type == 'request': + raise NotImplementedError("Canonical form (probably) does not make sense on type request") + + to_dump = self._keep_canonical_properties(self.props) + to_dump["name"] = self.fullname + + if self.fullname in names.names: + return self.name_ref(names) + else: + names.names[self.fullname] = self + + to_dump["fields"] = [f.to_canonical_json(names) for f in self.fields] + + return to_dump + def __eq__(self, that): to_cmp = json.loads(str(self)) return to_cmp == json.loads(str(that)) diff --git a/lang/py/test/test_schema.py b/lang/py/test/test_schema.py index 00e2a05de81..d36606294b3 100644 --- a/lang/py/test/test_schema.py +++ b/lang/py/test/test_schema.py @@ -295,7 +295,8 @@ def make_primitive_examples(): """, True) ] -EXAMPLES = PRIMITIVE_EXAMPLES +EXAMPLES = [] +EXAMPLES += PRIMITIVE_EXAMPLES EXAMPLES += FIXED_EXAMPLES EXAMPLES += ENUM_EXAMPLES EXAMPLES += ARRAY_EXAMPLES @@ -484,12 +485,17 @@ def test_exception_is_not_swallowed_on_parse_error(self): schema.parse('/not/a/real/file') caught_exception = False except schema.SchemaParseException, e: - expected_message = 'Error parsing JSON: /not/a/real/file, error = ' \ - 'No JSON object could be decoded' - self.assertEqual(expected_message, e.args[0]) + expected_messages = ['Error parsing JSON: /not/a/real/file, error = No JSON object could be decoded', + 'Error parsing JSON: /not/a/real/file, error = Expecting value: line 1 column 1 (char 0)'] + assert e.args[0] in expected_messages caught_exception = True self.assertTrue(caught_exception, 'Exception was not caught') + def test_canonical_form(self): + for example in VALID_EXAMPLES: + # Quick test: We can make canonical schemas of all valid schemas + schema.parse(example.schema_string).canonical_form() + if __name__ == '__main__': unittest.main() From 8521011a5d12c02b16440710a909759b762b65ac Mon Sep 17 00:00:00 2001 From: Erik Forsberg Date: Mon, 24 Oct 2016 14:41:02 +0200 Subject: [PATCH 2/3] AVRO-1938: Argh.. Actually force simplejson from setup.py --- lang/py/setup.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lang/py/setup.py b/lang/py/setup.py index 0d3c9b9e7a0..f544f481c2f 100755 --- a/lang/py/setup.py +++ b/lang/py/setup.py @@ -19,11 +19,6 @@ from setuptools import setup except ImportError: from distutils.core import setup -from sys import version_info - -install_requires = [] -if version_info[:2] <= (2, 5): - install_requires.append('simplejson >= 2.0.9') setup( name = 'avro', @@ -37,7 +32,7 @@ # Project uses simplejson, so ensure that it gets installed or upgraded # on the target machine - install_requires = install_requires, + install_requires = ['simplejson>=2.0.9'], # metadata for upload to PyPI author = 'Apache Avro', From 19f76341d7a219b148653caa8436f5e2234b8476 Mon Sep 17 00:00:00 2001 From: Erik Forsberg Date: Tue, 7 Feb 2017 10:05:54 +0100 Subject: [PATCH 3/3] AVRO-1938: Support canonical form of named types in unions --- lang/py/src/avro/schema.py | 8 ++++++-- lang/py/test/test_schema.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/lang/py/src/avro/schema.py b/lang/py/src/avro/schema.py index 6be2c8537c5..6ec4a14156f 100644 --- a/lang/py/src/avro/schema.py +++ b/lang/py/src/avro/schema.py @@ -526,8 +526,12 @@ def to_json(self, names=None): return names.prune_namespace(self.props) def to_canonical_json(self, names=None): - to_dump = self._keep_canonical_properties(self.to_json(names)) - to_dump["name"] = self.fullname + names_as_json = self.to_json(names) + if isinstance(names_as_json, basestring): + to_dump = self.fullname + else: + to_dump = self._keep_canonical_properties(names_as_json) + to_dump["name"] = self.fullname return to_dump diff --git a/lang/py/test/test_schema.py b/lang/py/test/test_schema.py index d36606294b3..34715fb3c96 100644 --- a/lang/py/test/test_schema.py +++ b/lang/py/test/test_schema.py @@ -131,6 +131,20 @@ def make_primitive_examples(): """, False), ] +NAMED_IN_UNION_EXAMPLES = [ + ExampleSchema("""{"namespace": "org.apache.avro.test", + "type": "record", + "name": "Test", + "fields": [{"type": {"symbols": ["one", "two"], + "type": "enum", + "name": "NamedEnum"}, + "name": "thenamedenum"}, + {"type": ["null", "NamedEnum"], + "name": "unionwithreftoenum"} + ] + }""", True) +] + RECORD_EXAMPLES = [ ExampleSchema("""\ {"type": "record", @@ -302,6 +316,7 @@ def make_primitive_examples(): EXAMPLES += ARRAY_EXAMPLES EXAMPLES += MAP_EXAMPLES EXAMPLES += UNION_EXAMPLES +EXAMPLES += NAMED_IN_UNION_EXAMPLES EXAMPLES += RECORD_EXAMPLES EXAMPLES += DOC_EXAMPLES