From e0096f2f9d0036e02f16eb0b111fd68470758351 Mon Sep 17 00:00:00 2001 From: Sean Busbey Date: Mon, 14 Sep 2015 00:03:51 +0000 Subject: [PATCH] AVRO-1694. Ruby: Schema normaliation and fingerprints. Contributed by Daniel Schierbeck. * Avro::SchemaNormalization.to_parsing_form converts a schema to Parsing Canonical Form * support for MD5 and SHA256 fingerprints This closes #40 git-svn-id: https://svn.apache.org/repos/asf/avro/trunk@1702839 13f79535-47bb-0310-9956-ffa450edef68 --- lang/ruby/lib/avro.rb | 1 + lang/ruby/lib/avro/schema.rb | 12 ++ lang/ruby/lib/avro/schema_normalization.rb | 83 ++++++++++ lang/ruby/test/case_finder.rb | 67 ++++++++ lang/ruby/test/test_fingerprints.rb | 37 +++++ lang/ruby/test/test_schema_normalization.rb | 171 ++++++++++++++++++++ 6 files changed, 371 insertions(+) create mode 100644 lang/ruby/lib/avro/schema_normalization.rb create mode 100644 lang/ruby/test/case_finder.rb create mode 100644 lang/ruby/test/test_fingerprints.rb create mode 100644 lang/ruby/test/test_schema_normalization.rb diff --git a/lang/ruby/lib/avro.rb b/lang/ruby/lib/avro.rb index 902dcd88033..c419ab1e6b3 100644 --- a/lang/ruby/lib/avro.rb +++ b/lang/ruby/lib/avro.rb @@ -39,3 +39,4 @@ def initialize(schm=nil, datum=nil, msg=nil) require 'avro/data_file' require 'avro/protocol' require 'avro/ipc' +require 'avro/schema_normalization' diff --git a/lang/ruby/lib/avro/schema.rb b/lang/ruby/lib/avro/schema.rb index 142157775ee..87f6fa4804e 100644 --- a/lang/ruby/lib/avro/schema.rb +++ b/lang/ruby/lib/avro/schema.rb @@ -137,6 +137,18 @@ def initialize(type) # Deprecated in favor of {#type_sym}. def type; @type_sym.to_s; end + # Returns the MD5 fingerprint of the schema as an Integer. + def md5_fingerprint + parsing_form = SchemaNormalization.to_parsing_form(self) + Digest::MD5.hexdigest(parsing_form).to_i(16) + end + + # Returns the SHA-256 fingerprint of the schema as an Integer. + def sha256_fingerprint + parsing_form = SchemaNormalization.to_parsing_form(self) + Digest::SHA256.hexdigest(parsing_form).to_i(16) + end + def ==(other, seen=nil) other.is_a?(Schema) && type_sym == other.type_sym end diff --git a/lang/ruby/lib/avro/schema_normalization.rb b/lang/ruby/lib/avro/schema_normalization.rb new file mode 100644 index 00000000000..0a5bee5d0cc --- /dev/null +++ b/lang/ruby/lib/avro/schema_normalization.rb @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +module Avro + class SchemaNormalization + def self.to_parsing_form(schema) + new.to_parsing_form(schema) + end + + def initialize + @processed_names = [] + end + + def to_parsing_form(schema) + JSON.dump(normalize_schema(schema)) + end + + private + + def normalize_schema(schema) + type = schema.type_sym.to_s + + if Schema::NAMED_TYPES.include?(type) + if @processed_names.include?(schema.name) + return schema.name + else + @processed_names << schema.name + end + end + + case type + when *Schema::PRIMITIVE_TYPES + type + when "record" + fields = schema.fields.map {|field| normalize_field(field) } + + normalize_named_type(schema, fields: fields) + when "enum" + normalize_named_type(schema, symbols: schema.symbols) + when "fixed" + normalize_named_type(schema, size: schema.size) + when "array" + { type: type, items: normalize_schema(schema.items) } + when "map" + { type: type, values: normalize_schema(schema.values) } + when "union" + if schema.schemas.nil? + [] + else + schema.schemas.map {|s| normalize_schema(s) } + end + else + raise "unknown type #{type}" + end + end + + def normalize_field(field) + { + name: field.name, + type: normalize_schema(field.type) + } + end + + def normalize_named_type(schema, attributes = {}) + name = Name.make_fullname(schema.name, schema.namespace) + + { name: name, type: schema.type_sym.to_s }.merge(attributes) + end + end +end diff --git a/lang/ruby/test/case_finder.rb b/lang/ruby/test/case_finder.rb new file mode 100644 index 00000000000..eb5424a09f4 --- /dev/null +++ b/lang/ruby/test/case_finder.rb @@ -0,0 +1,67 @@ +class CaseFinder + PATH = File.expand_path("../../../../share/test/data/schema-tests.txt", __FILE__) + + Case = Struct.new(:id, :input, :canonical, :fingerprint) + + def self.cases + new.cases + end + + def initialize + @scanner = StringScanner.new(File.read(PATH)) + @cases = [] + end + + def cases + until @scanner.eos? + test_case = scan_case + @cases << test_case if test_case + end + + @cases + end + + private + + def scan_case + if id = @scanner.scan(/\/\/ \d+\n/) + while @scanner.skip(/\/\/ .*\n/); end + + input = scan_input + canonical = scan_canonical + fingerprint = scan_fingerprint + + Case.new(id, input, canonical, fingerprint) + else + @scanner.skip(/.*\n/) + nil + end + end + + def scan_item(name) + if @scanner.scan(/<<#{name}\n/) + lines = [] + while line = @scanner.scan(/.+\n/) + break if line.chomp == name + lines << line + end + lines.join + elsif @scanner.scan(/<<#{name} /) + input = @scanner.scan(/.+$/) + @scanner.skip(/\n/) + input + end + end + + def scan_input + scan_item("INPUT") + end + + def scan_canonical + scan_item("canonical") + end + + def scan_fingerprint + scan_item("fingerprint") + end +end diff --git a/lang/ruby/test/test_fingerprints.rb b/lang/ruby/test/test_fingerprints.rb new file mode 100644 index 00000000000..d43aad86ed9 --- /dev/null +++ b/lang/ruby/test/test_fingerprints.rb @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +require 'test_help' + +class TestFingerprints < Test::Unit::TestCase + def test_md5_fingerprint + schema = Avro::Schema.parse <<-SCHEMA + { "type": "int" } + SCHEMA + + assert_equal 318112854175969537208795771590915775282, + schema.md5_fingerprint + end + + def test_sha256_fingerprint + schema = Avro::Schema.parse <<-SCHEMA + { "type": "int" } + SCHEMA + + assert_equal 28572620203319713300323544804233350633246234624932075150020181448463213378117, + schema.sha256_fingerprint + end +end diff --git a/lang/ruby/test/test_schema_normalization.rb b/lang/ruby/test/test_schema_normalization.rb new file mode 100644 index 00000000000..f61dd2f0e93 --- /dev/null +++ b/lang/ruby/test/test_schema_normalization.rb @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +require 'test_help' +require 'case_finder' + +class TestSchemaNormalization < Test::Unit::TestCase + def test_primitives + %w[null boolean string bytes int long float double].each do |type| + schema = Avro::Schema.parse(<<-JSON) + { "type": "#{type}" } + JSON + + canonical_form = Avro::SchemaNormalization.to_parsing_form(schema) + + assert_equal %("#{type}"), canonical_form + end + end + + def test_records + schema = Avro::Schema.parse(<<-JSON) + { + "type": "record", + "name": "test", + "namespace": "random", + "doc": "some record", + "fields": [ + { "name": "height", "type": "int", "doc": "the height" } + ] + } + JSON + + expected_type = <<-JSON.strip + {"name":"random.test","type":"record","fields":[{"name":"height","type":"int"}]} + JSON + + canonical_form = Avro::SchemaNormalization.to_parsing_form(schema) + + assert_equal expected_type, canonical_form + end + + def test_recursive_records + schema = Avro::Schema.parse(<<-JSON) + { + "type": "record", + "name": "item", + "fields": [ + { "name": "next", "type": "item" } + ] + } + JSON + + expected_type = <<-JSON.strip + {"name":"item","type":"record","fields":[{"name":"next","type":"item"}]} + JSON + + canonical_form = Avro::SchemaNormalization.to_parsing_form(schema) + + assert_equal expected_type, canonical_form + end + + def test_enums + schema = Avro::Schema.parse(<<-JSON) + { + "type": "enum", + "name": "suit", + "namespace": "cards", + "doc": "the different suits of cards", + "symbols": ["club", "hearts", "diamond", "spades"] + } + JSON + + expected_type = <<-JSON.strip + {"name":"cards.suit","type":"enum","symbols":["club","hearts","diamond","spades"]} + JSON + + canonical_form = Avro::SchemaNormalization.to_parsing_form(schema) + + assert_equal expected_type, canonical_form + end + + def test_fixed + schema = Avro::Schema.parse(<<-JSON) + { + "type": "fixed", + "name": "id", + "namespace": "db", + "size": 64 + } + JSON + + expected_type = <<-JSON.strip + {"name":"db.id","type":"fixed","size":64} + JSON + + canonical_form = Avro::SchemaNormalization.to_parsing_form(schema) + + assert_equal expected_type, canonical_form + end + + def test_arrays + schema = Avro::Schema.parse(<<-JSON) + { + "type": "array", + "doc": "the items", + "items": "int" + } + JSON + + expected_type = <<-JSON.strip + {"type":"array","items":"int"} + JSON + + canonical_form = Avro::SchemaNormalization.to_parsing_form(schema) + + assert_equal expected_type, canonical_form + end + + def test_maps + schema = Avro::Schema.parse(<<-JSON) + { + "type": "map", + "doc": "the items", + "values": "int" + } + JSON + + expected_type = <<-JSON.strip + {"type":"map","values":"int"} + JSON + + canonical_form = Avro::SchemaNormalization.to_parsing_form(schema) + + assert_equal expected_type, canonical_form + end + + def test_unions + schema = Avro::Schema.parse(<<-JSON) + ["int", "string"] + JSON + + expected_type = <<-JSON.strip + ["int","string"] + JSON + + canonical_form = Avro::SchemaNormalization.to_parsing_form(schema) + + assert_equal expected_type, canonical_form + end + + def test_shared_dataset + CaseFinder.cases.each do |test_case| + schema = Avro::Schema.parse(test_case.input) + assert_equal test_case.canonical, Avro::SchemaNormalization.to_parsing_form(schema) + end + end +end