Skip to content

Commit

Permalink
AVRO-1694. Ruby: Schema normaliation and fingerprints. Contributed by…
Browse files Browse the repository at this point in the history
… Daniel Schierbeck.

* Avro::SchemaNormalization.to_parsing_form converts a schema to Parsing
Canonical Form
* support for MD5 and SHA256 fingerprints

This closes #40



git-svn-id: https://svn.apache.org/repos/asf/avro/trunk@1702839 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
busbey committed Sep 14, 2015
1 parent 4686716 commit e0096f2
Show file tree
Hide file tree
Showing 6 changed files with 371 additions and 0 deletions.
1 change: 1 addition & 0 deletions lang/ruby/lib/avro.rb
Expand Up @@ -39,3 +39,4 @@ def initialize(schm=nil, datum=nil, msg=nil)
require 'avro/data_file'
require 'avro/protocol'
require 'avro/ipc'
require 'avro/schema_normalization'
12 changes: 12 additions & 0 deletions lang/ruby/lib/avro/schema.rb
Expand Up @@ -137,6 +137,18 @@ def initialize(type)
# Deprecated in favor of {#type_sym}.
def type; @type_sym.to_s; end

# Returns the MD5 fingerprint of the schema as an Integer.
def md5_fingerprint
parsing_form = SchemaNormalization.to_parsing_form(self)
Digest::MD5.hexdigest(parsing_form).to_i(16)
end

# Returns the SHA-256 fingerprint of the schema as an Integer.
def sha256_fingerprint
parsing_form = SchemaNormalization.to_parsing_form(self)
Digest::SHA256.hexdigest(parsing_form).to_i(16)
end

def ==(other, seen=nil)
other.is_a?(Schema) && type_sym == other.type_sym
end
Expand Down
83 changes: 83 additions & 0 deletions lang/ruby/lib/avro/schema_normalization.rb
@@ -0,0 +1,83 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

module Avro
class SchemaNormalization
def self.to_parsing_form(schema)
new.to_parsing_form(schema)
end

def initialize
@processed_names = []
end

def to_parsing_form(schema)
JSON.dump(normalize_schema(schema))
end

private

def normalize_schema(schema)
type = schema.type_sym.to_s

if Schema::NAMED_TYPES.include?(type)
if @processed_names.include?(schema.name)
return schema.name
else
@processed_names << schema.name
end
end

case type
when *Schema::PRIMITIVE_TYPES
type
when "record"
fields = schema.fields.map {|field| normalize_field(field) }

normalize_named_type(schema, fields: fields)
when "enum"
normalize_named_type(schema, symbols: schema.symbols)
when "fixed"
normalize_named_type(schema, size: schema.size)
when "array"
{ type: type, items: normalize_schema(schema.items) }
when "map"
{ type: type, values: normalize_schema(schema.values) }
when "union"
if schema.schemas.nil?
[]
else
schema.schemas.map {|s| normalize_schema(s) }
end
else
raise "unknown type #{type}"
end
end

def normalize_field(field)
{
name: field.name,
type: normalize_schema(field.type)
}
end

def normalize_named_type(schema, attributes = {})
name = Name.make_fullname(schema.name, schema.namespace)

{ name: name, type: schema.type_sym.to_s }.merge(attributes)
end
end
end
67 changes: 67 additions & 0 deletions lang/ruby/test/case_finder.rb
@@ -0,0 +1,67 @@
class CaseFinder
PATH = File.expand_path("../../../../share/test/data/schema-tests.txt", __FILE__)

Case = Struct.new(:id, :input, :canonical, :fingerprint)

def self.cases
new.cases
end

def initialize
@scanner = StringScanner.new(File.read(PATH))
@cases = []
end

def cases
until @scanner.eos?
test_case = scan_case
@cases << test_case if test_case
end

@cases
end

private

def scan_case
if id = @scanner.scan(/\/\/ \d+\n/)
while @scanner.skip(/\/\/ .*\n/); end

input = scan_input
canonical = scan_canonical
fingerprint = scan_fingerprint

Case.new(id, input, canonical, fingerprint)
else
@scanner.skip(/.*\n/)
nil
end
end

def scan_item(name)
if @scanner.scan(/<<#{name}\n/)
lines = []
while line = @scanner.scan(/.+\n/)
break if line.chomp == name
lines << line
end
lines.join
elsif @scanner.scan(/<<#{name} /)
input = @scanner.scan(/.+$/)
@scanner.skip(/\n/)
input
end
end

def scan_input
scan_item("INPUT")
end

def scan_canonical
scan_item("canonical")
end

def scan_fingerprint
scan_item("fingerprint")
end
end
37 changes: 37 additions & 0 deletions lang/ruby/test/test_fingerprints.rb
@@ -0,0 +1,37 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

require 'test_help'

class TestFingerprints < Test::Unit::TestCase
def test_md5_fingerprint
schema = Avro::Schema.parse <<-SCHEMA
{ "type": "int" }
SCHEMA

assert_equal 318112854175969537208795771590915775282,
schema.md5_fingerprint
end

def test_sha256_fingerprint
schema = Avro::Schema.parse <<-SCHEMA
{ "type": "int" }
SCHEMA

assert_equal 28572620203319713300323544804233350633246234624932075150020181448463213378117,
schema.sha256_fingerprint
end
end
171 changes: 171 additions & 0 deletions lang/ruby/test/test_schema_normalization.rb
@@ -0,0 +1,171 @@
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

require 'test_help'
require 'case_finder'

class TestSchemaNormalization < Test::Unit::TestCase
def test_primitives
%w[null boolean string bytes int long float double].each do |type|
schema = Avro::Schema.parse(<<-JSON)
{ "type": "#{type}" }
JSON

canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)

assert_equal %("#{type}"), canonical_form
end
end

def test_records
schema = Avro::Schema.parse(<<-JSON)
{
"type": "record",
"name": "test",
"namespace": "random",
"doc": "some record",
"fields": [
{ "name": "height", "type": "int", "doc": "the height" }
]
}
JSON

expected_type = <<-JSON.strip
{"name":"random.test","type":"record","fields":[{"name":"height","type":"int"}]}
JSON

canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)

assert_equal expected_type, canonical_form
end

def test_recursive_records
schema = Avro::Schema.parse(<<-JSON)
{
"type": "record",
"name": "item",
"fields": [
{ "name": "next", "type": "item" }
]
}
JSON

expected_type = <<-JSON.strip
{"name":"item","type":"record","fields":[{"name":"next","type":"item"}]}
JSON

canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)

assert_equal expected_type, canonical_form
end

def test_enums
schema = Avro::Schema.parse(<<-JSON)
{
"type": "enum",
"name": "suit",
"namespace": "cards",
"doc": "the different suits of cards",
"symbols": ["club", "hearts", "diamond", "spades"]
}
JSON

expected_type = <<-JSON.strip
{"name":"cards.suit","type":"enum","symbols":["club","hearts","diamond","spades"]}
JSON

canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)

assert_equal expected_type, canonical_form
end

def test_fixed
schema = Avro::Schema.parse(<<-JSON)
{
"type": "fixed",
"name": "id",
"namespace": "db",
"size": 64
}
JSON

expected_type = <<-JSON.strip
{"name":"db.id","type":"fixed","size":64}
JSON

canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)

assert_equal expected_type, canonical_form
end

def test_arrays
schema = Avro::Schema.parse(<<-JSON)
{
"type": "array",
"doc": "the items",
"items": "int"
}
JSON

expected_type = <<-JSON.strip
{"type":"array","items":"int"}
JSON

canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)

assert_equal expected_type, canonical_form
end

def test_maps
schema = Avro::Schema.parse(<<-JSON)
{
"type": "map",
"doc": "the items",
"values": "int"
}
JSON

expected_type = <<-JSON.strip
{"type":"map","values":"int"}
JSON

canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)

assert_equal expected_type, canonical_form
end

def test_unions
schema = Avro::Schema.parse(<<-JSON)
["int", "string"]
JSON

expected_type = <<-JSON.strip
["int","string"]
JSON

canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)

assert_equal expected_type, canonical_form
end

def test_shared_dataset
CaseFinder.cases.each do |test_case|
schema = Avro::Schema.parse(test_case.input)
assert_equal test_case.canonical, Avro::SchemaNormalization.to_parsing_form(schema)
end
end
end

0 comments on commit e0096f2

Please sign in to comment.