Skip to content
Browse files

Perform auto-detection of input to select appropriate Reader class if…

… one cannot be determined from file characteristics. Implemented using RDF::Format.detect and option or yieldreturn to RDF::Format.for. RDF::Reader.open uses this to help select appropriate Reader sub-class.
  • Loading branch information...
1 parent 29dca26 commit 08b329c38fad1e4396ffaf5c6c1d1ad4129c791b @gkellogg gkellogg committed Sep 7, 2011
Showing with 184 additions and 41 deletions.
  1. +2 −0 README.md
  2. +41 −2 lib/rdf/format.rb
  3. +7 −0 lib/rdf/nquads.rb
  4. +2 −0 lib/rdf/ntriples/format.rb
  5. +15 −3 lib/rdf/reader.rb
  6. +2 −0 spec/format_spec.rb
  7. +58 −18 spec/nquads_spec.rb
  8. +57 −18 spec/ntriples_spec.rb
View
2 README.md
@@ -27,6 +27,8 @@ Features
use of any one part of the library without needing to load up the rest.
* Compatible with Ruby 1.8.7+, Ruby 1.9.x, and JRuby 1.4/1.5.
* Compatible with older Ruby versions with the help of the [Backports][] gem.
+* Performs auto-detection of input to select appropriate Reader class if one
+ cannot be determined from file characteristics.
Tutorials
---------
View
43 lib/rdf/format.rb
@@ -78,11 +78,17 @@ def self.each(&block)
# @option options [Symbol, #to_sym] :file_extension (nil)
# @option options [String, #to_s] :content_type (nil)
# Note that content_type will be taken from a URL opened using {RDF::Util::File.open_file}.
+ # @option options [String] :sample (nil)
+ # A sample of input used for performing format detection.
+ # If we find no formats, or we find more than one, and we have a sample, we can
+ # perform format detection to find a specific format to use, in which case
+ # we pick the first one we find
# @return [Class]
+ # @yieldreturn [String] another way to provide a sample, allows lazy for retrieving the sample.
#
# @return [Class]
def self.for(options = {})
- case options
+ format = case options
when String
# Find a format based on the file name
self.for(:file_name => options)
@@ -95,7 +101,7 @@ def self.for(options = {})
# @see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7
mime_type = mime_type.to_s
mime_type = mime_type.split(';').first if mime_type.include?(?;) # remove any media type parameters
- content_types.has_key?(mime_type) ? content_types[mime_type].first : nil
+ content_types[mime_type]
# Find a format based on the file name:
when file_name = options[:file_name]
self.for(:file_extension => File.extname(file_name.to_s)[1..-1])
@@ -122,6 +128,21 @@ def self.for(options = {})
nil # not found
end
end
+
+ if format.is_a?(Array)
+ return format.first if format.length == 1
+ elsif !format.nil?
+ return format
+ end
+
+ # If we have a sample, use that for format detection
+ if sample = (options[:sample] if options.is_a?(Hash)) || (yield if block_given?)
+ # Given a sample, perform format detection across the appropriate formats, choosing
+ # the first that matches
+ format ||= @@subclasses
+
+ format.detect {|f| f.detect(sample)}
+ end
end
##
@@ -236,6 +257,24 @@ def self.writer(klass = nil, &block)
end
end
+
+ ##
+ # Use a text sample to detect the format of an input file. Sub-classes implement
+ # a matcher sufficient to detect probably format matches, including disambiguating
+ # between other similar formats.
+ #
+ # Used to determine format class from loaded formats by {RDF::Format.for} when a
+ # match cannot be unambigiously found otherwise.
+ #
+ # @example
+ # RDF::NTriples::Format.detect("<a> <b> <c> .") => true
+ #
+ # @param [String] sample Beginning several bytes (~ 1K) of input.
+ # @result [Boolean]
+ def self.detect(sample)
+ false
+ end
+
class << self
alias_method :reader_class, :reader
alias_method :writer_class, :writer
View
7 lib/rdf/nquads.rb
@@ -24,6 +24,13 @@ class Format < RDF::Format
reader { RDF::NQuads::Reader }
writer { RDF::NQuads::Writer }
+
+ ##
+ # Sample detection to see if it matches N-Quads (or N-Triples)
+ def self.detect(sample)
+ sample.match(%r(^\s*<[^>]*>.*\.\s*$)) &&
+ !sample.match(%r(@(base|prefix))) # Not Turtle/N3
+ end
end
class Reader < NTriples::Reader
View
2 lib/rdf/ntriples/format.rb
@@ -22,5 +22,7 @@ class Format < RDF::Format
reader { RDF::NTriples::Reader }
writer { RDF::NTriples::Writer }
+
+ # No format detection, as N-Triples can be parsed by N-Quads
end
end
View
18 lib/rdf/reader.rb
@@ -77,10 +77,17 @@ def self.each(&block)
# @option options [Symbol, #to_sym] :file_extension (nil)
# @option options [String, #to_s] :content_type (nil)
# @return [Class]
+ # @option options [String] :sample (nil)
+ # A sample of input used for performing format detection.
+ # If we find no formats, or we find more than one, and we have a sample, we can
+ # perform format detection to find a specific format to use, in which case
+ # we pick the first one we find
+ # @return [Class]
+ # @yieldreturn [String] another way to provide a sample, allows lazy for retrieving the sample.
#
# @return [Class]
- def self.for(options = {})
- if format = self.format || Format.for(options)
+ def self.for(options = {}, &block)
+ if format = self.format || Format.for(options, &block)
format.reader
end
end
@@ -120,7 +127,12 @@ def self.open(filename, options = {}, &block)
format_options = options.dup
format_options[:content_type] ||= file.content_type if file.respond_to?(:content_type)
format_options[:file_name] ||= filename
- reader = self.for(format_options[:format] || format_options)
+ reader = self.for(format_options[:format] || format_options) do
+ # Return a sample from the input file
+ sample = file.read(1000)
+ file.rewind
+ sample
+ end
if reader
reader.new(file, options, &block)
else
View
2 spec/format_spec.rb
@@ -1,5 +1,7 @@
require File.join(File.dirname(__FILE__), 'spec_helper')
require 'rdf/spec/format'
+require 'rdf/ntriples'
+require 'rdf/nquads'
describe RDF::Format do
before(:each) do
View
76 spec/nquads_spec.rb
@@ -25,11 +25,45 @@
RDF::Format.for(arg).should == @format_class
end
end
+
+ {
+ :ntriples => "<a> <b> <c> .",
+ :nquads => "<a> <b> <c> <d> . ",
+ :literal => '<a> <b> "literal" .',
+ :multi_line => '<a>\n <b>\n "literal"\n .',
+ }.each do |sym, str|
+ it "detects #{sym}" do
+ @format_class.for {str}.should == @format_class
+ end
+ end
end
describe "#to_sym" do
specify {@format_class.to_sym.should == :nquads}
end
+
+ describe ".detect" do
+ {
+ :ntriples => "<a> <b> <c> .",
+ :nquads => "<a> <b> <c> <d> . ",
+ :literal => '<a> <b> "literal" .',
+ :multi_line => '<a>\n <b>\n "literal"\n .',
+ }.each do |sym, str|
+ it "detects #{sym}" do
+ @format_class.detect(str).should be_true
+ end
+ end
+
+ {
+ :turtle => "@prefix foo: <bar> .\n foo:a foo:b <c> .",
+ :rdfxml => '<rdf:RDF about="foo"></rdf:RDF>',
+ :n3 => '@prefix foo: <bar> .\nfoo:bar = {<a> <b> <c>} .',
+ }.each do |sym, str|
+ it "does not detect #{sym}" do
+ @format_class.detect(str).should be_false
+ end
+ end
+ end
end
describe RDF::NQuads::Reader do
@@ -41,15 +75,18 @@
# @see lib/rdf/spec/reader.rb in rdf-spec
it_should_behave_like RDF_Reader
- it "should be discoverable" do
- readers = [
- RDF::Reader.for(:nquads),
- RDF::Reader.for('etc/doap.nq'),
- RDF::Reader.for(:file_name => 'etc/doap.nq'),
- RDF::Reader.for(:file_extension => 'nq'),
- RDF::Reader.for(:content_type => 'text/x-nquads'),
- ]
- readers.each { |reader| reader.should == RDF::NQuads::Reader }
+ describe ".for" do
+ formats = [
+ :nquads,
+ 'etc/doap.nq',
+ {:file_name => 'etc/doap.nq'},
+ {:file_extension => 'nq'},
+ {:content_type => 'text/x-nquads'},
+ ].each do |arg|
+ it "discovers with #{arg.inspect}" do
+ RDF::Reader.for(arg).should == RDF::NQuads::Reader
+ end
+ end
end
context "#initialize" do
@@ -106,15 +143,18 @@
@writer = RDF::NQuads::Writer.new
end
- it "should be discoverable" do
- writers = [
- RDF::Writer.for(:nquads),
- RDF::Writer.for('tmp/test.nq'),
- RDF::Writer.for(:file_name => 'tmp/test.nq'),
- RDF::Writer.for(:file_extension => 'nq'),
- RDF::Writer.for(:content_type => 'text/x-nquads'),
- ]
- writers.each { |writer| writer.should == RDF::NQuads::Writer }
+ describe ".for" do
+ formats = [
+ :nquads,
+ 'etc/doap.nq',
+ {:file_name => 'etc/doap.nq'},
+ {:file_extension => 'nq'},
+ {:content_type => 'text/x-nquads'},
+ ].each do |arg|
+ it "discovers with #{arg.inspect}" do
+ RDF::Writer.for(arg).should == RDF::NQuads::Writer
+ end
+ end
end
# @see lib/rdf/spec/writer.rb in rdf-spec
View
75 spec/ntriples_spec.rb
@@ -26,11 +26,42 @@
RDF::Format.for(arg).should == @format_class
end
end
+
+ {
+ :ntriples => "<a> <b> <c> .",
+ :nquads => "<a> <b> <c> <d> . ",
+ :literal => '<a> <b> "literal" .',
+ :multi_line => '<a>\n <b>\n "literal"\n .',
+ }.each do |sym, str|
+ it "detects #{sym}" do
+ @format_class.for {str}.should_not == @format_class
+ end
+ end
end
describe "#to_sym" do
specify {@format_class.to_sym.should == :ntriples}
end
+
+ describe ".detect" do
+ {
+ :ntriples => "<a> <b> <c> .",
+ :nquads => "<a> <b> <c> <d> . ",
+ :literal => '<a> <b> "literal" .',
+ :multi_line => '<a>\n <b>\n "literal"\n .',
+ :turtle => "@prefix foo: <bar> .\n foo:a foo:b <c> .",
+ :rdfxml => '<rdf:RDF about="foo"></rdf:RDF>',
+ :n3 => '@prefix foo: <bar> .\nfoo:bar = {<a> <b> <c>} .',
+ }.each do |sym, str|
+ it "does not detect #{sym}" do
+ @format_class.detect(str).should be_false
+ end
+ end
+
+ it "always returns false" do
+ @format_class.detect("<a> <b> <c> .").should be_false
+ end
+ end
end
describe RDF::NTriples::Reader do
@@ -41,15 +72,19 @@
# @see lib/rdf/spec/reader.rb in rdf-spec
it_should_behave_like RDF_Reader
- it "should be discoverable" do
- readers = [
- RDF::Reader.for(:ntriples),
- RDF::Reader.for('etc/doap.nt'),
- RDF::Reader.for(:file_name => 'etc/doap.nt'),
- RDF::Reader.for(:file_extension => 'nt'),
- RDF::Reader.for(:content_type => 'text/plain'),
- ]
- readers.each { |reader| reader.should == RDF::NTriples::Reader }
+ describe ".for" do
+ formats = [
+ :ntriples,
+ 'etc/doap.nt',
+ {:file_name => 'etc/doap.nt'},
+ {:file_extension => 'nt'},
+ {:content_type => 'text/plain'},
+ {:content_type => 'text/ntriples+turtle'},
+ ].each do |arg|
+ it "discovers with #{arg.inspect}" do
+ RDF::Reader.for(arg).should == RDF::NTriples::Reader
+ end
+ end
end
it "should return :ntriples for to_sym" do
@@ -64,15 +99,19 @@
@writer = RDF::NTriples::Writer.new
end
- it "should be discoverable" do
- writers = [
- RDF::Writer.for(:ntriples),
- RDF::Writer.for('tmp/test.nt'),
- RDF::Writer.for(:file_name => 'tmp/test.nt'),
- RDF::Writer.for(:file_extension => 'nt'),
- RDF::Writer.for(:content_type => 'text/plain'),
- ]
- writers.each { |writer| writer.should == RDF::NTriples::Writer }
+ describe ".for" do
+ formats = [
+ :ntriples,
+ 'etc/doap.nt',
+ {:file_name => 'etc/doap.nt'},
+ {:file_extension => 'nt'},
+ {:content_type => 'text/plain'},
+ {:content_type => 'text/ntriples+turtle'},
+ ].each do |arg|
+ it "discovers with #{arg.inspect}" do
+ RDF::Writer.for(arg).should == RDF::NTriples::Writer
+ end
+ end
end
# @see lib/rdf/spec/writer.rb in rdf-spec

0 comments on commit 08b329c

Please sign in to comment.
Something went wrong with that request. Please try again.