diff --git a/lib/feedzirra/atom.rb b/lib/feedzirra/atom.rb index 6e3b219e..60869a3d 100644 --- a/lib/feedzirra/atom.rb +++ b/lib/feedzirra/atom.rb @@ -11,7 +11,7 @@ class Atom elements :entry, :as => :entries, :class => AtomEntry def self.able_to_parse?(xml) - xml =~ /Atom/ + xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/ end end end \ No newline at end of file diff --git a/lib/feedzirra/atom_entry.rb b/lib/feedzirra/atom_entry.rb index 7782618f..cb60030b 100644 --- a/lib/feedzirra/atom_entry.rb +++ b/lib/feedzirra/atom_entry.rb @@ -11,5 +11,6 @@ class AtomEntry element :content element :summary element :published + element :created, :as => :published end end \ No newline at end of file diff --git a/lib/feedzirra/feed.rb b/lib/feedzirra/feed.rb index 69a2a08f..5b615bc9 100644 --- a/lib/feedzirra/feed.rb +++ b/lib/feedzirra/feed.rb @@ -62,12 +62,18 @@ def self.fetch_and_parse(urls, options = {}) curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match) curl.follow_location = true curl.on_success do |c| - feed = Feed.parse(c.body_str) - feed.feed_url ||= c.last_effective_url - feed.etag = etag_from_header(c.header_str) - feed.last_modified = last_modified_from_header(c.header_str) - responses[url] = feed - options[:on_success].call(url, feed) if options.has_key?(:on_success) + xml = c.body_str + klass = determine_feed_parser_for_xml(xml) + if klass + feed = klass.parse(xml) + feed.feed_url ||= c.last_effective_url + feed.etag = etag_from_header(c.header_str) + feed.last_modified = last_modified_from_header(c.header_str) + responses[url] = feed + options[:on_success].call(url, feed) if options.has_key?(:on_success) + else + puts "Error determining parser for #{url} - #{c.last_effective_url}" + end end curl.on_failure do |c| responses[url] = c.response_code diff --git a/lib/feedzirra/rdf.rb b/lib/feedzirra/rdf.rb index 5fc75901..e2717008 100644 --- a/lib/feedzirra/rdf.rb +++ b/lib/feedzirra/rdf.rb @@ -12,7 +12,7 @@ class RDF attr_accessor :feed_url def self.able_to_parse?(xml) - xml =~ /rdf\:RDF/ || false + xml =~ /(rdf\:RDF)|(#{Regexp.escape("http://purl.org/rss/1.0")})|(rss version\=\"0\.9.?\")/ || false end end end \ No newline at end of file diff --git a/lib/feedzirra/rss.rb b/lib/feedzirra/rss.rb index 4dc2640a..9be7e27f 100644 --- a/lib/feedzirra/rss.rb +++ b/lib/feedzirra/rss.rb @@ -12,7 +12,7 @@ class RSS attr_accessor :feed_url def self.able_to_parse?(xml) - xml =~ /rss version\=\"2\.0\"/ + xml =~ /rss.*version\=\"2\.0\"/ end end end \ No newline at end of file diff --git a/spec/benchmarks/feedzirra_benchmarks.rb b/spec/benchmarks/feedzirra_benchmarks.rb new file mode 100644 index 00000000..c400465d --- /dev/null +++ b/spec/benchmarks/feedzirra_benchmarks.rb @@ -0,0 +1,37 @@ +require File.dirname(__FILE__) + '/../../lib/feedzirra.rb' +require 'rfeedparser' +require 'feed-normalizer' +require 'open-uri' + +require 'benchmark' +include Benchmark + +iterations = 10 +urls = File.readlines(File.dirname(__FILE__) + "/../sample_feeds/successful_feed_urls.txt") +puts "benchmarks on #{urls.size} feeds" +puts "************************************" +benchmark do |t| + t.report("feedzirra") do + iterations.times do + Feedzirra::Feed.fetch_and_parse(urls, :on_success => lambda { |url, feed| $stdout.print '.'; $stdout.flush }) + end + end + + t.report("rfeedparser") do + iterations.times do + urls.each do |url| + feed = FeedParser.parse(url) + $stdout.print '.' + $stdout.flush + end + end + end + + t.report("feed-normalizer") do + urls.each do |url| + feed = FeedNormalizer::FeedNormalizer.parse(open(url), :force_parser => FeedNormalizer::SimpleRssParser) + $stdout.print '.' + $stdout.flush + end + end +end diff --git a/spec/benchmarks/parsing_benchmark.rb b/spec/benchmarks/parsing_benchmark.rb new file mode 100644 index 00000000..f76fa7d1 --- /dev/null +++ b/spec/benchmarks/parsing_benchmark.rb @@ -0,0 +1,46 @@ +require File.dirname(__FILE__) + '/../../lib/feedzirra.rb' +require 'rfeedparser' +require 'feed-normalizer' + +require 'benchmark' +include Benchmark + +iterations = 50 +xml = File.read(File.dirname(__FILE__) + '/../sample_feeds/PaulDixExplainsNothing.xml') + +benchmark do |t| + t.report("feedzirra") do + iterations.times do + f = Feedzirra::Feed.parse(xml) + title = f.title + first_title = f.entries.first.title + first_author = f.entries.first.author + first_url = f.entries.first.url + end + end + + t.report("rfeedparser") do + iterations.times do + f = FeedParser.parse(xml) + title = f.title + first_title = f.entries.first.title + first_author = f.entries.first.author + first_url = f.entries.first.url + end + end + + t.report("feed-normalizer") do + iterations.times do + # have to use the :force option to make feed-normalizer parse an atom feed + f = FeedNormalizer::FeedNormalizer.parse(xml, :force_parser => FeedNormalizer::SimpleRssParser) + # title = f.title + # first_title = f.entries.first.title + # first_author = f.entries.first.author + # first_url = f.entries.first.url + # puts title + # puts first_title + # puts first_author + # puts first_url + end + end +end diff --git a/spec/feedzirra/feed_spec.rb b/spec/feedzirra/feed_spec.rb index 71320e22..51bb3454 100644 --- a/spec/feedzirra/feed_spec.rb +++ b/spec/feedzirra/feed_spec.rb @@ -29,6 +29,13 @@ feed.title.should == "Paul Dix Explains Nothing" feed.entries.size.should == 5 end + + it "should parse an feedburner rss feed" do + feed = Feedzirra::Feed.parse(sample_rss_feed_burner_feed) + feed.class.should == Feedzirra::RDF + feed.title.should == "Sam Harris: Author, Philosopher, Essayist, Atheist" + feed.entries.size.should == 10 + end end describe "#determine_feed_parser_for_xml" do @@ -44,6 +51,10 @@ Feedzirra::Feed.determine_feed_parser_for_xml(sample_rdf_feed).should == Feedzirra::RDF end + it "should return the Feedzirra::RDF class for an rss feedburner feed" do + Feedzirra::Feed.determine_feed_parser_for_xml(sample_rss_feed_burner_feed).should == Feedzirra::RDF + end + it "should return the Feedzirra::RSS object for an rss 2.0 feed" do Feedzirra::Feed.determine_feed_parser_for_xml(sample_rss_feed).should == Feedzirra::RSS end diff --git a/spec/sample_feeds/get_sample_feeds.rb b/spec/sample_feeds/get_sample_feeds.rb index 5ad43b22..65e8e6e8 100644 --- a/spec/sample_feeds/get_sample_feeds.rb +++ b/spec/sample_feeds/get_sample_feeds.rb @@ -13,6 +13,7 @@ :url => opml_entry.attributes["htmlUrl"].to_s) end +urls = [] multi = Curl::Multi.new feeds.each do |feed| on_failure = lambda do |ex| @@ -23,11 +24,11 @@ on_success = lambda do |body| puts "got #{feed.title} - #{feed.feed_url}" - File.open("#{feed.title.gsub(/\W/, "")}.xml", "w") do |f| - f.write(body) - end + urls << feed.feed_url end multi.get(feed.feed_url, on_success, on_failure) end -multi.select([], []) while multi.size > 0 \ No newline at end of file +multi.select([], []) while multi.size > 0 + +File.open("successful_feed_urls.txt", "w") {|f| f.write(urls.join("\n"))} \ No newline at end of file diff --git a/spec/sample_feeds/successful_feed_urls.txt b/spec/sample_feeds/successful_feed_urls.txt new file mode 100644 index 00000000..cfb72fe3 --- /dev/null +++ b/spec/sample_feeds/successful_feed_urls.txt @@ -0,0 +1,180 @@ +http://feeds.feedburner.com/CoryForsyth +http://feeds.feedburner.com/dcmanges +http://www.allthingsdistributed.com/index.xml +http://feeds.feedburner.com/pmarca +http://aws.typepad.com/aws/atom.xml +http://blog.caboo.se/feed/atom.xml +http://www.avibryant.com/index.rdf +http://joyeur.com/atom/ +http://hunch.net/?feed=rss2 +http://feeds.feedburner.com/Nanorails +http://feeds.feedburner.com/PaulDixExplainsNothing +http://feeds.feedburner.com/SamHarris +http://rubyforge.org/export/rss_sfnewreleases.php +http://mike.bailey.net.au/blog/?feed=rss2 +http://rubyforge.org/export/rss_sfnews.php +http://feeds.feedburner.com/TomWhite +http://pauldowman.com/feed/ +http://railsontherun.com/feed/atom.xml +http://feeds.feedburner.com/devver/blog +http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=%22paul+dix%22&ie=utf-8&output=atom +http://feeds.feedburner.com/HacketyOrg +http://jobs.joelonsoftware.com/default.asp?pg=pgFeed&feed=9095128 +http://gslounge.com/blog/feed +http://www.oreillynet.com/pub/feed/89 +http://feeds.feedburner.com/activereload +http://jobs.37signals.com/jobs.rss +http://feeds.feedburner.com/al3x +http://www.mysqlperformanceblog.com/feed/ +http://soylentfoo.jnewland.com/xml/rss20/feed.xml +http://feeds.feedburner.com/blogspot/aefO +http://tweetscan.com/rss.php?s=pauldix +http://adam.blogs.bitscribe.net/feed/ +http://feeds.feedburner.com/newbamboo +http://www.postal-code.com/binarycode/feed/ +http://brainspl.at/xml/rss20/feed.xml +http://feeds.feedburner.com/brynary +http://feeds.feedburner.com/CoryFoy +http://feeds.feedburner.com/Chadfowlercom +http://feeds.feedburner.com/rubypal/KoEa +http://irthoughts.wordpress.com/feed/ +http://cfis.savagexi.com/articles.atom +http://www.danwebb.net/feed/atom.xml +http://feeds.feedburner.com/encytemedia +http://feeds.feedburner.com/errtheblog +http://www.eribium.org/blog/?feed=rss2 +http://feeds.feedburner.com/FingerprintsOfCasperFabricius +http://blog.rapleaf.com/dev/?feed=rss2 +http://codemode.blogspot.com/feeds/posts/default +http://feeds.feedburner.com/GiantRobotsSmashingIntoOtherGiantRobots +http://feeds.feedburner.com/hasmanythrough +http://blog.imperialdune.com/feed/atom.xml +http://blog.craigambrose.com/xml/rss20/feed.xml +http://www.infoq.com/rss/rss.action?token=M7lRPBznVOdzQgBkfJsR2LMOUm72X9hp +http://feeds.feedburner.com/JamesBritt-Home +http://fhwang.net/syndicate/ruby.atom +http://www.urbanhonking.com/ideasfordozens/atom.xml +http://www.jrmiii.com/feed/atom.xml +http://www.jonsthoughtsoneverything.com/feed/ +http://feeds.feedburner.com/mongoo/CTIN +http://everburning.com/feed/ +http://feeds.feedburner.com/LoudThinking +http://rubylearning.com/blog/feed/ +http://feeds.feedburner.com/MartinFowlersBliki +http://jicksta.com/feed +http://www.bofh.org.uk/articles.atom +http://mike.daless.io/aintablog/articles.rss +http://feeds.feedburner.com/NickSieger +http://dev.massivebraingames.com/rss +http://mikepence.wordpress.com/feed/ +http://rubyphilia.wordpress.com/feed/ +http://www.nimblecode.com/xml/rss/feed.xml +http://feeds.feedburner.com/pluron +http://lifecoding.com/blog/?feed=rss2 +http://sam.aaron.name/feed/atom.xml +http://feeds.feedburner.com/objo +http://feeds.feedburner.com/ozmmorg +http://www.notsostupid.com/feed/ +http://on-ruby.blogspot.com/atom.xml +http://oneless.blogspot.com/feeds/posts/default +http://onrails.org/xml/rss20/feed.xml +http://blog.pastie.org/index.rdf +http://ola-bini.blogspot.com/atom.xml +http://tomcopeland.blogs.com/juniordeveloper/atom.xml +http://blog.fallingsnow.net/feed/ +http://pitsula.blogspot.com/feeds/posts/default +http://lylejohnson.name/blog/feed/ +http://feeds.feedburner.com/prototype-blog +http://planetruby.0x42.net/rss20.xml +http://blog.zenspider.com/atom.xml +http://pragdave.pragprog.com/pragdave/atom.xml +http://feeds.feedburner.com/rails-envy +http://feeds.feedburner.com/railsjitsu +http://feeds.feedburner.com/riab +http://feeds.feedburner.com/RobertREvans +http://feeds.feedburner.com/reinh +http://blog.methodmissing.com/feed/atom.xml +http://redhanded.hobix.com/index.xml +http://feeds.feedburner.com/RubyOnRailsSecurity +http://feeds.feedburner.com/RidingRails +http://feeds.feedburner.com/ruby_is_awesome +http://feeds.feedburner.com/rubypond/JXRc +http://www.rubycorner.com/feeds/updated/rss20 +http://rubyonwindows.blogspot.com/feeds/posts/default +http://rubyquiz.com/index.rss +http://ruby.tie-rack.org/feed/ +http://www.rubyhead.com/feed/ +http://feeds.feedburner.com/37signals/beMH +http://feeds.feedburner.com/SimplisticComplexity +http://feeds.feedburner.com/slash7/rss +http://smartic.us/feed/atom.xml +http://cuttingtheredtape.blogspot.com/feeds/posts/default +http://feeds.feedburner.com/sneer/blog +http://www.spacebabies.nl/feed/ +http://feeds.feedburner.com/cleanair +http://richkilmer.blogs.com/ether/atom.xml +http://tenderlovemaking.com/feed/ +http://feeds.feedburner.com/terralien-ships-log +http://feeds.feedburner.com/ZenAndTheArtOfRubyProgramming +http://feeds.feedburner.com/rufytech +http://feeds.feedburner.com/WorkingWithRails +http://www.yup.com/xml/atom10/feed.xml +http://feeds.feedburner.com/StakeVentures +http://metaclass.org/feed/atom.xml +http://feeds.feedburner.com/therailsist +http://www.onestepback.org/index.cgi/index.rss +http://merbist.com/feed/ +http://feeds.feedburner.com/kevingc +http://feeds.feedburner.com/nuttnet/qWLn +http://brontemedia.com/feed/ +http://feeds.feedburner.com/brynary +http://feeds.feedburner.com/AmitGuptasWeblog +http://brighter.net/rss +http://feeds.feedburner.com/gilesbowkett +http://feeds.feedburner.com/hasmanythrough +http://probablycorey.wordpress.com/feed/ +http://feeds.feedburner.com/innonate +http://feeds.feedburner.com/Kungpowthinking +http://nikocunningham.blogspot.com/feeds/posts/default +http://www.notsostupid.com/feed/ +http://feeds.feedburner.com/slash7/rss +http://feeds.feedburner.com/ThirdRail +http://feeds.feedburner.com/trottercashion +http://weblogs.java.net/blog/arungupta/index.rdf +http://fabiokung.com/feed/ +http://feeds.feedburner.com/nicksieger +http://metaclass.org/feed/atom.xml +http://fhwang.net/syndicate/ruby.atom +http://ola-bini.blogspot.com/atom.xml +http://irthoughts.wordpress.com/feed/ +http://www.zedshaw.com/feed.atom +http://feeds.feedburner.com/devthatweb +http://ross.typepad.com/blog/atom.xml +http://www.pbs.org/cringely/pulpit/rss2.xml +http://rss.slashdot.org/slashdot/eqWf +http://www.zedshaw.com/feed.atom +http://www.scripting.com/rss.xml +http://www.techmeme.com/index.xml +http://feeds.feedburner.com/LinkBuildingBlog +http://feeds.feedburner.com/AmitGuptasWeblog +http://codesnipers.com/?q=node/feed +http://feeds.feedburner.com/hermanshead76 +http://www.joelonsoftware.com/rss.xml +http://feeds.feedburner.com/InformationArbitrage +http://paulgraham.infogami.com/blog/atom.xml +http://feeds.feedburner.com/startupping +http://feeds.feedburner.com/NewYorkSmallBusinessLaw +http://www.userscape.com/blog/index.php/site/rss_2.0//rss2/ +http://www.alistapart.com/feed/rss.xml +http://lsvp.wordpress.com/feed/ +http://www.microisv.com/feed/ +http://feeds.b5media.com/b5media/StartupSpark +http://pragmatictheory.blogspot.com/feeds/posts/default +http://fiveyearstoolate.wordpress.com/feed/ +http://feeds.feedburner.com/blogniscient +http://feeds.feedburner.com/Feedblog +http://feeds.feedburner.com/Tailrank +http://www.regator.com/blog/?feed=rss2 +http://feeds.feedburner.com/Spinn3r +http://blog.spotback.com/feed/ +http://irthoughts.wordpress.com/feed/ \ No newline at end of file diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index acf70af5..6d9eefac 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -25,6 +25,10 @@ def sample_rdf_entry_content File.read("#{File.dirname(__FILE__)}/sample_feeds/HREFConsideredHarmfulFirstEntry.xml") end +def sample_rss_feed_burner_feed + File.read("#{File.dirname(__FILE__)}/sample_feeds/SamHarrisAuthorPhilosopherEssayistAtheist.xml") +end + def sample_rss_feed File.read("#{File.dirname(__FILE__)}/sample_feeds/TenderLovemaking.xml") end