Skip to content

Commit

Permalink
Wrote a benchmark to run against 180 sample feeds. Fixed a bunch of b…
Browse files Browse the repository at this point in the history
…ugs in determining which parser to use based on those 180 feeds. Need to change the logic for what happens when it doesn't know how to parse something.
  • Loading branch information
pauldix committed Jan 31, 2009
1 parent ed90bf6 commit 65b63d0
Show file tree
Hide file tree
Showing 11 changed files with 299 additions and 13 deletions.
2 changes: 1 addition & 1 deletion lib/feedzirra/atom.rb
Expand Up @@ -11,7 +11,7 @@ class Atom
elements :entry, :as => :entries, :class => AtomEntry

def self.able_to_parse?(xml)
xml =~ /Atom/
xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
end
end
end
1 change: 1 addition & 0 deletions lib/feedzirra/atom_entry.rb
Expand Up @@ -11,5 +11,6 @@ class AtomEntry
element :content
element :summary
element :published
element :created, :as => :published
end
end
18 changes: 12 additions & 6 deletions lib/feedzirra/feed.rb
Expand Up @@ -62,12 +62,18 @@ def self.fetch_and_parse(urls, options = {})
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
curl.follow_location = true
curl.on_success do |c|
feed = Feed.parse(c.body_str)
feed.feed_url ||= c.last_effective_url
feed.etag = etag_from_header(c.header_str)
feed.last_modified = last_modified_from_header(c.header_str)
responses[url] = feed
options[:on_success].call(url, feed) if options.has_key?(:on_success)
xml = c.body_str
klass = determine_feed_parser_for_xml(xml)
if klass
feed = klass.parse(xml)
feed.feed_url ||= c.last_effective_url
feed.etag = etag_from_header(c.header_str)
feed.last_modified = last_modified_from_header(c.header_str)
responses[url] = feed
options[:on_success].call(url, feed) if options.has_key?(:on_success)
else
puts "Error determining parser for #{url} - #{c.last_effective_url}"
end
end
curl.on_failure do |c|
responses[url] = c.response_code
Expand Down
2 changes: 1 addition & 1 deletion lib/feedzirra/rdf.rb
Expand Up @@ -12,7 +12,7 @@ class RDF
attr_accessor :feed_url

def self.able_to_parse?(xml)
xml =~ /rdf\:RDF/ || false
xml =~ /(rdf\:RDF)|(#{Regexp.escape("http://purl.org/rss/1.0")})|(rss version\=\"0\.9.?\")/ || false
end
end
end
2 changes: 1 addition & 1 deletion lib/feedzirra/rss.rb
Expand Up @@ -12,7 +12,7 @@ class RSS
attr_accessor :feed_url

def self.able_to_parse?(xml)
xml =~ /rss version\=\"2\.0\"/
xml =~ /rss.*version\=\"2\.0\"/
end
end
end
37 changes: 37 additions & 0 deletions spec/benchmarks/feedzirra_benchmarks.rb
@@ -0,0 +1,37 @@
require File.dirname(__FILE__) + '/../../lib/feedzirra.rb'
require 'rfeedparser'
require 'feed-normalizer'
require 'open-uri'

require 'benchmark'
include Benchmark

iterations = 10
urls = File.readlines(File.dirname(__FILE__) + "/../sample_feeds/successful_feed_urls.txt")
puts "benchmarks on #{urls.size} feeds"
puts "************************************"
benchmark do |t|
t.report("feedzirra") do
iterations.times do
Feedzirra::Feed.fetch_and_parse(urls, :on_success => lambda { |url, feed| $stdout.print '.'; $stdout.flush })
end
end

t.report("rfeedparser") do
iterations.times do
urls.each do |url|
feed = FeedParser.parse(url)
$stdout.print '.'
$stdout.flush
end
end
end

t.report("feed-normalizer") do
urls.each do |url|
feed = FeedNormalizer::FeedNormalizer.parse(open(url), :force_parser => FeedNormalizer::SimpleRssParser)
$stdout.print '.'
$stdout.flush
end
end
end
46 changes: 46 additions & 0 deletions spec/benchmarks/parsing_benchmark.rb
@@ -0,0 +1,46 @@
require File.dirname(__FILE__) + '/../../lib/feedzirra.rb'
require 'rfeedparser'
require 'feed-normalizer'

require 'benchmark'
include Benchmark

iterations = 50
xml = File.read(File.dirname(__FILE__) + '/../sample_feeds/PaulDixExplainsNothing.xml')

benchmark do |t|
t.report("feedzirra") do
iterations.times do
f = Feedzirra::Feed.parse(xml)
title = f.title
first_title = f.entries.first.title
first_author = f.entries.first.author
first_url = f.entries.first.url
end
end

t.report("rfeedparser") do
iterations.times do
f = FeedParser.parse(xml)
title = f.title
first_title = f.entries.first.title
first_author = f.entries.first.author
first_url = f.entries.first.url
end
end

t.report("feed-normalizer") do
iterations.times do
# have to use the :force option to make feed-normalizer parse an atom feed
f = FeedNormalizer::FeedNormalizer.parse(xml, :force_parser => FeedNormalizer::SimpleRssParser)
# title = f.title
# first_title = f.entries.first.title
# first_author = f.entries.first.author
# first_url = f.entries.first.url
# puts title
# puts first_title
# puts first_author
# puts first_url
end
end
end
11 changes: 11 additions & 0 deletions spec/feedzirra/feed_spec.rb
Expand Up @@ -29,6 +29,13 @@
feed.title.should == "Paul Dix Explains Nothing"
feed.entries.size.should == 5
end

it "should parse an feedburner rss feed" do
feed = Feedzirra::Feed.parse(sample_rss_feed_burner_feed)
feed.class.should == Feedzirra::RDF
feed.title.should == "Sam Harris: Author, Philosopher, Essayist, Atheist"
feed.entries.size.should == 10
end
end

describe "#determine_feed_parser_for_xml" do
Expand All @@ -44,6 +51,10 @@
Feedzirra::Feed.determine_feed_parser_for_xml(sample_rdf_feed).should == Feedzirra::RDF
end

it "should return the Feedzirra::RDF class for an rss feedburner feed" do
Feedzirra::Feed.determine_feed_parser_for_xml(sample_rss_feed_burner_feed).should == Feedzirra::RDF
end

it "should return the Feedzirra::RSS object for an rss 2.0 feed" do
Feedzirra::Feed.determine_feed_parser_for_xml(sample_rss_feed).should == Feedzirra::RSS
end
Expand Down
9 changes: 5 additions & 4 deletions spec/sample_feeds/get_sample_feeds.rb
Expand Up @@ -13,6 +13,7 @@
:url => opml_entry.attributes["htmlUrl"].to_s)
end

urls = []
multi = Curl::Multi.new
feeds.each do |feed|
on_failure = lambda do |ex|
Expand All @@ -23,11 +24,11 @@

on_success = lambda do |body|
puts "got #{feed.title} - #{feed.feed_url}"
File.open("#{feed.title.gsub(/\W/, "")}.xml", "w") do |f|
f.write(body)
end
urls << feed.feed_url
end
multi.get(feed.feed_url, on_success, on_failure)
end

multi.select([], []) while multi.size > 0
multi.select([], []) while multi.size > 0

File.open("successful_feed_urls.txt", "w") {|f| f.write(urls.join("\n"))}
180 changes: 180 additions & 0 deletions spec/sample_feeds/successful_feed_urls.txt
@@ -0,0 +1,180 @@
http://feeds.feedburner.com/CoryForsyth
http://feeds.feedburner.com/dcmanges
http://www.allthingsdistributed.com/index.xml
http://feeds.feedburner.com/pmarca
http://aws.typepad.com/aws/atom.xml
http://blog.caboo.se/feed/atom.xml
http://www.avibryant.com/index.rdf
http://joyeur.com/atom/
http://hunch.net/?feed=rss2
http://feeds.feedburner.com/Nanorails
http://feeds.feedburner.com/PaulDixExplainsNothing
http://feeds.feedburner.com/SamHarris
http://rubyforge.org/export/rss_sfnewreleases.php
http://mike.bailey.net.au/blog/?feed=rss2
http://rubyforge.org/export/rss_sfnews.php
http://feeds.feedburner.com/TomWhite
http://pauldowman.com/feed/
http://railsontherun.com/feed/atom.xml
http://feeds.feedburner.com/devver/blog
http://blogsearch.google.com/blogsearch_feeds?hl=en&amp;scoring=d&amp;q=%22paul+dix%22&amp;ie=utf-8&amp;output=atom
http://feeds.feedburner.com/HacketyOrg
http://jobs.joelonsoftware.com/default.asp?pg=pgFeed&amp;feed=9095128
http://gslounge.com/blog/feed
http://www.oreillynet.com/pub/feed/89
http://feeds.feedburner.com/activereload
http://jobs.37signals.com/jobs.rss
http://feeds.feedburner.com/al3x
http://www.mysqlperformanceblog.com/feed/
http://soylentfoo.jnewland.com/xml/rss20/feed.xml
http://feeds.feedburner.com/blogspot/aefO
http://tweetscan.com/rss.php?s=pauldix
http://adam.blogs.bitscribe.net/feed/
http://feeds.feedburner.com/newbamboo
http://www.postal-code.com/binarycode/feed/
http://brainspl.at/xml/rss20/feed.xml
http://feeds.feedburner.com/brynary
http://feeds.feedburner.com/CoryFoy
http://feeds.feedburner.com/Chadfowlercom
http://feeds.feedburner.com/rubypal/KoEa
http://irthoughts.wordpress.com/feed/
http://cfis.savagexi.com/articles.atom
http://www.danwebb.net/feed/atom.xml
http://feeds.feedburner.com/encytemedia
http://feeds.feedburner.com/errtheblog
http://www.eribium.org/blog/?feed=rss2
http://feeds.feedburner.com/FingerprintsOfCasperFabricius
http://blog.rapleaf.com/dev/?feed=rss2
http://codemode.blogspot.com/feeds/posts/default
http://feeds.feedburner.com/GiantRobotsSmashingIntoOtherGiantRobots
http://feeds.feedburner.com/hasmanythrough
http://blog.imperialdune.com/feed/atom.xml
http://blog.craigambrose.com/xml/rss20/feed.xml
http://www.infoq.com/rss/rss.action?token=M7lRPBznVOdzQgBkfJsR2LMOUm72X9hp
http://feeds.feedburner.com/JamesBritt-Home
http://fhwang.net/syndicate/ruby.atom
http://www.urbanhonking.com/ideasfordozens/atom.xml
http://www.jrmiii.com/feed/atom.xml
http://www.jonsthoughtsoneverything.com/feed/
http://feeds.feedburner.com/mongoo/CTIN
http://everburning.com/feed/
http://feeds.feedburner.com/LoudThinking
http://rubylearning.com/blog/feed/
http://feeds.feedburner.com/MartinFowlersBliki
http://jicksta.com/feed
http://www.bofh.org.uk/articles.atom
http://mike.daless.io/aintablog/articles.rss
http://feeds.feedburner.com/NickSieger
http://dev.massivebraingames.com/rss
http://mikepence.wordpress.com/feed/
http://rubyphilia.wordpress.com/feed/
http://www.nimblecode.com/xml/rss/feed.xml
http://feeds.feedburner.com/pluron
http://lifecoding.com/blog/?feed=rss2
http://sam.aaron.name/feed/atom.xml
http://feeds.feedburner.com/objo
http://feeds.feedburner.com/ozmmorg
http://www.notsostupid.com/feed/
http://on-ruby.blogspot.com/atom.xml
http://oneless.blogspot.com/feeds/posts/default
http://onrails.org/xml/rss20/feed.xml
http://blog.pastie.org/index.rdf
http://ola-bini.blogspot.com/atom.xml
http://tomcopeland.blogs.com/juniordeveloper/atom.xml
http://blog.fallingsnow.net/feed/
http://pitsula.blogspot.com/feeds/posts/default
http://lylejohnson.name/blog/feed/
http://feeds.feedburner.com/prototype-blog
http://planetruby.0x42.net/rss20.xml
http://blog.zenspider.com/atom.xml
http://pragdave.pragprog.com/pragdave/atom.xml
http://feeds.feedburner.com/rails-envy
http://feeds.feedburner.com/railsjitsu
http://feeds.feedburner.com/riab
http://feeds.feedburner.com/RobertREvans
http://feeds.feedburner.com/reinh
http://blog.methodmissing.com/feed/atom.xml
http://redhanded.hobix.com/index.xml
http://feeds.feedburner.com/RubyOnRailsSecurity
http://feeds.feedburner.com/RidingRails
http://feeds.feedburner.com/ruby_is_awesome
http://feeds.feedburner.com/rubypond/JXRc
http://www.rubycorner.com/feeds/updated/rss20
http://rubyonwindows.blogspot.com/feeds/posts/default
http://rubyquiz.com/index.rss
http://ruby.tie-rack.org/feed/
http://www.rubyhead.com/feed/
http://feeds.feedburner.com/37signals/beMH
http://feeds.feedburner.com/SimplisticComplexity
http://feeds.feedburner.com/slash7/rss
http://smartic.us/feed/atom.xml
http://cuttingtheredtape.blogspot.com/feeds/posts/default
http://feeds.feedburner.com/sneer/blog
http://www.spacebabies.nl/feed/
http://feeds.feedburner.com/cleanair
http://richkilmer.blogs.com/ether/atom.xml
http://tenderlovemaking.com/feed/
http://feeds.feedburner.com/terralien-ships-log
http://feeds.feedburner.com/ZenAndTheArtOfRubyProgramming
http://feeds.feedburner.com/rufytech
http://feeds.feedburner.com/WorkingWithRails
http://www.yup.com/xml/atom10/feed.xml
http://feeds.feedburner.com/StakeVentures
http://metaclass.org/feed/atom.xml
http://feeds.feedburner.com/therailsist
http://www.onestepback.org/index.cgi/index.rss
http://merbist.com/feed/
http://feeds.feedburner.com/kevingc
http://feeds.feedburner.com/nuttnet/qWLn
http://brontemedia.com/feed/
http://feeds.feedburner.com/brynary
http://feeds.feedburner.com/AmitGuptasWeblog
http://brighter.net/rss
http://feeds.feedburner.com/gilesbowkett
http://feeds.feedburner.com/hasmanythrough
http://probablycorey.wordpress.com/feed/
http://feeds.feedburner.com/innonate
http://feeds.feedburner.com/Kungpowthinking
http://nikocunningham.blogspot.com/feeds/posts/default
http://www.notsostupid.com/feed/
http://feeds.feedburner.com/slash7/rss
http://feeds.feedburner.com/ThirdRail
http://feeds.feedburner.com/trottercashion
http://weblogs.java.net/blog/arungupta/index.rdf
http://fabiokung.com/feed/
http://feeds.feedburner.com/nicksieger
http://metaclass.org/feed/atom.xml
http://fhwang.net/syndicate/ruby.atom
http://ola-bini.blogspot.com/atom.xml
http://irthoughts.wordpress.com/feed/
http://www.zedshaw.com/feed.atom
http://feeds.feedburner.com/devthatweb
http://ross.typepad.com/blog/atom.xml
http://www.pbs.org/cringely/pulpit/rss2.xml
http://rss.slashdot.org/slashdot/eqWf
http://www.zedshaw.com/feed.atom
http://www.scripting.com/rss.xml
http://www.techmeme.com/index.xml
http://feeds.feedburner.com/LinkBuildingBlog
http://feeds.feedburner.com/AmitGuptasWeblog
http://codesnipers.com/?q=node/feed
http://feeds.feedburner.com/hermanshead76
http://www.joelonsoftware.com/rss.xml
http://feeds.feedburner.com/InformationArbitrage
http://paulgraham.infogami.com/blog/atom.xml
http://feeds.feedburner.com/startupping
http://feeds.feedburner.com/NewYorkSmallBusinessLaw
http://www.userscape.com/blog/index.php/site/rss_2.0//rss2/
http://www.alistapart.com/feed/rss.xml
http://lsvp.wordpress.com/feed/
http://www.microisv.com/feed/
http://feeds.b5media.com/b5media/StartupSpark
http://pragmatictheory.blogspot.com/feeds/posts/default
http://fiveyearstoolate.wordpress.com/feed/
http://feeds.feedburner.com/blogniscient
http://feeds.feedburner.com/Feedblog
http://feeds.feedburner.com/Tailrank
http://www.regator.com/blog/?feed=rss2
http://feeds.feedburner.com/Spinn3r
http://blog.spotback.com/feed/
http://irthoughts.wordpress.com/feed/
4 changes: 4 additions & 0 deletions spec/spec_helper.rb
Expand Up @@ -25,6 +25,10 @@ def sample_rdf_entry_content
File.read("#{File.dirname(__FILE__)}/sample_feeds/HREFConsideredHarmfulFirstEntry.xml")
end

def sample_rss_feed_burner_feed
File.read("#{File.dirname(__FILE__)}/sample_feeds/SamHarrisAuthorPhilosopherEssayistAtheist.xml")
end

def sample_rss_feed
File.read("#{File.dirname(__FILE__)}/sample_feeds/TenderLovemaking.xml")
end
Expand Down

0 comments on commit 65b63d0

Please sign in to comment.