Permalink
Browse files

Ported scrAPI to Ruby 1.9.3 (unfortunately 1.9.2 will not work out be…

…cause of a bug in Ruby itself).
  • Loading branch information...
1 parent becbe6e commit 22d49012a2020b8fdb0dd4723bf40526db066c72 Christoph Lupprich committed Nov 10, 2010
Showing with 43 additions and 39 deletions.
  1. +10 −5 README.rdoc
  2. +0 −1 Rakefile
  3. +4 −4 lib/scraper/base.rb
  4. +9 −9 lib/scraper/reader.rb
  5. +3 −3 scrapi.gemspec
  6. +1 −1 test/node_ext_test.rb
  7. +8 −8 test/reader_test.rb
  8. +7 −7 test/scraper_test.rb
  9. +1 −1 test/selector_test.rb
View
@@ -40,13 +40,16 @@ To get the latest source code with regular updates:
svn co http://labnotes.org/svn/public/ruby/scrapi
+== Version of Ruby
+
+Currently ScrAPI does not run with Ruby 1.9.2, but with the dev versions of Ruby 1.9.3. This is due to a bug in Ruby's visibility context handling (see changelog #29578 and bug #3406 on the official Ruby page). Using the most recent dev version of Ruby is easy with RVM (http://rvm.beginrescueend.com/).
== Using TIDY
-By default scrAPI uses Tidy to cleanup the HTML.
+By default scrAPI uses Tidy (actually Tidy-FFI) to cleanup the HTML.
You need to install the Tidy Gem for Ruby:
- gem install tidy
+ gem install tidy_ffi
And the Tidy binary libraries, available here:
@@ -56,15 +59,15 @@ By default scrAPI looks for the Tidy DLL (Windows) or shared library (Linux) in
Alternatively, just point Tidy to the library with:
- Tidy.path = "...."
+ TidyFFI.library_path = "...."
On Linux this would probably be:
- Tidy.path = "/usr/local/lib/libtidy.so"
+ TidyFFI.library_path = "/usr/local/lib/libtidy.so"
On OS/X this would probably be:
- Tidy.path = “/usr/lib/libtidy.dylib”
+ TidyFFI.library_path = “/usr/lib/libtidy.dylib”
For testing purposes, you can also use the built in HTML parser. It's useful for testing and getting up to grabs with scrAPI, but it doesn't deal well with broken HTML. So for testing only:
@@ -86,3 +89,5 @@ HTML DOM extracted from Rails, Copyright (c) 2004 David Heinemeier Hansson. Unde
HTML parser by Takahiro Maebashi and Katsuyuki Komatsu, Ruby license.
http://www.jin.gr.jp/~nahi/Ruby/html-parser/README.html
+
+Porting to Ruby 1.9.x by Christoph Lupprich, http://lupprich.info
View
@@ -1,6 +1,5 @@
require "benchmark"
require "rubygems"
-Gem::manage_gems
require "rake"
require "rake/testtask"
require "rake/rdoctask"
View
@@ -906,10 +906,10 @@ def request(url, options)
# end
def skip(elements = nil)
case elements
- when Array: @skip.concat elements
- when HTML::Node: @skip << elements
- when nil: @skip << true
- when true, false: @skip << elements
+ when Array then @skip.concat elements
+ when HTML::Node then @skip << elements
+ when nil then @skip << true
+ when true, false then @skip << elements
end
# Calling skip(element) as the last statement is
# redundant by design.
View
@@ -10,7 +10,7 @@
require "net/https"
begin
require "rubygems"
- require "tidy"
+ require "tidy_ffi"
rescue LoadError
end
@@ -95,6 +95,7 @@ def to_s
# * :redirect_limit -- Number of redirects allowed (default is 3).
# * :user_agent -- The User-Agent header to send.
# * :timeout -- HTTP open connection/read timeouts (in second).
+ # * :ssl_verify_mode -- SSL verification mode, defaults to OpenSSL::SSL::VERIFY_NONE
#
# It returns a hash with the following information:
# * :url -- The URL of the requested page (may change by permanent redirect)
@@ -123,6 +124,7 @@ def read_page(url, options = nil)
begin
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = (uri.scheme == "https")
+ http.verify_mode = options[:ssl_verify_mode] || OpenSSL::SSL::VERIFY_NONE
http.close_on_empty_response = true
http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
path = uri.path.dup # required so we don't modify path
@@ -202,10 +204,8 @@ def parse_page(content, encoding = nil, options = nil, parser = :tidy)
find_tidy
options = (options || {}).update(TIDY_OPTIONS)
options[:input_encoding] = encoding.gsub("-", "").downcase
- document = Tidy.open(options) do |tidy|
- html = tidy.clean(content)
- HTML::Document.new(html).find(:tag=>"html")
- end
+ html = TidyFFI::Tidy.with_options(options).clean(content)
+ document = HTML::Document.new(html).find(:tag=>"html")
when :html_parser
document = HTML::HTMLParser.parse(content).root
else
@@ -223,14 +223,14 @@ def parse_page(content, encoding = nil, options = nil, parser = :tidy)
module_function
def find_tidy()
- return if Tidy.path
+ return if TidyFFI.library_path
begin
- Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
+ TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
rescue LoadError
begin
- Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
+ TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
rescue LoadError
- Tidy.path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
+ TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
end
end
end
View
@@ -1,6 +1,6 @@
Gem::Specification.new do |spec|
spec.name = 'scrapi'
- spec.version = '1.2.1'
+ spec.version = '1.2.2'
spec.summary = "scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules."
spec.description = <<-EOF
scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
@@ -13,10 +13,10 @@ EOF
spec.files = Dir['{test,lib}/**/*', 'README.rdoc', 'CHANGELOG', 'Rakefile', 'MIT-LICENSE']
spec.require_path = 'lib'
spec.autorequire = 'scrapi.rb'
- spec.requirements << 'Tidy'
+ spec.requirements << 'Tidy_ffi'
spec.has_rdoc = true
spec.rdoc_options << '--main' << 'README.rdoc' << '--title' << "scrAPI toolkit for Ruby" << '--line-numbers'
spec.extra_rdoc_files = ['README.rdoc']
- spec.add_dependency 'tidy', '>=1.1.0'
+ spec.add_dependency 'tidy_ffy', '>=0.1.2'
end
View
@@ -7,7 +7,7 @@
require "rubygems"
require "test/unit"
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./lib/scrapi"
class NodeExtTest < Test::Unit::TestCase
View
@@ -12,8 +12,8 @@
require "webrick/https"
require "logger"
require "stringio"
-require File.join(File.dirname(__FILE__), "mock_net_http")
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./test/mock_net_http"
+require "./lib/scrapi"
class ReaderTest < Test::Unit::TestCase
@@ -239,38 +239,38 @@ def test_should_handle_encoding_correctly
# Test content encoding returned from HTTP server.
with_webrick do |server, params|
server.mount_proc "/test.html" do |req,resp|
- resp["Content-Type"] = "text/html; charset=my-encoding"
+ resp["Content-Type"] = "text/html; charset=ASCII"
resp.body = "Content comes here"
end
page = Reader.read_page(WEBRICK_TEST_URL)
page = Reader.parse_page(page.content, page.encoding)
- assert_equal "my-encoding", page.encoding
+ assert_equal "ASCII", page.encoding
end
# Test content encoding in HTML http-equiv header
# that overrides content encoding returned in HTTP.
with_webrick do |server, params|
server.mount_proc "/test.html" do |req,resp|
- resp["Content-Type"] = "text/html; charset=my-encoding"
+ resp["Content-Type"] = "text/html; charset=ASCII"
resp.body = %Q{
<html>
<head>
-<meta http-equiv="content-type" value="text/html; charset=other-encoding">
+<meta http-equiv="content-type" value="text/html; charset=UTF-8">
</head>
<body></body>
</html>
}
end
page = Reader.read_page(WEBRICK_TEST_URL)
page = Reader.parse_page(page.content, page.encoding)
- assert_equal "other-encoding", page.encoding
+ assert_equal "UTF-8", page.encoding
end
end
def test_should_support_https
begin
options = WEBRICK_OPTIONS.dup.update(
:SSLEnable=>true,
- :SSLVerifyClient => ::OpenSSL::SSL::VERIFY_NONE,
+ :SSLVerifyClient => OpenSSL::SSL::VERIFY_NONE,
:SSLCertName => [ ["C","JP"], ["O","WEBrick.Org"], ["CN", "WWW"] ]
)
server = WEBrick::HTTPServer.new(options)
View
@@ -8,8 +8,8 @@
require "rubygems"
require "time"
require "test/unit"
-require File.join(File.dirname(__FILE__), "mock_net_http")
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./test/mock_net_http"
+require "./lib/scrapi"
class ScraperTest < Test::Unit::TestCase
@@ -301,8 +301,8 @@ def test_skip_from_extractor
assert_equal "this", scraper.this2
scraper = new_scraper(html) do
- process "#1", :this1=>:text, :skip=>true do
- false
+ process "#1", :this1=>:text, :skip=>true do |element|
+ element
end
process "#1", :this2=>:text
end
@@ -351,7 +351,7 @@ def test_accessors
[response, <<-EOF
<html>
<head>
- <meta http-equiv="content-type" value="text/html; charset=other-encoding">
+ <meta http-equiv="content-type" value="text/html; charset=ASCII">
</head>
<body>
<div id="x"/>
@@ -371,7 +371,7 @@ def test_accessors
assert_equal "http://localhost/redirect", scraper.page_info.url.to_s
assert_equal time, scraper.page_info.last_modified
assert_equal "etag", scraper.page_info.etag
- assert_equal "other-encoding", scraper.page_info.encoding
+ assert_equal "ASCII", scraper.page_info.encoding
end
@@ -721,7 +721,7 @@ def test_prepare_and_result
# Extracting the attribute skips the second match.
scraper = new_scraper(DIVS123) do
process("div") { |element| @count +=1 }
- define_method(:prepare) { @count = 1 }
+ define_method(:prepare) { |element| @count = 1 }
define_method(:result) { @count }
end
result = scraper.scrape
View
@@ -4,7 +4,7 @@
# Developed for http://co.mments.com
# Code and documention: http://labnotes.org
-require File.join(File.dirname(__FILE__), "../lib", "scrapi")
+require "./lib/scrapi"
class SelectorTest < Test::Unit::TestCase

0 comments on commit 22d4901

Please sign in to comment.