Skip to content

Commit

Permalink
Initial commit of scraping beauty
Browse files Browse the repository at this point in the history
  • Loading branch information
adrianpike committed Oct 26, 2011
1 parent b433883 commit 76b4f69
Show file tree
Hide file tree
Showing 9 changed files with 259 additions and 25 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.rvmrc

# rcov generated
coverage

Expand Down
9 changes: 5 additions & 4 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
source "http://rubygems.org"
# Add dependencies required to use your gem here.
# Example:
# gem "activesupport", ">= 2.3.5"

gem "mechanize"

# Add dependencies to develop your gem here.
# Include everything needed to run rake, tests, features, etc.
group :development do
gem "shoulda", ">= 0"
gem "sham_rack", ">= 0"
gem "sinatra" # For sham_rack magic!
gem "rspec", ">= 0"
gem "bundler", "~> 1.0.0"
gem "jeweler", "~> 1.6.4"
gem "rcov", ">= 0"
Expand Down
51 changes: 51 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
GEM
remote: http://rubygems.org/
specs:
diff-lcs (1.1.3)
git (1.2.5)
jeweler (1.6.4)
bundler (~> 1.0)
git (>= 1.2.5)
rake
mechanize (2.0.1)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 1.8)
nokogiri (~> 1.4)
webrobots (~> 0.0, >= 0.0.9)
net-http-digest_auth (1.1.1)
net-http-persistent (1.9)
nokogiri (1.5.0)
rack (1.3.5)
rack-protection (1.1.4)
rack
rake (0.9.2.2)
rcov (0.9.11)
rspec (2.7.0)
rspec-core (~> 2.7.0)
rspec-expectations (~> 2.7.0)
rspec-mocks (~> 2.7.0)
rspec-core (2.7.1)
rspec-expectations (2.7.0)
diff-lcs (~> 1.1.2)
rspec-mocks (2.7.0)
sham_rack (1.3.3)
rack
sinatra (1.3.1)
rack (~> 1.3, >= 1.3.4)
rack-protection (~> 1.1, >= 1.1.2)
tilt (~> 1.3, >= 1.3.3)
tilt (1.3.3)
webrobots (0.0.12)
nokogiri (>= 1.4.4)

PLATFORMS
ruby

DEPENDENCIES
bundler (~> 1.0.0)
jeweler (~> 1.6.4)
mechanize
rcov
rspec
sham_rack
sinatra
38 changes: 37 additions & 1 deletion README.rdoc
Original file line number Diff line number Diff line change
@@ -1,6 +1,42 @@
= scrapie

Description goes here.
Hey, it's Scrapie! It's 2011, we should be able to scrape sites for their juicy data in a delicious fashion instead of having to hack something together every time.

It's basically a tool that allows you to really simply and quickly fab up a class that translates CSS selectors into attributes, and lets you specify your own translations on query params.

== Example

class Airplane < Scrapie
url 'http://registry.faa.gov/aircraftinquiry/NNum_Results.aspx'
params {
:n_number => 'NNumbertxt'
}
attributes {
'serial_number' => 'div#serial_number',
'classname' => '.class_name'
}
before_fetch do |agent|
# Do stuff with my agent, like log in or hax the gibson
end
after_fetch do |agent|
# Do more neatu stuff with my agent
end


# Other posisbilities
method :get
agent_options { :options_to_send_to_my_new_mechanize_agent => 'BE COOL MAN' }
end

a = Airplane.find(:n_number => '12345') # => Fetches http://registry.faa.gov/aircraftinquiry/NNum_Results.aspx?NNumbertxt=12345
a.serial_number = 'a cool serial number'

== Todo

* Set up the callbacks
* Sanitize
* Refactor
* Make it cooler!

== Contributing to scrapie

Expand Down
15 changes: 8 additions & 7 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,20 @@ Jeweler::Tasks.new do |gem|
gem.name = "scrapie"
gem.homepage = "http://github.com/adrianpike/scrapie"
gem.license = "MIT"
gem.summary = %Q{TODO: one-line summary of your gem}
gem.description = %Q{TODO: longer description of your gem}
gem.summary = %Q{Scrapie scrapes things for great justice.}
gem.description = %Q{Scrapie is a tool that allows you to really simply and quickly fab up a class that translates CSS selectors into attributes, and lets you specify your own translations on query params. }
gem.email = "adrian@pikeapps.com"
gem.authors = ["Adrian Pike"]
# dependencies defined in Gemfile
end
Jeweler::RubygemsDotOrgTasks.new

require 'rake/testtask'
Rake::TestTask.new(:test) do |test|
test.libs << 'lib' << 'test'
test.pattern = 'test/**/test_*.rb'
test.verbose = true

require 'rspec'
require 'rspec/core/rake_task'
RSpec::Core::RakeTask.new('spec') do |t|
t.pattern = 'spec/*_spec.rb'
# t.rspec_opts = ["--backtrace"]
end

require 'rcov/rcovtask'
Expand Down
55 changes: 55 additions & 0 deletions lib/scrapie.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
require 'mechanize'

class Scrapie
class ScrapieException < Exception; end
class NoAttributesException < ScrapieException; end

def self.url(url)
@url = url
end
def self.params(params)
@params = params
end
def self.http_method(method)
@http_method = method
end
def self.attributes(attributes)
@attributes = attributes
attributes.each {|name,page_selector|
self.send(:attr_accessor, name)
}
end

# find()
# find(:foo => bar)
# find(:foo => bar, :baz => bizzle)
def self.find(opts = {})
raise NoAttributesException unless (@attributes and @attributes.size > 0)
a = Mechanize.new

# Let's build out the parameters now
params = Hash[opts.collect{|k,v|
[@params[k], v] if @params and @params[k]
}]

page = a.send(@http_method || :get, @url, params)

new_object = self.new
@attributes.each {|name, page_selector|
new_object.send(name + '=', page.search(page_selector).inner_html)
}

new_object
end

# Callbacks # TODO

def self.before_fetch

end

def self.after_fetch

end

end
9 changes: 3 additions & 6 deletions test/helper.rb → spec/helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,9 @@
$stderr.puts "Run `bundle install` to install missing gems"
exit e.status_code
end
require 'test/unit'
require 'shoulda'
require 'rspec'
require 'sham_rack'

$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
$LOAD_PATH.unshift(File.dirname(__FILE__))
require 'scrapie'

class Test::Unit::TestCase
end
require 'scrapie'
98 changes: 98 additions & 0 deletions spec/scrapie_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
require 'helper'

ShamRack.at("scrapietest").sinatra do
get "/test1" do
"No attributes here chief"
end
get "/test_with_params" do
"<div id='param'>#{params[:test_param_for_getting]}</div><div id='param_upcased'>#{params[:test_param_for_getting].upcase}</div>"
end
get "/test" do
"<div class='foo'>example</div>"
end
get '/500' do
DERP
end
post '/post' do
"<div id='post_param'>#{params[:le_post]}</div>"
end
end

class NoAttributeScrapie < Scrapie
url 'http://scrapietest/test1'
end

class BasicScrapie < Scrapie
url 'http://scrapietest/test'
attributes({ 'foo' => '.foo' })
end

class ParamsScrapie < Scrapie
url 'http://scrapietest/test_with_params'
params({ :test_param => 'test_param_for_getting' })
attributes({
'param' => 'div#param',
'param_upcased' => 'div#param_upcased'
})
end

class FourOhFourScrapie < Scrapie
url 'http://scrapietest/ends_of_the_earth'
attributes({
'results' => 'div#post_param'
})
end

class FiveHundredScrapie < Scrapie
url 'http://scrapietest/500'
attributes({
'results' => 'div#post_param'
})
end

class PostScrapie < Scrapie
url 'http://scrapietest/post'
http_method :post

params({ :search => 'le_post' })
attributes({
'results' => 'div#post_param'
})
end

describe Scrapie do

it 'whines if you don\'t specify any attributes' do
lambda { nas = NoAttributeScrapie.find(:har => 'heh') }.should raise_error(Scrapie::NoAttributesException)
end

it 'does a basic fetch sans params' do
basic = BasicScrapie.find
basic.foo.should == 'example'
end

it 'handles params' do
test_string = 'sdkfjhdsafjkladhfklzxcv123' # todo: random string

paramtest = ParamsScrapie.find(:test_param => test_string)
paramtest.param.should == test_string
paramtest.param_upcased.should == test_string.upcase
end

it 'handles 404s' do
lambda { nas = FourOhFourScrapie.find(:har => 'heh') }.should raise_error(Mechanize::ResponseCodeError)
end
it 'handles 500s' do
lambda { nas = FiveHundredScrapie.find(:har => 'heh') }.should raise_error(Mechanize::ResponseCodeError)
end
it 'uses different HTTP methods' do
post = PostScrapie.find(:search => 'le_search')

post.results.should == 'le_search'
end

it 'uses a before_fetch'
it 'uses an after_fetch'
it 'sets agent options'

end
7 changes: 0 additions & 7 deletions test/test_scrapie.rb

This file was deleted.

0 comments on commit 76b4f69

Please sign in to comment.