Skip to content

Commit

Permalink
First pass
Browse files Browse the repository at this point in the history
  • Loading branch information
auxesis committed Jan 22, 2017
0 parents commit db516b2
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
8 changes: 8 additions & 0 deletions Gemfile
@@ -0,0 +1,8 @@
ruby '2.3.1'
source 'https://rubygems.org'

gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults'
gem 'pry'
gem 'activesupport'
gem 'reverse_markdown'
gem 'mechanize'
79 changes: 79 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,79 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
activesupport (4.2.6)
i18n (~> 0.7)
json (~> 1.7, >= 1.7.7)
minitest (~> 5.1)
thread_safe (~> 0.3, >= 0.3.4)
tzinfo (~> 1.1)
coderay (1.1.0)
domain_name (0.5.20161021)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.6.0.1)
i18n (0.7.0)
json (1.8.3)
mechanize (2.7.5)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
method_source (0.8.2)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.1.0)
minitest (5.8.4)
net-http-digest_auth (1.4)
net-http-persistent (2.9.4)
nokogiri (1.7.0.1)
mini_portile2 (~> 2.1.0)
ntlm-http (0.1.1)
pry (0.10.3)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
reverse_markdown (1.0.3)
nokogiri
slop (3.6.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
thread_safe (0.3.5)
tzinfo (1.2.2)
thread_safe (~> 0.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.2)
webrobots (0.1.2)

PLATFORMS
ruby

DEPENDENCIES
activesupport
mechanize
pry
reverse_markdown
scraperwiki!

RUBY VERSION
ruby 2.3.1p112

BUNDLED WITH
1.13.6
44 changes: 44 additions & 0 deletions scraper.rb
@@ -0,0 +1,44 @@
require 'scraperwiki'
require 'mechanize'
require 'pry'

def get(url)
@agent ||= Mechanize.new
@agent.get(url)
end

def extract_listings(page)
page.search('li.dog-listing.listing').map do |listing|
{
'name' => listing.search('h4').text,
'description' => listing.search('div.personality').text,
'gender' => listing.search('dd.gender').first.text.downcase,
'breed' => listing.search('dd.breed').first.text,
'link' => 'https://www.petrescue.com.au' + listing.search('h4 a').first['href'],
}
end
end

def fetch_animals(url)
page = get(url)
extract_listings(page)
end

def main
url = 'https://www.petrescue.com.au/listings/dogs?age=either&commit=Search&gender=either&page=1&postcode=&postcode_distance=50&size%5B%5D=all&species=dog&states%5B%5D=1&utf8=%E2%9C%93'
page = get(url)
max = page.search('#main > article > div.pagination.footer-pagination > nav > div.info').first.text.split.last.to_i

animals = (1..max).to_a.map { |i|
puts "### Fetching page #{i} of #{max}"
url = "https://www.petrescue.com.au/listings/dogs?age=either&commit=Search&gender=either&page=#{i}&postcode=&postcode_distance=50&size%5B%5D=all&species=dog&states%5B%5D=1&utf8=%E2%9C%93"
fetch_animals(url)
}.flatten

puts "### Saving #{animals.size} records"

ScraperWiki.save_sqlite(%w(link), animals)
end

main

0 comments on commit db516b2

Please sign in to comment.