Skip to content

Commit

Permalink
First pass at scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
auxesis committed Nov 5, 2016
1 parent bd60b88 commit c091a8e
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 59 deletions.
1 change: 1 addition & 0 deletions .ruby-version
@@ -0,0 +1 @@
2.3.1
17 changes: 8 additions & 9 deletions Gemfile
@@ -1,10 +1,9 @@
# It's easy to add more libraries or choose different versions. Any libraries
# specified here will be installed and made available to your morph.io scraper.
# Find out more: https://morph.io/documentation/ruby
ruby '2.3.1'
source 'https://rubygems.org'

source "https://rubygems.org"

ruby "2.0.0"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "mechanize"
gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults'
gem 'pry'
gem 'geokit'
gem 'activesupport'
gem 'reverse_markdown'
gem 'spreadsheet'
63 changes: 38 additions & 25 deletions Gemfile.lock
Expand Up @@ -10,38 +10,51 @@ GIT
GEM
remote: https://rubygems.org/
specs:
domain_name (0.5.24)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.2)
domain_name (~> 0.5)
activesupport (5.0.0.1)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (~> 0.7)
minitest (~> 5.1)
tzinfo (~> 1.1)
coderay (1.1.1)
concurrent-ruby (1.0.2)
geokit (1.10.0)
httpclient (2.6.0.1)
mechanize (2.7.3)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (~> 2.0)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.4)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (2.5)
mini_portile (0.6.2)
net-http-digest_auth (1.4)
net-http-persistent (2.9.4)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
ntlm-http (0.1.1)
i18n (0.7.0)
method_source (0.8.2)
mini_portile2 (2.1.0)
minitest (5.9.1)
nokogiri (1.6.8.1)
mini_portile2 (~> 2.1.0)
pry (0.10.4)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
reverse_markdown (1.0.3)
nokogiri
ruby-ole (1.2.12)
slop (3.6.0)
spreadsheet (1.1.3)
ruby-ole (>= 1.0)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.1)
webrobots (0.1.1)
thread_safe (0.3.5)
tzinfo (1.2.2)
thread_safe (~> 0.1)

PLATFORMS
ruby

DEPENDENCIES
mechanize
activesupport
geokit
pry
reverse_markdown
scraperwiki!
spreadsheet

RUBY VERSION
ruby 2.3.1p112

BUNDLED WITH
1.13.6
156 changes: 131 additions & 25 deletions scraper.rb
@@ -1,25 +1,131 @@
# This is a template for a Ruby scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# require 'scraperwiki'
# require 'mechanize'
#
# agent = Mechanize.new
#
# # Read in a page
# page = agent.get("http://foo.com")
#
# # Find somehing on the page using css selectors
# p page.at('div.content')
#
# # Write out to the sqlite database using scraperwiki library
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
#
# # An arbitrary query against the database
# ScraperWiki.select("* from data where 'name'='peter'")

# You don't have to do things with the Mechanize or ScraperWiki libraries.
# You can use whatever gems you want: https://morph.io/documentation/ruby
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
require 'scraperwiki'
require 'geokit'
require 'spreadsheet'
require 'pry'

# Set an API key if provided
Geokit::Geocoders::GoogleGeocoder.api_key = ENV['MORPH_GOOGLE_API_KEY'] if ENV['MORPH_GOOGLE_API_KEY']

@mappings = {
1 => 'food_business_operator',
2 => 'trading_name',
3 => 'defendant',
4 => 'address',
5 => 'postown',
6 => 'county',
7 => 'postcode',
8 => 'offence_category',
9 => 'offence_provision',
10 => 'contravention_in_eu_regulations',
11 => 'nature_of_offence',
12 => 'date_of_conviction',
13 => 'conviction_or_guilty_plea',
14 => 'court_name',
15 => 'region',
16 => 'sentence',
17 => 'costs_awarded',
18 => 'prosecution_authority'
}
class String
def to_md5
Digest::MD5.new.hexdigest(self)
end
end

def scrub_date(value)
case
when value.class == DateTime
return value.to_date
when value.class == String
return Date.parse(value)
when value.class == Date
return value
else
puts "[debug] Unhandled date: #{value.inspect}"
end
end

def generate_id(details)
return details.values.map(&:to_s).join(' ').to_md5
end

def build_prosecution(row)
details = {}
row.each_with_index do |value, index|
key = @mappings[index]
case key
when nil
next
when 'date_of_conviction'
value = scrub_date(value)
else
# Remove all leading and trailing whitespace
value.strip
end
details.merge!({key => value})
end
details['id'] = generate_id(details)
return details
end

def geocode(prosecution)
@addresses ||= {}

address = prosecution['address']
address = [
prosecution['address'],
prosecution['county'],
prosecution['postcode'],
].join(', ')

if @addresses[address]
puts "Geocoding [cache hit] #{address}"
location = @addresses[address]
else
puts "Geocoding #{address}"
a = Geokit::Geocoders::GoogleGeocoder.geocode(address)
location = {
'lat' => a.lat,
'lng' => a.lng,
}

@addresses[address] = location
end

prosecution.merge!(location)
end

def existing_record_ids
return @cached if @cached
@cached = ScraperWiki.select('id from data').map {|r| r['id']}
rescue SqliteMagic::NoSuchTable
[]
end

def fetch_prosecutions
url = "https://www.food.gov.uk/sites/default/files/prosecution-outcomes.xls"
xls = open(url)

sheet = Spreadsheet.open(xls).worksheet(0)
header_row = sheet.row(5)
max = sheet.rows.size - 1

sheet.rows[6..max]
end

def main
prosecutions = fetch_prosecutions
prosecutions.map! { |p| build_prosecution(p) }

puts "### Found #{prosecutions.size} notices"
new_prosecutions = prosecutions.select {|r| !existing_record_ids.include?(r['id'])}
puts "### There are #{new_prosecutions.size} new prosecutions"
new_prosecutions.map! {|p| geocode(p) }

# Serialise
ScraperWiki.save_sqlite(['id'], new_prosecutions)

puts "Done"
end

main()

0 comments on commit c091a8e

Please sign in to comment.