-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.rb
217 lines (178 loc) · 5.73 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# Usage: bundle exec ruby scraper.rb
#
# Environment variables:
#
# - MORPH_GOOGLE_API_KEY: Google Maps API key
# - MORPH_PROXY: proxy to make requests through, in the format of 'HOST:PORT'
require 'scraperwiki'
require 'nokogiri'
require 'mechanize'
require 'geokit'
require 'pry'
require 'active_support'
require 'active_support/core_ext'
require 'reverse_markdown'
# Set an API key if provided
Geokit::Geocoders::GoogleGeocoder.api_key = ENV['MORPH_GOOGLE_API_KEY'] if ENV['MORPH_GOOGLE_API_KEY']
def scrub(text)
text.gsub!(/[[:space:]]/, ' ') # convert all utf whitespace to simple space
text.strip
end
def get(url)
@agent ||= Mechanize.new
@agent.user_agent_alias = 'Windows Firefox'
if ENV['MORPH_PROXY']
host, port = ENV['MORPH_PROXY'].split(':')
@agent.set_proxy host, port
puts "Using proxy for request to #{url}"
end
@agent.open_timeout = 60
@agent.read_timeout = 60
retry_count = 0
begin
page = @agent.get(url)
rescue => e
puts "Error when fetching #{url}: #{e}"
if (retry_count += 1) < 10
puts "Retrying"
retry
else
puts "Failed too many times. Exiting."
exit 1
end
end
page
end
# This attemps to solve a complicated problem where the information is spread
# across multiple elements. This finds all the elements until the next "header"
# (a strong element), then converts them all to text.
def extract_multiline(name, page, opts={})
options = { :scrub => false, :markdown => false }.merge(opts)
start_el = page.find {|e| e.text =~ /#{name}/i}
els = [start_el]
current = start_el.next
until current.children.find {|c| c.name == 'strong'} do
els << current
current = current.next
end
if options[:markdown]
html = els[1..-1].map(&:to_s).join
ReverseMarkdown.convert(html)
else
text = els.map(&:text).join
standalone = text[/#{name}\**:[[:space:]](.*)/im, 1]
options[:scrub] ? scrub(standalone) : standalone.strip
end
end
def extract_attrs(page)
attrs = {}
# Address of business
attrs['address'] = extract_multiline('address of business', page)
return nil if attrs['address'].blank?
# Trading name
text = page.find {|e| e.text =~ /trading name/i}.text
attrs['trading_name'] = text[/^trading name\*:.(.*)/i, 1]
# Name of convicted
attrs['name_of_convicted'] = extract_multiline('name of convicted', page)
# Date of offence
attrs['date_of_offence'] = extract_multiline('date of offence', page)
# Nature and circumstances of offence
attrs['offence_nature'] = extract_multiline('nature and circumstances of offence', page, :markdown => true)
# Court decision date
text = page.find {|e| e.text =~ /court decision date/i}.text
attrs['court_decision_date'] = text[/^court decision date:.(.*)/i, 1]
# Court
text = page.find {|e| e.text =~ /court:/i}.text
text = scrub(text).strip
attrs['court'] = text[/^court:.(.*)/i, 1]
# Prosecution brought by
text = page.find {|e| e.text =~ /prosecution brought by/i}.text
attrs['prosecution_brought_by'] = text[/^prosecution brought by:.(.*)/i, 1]
# Fine
text = page.find {|e| e.text =~ /fine:/i}.text
attrs['fine'] = text[/^(\d*\..)*fine:.(.*)/i, 2]
# Prosecution Costs
# Optional. Not all prosecutions have these.
if el = page.find {|e| e.text =~ /prosecution costs:/i}
text = el.text
attrs['prosecution_costs'] = text[/^prosecution costs:.(.*)/i, 1]
end
# Victim of Crime Levy
# 'victim' is singular and pluralised, so match on both because wtf
text = page.find {|e| e.text =~ /victims* of crime( levy)*:*/i}.text
attrs['victims_of_crime_levy'] = text[/^victims* of crime( levy)*:*.(.*)/i, 2]
# Total Penalty
text = page.find {|e| e.text =~ /total( penalty)*:/i}.text
attrs['total_penalty'] = text[/total([[:space:]]penalty)*:[[:space:]]*(.*)/i, 2]
# Comments
# Optional. Not all prosecutions have these.
if el = page.find {|e| e.text =~ /comments:/i}
text = el.text
attrs['comments'] = text[/^comments:.(.*)/i, 1]
end
attrs
end
def extract_ids(page)
page.search('h3 a').map {|a|
{
'id' => a['id'],
'link' => base + '#' + a['id'],
}
}
end
def build_prosecution(attrs, page)
doc = Nokogiri::HTML(page.body) {|c| c.noblanks}
elements = doc.search('div.wysiwyg').first.children
header = elements.search("//a[@id='#{attrs['id']}']").first.parent
els = [header]
current = header.next
until current.nil? || current.name == 'h3' do
els << current
current = current.next
end
if more_attrs = extract_attrs(els)
puts "Extracting #{more_attrs['address']}"
attrs.merge(more_attrs)
else
nil
end
end
def geocode(notice)
@addresses ||= {}
address = notice['address']
if @addresses[address]
puts "Geocoding [cache hit] #{address}"
location = @addresses[address]
else
puts "Geocoding #{address}"
a = Geokit::Geocoders::GoogleGeocoder.geocode(address)
location = {
'lat' => a.lat,
'lng' => a.lng,
}
@addresses[address] = location
end
notice.merge!(location)
end
def base
'http://www.sahealth.sa.gov.au/wps/wcm/connect/public+content/sa+health+internet/about+us/legislation/food+legislation/food+prosecution+register'
end
def existing_record_ids
return @cached if @cached
@cached = ScraperWiki.select('link from data').map {|r| r['link']}
rescue SqliteMagic::NoSuchTable
[]
end
def main
page = get(base)
prosecutions = extract_ids(page)
puts "### Found #{prosecutions.size} prosecutions"
new_prosecutions = prosecutions.select {|r| !existing_record_ids.include?(r['link']) }
puts "### There are #{new_prosecutions.size} new prosecutions"
new_prosecutions.map! {|p| build_prosecution(p, page) }.compact!
new_prosecutions.map! {|p| geocode(p) }
# Serialise
ScraperWiki.save_sqlite(['link'], new_prosecutions)
puts "Done"
end
main()