/
scraper.rb
136 lines (115 loc) · 2.95 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
require 'scraperwiki'
require 'geokit'
require 'spreadsheet'
require 'pry'
# Set an API key if provided
Geokit::Geocoders::GoogleGeocoder.api_key = ENV['MORPH_GOOGLE_API_KEY'] if ENV['MORPH_GOOGLE_API_KEY']
@mappings = {
1 => 'food_business_operator',
2 => 'trading_name',
3 => 'defendant',
4 => 'address',
5 => 'postown',
6 => 'county',
7 => 'postcode',
8 => 'offence_category',
9 => 'offence_provision',
10 => 'contravention_in_eu_regulations',
11 => 'nature_of_offence',
12 => 'date_of_conviction',
13 => 'conviction_or_guilty_plea',
14 => 'court_name',
15 => 'region',
16 => 'sentence',
17 => 'costs_awarded',
18 => 'prosecution_authority'
}
class String
def to_md5
Digest::MD5.new.hexdigest(self)
end
end
def scrub_date(value)
case
when value.class == DateTime
return value.to_date
when value.class == String
return Date.parse(value)
when value.class == Date
return value
else
puts "[debug] Unhandled date: #{value.inspect}"
end
end
def generate_id(details)
return details.values.map(&:to_s).join(' ').to_md5
end
def build_prosecution(row)
details = {}
row.each_with_index do |value, index|
key = @mappings[index]
case key
when nil
next
when 'date_of_conviction'
value = scrub_date(value)
else
# Remove all leading and trailing whitespace
value.strip if value.is_a? String
end
details.merge!({key => value})
end
details['id'] = generate_id(details)
return details
end
def geocode(prosecution)
@addresses ||= {}
address = prosecution['address']
address = [
prosecution['address'],
prosecution['county'],
prosecution['postcode'],
].join(', ')
if @addresses[address]
puts "Geocoding [cache hit] #{address}"
location = @addresses[address]
else
puts "Geocoding #{address}"
a = Geokit::Geocoders::GoogleGeocoder.geocode(address)
if !a.lat && !a.lng
a = Geokit::Geocoders::GoogleGeocoder.geocode(prosecution['postcode'])
end
location = {
'lat' => a.lat,
'lng' => a.lng,
}
@addresses[address] = location
end
prosecution.merge!(location)
end
def existing_record_ids
return @cached if @cached
@cached = ScraperWiki.select('id from data').map {|r| r['id']}
rescue SqliteMagic::NoSuchTable
[]
end
def fetch_prosecutions
url = "https://www.food.gov.uk/sites/default/files/prosecution-outcomes.xls"
xls = open(url)
sheet = Spreadsheet.open(xls).worksheet(0)
header_row = sheet.row(5)
max = sheet.rows.size - 1
sheet.rows[6..max]
end
def main
prosecutions = fetch_prosecutions
prosecutions.map! { |p| build_prosecution(p) }
puts "### Found #{prosecutions.size} notices"
new_prosecutions = prosecutions.select {|r| !existing_record_ids.include?(r['id'])}
puts "### There are #{new_prosecutions.size} new prosecutions"
new_prosecutions.map! {|p| geocode(p) }
# Serialise
ScraperWiki.save_sqlite(['id'], new_prosecutions)
puts "Done"
end
main()