Skip to content

Commit

Permalink
Tool to scrape the FCO site for embassy data
Browse files Browse the repository at this point in the history
  • Loading branch information
alext committed Nov 13, 2012
1 parent f7295f1 commit 0c19b15
Showing 1 changed file with 82 additions and 0 deletions.
82 changes: 82 additions & 0 deletions lib/data/fco_embassy_scraper.rb
@@ -0,0 +1,82 @@
# encoding: utf-8
require 'open-uri'
require 'nokogiri'

class FCOEmbassyScraper
INDEX_URL = "http://www.fco.gov.uk/en/travel-and-living-abroad/find-an-embassy/"

def self.scrape
self.new.run
end

def initialize
@urls = []
@embassies = {}
@countries = YAML.load_file(File.expand_path('../countries.yml', __FILE__))
@index_uri = URI.parse(INDEX_URL)
end

attr_reader :embassies

def run
process_index
@urls.each do |url|
begin
e = process_embassy_page(url)
country_name = e.delete("country")
country = case country_name
when "Côte d'Ivoire"
{:slug => "cote-d_ivoire-(ivory-coast)"}
when "Dominica"
{:slug => "dominica,-commonwealth-of"}
when "Equatorial Guinea - BHC Yaoundé"
{:slug => "equatorial-guinea"}
when "Kyrgystan"
{:slug => "kyrgyzstan"}
when "Niger - British High Commission"
{:slug => "niger"}
when "Pitcairn Henderson Ducie & Oeno Islands"
{:slug => "pitcairn"}
else
@countries.select {|c| c[:name].downcase == country_name.downcase }.first
end
if country
@embassies[country[:slug]] ||= []
@embassies[country[:slug]] << e
else
puts "Couldn't resolv slug for country #{country_name}, url: #{url}"
end
rescue => ex
puts "Error #{ex.class}: #{ex.message} processing #{url}"
end
end
@embassies
end

def process_index
page = Nokogiri::HTML(@index_uri.open)
page.css('#newA2ZCountryLink').each do |link|
@urls << URI.join(@index_uri, link["href"])
end
end

def process_embassy_page(uri)
page = Nokogiri::HTML(uri.open)
page_title = page.at_css('h1').text.strip
country = page_title.split(',').first
raise "Strange country name: #{country}" if country == "Access denied"
embassy = {"country" => country}
page.css('table.Embassy tr').each do |row|
items = row.css('td')
key = items.first.text.strip
next if key.blank?
key = key.downcase.chomp(':').gsub(/\s/, '_')

value = items.last
value = Nokogiri::HTML::DocumentFragment.parse(value.inner_html.gsub('<br>', "\n")).text
value = value.gsub("\u00A0", ' ').gsub("\u2013", '-').strip
embassy[key] = value
end
embassy
end
end

0 comments on commit 0c19b15

Please sign in to comment.