Skip to content

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
busterandcharlie committed Jun 12, 2016
0 parents commit c72a506
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
62 changes: 62 additions & 0 deletions scraper.py
@@ -0,0 +1,62 @@
# Blank Python

import scraperwiki

#In each page puts contents of all <a> tags in <div class="Result"> into a list,
#Then loops through and puts the first 'about' attribute of each into a table
#Then finds 'next' link, and does the same for that linked page

#http://www.london-gazette.co.uk/issues/recent/10/personal-insolvency/bankruptcy/start=1
#HTML:
#<div id="divResult" class="Result">
#<dl class="Details">
# <dt class="PubDate">Date:</dt>
# <dd class="PubDate">
#<p class="summary">
# <strong class="highlight">60112</strong>
#<ul class="Links">
# <li class="lteIE6_first-child" about="http://www.london-gazette.co.uk/id/issues/60112" typeof="g:Issue" property="g:hasPublicationDate" content="2012-04-10" rel="g:hasNotice"><a href="/issues/60112/notices/1568613/recent=10;category=personal-insolvency;subcategory=bankruptcy" about="http://www.london-gazette.co.uk/id/issues/60112/notices/1568613" typeof="g:Notice"><img alt="See full notice" title="See full notice" src="/styles/styleimages/button_seeFullNotice.gif" /></a></li>
# <li about="http://www.london-gazette.co.uk/id/issues/60112/notices/1568613" typeof="g:Notice" rel="foaf:page"><a target="_blank" href="/issues/60112/pages/6994" typeof="foaf:Document"><img alt="See PDF" title="See PDF" src="/styles/styleimages/button_seePDF.gif" /></a></li><li><a target='_blank' href='https://www.tsoshop.co.uk/bookstore.asp?FO=1159966&amp;Action=AddItem&amp;ExternalRef=60112'>


import scraperwiki
import urlparse
import lxml.html

# scrape_table function: gets passed an individual page to scrape
def scrape_links(root):
# print root.cssselect("div.Result p"), 'AGHHH'
links = root.cssselect("div.Result a") # selects all <a> links in <div class="Result">
for link in links:
# Set up data record
record = {}
aboutattr = link.attrib.get('about')
if aboutattr:
record['link'] = aboutattr
print record, '------------'
scraperwiki.sqlite.save(["link"], record)

# scrape_and_look_for_next_link function: calls the scrape_table
# function, then hunts for a 'next' link: if one is found, calls itself again
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
#print html
root = lxml.html.fromstring(html)
scrape_links(root)
next_link = root.cssselect("a.Next")
#print next_link
if next_link:
next_url = urlparse.urljoin(base_url, next_link[0].attrib.get('href'))
#print next_url
scrape_and_look_for_next_link(next_url)

# START HERE: define your starting URL - then
urls = ['/issues/2013-01-15;2013-01-15/all=NoticeCode%3a2441/start=1', '/issues/2013-01-15;2013-01-15/all=NoticeCode%3a2453/start=1',
'/issues/2013-01-15;2013-01-15/all=NoticeCode%3a2462/start=1', '/issues/2012-02-10;2013-02-20/all=NoticeCode%3a2441/start=1']

base_url = 'http://www.london-gazette.co.uk'
for u in urls:
starting_url = urlparse.urljoin(base_url, u)
scrape_and_look_for_next_link(starting_url)


0 comments on commit c72a506

Please sign in to comment.