Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
bethanjhn committed Jul 30, 2019
1 parent 7b9d7e2 commit 2b569a8
Showing 1 changed file with 13 additions and 10 deletions.
23 changes: 13 additions & 10 deletions scraper.py
Expand Up @@ -5,7 +5,6 @@
###############################################################################

import scraperwiki
import urlparse
import lxml.html

# scrape_table function: gets passed an individual page to scrape
Expand All @@ -15,7 +14,10 @@ def scrape_table(root):
# Set up our data record - we'll need it later
record = {}
table_cells = row.cssselect("td")
if table_cells:
if table_cells:
table_cellsurls = table_cells[1].cssselect("a")
record['HospitalURL'] = table_cellsurls[1].attrib.get('href')

record['Date'] = table_cells[0].text
record['Hospital'] = table_cells[1].text
record['Region'] = table_cells[2].text
Expand All @@ -30,17 +32,18 @@ def scrape_table(root):

# # scrape_and_look_for_next_link function: calls the scrape_table
# # function, then hunts for a 'next' link: if one is found, calls itself again
def scrape_and_look_for_next_link(url):
def scrape_and_look_for_next_link():
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
next_link = root.cssselect("a.next")
print next_link
if next_link:
next_url = urlparse.urljoin(base_url, next_link[0].attrib.get('href'))
print next_url
scrape_and_look_for_next_link(next_url)
# Below would find a next button on the page and select it, then loop through that page etc
# scrape_table(root)
# next_link = root.cssselect("a.next")
# print next_link
# if next_link:
# next_url = urlparse.urljoin(base_url, next_link[0].attrib.get('href'))
# print next_url
# scrape_and_look_for_next_link(next_url)

# START HERE: define your starting URL - then call a function to scrape it
starting_url='http://inmo.ie/6022'
Expand Down

0 comments on commit 2b569a8

Please sign in to comment.