Update scraper.py

bethanjhn · Jul 30, 2019 · 2b569a8 · 2b569a8
1 parent 7b9d7e2
commit 2b569a8
Showing 1 changed file with 13 additions and 10 deletions.
diff --git a/scraper.py b/scraper.py
@@ -5,7 +5,6 @@
 ###############################################################################
 
 import scraperwiki
-import urlparse
 import lxml.html
 
 # scrape_table function: gets passed an individual page to scrape
@@ -15,7 +14,10 @@ def scrape_table(root):
         # Set up our data record - we'll need it later
         record = {}
         table_cells = row.cssselect("td")
-        if table_cells: 
+        if table_cells:
+            table_cellsurls = table_cells[1].cssselect("a") 
+            record['HospitalURL'] = table_cellsurls[1].attrib.get('href')
+
             record['Date'] = table_cells[0].text
             record['Hospital'] = table_cells[1].text
             record['Region'] = table_cells[2].text
@@ -30,17 +32,18 @@ def scrape_table(root):
 
 # # scrape_and_look_for_next_link function: calls the scrape_table
 # # function, then hunts for a 'next' link: if one is found, calls itself again
-def scrape_and_look_for_next_link(url):
+def scrape_and_look_for_next_link():
     html = scraperwiki.scrape(url)
     print html
     root = lxml.html.fromstring(html)
-    scrape_table(root)
-    next_link = root.cssselect("a.next")
-    print next_link
-    if next_link:
-        next_url = urlparse.urljoin(base_url, next_link[0].attrib.get('href'))
-        print next_url
-        scrape_and_look_for_next_link(next_url)
+#     Below would find a next button on the page and select it, then loop through that page etc
+#     scrape_table(root)
+#     next_link = root.cssselect("a.next")
+#     print next_link
+#     if next_link:
+#         next_url = urlparse.urljoin(base_url, next_link[0].attrib.get('href'))
+#         print next_url
+#         scrape_and_look_for_next_link(next_url)
 
   # START HERE: define your starting URL - then call a function to scrape it
 starting_url='http://inmo.ie/6022'