Update scraper.py

blablupcom · Jun 7, 2017 · 8e2aa8a · 8e2aa8a
1 parent de3d6a0
commit 8e2aa8a
Showing 1 changed file with 29 additions and 13 deletions.
diff --git a/scraper.py b/scraper.py
@@ -9,9 +9,7 @@
 from datetime import datetime
 from bs4 import BeautifulSoup
 
-
-#### FUNCTIONS 1.1
-import requests   #import requests for validating urls
+#### FUNCTIONS 1.0
 
 def validateFilename(filename):
     filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$'
@@ -39,19 +37,19 @@ def validateFilename(filename):
 
 def validateURL(url):
     try:
-        r = requests.get(url)
+        r = urllib2.urlopen(url)
         count = 1
-        while r.status_code == 500 and count < 4:
+        while r.getcode() == 500 and count < 4:
             print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
             count += 1
-            r = requests.get(url)
+            r = urllib2.urlopen(url)
         sourceFilename = r.headers.get('Content-Disposition')
 
         if sourceFilename:
             ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
         else:
             ext = os.path.splitext(url)[1]
-        validURL = r.status_code == 200
+        validURL = r.getcode() == 200
         validFiletype = ext.lower() in ['.csv', '.xls', '.zip', '.xlsx', '.pdf']
         return validURL, validFiletype
     except:
@@ -86,8 +84,8 @@ def convert_mth_strings ( mth_string ):
 
 #### VARIABLES 1.0
 
-entity_id = "NHTRTVFT_5BPNFT_gov"
-url = "http://www.5boroughspartnership.nhs.uk/financial-transparency-reports/"
+entity_id = "NFTRYW_BCHNFT_gov"
+url = "http://www.bhamcommunity.nhs.uk/about-us/publications/public-funds/previous-years/"
 errors = 0
 data = []
 
@@ -100,13 +98,31 @@ def convert_mth_strings ( mth_string ):
 #### SCRAPE DATA
 
 
-blocks = soup.find('div', 'related_docs').find_all('a')
+
+blocks = soup.find_all('a', 'result-link')
 for block in blocks:
-    url = 'http://www.5boroughspartnership.nhs.uk' + block['href']
-    csvMth = block.text.split()[0][:3]
-    csvYr = block.text.split()[-1]
+    link = 'http://www.bhamcommunity.nhs.uk/about-us/publications/public-funds/previous-years/' + block['href']
+    html_page = urllib2.urlopen(link)
+    soup_page = BeautifulSoup(html_page, 'lxml')
+    csvMth = block.text.split()[-3][:3]
+    csvYr = block.text.split()[-2]
+    doc_link = soup_page.find('a', attrs={'id':'downloadAsset'})['href']
+    url = 'http://www.bhamcommunity.nhs.uk' + doc_link
     csvMth = convert_mth_strings(csvMth.upper())
     data.append([csvYr, csvMth, url])
+link_page2 = "http://www.bhamcommunity.nhs.uk/about-us/publications/public-funds/"
+html_page2 = urllib2.urlopen(link_page2)
+soup_page2 = BeautifulSoup(html_page2, 'lxml')
+doc_link_a = soup_page2.find('a', 'result-link')
+doc_link_current = 'http://www.bhamcommunity.nhs.uk/about-us/publications/public-funds/' + doc_link_a['href']
+html_current = urllib2.urlopen(doc_link_current)
+soup_current = BeautifulSoup(html_current, 'lxml')
+csvMth = doc_link_a.text.split()[-3][:3]
+csvYr = doc_link_a.text.split()[-2]
+url_current = soup_current.find('a', attrs={'id':'downloadAsset'})['href']
+url = 'http://www.bhamcommunity.nhs.uk' + url_current
+csvMth = convert_mth_strings(csvMth.upper())
+data.append([csvYr, csvMth, url])
 
 
 #### STORE DATA 1.0