Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed Jun 7, 2017
1 parent de3d6a0 commit 8e2aa8a
Showing 1 changed file with 29 additions and 13 deletions.
42 changes: 29 additions & 13 deletions scraper.py
Expand Up @@ -9,9 +9,7 @@
from datetime import datetime
from bs4 import BeautifulSoup


#### FUNCTIONS 1.1
import requests #import requests for validating urls
#### FUNCTIONS 1.0

def validateFilename(filename):
filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$'
Expand Down Expand Up @@ -39,19 +37,19 @@ def validateFilename(filename):

def validateURL(url):
try:
r = requests.get(url)
r = urllib2.urlopen(url)
count = 1
while r.status_code == 500 and count < 4:
while r.getcode() == 500 and count < 4:
print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
count += 1
r = requests.get(url)
r = urllib2.urlopen(url)
sourceFilename = r.headers.get('Content-Disposition')

if sourceFilename:
ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
else:
ext = os.path.splitext(url)[1]
validURL = r.status_code == 200
validURL = r.getcode() == 200
validFiletype = ext.lower() in ['.csv', '.xls', '.zip', '.xlsx', '.pdf']
return validURL, validFiletype
except:
Expand Down Expand Up @@ -86,8 +84,8 @@ def convert_mth_strings ( mth_string ):

#### VARIABLES 1.0

entity_id = "NHTRTVFT_5BPNFT_gov"
url = "http://www.5boroughspartnership.nhs.uk/financial-transparency-reports/"
entity_id = "NFTRYW_BCHNFT_gov"
url = "http://www.bhamcommunity.nhs.uk/about-us/publications/public-funds/previous-years/"
errors = 0
data = []

Expand All @@ -100,13 +98,31 @@ def convert_mth_strings ( mth_string ):
#### SCRAPE DATA


blocks = soup.find('div', 'related_docs').find_all('a')

blocks = soup.find_all('a', 'result-link')
for block in blocks:
url = 'http://www.5boroughspartnership.nhs.uk' + block['href']
csvMth = block.text.split()[0][:3]
csvYr = block.text.split()[-1]
link = 'http://www.bhamcommunity.nhs.uk/about-us/publications/public-funds/previous-years/' + block['href']
html_page = urllib2.urlopen(link)
soup_page = BeautifulSoup(html_page, 'lxml')
csvMth = block.text.split()[-3][:3]
csvYr = block.text.split()[-2]
doc_link = soup_page.find('a', attrs={'id':'downloadAsset'})['href']
url = 'http://www.bhamcommunity.nhs.uk' + doc_link
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])
link_page2 = "http://www.bhamcommunity.nhs.uk/about-us/publications/public-funds/"
html_page2 = urllib2.urlopen(link_page2)
soup_page2 = BeautifulSoup(html_page2, 'lxml')
doc_link_a = soup_page2.find('a', 'result-link')
doc_link_current = 'http://www.bhamcommunity.nhs.uk/about-us/publications/public-funds/' + doc_link_a['href']
html_current = urllib2.urlopen(doc_link_current)
soup_current = BeautifulSoup(html_current, 'lxml')
csvMth = doc_link_a.text.split()[-3][:3]
csvYr = doc_link_a.text.split()[-2]
url_current = soup_current.find('a', attrs={'id':'downloadAsset'})['href']
url = 'http://www.bhamcommunity.nhs.uk' + url_current
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])


#### STORE DATA 1.0
Expand Down

0 comments on commit 8e2aa8a

Please sign in to comment.