Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed Apr 2, 2018
1 parent caa3aab commit 76cd762
Showing 1 changed file with 15 additions and 12 deletions.
27 changes: 15 additions & 12 deletions scraper.py
Expand Up @@ -9,6 +9,7 @@
from datetime import datetime
from bs4 import BeautifulSoup


#### FUNCTIONS 1.0

def validateFilename(filename):
Expand Down Expand Up @@ -50,14 +51,13 @@ def validateURL(url):
else:
ext = os.path.splitext(url)[1]
validURL = r.getcode() == 200
validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx']
validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx', '.pdf']
return validURL, validFiletype
except:
print ("Error validating URL.")
return False, False



def validate(filename, file_url):
validFilename = validateFilename(filename)
validURL, validFiletype = validateURL(file_url)
Expand All @@ -82,32 +82,35 @@ def convert_mth_strings ( mth_string ):
mth_string = mth_string.replace(k, v)
return mth_string


#### VARIABLES 1.0

entity_id = "NFTRL4_TRWNT_gov"
url = "https://data.gov.uk/dataset/financial-transactions-data-royal-wolverhampton-hospitals-nhs-trust"
entity_id = "CCG00D_NDDEASCCG_gov"
url = "https://www.durhamdaleseasingtonsedgefieldccg.nhs.uk/documents/payments-over-25000/"
errors = 0
data = []


#### READ HTML 1.0

html = urllib2.urlopen(url)
soup = BeautifulSoup(html, 'lxml')
soup = BeautifulSoup(html, "lxml")


#### SCRAPE DATA

blocks = soup.find_all('div', 'dataset-resource-text')
for block in blocks:
title = block.find('span', 'inner-cell').text.strip().split()
url = block.find('div', 'inner-cell').find_all('span')[1].find('a')['href']
csvMth = title[2][:3]
csvYr = title[1]
title_divs = soup.find_all('p', 'attachment')
for title_div in title_divs:
block = title_div.find('a')
url = block['href']
title = block.text.strip()
csvMth = title.split()[-1][:3]
csvYr = title.strip()[:4]
if '201' in csvMth or '2018' in csvYr:
csvMth = title.split()[-2][:3]
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])


#### STORE DATA 1.0

for row in data:
Expand Down

0 comments on commit 76cd762

Please sign in to comment.