diff --git a/scraper.py b/scraper.py index 7e7b895..9ca1337 100644 --- a/scraper.py +++ b/scraper.py @@ -9,6 +9,7 @@ from datetime import datetime from bs4 import BeautifulSoup + #### FUNCTIONS 1.0 def validateFilename(filename): @@ -50,14 +51,13 @@ def validateURL(url): else: ext = os.path.splitext(url)[1] validURL = r.getcode() == 200 - validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx'] + validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx', '.pdf'] return validURL, validFiletype except: print ("Error validating URL.") return False, False - def validate(filename, file_url): validFilename = validateFilename(filename) validURL, validFiletype = validateURL(file_url) @@ -82,10 +82,11 @@ def convert_mth_strings ( mth_string ): mth_string = mth_string.replace(k, v) return mth_string + #### VARIABLES 1.0 -entity_id = "NFTRL4_TRWNT_gov" -url = "https://data.gov.uk/dataset/financial-transactions-data-royal-wolverhampton-hospitals-nhs-trust" +entity_id = "CCG00D_NDDEASCCG_gov" +url = "https://www.durhamdaleseasingtonsedgefieldccg.nhs.uk/documents/payments-over-25000/" errors = 0 data = [] @@ -93,21 +94,23 @@ def convert_mth_strings ( mth_string ): #### READ HTML 1.0 html = urllib2.urlopen(url) -soup = BeautifulSoup(html, 'lxml') +soup = BeautifulSoup(html, "lxml") #### SCRAPE DATA -blocks = soup.find_all('div', 'dataset-resource-text') -for block in blocks: - title = block.find('span', 'inner-cell').text.strip().split() - url = block.find('div', 'inner-cell').find_all('span')[1].find('a')['href'] - csvMth = title[2][:3] - csvYr = title[1] +title_divs = soup.find_all('p', 'attachment') +for title_div in title_divs: + block = title_div.find('a') + url = block['href'] + title = block.text.strip() + csvMth = title.split()[-1][:3] + csvYr = title.strip()[:4] + if '201' in csvMth or '2018' in csvYr: + csvMth = title.split()[-2][:3] csvMth = convert_mth_strings(csvMth.upper()) data.append([csvYr, csvMth, url]) - #### STORE DATA 1.0 for row in data: