diff --git a/scraper.py b/scraper.py index 6eb2453..8fc4906 100644 --- a/scraper.py +++ b/scraper.py @@ -9,10 +9,8 @@ from datetime import datetime from bs4 import BeautifulSoup - - #### FUNCTIONS 1.2 -import requests # import requests to validate url +import requests # import requests for validating urls def validateFilename(filename): filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$' @@ -40,25 +38,27 @@ def validateFilename(filename): def validateURL(url): try: - r = requests.get(url, allow_redirects=True, timeout=20) + r = requests.get(url) count = 1 while r.status_code == 500 and count < 4: print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code)) count += 1 - r = requests.get(url, allow_redirects=True, timeout=20) + r = requests.get(url) sourceFilename = r.headers.get('Content-Disposition') + if sourceFilename: ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '') else: ext = os.path.splitext(url)[1] validURL = r.status_code == 200 - validFiletype = ext in ['.csv', '.xls', '.xlsx'] + validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx', '.pdf'] return validURL, validFiletype except: print ("Error validating URL.") return False, False + def validate(filename, file_url): validFilename = validateFilename(filename) validURL, validFiletype = validateURL(file_url) @@ -83,46 +83,31 @@ def convert_mth_strings ( mth_string ): mth_string = mth_string.replace(k, v) return mth_string - #### VARIABLES 1.0 -entity_id = "E4210_WMBC_gov" -url = "https://www.wigan.gov.uk/Council/DataProtection-FOI-Stats/Spending-and-Finance-data.aspx" +entity_id = "NFTRRP_BEAHMHNT_gov" +url = "http://www.beh-mht.nhs.uk/proxy2.aspx?requesturl=http://www.beh-mht.nhs.uk/ws/AssociatedInformation.asmx/FilefolderList?sitekit_SiteID=3053&FolderName=Downloads/About%20Us/Finance/Expenditure&orderBy=DisplayName&NumRetrned=100&sitekit_LoggedInUser=0&sitekit_GroupId=0" errors = 0 data = [] - #### READ HTML 1.0 html = urllib2.urlopen(url) soup = BeautifulSoup(html, 'lxml') + #### SCRAPE DATA -pat = re.compile('\d{4}') -block = soup.find('div', attrs = {'id':'L3_MainContentPlaceholder'}).find_all_next('ul') -for b in block: - links = b.find_all('a') - for link in links: - if 'Spend' in link.text: - if '.csv' in link['href']: - url = 'https://www.wigan.gov.uk' + link['href'] - csvMth = link.text.strip().split('-')[-1].strip().split('(')[0].strip()[:3] - csvYr = link.text.strip().split('-')[-1].strip().split('(')[0].strip()[-4:] - csvMth = convert_mth_strings(csvMth.upper()) - todays_date = str(datetime.now()) - if len(link.text.split('-')) > 2: - tys = pat.findall(link.text) - if len(tys) > 1: - if int(tys[0]) < int(tys[1]): - csvMth = 'Y1' - if int(tys[0]) == int(tys[1]): - csvMth = 'Q0' - if len(tys) == 1: - csvMth = 'Q0' - else: - csvMth = 'Q0' - data.append([csvYr, csvMth, url]) +links = soup.find_all('fileurl') +for link in links: + url = 'http://www.beh-mht.nhs.uk/Downloads/About Us/Finance/Expenditure/' + link.text + title = link.text.strip().split()[-1].split('-') + csvYr = title[0][:4] + if len(title) == 1: + csvMth = 'Y1' + else: + csvMth = title[1][:2] + data.append([csvYr, csvMth, url]) #### STORE DATA 1.0