From 7461f950843953cca26ffb7b091f2afb7dadc003 Mon Sep 17 00:00:00 2001 From: blablupcom Date: Tue, 24 Jul 2018 14:17:34 +0300 Subject: [PATCH] Update scraper.py --- scraper.py | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/scraper.py b/scraper.py index 006f241..f5c21e3 100644 --- a/scraper.py +++ b/scraper.py @@ -44,8 +44,7 @@ def validateURL(url): print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code)) count += 1 r = urllib2.urlopen(url) - sourceFilename = r.headers.get('Content-Disposition') - + sourceFilename = r.geturl() if sourceFilename: ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '') else: @@ -85,8 +84,8 @@ def convert_mth_strings ( mth_string ): #### VARIABLES 1.0 -entity_id = "E1232_CBC_gov" -url = "https://www.dorsetforyou.gov.uk/your-council/about-your-council/budgets-and-spending/open-data-and-transparency/payments-to-suppliers-christchurch-borough-council.aspx" +entity_id = "E1732_EHDC_gov" +url = "https://www.easthants.gov.uk/transparency" errors = 0 data = [] @@ -98,26 +97,23 @@ def convert_mth_strings ( mth_string ): #### SCRAPE DATA -links = soup.find('main', id='main').find_all('li') -for link in links: - if 'http' not in link.find('a')['href']: - url = 'https://www.dorsetforyou.gov.uk/' + link.find('a')['href'][1:] - else: - url = link.find('a')['href'][1:] - if '.xlsx' in url or '.xls' in url or '.csv' in url: - file_name = link.text.strip() - csvYr = link.text.strip()[-4:] - if 'Q4' in file_name: - csvMth = 'Q1' - if 'Q3' in file_name: - csvMth = 'Q4' - if 'Q2' in file_name: - csvMth = 'Q3' - if 'Q1' in file_name: - csvMth = 'Q2' - csvMth = convert_mth_strings(csvMth.upper()) - data.append([csvYr, csvMth, url]) - +ul_blocks = soup.find('div', attrs = {'property': 'content:encoded'}).find_all('ul') +for ul_block in ul_blocks: + li_blocks = ul_block.find_all('li') + for li_block in li_blocks: + link = li_block.find('a', href=True) + if link: + url = link['href'] + if '-csv-' in url: + file_name = link.text + if 'http' not in url: + url = 'https://www.easthants.gov.uk'+link['href'] + else: + url = link['href'] + csvMth = file_name[:3] + csvYr = file_name.split()[1] + csvMth = convert_mth_strings(csvMth.upper()) + data.append([csvYr, csvMth, url]) #### STORE DATA 1.0