Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed Jul 24, 2018
1 parent ec4e42b commit 7461f95
Showing 1 changed file with 20 additions and 24 deletions.
44 changes: 20 additions & 24 deletions scraper.py
Expand Up @@ -44,8 +44,7 @@ def validateURL(url):
print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
count += 1
r = urllib2.urlopen(url)
sourceFilename = r.headers.get('Content-Disposition')

sourceFilename = r.geturl()
if sourceFilename:
ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
else:
Expand Down Expand Up @@ -85,8 +84,8 @@ def convert_mth_strings ( mth_string ):

#### VARIABLES 1.0

entity_id = "E1232_CBC_gov"
url = "https://www.dorsetforyou.gov.uk/your-council/about-your-council/budgets-and-spending/open-data-and-transparency/payments-to-suppliers-christchurch-borough-council.aspx"
entity_id = "E1732_EHDC_gov"
url = "https://www.easthants.gov.uk/transparency"
errors = 0
data = []

Expand All @@ -98,26 +97,23 @@ def convert_mth_strings ( mth_string ):

#### SCRAPE DATA

links = soup.find('main', id='main').find_all('li')
for link in links:
if 'http' not in link.find('a')['href']:
url = 'https://www.dorsetforyou.gov.uk/' + link.find('a')['href'][1:]
else:
url = link.find('a')['href'][1:]
if '.xlsx' in url or '.xls' in url or '.csv' in url:
file_name = link.text.strip()
csvYr = link.text.strip()[-4:]
if 'Q4' in file_name:
csvMth = 'Q1'
if 'Q3' in file_name:
csvMth = 'Q4'
if 'Q2' in file_name:
csvMth = 'Q3'
if 'Q1' in file_name:
csvMth = 'Q2'
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])

ul_blocks = soup.find('div', attrs = {'property': 'content:encoded'}).find_all('ul')
for ul_block in ul_blocks:
li_blocks = ul_block.find_all('li')
for li_block in li_blocks:
link = li_block.find('a', href=True)
if link:
url = link['href']
if '-csv-' in url:
file_name = link.text
if 'http' not in url:
url = 'https://www.easthants.gov.uk'+link['href']
else:
url = link['href']
csvMth = file_name[:3]
csvYr = file_name.split()[1]
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])

#### STORE DATA 1.0

Expand Down

0 comments on commit 7461f95

Please sign in to comment.