Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed May 31, 2018
1 parent cefe65c commit a9fcc24
Showing 1 changed file with 23 additions and 11 deletions.
34 changes: 23 additions & 11 deletions scraper.py
Expand Up @@ -85,28 +85,40 @@ def convert_mth_strings ( mth_string ):

#### VARIABLES 1.0

entity_id = "E0931_ABC_gov"
url = "https://www.allerdale.gov.uk/en/about-council/budget-and-spending/spending-council/"
entity_id = "E2632_BDC_gov"
url = "https://www.broadland.gov.uk/info/200197/spending_and_transparency/339/council_spending_over_250"
errors = 0
data = []


#### READ HTML 1.0

html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(html, 'lxml')

#### SCRAPE DATA

links = soup.find_all('a')
links = soup.find('div', 'editor').find_all('a', href=True)
for link in links:
file_name = link.text
if 'Spending' in file_name and '.csv' in link['href']:
url = link['href']
csvYr = file_name.replace('Spending ', '').strip()[-4:]
csvMth = file_name.replace('Spending ', '').strip()[:3]
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])
if 'http' not in link['href']:
year_url = 'https://www.broadland.gov.uk' + link['href']
else:
year_url = link['href']
year_html = urllib2.urlopen(year_url)
year_soup = BeautifulSoup(year_html, 'lxml')
blocks = year_soup.find_all('span', 'download-listing__file-tag download-listing__file-tag--type')
for block in blocks:
if 'CSV' in block.text:
url = block.find_next('a')['href']
if 'http' not in url:
url = 'https://www.broadland.gov.uk' + url
else:
url = url
file_name = block.find_next('a')['aria-label']
csvMth = file_name.split()[-2][:3]
csvYr = file_name.split()[-1]
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])


#### STORE DATA 1.0
Expand Down

0 comments on commit a9fcc24

Please sign in to comment.