Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed May 30, 2018
1 parent bca30d4 commit 251fd6b
Showing 1 changed file with 26 additions and 33 deletions.
59 changes: 26 additions & 33 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from datetime import datetime
from bs4 import BeautifulSoup


#### FUNCTIONS 1.0

def validateFilename(filename):
Expand Down Expand Up @@ -81,52 +82,44 @@ def convert_mth_strings ( mth_string ):
mth_string = mth_string.replace(k, v)
return mth_string


#### VARIABLES 1.0

entity_id = "E1821_WCC_gov"
url = "http://www.worcestershire.gov.uk/info/20024/council_finance/331/payments_to_commercial_suppliers_over_500_and_government_procurement_card_transactions"
entity_id = "E2231_ABC_gov"
url = "https://www.ashford.gov.uk/transparency/expenditure/supplier-invoices/"
errors = 0
data = []


#### READ HTML 1.0

html = urllib2.urlopen(url)
soup = BeautifulSoup(html, 'lxml')


#### SCRAPE DATA

block = soup.find('div',{'id':'content'})
links = block.findAll('a', text=re.compile('View Reports'))

links = soup.find('section', attrs = {'class': 'main-article-area-7'}).find_all('a', href=True)
for link in links:
if 'http' not in link['href']:
suburl = 'http://www.worcestershire.gov.uk' + link['href']
else:
suburl = link['href']
if 'payments_to_commercial_suppliers_over' in suburl:
html2 = urllib2.urlopen(suburl)
soup2 = BeautifulSoup(html2, 'lxml')
block = soup2.find('ul', {'class':'item-list item-list__rich'})
sublinks = block.findAll('a', href=True)
for sublink in sublinks:
filePageUrl = sublink['href']
title = sublink.encode_contents(formatter='html').replace(' ',' ')
title = title.upper().strip()
html3 = urllib2.urlopen(filePageUrl)
soup3 = BeautifulSoup(html3, 'lxml')
block = soup3.find('main',{'class':'main-content'})
filelinks = block.findAll('a', href=True)
for filelink in filelinks:
fileurl = filelink['href']
if 'Download' in filelink.text:
csvYr = title.split(' ')[-1]
csvMth = title.split(' ')[-2][:3]
if ' - ' not in title:
csvYr = title.split()[1]
csvMth = title.split()[0][:3]
csvMth = convert_mth_strings(csvMth)
data.append([csvYr, csvMth, fileurl])
if ('.xlsx' in link['href'] or '.csv' in link['href'] or '.xlx' in link['href']) and ('Supplier Spend' in link.text or 'Supplier spend' in link.text):
if 'http' not in link['href']:
url = 'https://www.ashford.gov.uk'+link['href']
else:
url = link['href']
file_name = link.text.strip()
if 'April - June' in file_name:
csvMth = 'Q2'
if 'July - Sept' in file_name:
csvMth = 'Q3'
if 'October - December' in file_name or 'Oct - Dec' in file_name:
csvMth = 'Q4'
if 'Jan - Mar' in file_name:
csvMth = 'Q1'
if ' - ' not in file_name:
csvMth = 'Y1'
csvYr = file_name.split('/')[0][-4:]
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])


#### STORE DATA 1.0

Expand Down

0 comments on commit 251fd6b

Please sign in to comment.