Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed Apr 13, 2017
1 parent 1d6712c commit 9f2d773
Showing 1 changed file with 18 additions and 21 deletions.
39 changes: 18 additions & 21 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def convert_mth_strings ( mth_string ):
#### VARIABLES 1.0

entity_id = "DFT004_DFT_gov"
url = "http://data.gov.uk/dataset/financial-transactions-data-dft"
url = "https://data.gov.uk/dataset/financial-transactions-data-dft"
errors = 0
data = []

Expand All @@ -96,27 +96,24 @@ def convert_mth_strings ( mth_string ):

#### SCRAPE DATA

blocks = soup.findAll('div', {'class':'dataset-resource'})

blocks = soup.select('div.dataset-resource-text')
for block in blocks:
a = ''
try:
a = block.findAll('a')[2]
except:
pass
if not a:
continue
link = a['href']
title = block.find('span','inner-cell').text.strip().split(' ')[0].strip()
csvYr = title.split('/')[-1]
csvMth = title.split('/')[0]
if 'to 11th May 2010' in block.find('span','inner-cell').text.strip():
csvMth = 'Q0'
csvYr = '2010'
if 'April 2009 to March 2010' in block.find('span','inner-cell').text.strip():
csvMth = 'Y1'
csvYr = '2009'
data.append([csvYr, csvMth, link])
link = block.find_next('a').find_next('a')['href']
if '.csv' in link or '.xlsx' in link or '.xls' in link:
link_text = block.select_one('span.inner-cell').text.replace(' return', '').strip()
csvYr = link_text.split()[-1]
csvMth = link_text.split()[-2][:3]
if 'to 11th May 2010' in block.find('span','inner-cell').text.strip():
csvMth = 'Q0'
csvYr = '2010'
if 'April 2009 to March 2010' in block.find('span','inner-cell').text.strip():
csvMth = 'Y1'
csvYr = '2009'
if '12th' in csvYr:
csvMth = 'May'
csvYr = '2010'
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, link])


#### STORE DATA 1.0
Expand Down

0 comments on commit 9f2d773

Please sign in to comment.