Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed Dec 11, 2015
1 parent 981e5cb commit 1b97496
Showing 1 changed file with 128 additions and 43 deletions.
171 changes: 128 additions & 43 deletions scraper.py
@@ -1,60 +1,145 @@
# -*- coding: utf-8 -*-

#### IMPORTS 1.0

import os
import re
import scraperwiki
import urllib2
from datetime import datetime
from bs4 import BeautifulSoup

# Set up variables
#### FUNCTIONS 1.0

def validateFilename(filename):
filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$'
dateregex = '[0-9][0-9][0-9][0-9]_[0-9QY][0-9]'
validName = (re.search(filenameregex, filename) != None)
found = re.search(dateregex, filename)
if not found:
return False
date = found.group(0)
now = datetime.now()
year, month = date[:4], date[5:7]
validYear = (2000 <= int(year) <= now.year)
if 'Q' in date:
validMonth = (month in ['Q0', 'Q1', 'Q2', 'Q3', 'Q4'])
elif 'Y' in date:
validMonth = (month in ['Y1'])
else:
try:
validMonth = datetime.strptime(date, "%Y_%m") < now
except:
return False
if all([validName, validYear, validMonth]):
return True


def validateURL(url):
try:
r = urllib2.urlopen(url)
count = 1
while r.getcode() == 500 and count < 4:
print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
count += 1
r = urllib2.urlopen(url)
sourceFilename = r.headers.get('Content-Disposition')

if sourceFilename:
ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
else:
ext = os.path.splitext(url)[1]
validURL = r.getcode() == 200
validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx']
return validURL, validFiletype
except:
print ("Error validating URL.")
return False, False


def validate(filename, file_url):
validFilename = validateFilename(filename)
validURL, validFiletype = validateURL(file_url)
if not validFilename:
print filename, "*Error: Invalid filename*"
print file_url
return False
if not validURL:
print filename, "*Error: Invalid URL*"
print file_url
return False
if not validFiletype:
print filename, "*Error: Invalid filetype*"
print file_url
return False
return True


def convert_mth_strings ( mth_string ):
month_numbers = {'JAN': '01', 'FEB': '02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06', 'JUL':'07', 'AUG':'08', 'SEP':'09','OCT':'10','NOV':'11','DEC':'12' }
for k, v in month_numbers.items():
mth_string = mth_string.replace(k, v)
return mth_string

#### VARIABLES 1.0

entity_id = "E1821_WCC_gov"
url = "http://www.worcestershire.gov.uk/info/20024/council_finance/331/payments_to_commercial_suppliers_over_500_and_government_procurement_card_transactions"
errors = 0
data = []

#### READ HTML 1.0

# Set up functions
def convert_mth_strings ( mth_string ):
month_numbers = {'JAN': '01', 'FEB': '02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06', 'JUL':'07', 'AUG':'08', 'SEP':'09','OCT':'10','NOV':'11','DEC':'12' }
#loop through the months in our dictionary
for k, v in month_numbers.items():
#then replace the word with the number
mth_string = mth_string.replace(k, v)
return mth_string

# pull down the content from the webpage
html = urllib2.urlopen(url)
soup = BeautifulSoup(html)
soup = BeautifulSoup(html, 'lxml')


#### SCRAPE DATA

# find all entries with the required class
block = soup.find('div',{'class':'editor'})
links = block.findAll('a', href=True)

for link in links:
suburl = 'http://www.worcestershire.gov.uk' + link['href']
if 'payments_to_commercial_suppliers_over' in suburl:
html2 = urllib2.urlopen(suburl)
soup2 = BeautifulSoup(html2)
block = soup2.find('ul', {'class':'item-list item-list__rich'})
sublinks = block.findAll('a', href=True)

for sublink in sublinks:
filePageUrl = sublink['href']
title = sublink.encode_contents(formatter='html').replace('&nbsp;',' ') # gets rid of erroneous &nbsp; chars
title = title.upper().strip()
html3 = urllib2.urlopen(filePageUrl)
soup3 = BeautifulSoup(html3)

block = soup3.find('main',{'class':'main-content'})
filelinks = block.findAll('a', href=True)

for filelink in filelinks:
# create the right strings for the new filename
fileurl = filelink['href']
if 'Download' in filelink.text:
print filelink.text
print fileurl
csvYr = title.split(' ')[-1]
csvMth = title.split(' ')[-2][:3]
csvMth = convert_mth_strings(csvMth);
filename = entity_id + "_" + csvYr + "_" + csvMth
todays_date = str(datetime.now())
scraperwiki.sqlite.save(unique_keys=['l'], data={"l": fileurl, "f": filename, "d": todays_date })
print filename
suburl = 'http://www.worcestershire.gov.uk' + link['href']
if 'payments_to_commercial_suppliers_over' in suburl:
html2 = urllib2.urlopen(suburl)
soup2 = BeautifulSoup(html2, 'lxml')
block = soup2.find('ul', {'class':'item-list item-list__rich'})
sublinks = block.findAll('a', href=True)
for sublink in sublinks:
filePageUrl = sublink['href']
title = sublink.encode_contents(formatter='html').replace('&nbsp;',' ')
title = title.upper().strip()
html3 = urllib2.urlopen(filePageUrl)
soup3 = BeautifulSoup(html3, 'lxml')
block = soup3.find('main',{'class':'main-content'})
filelinks = block.findAll('a', href=True)
for filelink in filelinks:
fileurl = filelink['href']
if 'Download' in filelink.text:
csvYr = title.split(' ')[-1]
csvMth = title.split(' ')[-2][:3]
csvMth = convert_mth_strings(csvMth)
data.append([csvYr, csvMth, fileurl])

#### STORE DATA 1.0

for row in data:
csvYr, csvMth, url = row
filename = entity_id + "_" + csvYr + "_" + csvMth
todays_date = str(datetime.now())
file_url = url.strip()

valid = validate(filename, file_url)

if valid == True:
scraperwiki.sqlite.save(unique_keys=['l'], data={"l": file_url, "f": filename, "d": todays_date })
print filename
else:
errors += 1

if errors > 0:
raise Exception("%d errors occurred during scrape." % errors)


#### EOF

0 comments on commit 1b97496

Please sign in to comment.