From 39d9599cdd9a14fe624a84672f540657038cce51 Mon Sep 17 00:00:00 2001 From: blablupcom Date: Mon, 1 Feb 2016 07:26:21 +0300 Subject: [PATCH] First commit --- requirements.txt | 5 ++ scraper.py | 137 +++++++++++++++++++++++++++++++++++++++++++++ scraperwiki.sqlite | Bin 0 -> 3072 bytes 3 files changed, 142 insertions(+) create mode 100644 requirements.txt create mode 100644 scraper.py create mode 100644 scraperwiki.sqlite diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..690a611 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +-e git+http://github.com/openaustralia/scraperwiki-python.git@morph_defaults#egg=scraperwiki +lxml==3.4.4 +cssselect==0.9.1 +beautifulsoup4 + diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..9bb1b62 --- /dev/null +++ b/scraper.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- + +#### IMPORTS 1.0 + +import os +import re +import scraperwiki +import urllib2 +from datetime import datetime +from bs4 import BeautifulSoup + +#### FUNCTIONS 1.0 + +def validateFilename(filename): + filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$' + dateregex = '[0-9][0-9][0-9][0-9]_[0-9QY][0-9]' + validName = (re.search(filenameregex, filename) != None) + found = re.search(dateregex, filename) + if not found: + return False + date = found.group(0) + now = datetime.now() + year, month = date[:4], date[5:7] + validYear = (2000 <= int(year) <= now.year) + if 'Q' in date: + validMonth = (month in ['Q0', 'Q1', 'Q2', 'Q3', 'Q4']) + elif 'Y' in date: + validMonth = (month in ['Y1']) + else: + try: + validMonth = datetime.strptime(date, "%Y_%m") < now + except: + return False + if all([validName, validYear, validMonth]): + return True + + +def validateURL(url): + try: + r = urllib2.urlopen(url) + count = 1 + while r.getcode() == 500 and count < 4: + print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code)) + count += 1 + r = urllib2.urlopen(url) + sourceFilename = r.headers.get('Content-Disposition') + + if sourceFilename: + ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '') + else: + ext = os.path.splitext(url)[1] + validURL = r.getcode() == 200 + validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx', '.pdf'] + return validURL, validFiletype + except: + print ("Error validating URL.") + return False, False + + + +def validate(filename, file_url): + validFilename = validateFilename(filename) + validURL, validFiletype = validateURL(file_url) + if not validFilename: + print filename, "*Error: Invalid filename*" + print file_url + return False + if not validURL: + print filename, "*Error: Invalid URL*" + print file_url + return False + if not validFiletype: + print filename, "*Error: Invalid filetype*" + print file_url + return False + return True + + +def convert_mth_strings ( mth_string ): + month_numbers = {'JAN': '01', 'FEB': '02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06', 'JUL':'07', 'AUG':'08', 'SEP':'09','OCT':'10','NOV':'11','DEC':'12' } + for k, v in month_numbers.items(): + mth_string = mth_string.replace(k, v) + return mth_string + +#### VARIABLES 1.0 + +entity_id = "FTRCDX_HDNHSFT_gov" +url = "http://www.hdft.nhs.uk/about-us/freedomofinformation/publication-scheme/what-we-spend-and-how-we-spend-it/" +errors = 0 +data = [] + +#### READ HTML 1.0 + +html = urllib2.urlopen(url) +soup = BeautifulSoup(html, 'lxml') + + +#### SCRAPE DATA + + +links = soup.find('table', 'DataGrid oDataGrid').find('tbody').find_all('a') +for link in links: + try: + if '=Attachment' in link['href']: + url = 'http://www.hdft.nhs.uk'+link['href'] + title = link['title'].strip() + if '20' not in title: + continue + csvMth = 'Y1' + csvYr = title.split('-')[0][-4:] + csvMth = convert_mth_strings(csvMth.upper()) + data.append([csvYr, csvMth, url]) + except: + break + + +#### STORE DATA 1.0 + +for row in data: + csvYr, csvMth, url = row + filename = entity_id + "_" + csvYr + "_" + csvMth + todays_date = str(datetime.now()) + file_url = url.strip() + + valid = validate(filename, file_url) + + if valid == True: + scraperwiki.sqlite.save(unique_keys=['l'], data={"l": file_url, "f": filename, "d": todays_date }) + print filename + else: + errors += 1 + +if errors > 0: + raise Exception("%d errors occurred during scrape." % errors) + + +#### EOF diff --git a/scraperwiki.sqlite b/scraperwiki.sqlite new file mode 100644 index 0000000000000000000000000000000000000000..562e8b55dc3009568fca6a3038571a7c7abb18f0 GIT binary patch literal 3072 zcmeH{-EP`26vu5RAc3^&ieCAuYPAyVd_aO!ZCOcGv=&qvGSz>@Lb=+&06)M>EUXv;|K;Z;08HWNR4otc+CnD%BGG=UPK@8}SI0l&mV)+W2@lsl=rI-$ip(es32 zQY!0u*)%n~NU2@ya(-Jb6vpGR){Qz`>vt1v^r_&5$wMOa=mWbdTr)mo$zU{WGcCN2 z-ndD^c(rmwsblDS{Na`zbw+W#m$2bougzu)F6Uvpd&Bzt#BUr|TES_hc6xc@2iJqU zK+u@MM>^R}ErrvmS#)$sIJzdBF@sZ=z!{otlnjST!jU!M=oBue2)OgV{8mFVEr%9M z5^y%L)@9RYY-lMPp~Ng%5^R=bTWi*h%-G-(HeDmO=mtlE%`}Oz9vg84TXx$2@DZRc zg78uKs#xfUZ2NEbZ=wPnby9~C7h^S@Om8C1%ZiuUM4FfNL1pUXT&8F%sgNR@WhFg9 Ok@%L3#06MM8GZv=l}}#) literal 0 HcmV?d00001