From 047b8690072f1157036715cdbff10cfd854a8255 Mon Sep 17 00:00:00 2001 From: blablupcom Date: Tue, 12 Apr 2016 11:08:06 +0400 Subject: [PATCH] Update scraper.py --- scraper.py | 164 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 140 insertions(+), 24 deletions(-) diff --git a/scraper.py b/scraper.py index e41caf1..b36e539 100644 --- a/scraper.py +++ b/scraper.py @@ -1,24 +1,140 @@ -# This is a template for a Python scraper on morph.io (https://morph.io) -# including some code snippets below that you should find helpful - -# import scraperwiki -# import lxml.html -# -# # Read in a page -# html = scraperwiki.scrape("http://foo.com") -# -# # Find something on the page using css selectors -# root = lxml.html.fromstring(html) -# root.cssselect("div[align='left']") -# -# # Write out to the sqlite database using scraperwiki library -# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"}) -# -# # An arbitrary query against the database -# scraperwiki.sql.select("* from data where 'name'='peter'") -print 'http://example.com' -# You don't have to do things with the ScraperWiki and lxml libraries. -# You can use whatever libraries you want: https://morph.io/documentation/python -# All that matters is that your final data is written to an SQLite database -# called "data.sqlite" in the current working directory which has at least a table -# called "data". +# -*- coding: utf-8 -*- + +#### IMPORTS 1.0 + +import os +import re +import scraperwiki +import urllib2 +from datetime import datetime +from bs4 import BeautifulSoup + + +#### FUNCTIONS 1.2 + +import requests # import requests for validating url + +def validateFilename(filename): + filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$' + dateregex = '[0-9][0-9][0-9][0-9]_[0-9QY][0-9]' + validName = (re.search(filenameregex, filename) != None) + found = re.search(dateregex, filename) + if not found: + return False + date = found.group(0) + now = datetime.now() + year, month = date[:4], date[5:7] + validYear = (2000 <= int(year) <= now.year) + if 'Q' in date: + validMonth = (month in ['Q0', 'Q1', 'Q2', 'Q3', 'Q4']) + elif 'Y' in date: + validMonth = (month in ['Y1']) + else: + try: + validMonth = datetime.strptime(date, "%Y_%m") < now + except: + return False + if all([validName, validYear, validMonth]): + return True + + +def validateURL(url): + + try: + r = requests.get(url, allow_redirects=True, timeout=20) + count = 1 + while r.status_code == 500 and count < 4: + print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code)) + count += 1 + r = requests.get(url, allow_redirects=True, timeout=20) + sourceFilename = r.headers.get('Content-Disposition') + if sourceFilename: + ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '') + else: + ext = os.path.splitext(url)[1] + validURL = r.status_code == 200 + validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx'] + return validURL, validFiletype + except: + print ("Error validating URL.") + return False, False + +def validate(filename, file_url): + validFilename = validateFilename(filename) + validURL, validFiletype = validateURL(file_url) + if not validFilename: + print filename, "*Error: Invalid filename*" + print file_url + return False + if not validURL: + print filename, "*Error: Invalid URL*" + print file_url + return False + if not validFiletype: + print filename, "*Error: Invalid filetype*" + print file_url + return False + return True + + +def convert_mth_strings ( mth_string ): + month_numbers = {'JAN': '01', 'FEB': '02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06', 'JUL':'07', 'AUG':'08', 'SEP':'09','OCT':'10','NOV':'11','DEC':'12' } + for k, v in month_numbers.items(): + mth_string = mth_string.replace(k, v) + return mth_string + + +#### VARIABLES 1.0 + +entity_id = "E0104_NSC_gov" +urls = "http://data.n-somerset.gov.uk/Download/finance/north-somerset-council-spend-over-250?page={}" +url = 'http://example.com' +errors = 0 +data = [] + + +#### READ HTML 1.0 + +html = urllib2.urlopen(url) +soup = BeautifulSoup(html, 'lxml') + + +#### SCRAPE DATA +import itertools + +for i in itertools.count(): + html = urllib2.urlopen(urls.format(i)) + soup = BeautifulSoup(html.text, 'lxml') + links = soup.find_all('a', 'download button green CSV') + next_page = soup.find('table', id='DataSetList').find('tfoot').find_all('a')[-1].text + for link in links: + url = 'http://data.n-somerset.gov.uk'+link['href'].split('?version')[0] + csvYr = link['href'].split('/CSV')[0].replace('-1', '')[-4:] + csvMth = link['href'].split('/CSV')[0].replace('-1', '').split('-')[-2][:3] + csvMth = convert_mth_strings(csvMth.upper()) + data.append([csvYr, csvMth, url]) + if '>' not in next_page: + break + + +#### STORE DATA 1.0 + +for row in data: + csvYr, csvMth, url = row + filename = entity_id + "_" + csvYr + "_" + csvMth + todays_date = str(datetime.now()) + file_url = url.strip() + + valid = validate(filename, file_url) + + if valid == True: + scraperwiki.sqlite.save(unique_keys=['l'], data={"l": file_url, "f": filename, "d": todays_date }) + print filename + else: + errors += 1 + +if errors > 0: + raise Exception("%d errors occurred during scrape." % errors) + + +#### EOF