Update scraper.py

blablupcom · Apr 12, 2016 · 047b869 · 047b869
1 parent c46c200
commit 047b869
Showing 1 changed file with 140 additions and 24 deletions.
diff --git a/scraper.py b/scraper.py
@@ -1,24 +1,140 @@
-# This is a template for a Python scraper on morph.io (https://morph.io)
-# including some code snippets below that you should find helpful
-
-# import scraperwiki
-# import lxml.html
-#
-# # Read in a page
-# html = scraperwiki.scrape("http://foo.com")
-#
-# # Find something on the page using css selectors
-# root = lxml.html.fromstring(html)
-# root.cssselect("div[align='left']")
-#
-# # Write out to the sqlite database using scraperwiki library
-# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
-#
-# # An arbitrary query against the database
-# scraperwiki.sql.select("* from data where 'name'='peter'")
-print 'http://example.com'
-# You don't have to do things with the ScraperWiki and lxml libraries.
-# You can use whatever libraries you want: https://morph.io/documentation/python
-# All that matters is that your final data is written to an SQLite database
-# called "data.sqlite" in the current working directory which has at least a table
-# called "data".
+# -*- coding: utf-8 -*-
+
+#### IMPORTS 1.0
+
+import os
+import re
+import scraperwiki
+import urllib2
+from datetime import datetime
+from bs4 import BeautifulSoup
+
+
+#### FUNCTIONS 1.2
+
+import requests             # import requests for validating url
+
+def validateFilename(filename):
+    filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$'
+    dateregex = '[0-9][0-9][0-9][0-9]_[0-9QY][0-9]'
+    validName = (re.search(filenameregex, filename) != None)
+    found = re.search(dateregex, filename)
+    if not found:
+        return False
+    date = found.group(0)
+    now = datetime.now()
+    year, month = date[:4], date[5:7]
+    validYear = (2000 <= int(year) <= now.year)
+    if 'Q' in date:
+        validMonth = (month in ['Q0', 'Q1', 'Q2', 'Q3', 'Q4'])
+    elif 'Y' in date:
+        validMonth = (month in ['Y1'])
+    else:
+        try:
+            validMonth = datetime.strptime(date, "%Y_%m") < now
+        except:
+            return False
+    if all([validName, validYear, validMonth]):
+        return True
+
+
+def validateURL(url):
+
+     try:
+        r = requests.get(url, allow_redirects=True, timeout=20)
+        count = 1
+        while r.status_code == 500 and count < 4:
+            print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
+            count += 1
+            r = requests.get(url, allow_redirects=True, timeout=20)
+        sourceFilename = r.headers.get('Content-Disposition')
+        if sourceFilename:
+            ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
+        else:
+            ext = os.path.splitext(url)[1]
+        validURL = r.status_code == 200
+        validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx']
+        return validURL, validFiletype
+     except:
+        print ("Error validating URL.")
+        return False, False
+
+def validate(filename, file_url):
+    validFilename = validateFilename(filename)
+    validURL, validFiletype = validateURL(file_url)
+    if not validFilename:
+        print filename, "*Error: Invalid filename*"
+        print file_url
+        return False
+    if not validURL:
+        print filename, "*Error: Invalid URL*"
+        print file_url
+        return False
+    if not validFiletype:
+        print filename, "*Error: Invalid filetype*"
+        print file_url
+        return False
+    return True
+
+
+def convert_mth_strings ( mth_string ):
+    month_numbers = {'JAN': '01', 'FEB': '02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06', 'JUL':'07', 'AUG':'08', 'SEP':'09','OCT':'10','NOV':'11','DEC':'12' }
+    for k, v in month_numbers.items():
+        mth_string = mth_string.replace(k, v)
+    return mth_string
+
+
+#### VARIABLES 1.0
+
+entity_id = "E0104_NSC_gov"
+urls = "http://data.n-somerset.gov.uk/Download/finance/north-somerset-council-spend-over-250?page={}"
+url = 'http://example.com'
+errors = 0
+data = []
+
+
+#### READ HTML 1.0
+
+html = urllib2.urlopen(url)
+soup = BeautifulSoup(html, 'lxml')
+
+
+#### SCRAPE DATA
+import itertools
+
+for i in itertools.count():
+    html = urllib2.urlopen(urls.format(i))
+    soup = BeautifulSoup(html.text, 'lxml')
+    links = soup.find_all('a', 'download button green CSV')
+    next_page = soup.find('table', id='DataSetList').find('tfoot').find_all('a')[-1].text
+    for link in links:
+        url = 'http://data.n-somerset.gov.uk'+link['href'].split('?version')[0]
+        csvYr = link['href'].split('/CSV')[0].replace('-1', '')[-4:]
+        csvMth = link['href'].split('/CSV')[0].replace('-1', '').split('-')[-2][:3]
+        csvMth = convert_mth_strings(csvMth.upper())
+        data.append([csvYr, csvMth, url])
+    if '>' not in next_page:
+        break
+
+
+#### STORE DATA 1.0
+
+for row in data:
+    csvYr, csvMth, url = row
+    filename = entity_id + "_" + csvYr + "_" + csvMth
+    todays_date = str(datetime.now())
+    file_url = url.strip()
+
+    valid = validate(filename, file_url)
+
+    if valid == True:
+        scraperwiki.sqlite.save(unique_keys=['l'], data={"l": file_url, "f": filename, "d": todays_date })
+        print filename
+    else:
+        errors += 1
+
+if errors > 0:
+    raise Exception("%d errors occurred during scrape." % errors)
+
+
+#### EOF