Update scraper.py

blablupcom · Dec 11, 2015 · 1b97496 · 1b97496
1 parent 981e5cb
commit 1b97496
Showing 1 changed file with 128 additions and 43 deletions.
diff --git a/scraper.py b/scraper.py
@@ -1,60 +1,145 @@
 # -*- coding: utf-8 -*-
 
+#### IMPORTS 1.0
+
+import os
+import re
 import scraperwiki
 import urllib2
 from datetime import datetime
 from bs4 import BeautifulSoup
 
-# Set up variables
+#### FUNCTIONS 1.0
+
+def validateFilename(filename):
+    filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$'
+    dateregex = '[0-9][0-9][0-9][0-9]_[0-9QY][0-9]'
+    validName = (re.search(filenameregex, filename) != None)
+    found = re.search(dateregex, filename)
+    if not found:
+        return False
+    date = found.group(0)
+    now = datetime.now()
+    year, month = date[:4], date[5:7]
+    validYear = (2000 <= int(year) <= now.year)
+    if 'Q' in date:
+        validMonth = (month in ['Q0', 'Q1', 'Q2', 'Q3', 'Q4'])
+    elif 'Y' in date:
+        validMonth = (month in ['Y1'])
+    else:
+        try:
+            validMonth = datetime.strptime(date, "%Y_%m") < now
+        except:
+            return False
+    if all([validName, validYear, validMonth]):
+        return True
+
+
+def validateURL(url):
+    try:
+        r = urllib2.urlopen(url)
+        count = 1
+        while r.getcode() == 500 and count < 4:
+            print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
+            count += 1
+            r = urllib2.urlopen(url)
+        sourceFilename = r.headers.get('Content-Disposition')
+
+        if sourceFilename:
+            ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
+        else:
+            ext = os.path.splitext(url)[1]
+        validURL = r.getcode() == 200
+        validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx']
+        return validURL, validFiletype
+    except:
+        print ("Error validating URL.")
+        return False, False
+
+
+def validate(filename, file_url):
+    validFilename = validateFilename(filename)
+    validURL, validFiletype = validateURL(file_url)
+    if not validFilename:
+        print filename, "*Error: Invalid filename*"
+        print file_url
+        return False
+    if not validURL:
+        print filename, "*Error: Invalid URL*"
+        print file_url
+        return False
+    if not validFiletype:
+        print filename, "*Error: Invalid filetype*"
+        print file_url
+        return False
+    return True
+
+
+def convert_mth_strings ( mth_string ):
+    month_numbers = {'JAN': '01', 'FEB': '02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06', 'JUL':'07', 'AUG':'08', 'SEP':'09','OCT':'10','NOV':'11','DEC':'12' }
+    for k, v in month_numbers.items():
+        mth_string = mth_string.replace(k, v)
+    return mth_string
+
+#### VARIABLES 1.0
+
 entity_id = "E1821_WCC_gov"
 url = "http://www.worcestershire.gov.uk/info/20024/council_finance/331/payments_to_commercial_suppliers_over_500_and_government_procurement_card_transactions"
+errors = 0
+data = []
+
+#### READ HTML 1.0
 
-# Set up functions
-def convert_mth_strings ( mth_string ):
-	month_numbers = {'JAN': '01', 'FEB': '02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06', 'JUL':'07', 'AUG':'08', 'SEP':'09','OCT':'10','NOV':'11','DEC':'12' }
-	#loop through the months in our dictionary
-	for k, v in month_numbers.items():
-		#then replace the word with the number
-		mth_string = mth_string.replace(k, v)
-	return mth_string
-
-# pull down the content from the webpage
 html = urllib2.urlopen(url)
-soup = BeautifulSoup(html)
+soup = BeautifulSoup(html, 'lxml')
+
+
+#### SCRAPE DATA
 
-# find all entries with the required class
 block = soup.find('div',{'class':'editor'})
 links = block.findAll('a', href=True)
 
 for link in links:
-	suburl = 'http://www.worcestershire.gov.uk' + link['href']
-	if 'payments_to_commercial_suppliers_over' in suburl:
-		html2 = urllib2.urlopen(suburl)
-		soup2 = BeautifulSoup(html2)
-		block = soup2.find('ul', {'class':'item-list item-list__rich'})
-		sublinks = block.findAll('a', href=True)
-
-		for sublink in sublinks:
-			filePageUrl = sublink['href']
-			title = sublink.encode_contents(formatter='html').replace('&nbsp;',' ') #  gets rid of erroneous &nbsp; chars
-			title = title.upper().strip()
-			html3 = urllib2.urlopen(filePageUrl)
-			soup3 = BeautifulSoup(html3)
-
-			block = soup3.find('main',{'class':'main-content'})
-			filelinks = block.findAll('a', href=True)
-
-			for filelink in filelinks:
-		  		# create the right strings for the new filename
-		  		fileurl = filelink['href']
-		  		if 'Download' in filelink.text:
-			  		print filelink.text
-			  		print fileurl
-			  		csvYr = title.split(' ')[-1]
-			  		csvMth = title.split(' ')[-2][:3]
-			  		csvMth = convert_mth_strings(csvMth);
-			  		filename = entity_id + "_" + csvYr + "_" + csvMth
-			  		todays_date = str(datetime.now())
-			  		scraperwiki.sqlite.save(unique_keys=['l'], data={"l": fileurl, "f": filename, "d": todays_date })
-			  		print filename
+    suburl = 'http://www.worcestershire.gov.uk' + link['href']
+    if 'payments_to_commercial_suppliers_over' in suburl:
+        html2 = urllib2.urlopen(suburl)
+        soup2 = BeautifulSoup(html2, 'lxml')
+        block = soup2.find('ul', {'class':'item-list item-list__rich'})
+        sublinks = block.findAll('a', href=True)
+        for sublink in sublinks:
+            filePageUrl = sublink['href']
+            title = sublink.encode_contents(formatter='html').replace('&nbsp;',' ')
+            title = title.upper().strip()
+            html3 = urllib2.urlopen(filePageUrl)
+            soup3 = BeautifulSoup(html3, 'lxml')
+            block = soup3.find('main',{'class':'main-content'})
+            filelinks = block.findAll('a', href=True)
+            for filelink in filelinks:
+                fileurl = filelink['href']
+                if 'Download' in filelink.text:
+                    csvYr = title.split(' ')[-1]
+                    csvMth = title.split(' ')[-2][:3]
+                    csvMth = convert_mth_strings(csvMth)
+                    data.append([csvYr, csvMth, fileurl])
+
+#### STORE DATA 1.0
+
+for row in data:
+    csvYr, csvMth, url = row
+    filename = entity_id + "_" + csvYr + "_" + csvMth
+    todays_date = str(datetime.now())
+    file_url = url.strip()
+
+    valid = validate(filename, file_url)
+
+    if valid == True:
+        scraperwiki.sqlite.save(unique_keys=['l'], data={"l": file_url, "f": filename, "d": todays_date })
+        print filename
+    else:
+        errors += 1
+
+if errors > 0:
+    raise Exception("%d errors occurred during scrape." % errors)
+
 
+#### EOF