first commit

blablupcom · Dec 9, 2015 · d8acf88 · d8acf88
1 parent c247efb
commit d8acf88
Showing 1 changed file with 84 additions and 47 deletions.
diff --git a/scraper.py b/scraper.py
@@ -1,97 +1,134 @@
 # -*- coding: utf-8 -*-
+
+#### IMPORTS 1.0
+
 import os
 import re
-import requests
 import scraperwiki
 import urllib2
 from datetime import datetime
 from bs4 import BeautifulSoup
-from dateutil.parser import parse
 
-# Set up variables
-entity_id = "E1301_DBC_gov"
-url = "http://www.darlington.gov.uk/your-council/council-information/financial-information/spending-data/"
-errors = 0
-# Set up functions
+#### FUNCTIONS 1.0
+
 def validateFilename(filename):
-    filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9][0-9]$'
-    dateregex = '[0-9][0-9][0-9][0-9]_[0-9][0-9]'
+    filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$'
+    dateregex = '[0-9][0-9][0-9][0-9]_[0-9QY][0-9]'
     validName = (re.search(filenameregex, filename) != None)
     found = re.search(dateregex, filename)
     if not found:
         return False
     date = found.group(0)
-    year, month = int(date[:4]), int(date[5:7])
     now = datetime.now()
-    validYear = (2000 <= year <= now.year)
-    validMonth = (1 <= month <= 12)
+    year, month = date[:4], date[5:7]
+    validYear = (2000 <= int(year) <= now.year)
+    if 'Q' in date:
+        validMonth = (month in ['Q0', 'Q1', 'Q2', 'Q3', 'Q4'])
+    elif 'Y' in date:
+        validMonth = (month in ['Y1'])
+    else:
+        try:
+            validMonth = datetime.strptime(date, "%Y_%m") < now
+        except:
+            return False
     if all([validName, validYear, validMonth]):
         return True
+
+
 def validateURL(url):
     try:
-        r = requests.get(url, allow_redirects=True, timeout=20)
+        r = urllib2.urlopen(url)
         count = 1
-        while r.status_code == 500 and count < 4:
+        while r.getcode() == 500 and count < 4:
             print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
             count += 1
-            r = requests.get(url, allow_redirects=True, timeout=20)
+            r = urllib2.urlopen(url)
         sourceFilename = r.headers.get('Content-Disposition')
 
         if sourceFilename:
             ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
         else:
             ext = os.path.splitext(url)[1]
-        validURL = r.status_code == 200
-        validFiletype = ext in ['.csv', '.xls', '.xlsx']
+        validURL = r.getcode() == 200
+        validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx']
         return validURL, validFiletype
     except:
-        raise
-def convert_mth_strings ( mth_string ):
+        print ("Error validating URL.")
+        return False, False
 
+
+def validate(filename, file_url):
+    validFilename = validateFilename(filename)
+    validURL, validFiletype = validateURL(file_url)
+    if not validFilename:
+        print filename, "*Error: Invalid filename*"
+        print file_url
+        return False
+    if not validURL:
+        print filename, "*Error: Invalid URL*"
+        print file_url
+        return False
+    if not validFiletype:
+        print filename, "*Error: Invalid filetype*"
+        print file_url
+        return False
+    return True
+
+
+def convert_mth_strings ( mth_string ):
     month_numbers = {'JAN': '01', 'FEB': '02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06', 'JUL':'07', 'AUG':'08', 'SEP':'09','OCT':'10','NOV':'11','DEC':'12' }
-    #loop through the months in our dictionary
     for k, v in month_numbers.items():
-#then replace the word with the number
-
         mth_string = mth_string.replace(k, v)
     return mth_string
-# pull down the content from the webpage
+
+#### VARIABLES 1.0
+
+entity_id = "E1301_DBC_gov"
+url = "http://www.darlington.gov.uk/your-council/council-information/financial-information/spending-data/"
+errors = 0
+data = []
+
+#### READ HTML 1.0
+
 html = urllib2.urlopen(url)
 soup = BeautifulSoup(html, 'lxml')
-# find all entries with the required class
+
+
+#### SCRAPE DATA
+
 block = soup.find('section', attrs = {'id':'mainContent'})
 links = block.findAll('a', href=True)
 for link in links:
     csvFile = link.text.strip()
-    if 'CSV' in csvFile:
+    if '.csv' in link['href']:
         Mth = csvFile.split(' ')[0].strip()[:3]
         csvYr = csvFile.split(' ')
         if len(csvYr) == 4:
             csvYr = csvYr[-1].strip()[:5].strip()
         else:
             csvYr = csvYr[1].strip()[:5].strip()
+        url = 'http://www.darlington.gov.uk' + link['href']
         csvMth = convert_mth_strings(Mth.upper())
-        filename = entity_id + "_" + csvYr + "_" + csvMth
-        todays_date = str(datetime.now())
-        file_url = 'http://www.darlington.gov.uk' + link['href']
-        validFilename = validateFilename(filename)
-        validURL, validFiletype = validateURL(file_url)
-        if not validFilename:
-            print filename, "*Error: Invalid filename*"
-            print file_url
-            errors += 1
-            continue
-        if not validURL:
-            print filename, "*Error: Invalid URL*"
-            print file_url
-            errors += 1
-            continue
-        if not validFiletype:
-            print filename, "*Error: Invalid filetype*"
-            print file_url
-            errors += 1
-            continue
+        data.append([csvYr, csvMth, url])
+
+#### STORE DATA 1.0
+
+for row in data:
+    csvYr, csvMth, url = row
+    filename = entity_id + "_" + csvYr + "_" + csvMth
+    todays_date = str(datetime.now())
+    file_url = url.strip()
+
+    valid = validate(filename, file_url)
+
+    if valid == True:
         scraperwiki.sqlite.save(unique_keys=['l'], data={"l": file_url, "f": filename, "d": todays_date })
         print filename
-    if errors > 0:
-        raise Exception("%d errors occurred during scrape." % errors)
+    else:
+        errors += 1
+
+if errors > 0:
+    raise Exception("%d errors occurred during scrape." % errors)
+
+
+#### EOF