Update scraper.py

blablupcom · Jun 20, 2018 · 5871317 · 5871317
1 parent a9fcc24
commit 5871317
Showing 1 changed file with 33 additions and 30 deletions.
diff --git a/scraper.py b/scraper.py
@@ -2,15 +2,16 @@
 
 #### IMPORTS 1.0
 
-import os
+import os, json
 import re
 import scraperwiki
 import urllib2
 from datetime import datetime
 from bs4 import BeautifulSoup
 
 
-#### FUNCTIONS 1.0
+#### FUNCTIONS 1.2
+import requests
 
 def validateFilename(filename):
     filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$'
@@ -38,19 +39,19 @@ def validateFilename(filename):
 
 def validateURL(url):
     try:
-        r = urllib2.urlopen(url)
+        r = requests.get(url)
         count = 1
-        while r.getcode() == 500 and count < 4:
+        while r.status_code == 500 and count < 4:
             print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
             count += 1
-            r = urllib2.urlopen(url)
+            r = requests.get(url)
         sourceFilename = r.headers.get('Content-Disposition')
 
         if sourceFilename:
             ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
         else:
             ext = os.path.splitext(url)[1]
-        validURL = r.getcode() == 200
+        validURL = r.status_code == 200
         validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx']
         return validURL, validFiletype
     except:
@@ -85,38 +86,40 @@ def convert_mth_strings ( mth_string ):
 
 #### VARIABLES 1.0
 
-entity_id = "E2632_BDC_gov"
-url = "https://www.broadland.gov.uk/info/200197/spending_and_transparency/339/council_spending_over_250"
+entity_id = "E0933_CCC_gov"
+url = "https://www.carlisle.gov.uk/open-data/DesktopModules/DocumentViewer/API/ContentService/GetFolderDescendants?parentId=2114&sortOrder=&searchText="
 errors = 0
 data = []
-
+ua = {'requestverificationtoken':'i3_cHuEClJ7QYaW579uxiAdR0m-il3z1uPP0g-9NyhnITv5P2WkfpK7fMq07abe_x1G8xomRp2dZA_EuF_uZqzGlNcc2Qe0p3l5yLBdpUlDAM9fwsRHsaJUO6gA1',
+      'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
+      'x-requested-with':'XMLHttpRequest', 'moduleid':'8297', 'tabid':'1600'}
 
 #### READ HTML 1.0
 
-html = urllib2.urlopen(url)
-soup = BeautifulSoup(html, 'lxml')
+html = requests.get(url, headers=ua)
+json_soup = json.loads(html.text)
 
 #### SCRAPE DATA
 
-links = soup.find('div', 'editor').find_all('a', href=True)
-for link in links:
-    if 'http' not in link['href']:
-        year_url = 'https://www.broadland.gov.uk' + link['href']
-    else:
-        year_url = link['href']
-    year_html = urllib2.urlopen(year_url)
-    year_soup = BeautifulSoup(year_html, 'lxml')
-    blocks = year_soup.find_all('span', 'download-listing__file-tag download-listing__file-tag--type')
-    for block in blocks:
-        if 'CSV' in block.text:
-            url = block.find_next('a')['href']
-            if 'http' not in url:
-                url = 'https://www.broadland.gov.uk' + url
-            else:
-                url = url
-            file_name = block.find_next('a')['aria-label']
-            csvMth = file_name.split()[-2][:3]
-            csvYr = file_name.split()[-1]
+items = json_soup['Items']
+for item in items:
+    item_key = item['key']
+    year_url = 'https://www.carlisle.gov.uk/open-data/DesktopModules/DocumentViewer/API/ContentService/GetFolderContent?startIndex=0&numItems=100&sort=Name+asc&folderId={}'
+    year_html = requests.get(year_url.format(item_key), headers=ua)
+    year_json_soup = json.loads(year_html.text)
+    year_items = year_json_soup['Items']
+    for year_item in year_items:
+        url = 'https://www.carlisle.gov.uk'+year_item['Url']
+        file_name = year_item['Name']
+        # print file_name.split('_')
+        if '.csv' in file_name:
+            csvYr = file_name.split('_')[-1][:4]
+            csvMth = file_name.split('_')[-2][:3]
+            if '201' in csvMth:
+                csvMth = 'Apr'
+                csvYr = '2011'
+            if '20' not in csvYr:
+                csvYr = '20'+csvYr.split('.')[0]
             csvMth = convert_mth_strings(csvMth.upper())
             data.append([csvYr, csvMth, url])