diff --git a/scraper.py b/scraper.py index 51c3c04..b046431 100644 --- a/scraper.py +++ b/scraper.py @@ -2,7 +2,7 @@ #### IMPORTS 1.0 -import os +import os, json import re import scraperwiki import urllib2 @@ -10,7 +10,8 @@ from bs4 import BeautifulSoup -#### FUNCTIONS 1.0 +#### FUNCTIONS 1.2 +import requests def validateFilename(filename): filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$' @@ -38,19 +39,19 @@ def validateFilename(filename): def validateURL(url): try: - r = urllib2.urlopen(url) + r = requests.get(url) count = 1 - while r.getcode() == 500 and count < 4: + while r.status_code == 500 and count < 4: print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code)) count += 1 - r = urllib2.urlopen(url) + r = requests.get(url) sourceFilename = r.headers.get('Content-Disposition') if sourceFilename: ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '') else: ext = os.path.splitext(url)[1] - validURL = r.getcode() == 200 + validURL = r.status_code == 200 validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx'] return validURL, validFiletype except: @@ -85,38 +86,40 @@ def convert_mth_strings ( mth_string ): #### VARIABLES 1.0 -entity_id = "E2632_BDC_gov" -url = "https://www.broadland.gov.uk/info/200197/spending_and_transparency/339/council_spending_over_250" +entity_id = "E0933_CCC_gov" +url = "https://www.carlisle.gov.uk/open-data/DesktopModules/DocumentViewer/API/ContentService/GetFolderDescendants?parentId=2114&sortOrder=&searchText=" errors = 0 data = [] - +ua = {'requestverificationtoken':'i3_cHuEClJ7QYaW579uxiAdR0m-il3z1uPP0g-9NyhnITv5P2WkfpK7fMq07abe_x1G8xomRp2dZA_EuF_uZqzGlNcc2Qe0p3l5yLBdpUlDAM9fwsRHsaJUO6gA1', + 'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36', + 'x-requested-with':'XMLHttpRequest', 'moduleid':'8297', 'tabid':'1600'} #### READ HTML 1.0 -html = urllib2.urlopen(url) -soup = BeautifulSoup(html, 'lxml') +html = requests.get(url, headers=ua) +json_soup = json.loads(html.text) #### SCRAPE DATA -links = soup.find('div', 'editor').find_all('a', href=True) -for link in links: - if 'http' not in link['href']: - year_url = 'https://www.broadland.gov.uk' + link['href'] - else: - year_url = link['href'] - year_html = urllib2.urlopen(year_url) - year_soup = BeautifulSoup(year_html, 'lxml') - blocks = year_soup.find_all('span', 'download-listing__file-tag download-listing__file-tag--type') - for block in blocks: - if 'CSV' in block.text: - url = block.find_next('a')['href'] - if 'http' not in url: - url = 'https://www.broadland.gov.uk' + url - else: - url = url - file_name = block.find_next('a')['aria-label'] - csvMth = file_name.split()[-2][:3] - csvYr = file_name.split()[-1] +items = json_soup['Items'] +for item in items: + item_key = item['key'] + year_url = 'https://www.carlisle.gov.uk/open-data/DesktopModules/DocumentViewer/API/ContentService/GetFolderContent?startIndex=0&numItems=100&sort=Name+asc&folderId={}' + year_html = requests.get(year_url.format(item_key), headers=ua) + year_json_soup = json.loads(year_html.text) + year_items = year_json_soup['Items'] + for year_item in year_items: + url = 'https://www.carlisle.gov.uk'+year_item['Url'] + file_name = year_item['Name'] + # print file_name.split('_') + if '.csv' in file_name: + csvYr = file_name.split('_')[-1][:4] + csvMth = file_name.split('_')[-2][:3] + if '201' in csvMth: + csvMth = 'Apr' + csvYr = '2011' + if '20' not in csvYr: + csvYr = '20'+csvYr.split('.')[0] csvMth = convert_mth_strings(csvMth.upper()) data.append([csvYr, csvMth, url])