Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed Jun 20, 2018
1 parent a9fcc24 commit 5871317
Showing 1 changed file with 33 additions and 30 deletions.
63 changes: 33 additions & 30 deletions scraper.py
Expand Up @@ -2,15 +2,16 @@

#### IMPORTS 1.0

import os
import os, json
import re
import scraperwiki
import urllib2
from datetime import datetime
from bs4 import BeautifulSoup


#### FUNCTIONS 1.0
#### FUNCTIONS 1.2
import requests

def validateFilename(filename):
filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$'
Expand Down Expand Up @@ -38,19 +39,19 @@ def validateFilename(filename):

def validateURL(url):
try:
r = urllib2.urlopen(url)
r = requests.get(url)
count = 1
while r.getcode() == 500 and count < 4:
while r.status_code == 500 and count < 4:
print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
count += 1
r = urllib2.urlopen(url)
r = requests.get(url)
sourceFilename = r.headers.get('Content-Disposition')

if sourceFilename:
ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
else:
ext = os.path.splitext(url)[1]
validURL = r.getcode() == 200
validURL = r.status_code == 200
validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx']
return validURL, validFiletype
except:
Expand Down Expand Up @@ -85,38 +86,40 @@ def convert_mth_strings ( mth_string ):

#### VARIABLES 1.0

entity_id = "E2632_BDC_gov"
url = "https://www.broadland.gov.uk/info/200197/spending_and_transparency/339/council_spending_over_250"
entity_id = "E0933_CCC_gov"
url = "https://www.carlisle.gov.uk/open-data/DesktopModules/DocumentViewer/API/ContentService/GetFolderDescendants?parentId=2114&sortOrder=&searchText="
errors = 0
data = []

ua = {'requestverificationtoken':'i3_cHuEClJ7QYaW579uxiAdR0m-il3z1uPP0g-9NyhnITv5P2WkfpK7fMq07abe_x1G8xomRp2dZA_EuF_uZqzGlNcc2Qe0p3l5yLBdpUlDAM9fwsRHsaJUO6gA1',
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
'x-requested-with':'XMLHttpRequest', 'moduleid':'8297', 'tabid':'1600'}

#### READ HTML 1.0

html = urllib2.urlopen(url)
soup = BeautifulSoup(html, 'lxml')
html = requests.get(url, headers=ua)
json_soup = json.loads(html.text)

#### SCRAPE DATA

links = soup.find('div', 'editor').find_all('a', href=True)
for link in links:
if 'http' not in link['href']:
year_url = 'https://www.broadland.gov.uk' + link['href']
else:
year_url = link['href']
year_html = urllib2.urlopen(year_url)
year_soup = BeautifulSoup(year_html, 'lxml')
blocks = year_soup.find_all('span', 'download-listing__file-tag download-listing__file-tag--type')
for block in blocks:
if 'CSV' in block.text:
url = block.find_next('a')['href']
if 'http' not in url:
url = 'https://www.broadland.gov.uk' + url
else:
url = url
file_name = block.find_next('a')['aria-label']
csvMth = file_name.split()[-2][:3]
csvYr = file_name.split()[-1]
items = json_soup['Items']
for item in items:
item_key = item['key']
year_url = 'https://www.carlisle.gov.uk/open-data/DesktopModules/DocumentViewer/API/ContentService/GetFolderContent?startIndex=0&numItems=100&sort=Name+asc&folderId={}'
year_html = requests.get(year_url.format(item_key), headers=ua)
year_json_soup = json.loads(year_html.text)
year_items = year_json_soup['Items']
for year_item in year_items:
url = 'https://www.carlisle.gov.uk'+year_item['Url']
file_name = year_item['Name']
# print file_name.split('_')
if '.csv' in file_name:
csvYr = file_name.split('_')[-1][:4]
csvMth = file_name.split('_')[-2][:3]
if '201' in csvMth:
csvMth = 'Apr'
csvYr = '2011'
if '20' not in csvYr:
csvYr = '20'+csvYr.split('.')[0]
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])

Expand Down

0 comments on commit 5871317

Please sign in to comment.