Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
blablupcom committed Nov 24, 2017
1 parent 34d96ed commit 912be4f
Showing 1 changed file with 19 additions and 32 deletions.
51 changes: 19 additions & 32 deletions scraper.py
Expand Up @@ -8,10 +8,9 @@
import urllib2
from datetime import datetime
from bs4 import BeautifulSoup
import requests
from dateutil.parser import parse

#### FUNCTIONS 1.0
#### FUNCTIONS 1.2
import requests # import requests for validating urls

def validateFilename(filename):
filenameregex = '^[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[0-9][0-9][0-9][0-9]_[0-9QY][0-9]$'
Expand Down Expand Up @@ -39,25 +38,27 @@ def validateFilename(filename):

def validateURL(url):
try:
r = urllib2.urlopen(url)
r = requests.get(url)
count = 1
while r.getcode() == 500 and count < 4:
while r.status_code == 500 and count < 4:
print ("Attempt {0} - Status code: {1}. Retrying.".format(count, r.status_code))
count += 1
r = urllib2.urlopen(url)
r = requests.get(url)
sourceFilename = r.headers.get('Content-Disposition')

if sourceFilename:
ext = os.path.splitext(sourceFilename)[1].replace('"', '').replace(';', '').replace(' ', '')
else:
ext = os.path.splitext(url)[1]
validURL = r.getcode() == 200
validFiletype = ext in ['.csv', '.xls', '.xlsx', '.docx']
validURL = r.status_code == 200
validFiletype = ext.lower() in ['.csv', '.xls', '.xlsx', '.pdf']
return validURL, validFiletype
except:
print ("Error validating URL.")
return False, False



def validate(filename, file_url):
validFilename = validateFilename(filename)
validURL, validFiletype = validateURL(file_url)
Expand All @@ -82,42 +83,28 @@ def convert_mth_strings ( mth_string ):
mth_string = mth_string.replace(k, v)
return mth_string


#### VARIABLES 1.0

entity_id = "E3201_TAWBO_gov"
urls = ["http://www.telford.gov.uk/downloads/download/64/expenditure_over_100_-_2012","http://www.telford.gov.uk/downloads/download/14/expenditure_over_100_-_2013",
"http://www.telford.gov.uk/downloads/download/65/expenditure_over_100_-_2014", "http://www.telford.gov.uk/downloads/download/457/expenditure_over_100_-_2015",
"http://www.telford.gov.uk/downloads/download/1047/expenditure_over_100_-_2017"]
entity_id = "t18_JNCC_gov"
url = "http://jncc.defra.gov.uk/page-5544"
errors = 0
data = []
url = 'http://example.com'

#### READ HTML 1.0


html = urllib2.urlopen(url)
soup = BeautifulSoup(html, 'lxml')


#### SCRAPE DATA
for url in urls:
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, 'lxml')
block = soup.find('ul', 'item-list')
links = block.find_all('a')
for link in links:
csvfile = link.text.strip().split('-')[-1].strip()
csvMth = csvfile[:3]
csvYr = csvfile[-4:]
csvMth = convert_mth_strings(csvMth.upper())
filename = entity_id + "_" + csvYr + "_" + csvMth
todays_date = str(datetime.now())
urls = link['href']
html_csv = urllib2.urlopen(urls)
soup_csv = BeautifulSoup(html_csv, 'lxml')
url = soup_csv.find('a', 'button button__primary')['href']
file_url = url

links = soup.find('div', 'holder').find_all('p')
for link in links:
if '.csv' in link.text:
url = 'http://jncc.defra.gov.uk'+link.find('a')['href']
title = link.text.strip().split()
csvYr = title[1][:4]
csvMth = title[0][:3]
csvMth = convert_mth_strings(csvMth.upper())
data.append([csvYr, csvMth, url])

Expand Down

0 comments on commit 912be4f

Please sign in to comment.