Skip to content

Commit

Permalink
sublinks moved to findAll and loop added
Browse files Browse the repository at this point in the history
  • Loading branch information
woodbine committed May 6, 2015
1 parent 28f21c7 commit 981e5cb
Showing 1 changed file with 24 additions and 23 deletions.
47 changes: 24 additions & 23 deletions scraper.py
Expand Up @@ -32,28 +32,29 @@ def convert_mth_strings ( mth_string ):
html2 = urllib2.urlopen(suburl)
soup2 = BeautifulSoup(html2)
block = soup2.find('ul', {'class':'item-list item-list__rich'})
sublink = block.find('a', href=True)
filePageUrl = sublink['href']
sublinks = block.findAll('a', href=True)

title = sublink.encode_contents(formatter='html').replace(' ',' ') # gets rid of erroneous   chars
title = title.upper().strip()
html3 = urllib2.urlopen(filePageUrl)
soup3 = BeautifulSoup(html3)

block = soup3.find('main',{'class':'main-content'})
filelinks = block.findAll('a', href=True)

for filelink in filelinks:
# create the right strings for the new filename
fileurl = filelink['href']
if 'Download' in filelink.text:
print filelink.text
print fileurl
csvYr = title.split(' ')[-1]
csvMth = title.split(' ')[-2][:3]
csvMth = convert_mth_strings(csvMth);
filename = entity_id + "_" + csvYr + "_" + csvMth
todays_date = str(datetime.now())
scraperwiki.sqlite.save(unique_keys=['l'], data={"l": fileurl, "f": filename, "d": todays_date })
print filename
for sublink in sublinks:
filePageUrl = sublink['href']
title = sublink.encode_contents(formatter='html').replace(' ',' ') # gets rid of erroneous   chars
title = title.upper().strip()
html3 = urllib2.urlopen(filePageUrl)
soup3 = BeautifulSoup(html3)

block = soup3.find('main',{'class':'main-content'})
filelinks = block.findAll('a', href=True)

for filelink in filelinks:
# create the right strings for the new filename
fileurl = filelink['href']
if 'Download' in filelink.text:
print filelink.text
print fileurl
csvYr = title.split(' ')[-1]
csvMth = title.split(' ')[-2][:3]
csvMth = convert_mth_strings(csvMth);
filename = entity_id + "_" + csvYr + "_" + csvMth
todays_date = str(datetime.now())
scraperwiki.sqlite.save(unique_keys=['l'], data={"l": fileurl, "f": filename, "d": todays_date })
print filename

0 comments on commit 981e5cb

Please sign in to comment.