Permalink
Browse files

Merge branch 'master' of github.com:sunlightlabs/openstates

  • Loading branch information...
2 parents ac16dd3 + 54fa16a commit bb30b890641a8a7d80ae0d47beee2b90cd4a63bc @twneale twneale committed Feb 23, 2012
Showing with 47 additions and 60 deletions.
  1. +47 −60 openstates/tn/bills.py
View
@@ -30,11 +30,21 @@
def categorize_action(action):
for prefix, types in _categorizers:
- #if action.startswith(prefix):
if prefix in action:
return types
return 'other'
+def actions_from_table(bill, chamber, actions_table):
+ action_rows = actions_table.xpath("tr[position()>1]")
+ for ar in action_rows:
+ tds = ar.xpath('td')
+ action_taken = tds[0].text
+ action_date = datetime.datetime.strptime(tds[1].text.strip(),
+ '%m/%d/%Y')
+ action_type = categorize_action(action_taken)
+ bill.add_action(chamber, action_taken, action_date, action_type)
+
+
class TNBillScraper(BillScraper):
state = 'tn'
@@ -73,34 +83,28 @@ def scrape_bill(self, term, bill_url, bill_type):
with self.urlopen(bill_url) as page:
page = lxml.html.fromstring(page)
- matching_bill = False
- chamber1 = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text
-
- #Checking if there is a matching bill
- if len(page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')) > 0:
- matching_bill = True
+ page.make_links_absolute(bill_url)
- chamber2 = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')[0].text
+ bill_id = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text
+ secondary_bill_id = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')
- if '*' in chamber1:
- bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
- secondary_bill_id = chamber2.replace(' ', '')
- else:
- bill_id = chamber2.replace(' ', '')[1:len(chamber2)]
- secondary_bill_id = chamber1.replace(' ', '')
+ # checking if there is a matching bill
+ if secondary_bill_id:
+ secondary_bill_id = secondary_bill_id[0].text
- primary_chamber = 'lower' if 'H' in bill_id else 'upper'
+ # swap ids if * is in secondary_bill_id
+ if '*' in secondary_bill_id:
+ bill_id, secondary_bill_id = secondary_bill_id, bill_id
+ secondary_bill_id = secondary_bill_id.strip()
- else:
- primary_chamber = 'lower' if 'H' in chamber1 else 'upper'
- bill_id = chamber1.replace(' ', '')[1:len(chamber1)]
- secondary_bill_id = None
+ bill_id = bill_id.replace('*', '').strip()
+ primary_chamber = 'lower' if 'H' in bill_id else 'upper'
secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'
title = page.xpath("//span[@id='lblAbstract']")[0].text
- #Bill subject
+ # bill subject
subject_pos = title.find('-')
subjects = [s.strip() for s in title[:subject_pos-1].split(',')]
@@ -115,61 +119,41 @@ def scrape_bill(self, term, bill_url, bill_type):
sponsor = sponsor.replace('*','').strip()
bill.add_sponsor('primary',sponsor)
- # Co-sponsors unavailable for scraping (loaded into page via AJAX)
-
# bill text
summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
bill.add_version('Current Version', summary.get('href'))
- #Primary Actions
- tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
- actions_table = tables[0]
- action_rows = actions_table.xpath("tr[position()>1]")
- for ar in action_rows:
- action_taken = ar.xpath("td")[0].text
- action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
- action_type = categorize_action(action_taken)
- bill.add_action(primary_chamber, action_taken, action_date, action_type)
-
- #Primary Votes
- votes_link = page.xpath("//span[@id='lblBillVotes']/a")
- if(len(votes_link) > 0):
- votes_link = votes_link[0].get('href')
- bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))
-
- #If there is a matching bill
- if matching_bill == True:
-
- #Secondary Sponsor
+ # actions
+ atable = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")[0]
+ actions_from_table(bill, primary_chamber, atable)
+
+ # if there is a matching bill
+ if secondary_bill_id:
+ # secondary sponsor
secondary_sponsor = page.xpath("//span[@id='lblCoBillSponsor']")[0].text_content().split("by")[-1]
secondary_sponsor = secondary_sponsor.replace('*','').replace(')', '').strip()
bill.add_sponsor('secondary', secondary_sponsor)
- #Secondary Actions
- tables2 = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvCoActionHistory']")
- actions2_table = tables2[0]
- action2_rows = actions2_table.xpath("tr[position()>1]")
- for ar2 in action2_rows:
- action2_taken = ar2.xpath("td")[0].text
- action2_date = datetime.datetime.strptime(ar2.xpath("td")[1].text.strip(), '%m/%d/%Y')
- action2_type = categorize_action(action2_taken)
- bill.add_action(secondary_chamber, action2_taken,
- action2_date, action2_type)
-
- #Secondary Votes
- votes2_link = page.xpath("//span[@id='lblBillVotes']/a")
- if(len(votes_link) > 0):
- votes2_link = votes2_link[0].get('href')
- bill = self.scrape_votes(bill, secondary_sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))
+ # secondary actions
+ cotable = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvCoActionHistory']")[0]
+ actions_from_table(bill, secondary_chamber, cotable)
+
+ # votes
+ votes_link = page.xpath("//span[@id='lblBillVotes']/a/@href")
+ if len(votes_link) > 0:
+ bill = self.scrape_votes(bill, votes_link[0])
+ votes_link = page.xpath("//span[@id='lblCompVotes']/a/@href")
+ if len(votes_link) > 0:
+ bill = self.scrape_votes(bill, votes_link[0])
self.save_bill(bill)
- def scrape_votes(self, bill, sponsor, link):
+ def scrape_votes(self, bill, link):
with self.urlopen(link) as page:
page = lxml.html.fromstring(page)
raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content()
- raw_vote_data = raw_vote_data.strip().split('%s by %s - ' % (bill['bill_id'], sponsor))[1:]
+ raw_vote_data = re.split(raw_vote_data.strip(), '.+ by .+ -')[1:]
for raw_vote in raw_vote_data:
raw_vote = raw_vote.split(u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0')
motion = raw_vote[0]
@@ -189,6 +173,7 @@ def scrape_votes(self, bill, sponsor, link):
nos = []
for v in raw_vote[1:]:
+ v = v.strip()
if v.startswith('Ayes...') and vote_regex.search(v):
yes_count = int(vote_regex.search(v).group())
elif v.startswith('Noes...') and vote_regex.search(v):
@@ -209,6 +194,8 @@ def scrape_votes(self, bill, sponsor, link):
vote.yes(a)
for n in nos:
vote.no(n)
+
+ vote.validate()
bill.add_vote(vote)
return bill

0 comments on commit bb30b89

Please sign in to comment.