Skip to content

Commit

Permalink
AK: scrape multiple vote motions on a single page (fixes openstates#182)
Browse files Browse the repository at this point in the history
  • Loading branch information
twneale committed Feb 22, 2012
1 parent e37c727 commit f9396e7
Showing 1 changed file with 51 additions and 45 deletions.
96 changes: 51 additions & 45 deletions openstates/ak/bills.py
Expand Up @@ -183,54 +183,60 @@ def scrape_bill(self, chamber, session, bill_id, bill_type, url):

self.save_bill(bill)

def parse_vote(self, bill, action, act_chamber, act_date, url):
def parse_vote(self, bill, action, act_chamber, act_date, url,
re_vote_text=re.compile(r'The question being:\s*"(.*?\?)"', re.S),
re_header=re.compile(r'\d{2}-\d{2}-\d{4}\s{10,}\w{,20} Journal\s{10,}\d{,6}\s{,4}')):

html = self.urlopen(url)
doc = lxml.html.fromstring(html)

yes = no = other = 0

tally = re.findall('(?:(Y|N|E|A)(-|\d+)\s*)', action)

for vtype, vcount in tally:
vcount = int(vcount) if vcount != '-' else 0
if vtype == 'Y':
yes = vcount
elif vtype == 'N':
no = vcount
else:
other += vcount

# regex against plain html for motion
try:
motion = re.findall('The question being:\s*"(.*)\?"', html,
re.DOTALL)[0].replace('\r\n', ' ')
except IndexError:
return

vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other)

#vote_lines = doc.xpath('//b[contains(text(), "YEAS:")]')[0].tail.split('\r\n')
vote_lines = doc.xpath('//pre')[1].text_content().split('\r\n')
vote_type = None
for vote_list in vote_lines:
if vote_list.startswith('Yeas: '):
vote_list, vote_type = vote_list[6:], vote.yes
elif vote_list.startswith('Nays: '):
vote_list, vote_type = vote_list[6:], vote.no
elif vote_list.startswith('Excused: '):
vote_list, vote_type = vote_list[9:], vote.other
elif vote_list.startswith('Absent: '):
vote_list, vote_type = vote_list[9:], vote.other
elif vote_list.strip() == '':
vote_type = None
if vote_type:
for name in vote_list.split(','):
name = name.strip()
if name:
vote_type(name)

vote.add_source(url)
bill.add_vote(vote)
# Find all chunks of text representing voting reports.
votes_text = doc.xpath('//pre')[1].text_content()
votes_text = re_vote_text.split(votes_text)
votes_data = zip(votes_text[1::2], votes_text[2::2])

# Process each.
for motion, text in votes_data:

yes = no = other = 0

tally = re.findall('([YNEA])[A-Z]+:\s{,3}(\d{,3})', text)
for vtype, vcount in tally:
vcount = int(vcount) if vcount != '-' else 0
if vtype == 'Y':
yes = vcount
elif vtype == 'N':
no = vcount
else:
other += vcount

vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other)

# In lengthy documents, the "header" can be repeated in the middle of
# content. This regex gets rid of it.
vote_lines = re_header.sub('', text)
vote_lines = vote_lines.split('\r\n')

vote_type = None
for vote_list in vote_lines:
if vote_list.startswith('Yeas: '):
vote_list, vote_type = vote_list[6:], vote.yes
elif vote_list.startswith('Nays: '):
vote_list, vote_type = vote_list[6:], vote.no
elif vote_list.startswith('Excused: '):
vote_list, vote_type = vote_list[9:], vote.other
elif vote_list.startswith('Absent: '):
vote_list, vote_type = vote_list[9:], vote.other
elif vote_list.strip() == '':
vote_type = None
if vote_type:
for name in vote_list.split(','):
name = name.strip()
if name:
vote_type(name)

vote.add_source(url)
bill.add_vote(vote)

def clean_action(self, action):
# Clean up some acronyms
Expand Down

0 comments on commit f9396e7

Please sign in to comment.