## Tutorial 4: Logistic Regression 

### Preprocessing

We will generate the data file `2017_cleaned_votes.csv` using the following function that scrapes the votes for a given roll call in a given year.

In [1]:
import urllib, os.path, time
import xml.etree.ElementTree as ET
import numpy as np

In [7]:
def scrape_votes(year, roll, sleep=True):
       """
       For a given year and roll call number, download the votes
       as an XML file, retrieve all votes and return an array that
       has a row for each member of the House, their ID, their party
       and their vote, represented as a string in the "vote" field
       and a numeric representation in their "numeric_vote" field.

       An "Aye" was coded as 1, "Nay" as -1, with "Not Voting" and
       "Present" coded as 0.
       """

       # download the data and make a local copy
       if not os.path.exists("./vote/%s_%s.xml" % (year, roll)):
           if sleep:
                time.sleep(np.random.randint(2,4))
           data = urllib.urlopen("http://clerk.house.gov/evs/%d/roll%03d.xml" % (year, roll)).read()
           f = file("./vote/%s_%s.xml" % (year, roll), 'w')
           f.write(data)
           f.close()
       data = file("./vote/%s_%s.xml" % (year, roll))

       # parse the XML tree
       # an investigation of one of the .xml files shows that
       # the information we want is in the <recorded-vote> tags

       xml_data = ET.parse(data)
       values = []
       for vote in xml_data.find('vote-data').findall('recorded-vote'):
           legislator = vote.find('legislator')
           name = legislator.get('unaccented-name')
           ident = legislator.get('name-id')
           party = legislator.get('party')
           v = vote.find('vote').text
           values.append((name, v, party, ident, {'Yea':1,'Aye':1, 'Nay':-1, 'Not Voting':0, 'Present':0, 'No':-1}[v]))

       return np.array(values, np.dtype([('name', '|S50'),
                                         ('vote%d' % roll, '|S12'),
                                         ('party', '|S15'),
                                         ('id', '|S12'),
                                         ('numeric_vote%d' % roll, np.int)]))

Here’s a sample of what this function returns.

In [8]:
scrape_votes(2017,501)[:20]

array([('Abraham', 'No', 'R', 'A000374', -1),
       ('Adams', 'Aye', 'D', 'A000370',  1),
       ('Aderholt', 'No', 'R', 'A000055', -1),
       ('Aguilar', 'Aye', 'D', 'A000371',  1),
       ('Allen', 'No', 'R', 'A000372', -1),
       ('Amash', 'No', 'R', 'A000367', -1),
       ('Amodei', 'No', 'R', 'A000369', -1),
       ('Arrington', 'No', 'R', 'A000375', -1),
       ('Babin', 'No', 'R', 'B001291', -1),
       ('Bacon', 'No', 'R', 'B001298', -1),
       ('Banks (IN)', 'No', 'R', 'B001299', -1),
       ('Barletta', 'No', 'R', 'B001269', -1),
       ('Barr', 'No', 'R', 'B001282', -1),
       ('Barragan', 'Aye', 'D', 'B001300',  1),
       ('Barton', 'Aye', 'R', 'B000213',  1),
       ('Bass', 'Aye', 'D', 'B001270',  1),
       ('Beatty', 'Aye', 'D', 'B001281',  1),
       ('Bera', 'Aye', 'D', 'B001287',  1),
       ('Bergman', 'No', 'R', 'B001301', -1),
       ('Beyer', 'Aye', 'D', 'B001292',  1)],
      dtype=[('name', 'S50'), ('vote501', 'S12'), ('party', 'S15'), ('id', 'S12'), ('nu

We will get all votes for 2017 using a simple loops (with a little sleep so we don’t get kicked out of the server).

In [9]:
import matplotlib.mlab as ML # for recarray utils

def get_all_votes():
    data = scrape_votes(2017, 1)
    rolls_found = [1]
    for i in range(2,711):
        try:
            new_votes = scrape_votes(2017,i)
            newdata = ML.rec_join('id', data, ML.rec_drop_fields(new_votes,
                                                             ['name', 'party']))
            if newdata.shape[0] < 10:
                # A merge seems to have failed if this happens...
                raise ValueError
            else:
                data = newdata
            rolls_found.append(i)
        except:
            # Roll 2 is election of the speaker... not a Y/N vote
            pass

        # every 100th vote, let's save what we have to disk
        if i % 100 == 0:
            ML.rec2csv(data, './data/2017_votes.csv', delimiter=';')

    # we don't need all columns to analyse the data
    cleaned = ML.rec_keep_fields(data, ['party'] + [n for n in data.dtype.names if 'numeric_vote' in n])
    ML.rec2csv(cleaned, './data/2017_cleaned_votes.csv', delimiter=';')
    ML.rec2csv(data, './data/2017_votes.csv', delimiter=';')

force = False
if not os.path.exists('./data/2017_cleaned_votes.csv') or force:
    get_all_votes()