In [1]:
from collections import OrderedDict
from pathlib import PurePath

from bs4 import BeautifulSoup
import json
import requests
import requests_cache
from tqdm.notebook import tqdm

In [2]:
requests_cache.install_cache()

In [3]:
#requests_cache.clear()

In [4]:
BASE_URL = 'https://malegislature.gov'

def select_string(soup, selector):
    try:
        return (soup.select_one(selector).string or '').strip()
    except AttributeError:
        return ''

def get_soup(url):
    # print(url)
    # HACK: Work around SSLError "unable to get local issuer certificate"
    response = requests.get(url, verify=False)
    response.raise_for_status()
    #print(f'Status: {response.status_code}, Cached: {response.from_cache}')
    return BeautifulSoup(response.text, 'lxml')

CLEAN_URLS = {
    'https://malegislature.gov/Legislators/District/4thHampden': {
        'chamber': 'House',
        'district': '4th Hampden'
    }
}

def parse_chamber(soup):
    for row_soup in tqdm(soup.select('#legislatorTable tbody tr')):
        # TODO: Rework this to be more flexible/informative about missing data (e.g. "Vacant")
        row = parse_leg_row(row_soup)

        profile_soup = get_soup(row['url'])
        
        try:
            profile = parse_leg_profile(profile_soup)
        except AttributeError as exc:
            print(f"Error parsing {row['url']}: {exc}")
            profile = OrderedDict()

        profile.update(row)

        clean_data = CLEAN_URLS.get(profile['url'], {})
        profile.update(clean_data)
        
        yield profile

def parse_leg_row(soup):
    return OrderedDict([
        ('first_name', select_string(soup, 'td:nth-of-type(3)')),
        ('last_name', select_string(soup, 'td:nth-of-type(4)')),
        ('party', select_string(soup, 'td:nth-of-type(6)')),
        ('photo', BASE_URL + soup.select_one('.thumb img')['src']),
        ('url', BASE_URL + soup.select_one('td:nth-of-type(3) a')['href']),
        ('email', select_string(soup, 'td:nth-of-type(9) a')),
        ('phone', select_string(soup, 'td:nth-of-type(8)')),
        ('room', select_string(soup, 'td:nth-of-type(7)')),
    ])

def parse_leg_profile(soup):
    leg_type = soup.select_one('h1 span')
    return OrderedDict([
        ('chamber', 'Senate' if leg_type.string == 'Senator' else 'House'),
        ('district', soup.select_one('.subTitle').string.split('-')[1].strip()),
        ('full_name', leg_type.next_sibling.string.strip()),
    ])

def get_chamber(chamber_name):
    chamber_url = BASE_URL + f'/Legislators/Members/{chamber_name.title()}'
    chamber_soup = get_soup(chamber_url)
    chamber = list(parse_chamber(chamber_soup))

    print(json.dumps([chamber[0], "...", chamber[-1]], indent=2))
    
    return chamber

def save_json(obj, json_path):
    with open(json_path, 'w') as json_file:
        json.dump(obj, json_file)

    !ls -lh $json_path
    !head -c 1024 $json_path

In [9]:
representatives = get_chamber('house')

HBox(children=(FloatProgress(value=0.0, max=160.0), HTML(value='')))

Error parsing https://malegislature.gov/Legislators/District/4thHampden: 'NoneType' object has no attribute 'string'

[
  {
    "chamber": "House",
    "district": "2nd Middlesex",
    "full_name": "James Arciero",
    "first_name": "James",
    "last_name": "Arciero",
    "party": "Democrat",
    "photo": "https://malegislature.gov/Legislators/Profile/70/J_A1.jpg",
    "url": "https://malegislature.gov/Legislators/Profile/J_A1",
    "email": "James.Arciero@mahouse.gov",
    "phone": "(617) 722-2012",
    "room": "277"
  },
  "...",
  {
    "first_name": "Vacant",
    "last_name": "4th Hampden",
    "party": "",
    "photo": "https://malegislature.gov/Legislators/Profile/NoPhotoAvailable.jpg?Name=4th%20Hampden&width=70",
    "url": "https://malegislature.gov/Legislators/District/4thHampden",
    "email": "",
    "phone": "(617) 722-2877",
    "room": "174",
    "chamber": "House",
    "district": "4th Hampden"
  }
]


In [10]:
senators = get_chamber('senate')

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


[
  {
    "chamber": "Senate",
    "district": "Third Middlesex",
    "full_name": "Michael J. Barrett",
    "first_name": "Michael",
    "last_name": "Barrett",
    "party": "Democrat",
    "photo": "https://malegislature.gov/Legislators/Profile/70/MJB0.jpg",
    "url": "https://malegislature.gov/Legislators/Profile/MJB0",
    "email": "Mike.Barrett@masenate.gov",
    "phone": "(617) 722-1572",
    "room": "109-D"
  },
  "...",
  {
    "chamber": "Senate",
    "district": "Hampden",
    "full_name": "James T. Welch",
    "first_name": "James",
    "last_name": "Welch",
    "party": "Democrat",
    "photo": "https://malegislature.gov/Legislators/Profile/70/JTW0.jpg",
    "url": "https://malegislature.gov/Legislators/Profile/JTW0",
    "email": "James.Welch@masenate.gov",
    "phone": "(617) 722-1660",
    "room": "413-B"
  }
]


In [11]:
 save_json(representatives, '../static/ma_representatives.json')

-rw-r--r--  1 brian  staff    57K Nov 28 06:24 ../static/ma_representatives.json
[{"chamber": "House", "district": "2nd Middlesex", "full_name": "James Arciero", "first_name": "James", "last_name": "Arciero", "party": "Democrat", "photo": "https://malegislature.gov/Legislators/Profile/70/J_A1.jpg", "url": "https://malegislature.gov/Legislators/Profile/J_A1", "email": "James.Arciero@mahouse.gov", "phone": "(617) 722-2012", "room": "277"}, {"chamber": "House", "district": "2nd Hampden", "full_name": "Brian M. Ashe", "first_name": "Brian", "last_name": "Ashe", "party": "Democrat", "photo": "https://malegislature.gov/Legislators/Profile/70/BMA1.jpg", "url": "https://malegislature.gov/Legislators/Profile/BMA1", "email": "Brian.Ashe@mahouse.gov", "phone": "(617) 722-2430", "room": "236"}, {"chamber": "House", "district": "1st Norfolk", "full_name": "Bruce J. Ayers", "first_name": "Bruce", "last_name": "Ayers", "party": "Democrat", "photo": "https://malegislature.gov/Legislators/Profile/70/BJ

In [12]:
save_json(senators, '../static/ma_senators.json')

-rw-r--r--  1 brian  staff    15K Nov 28 06:24 ../static/ma_senators.json
[{"chamber": "Senate", "district": "Third Middlesex", "full_name": "Michael J. Barrett", "first_name": "Michael", "last_name": "Barrett", "party": "Democrat", "photo": "https://malegislature.gov/Legislators/Profile/70/MJB0.jpg", "url": "https://malegislature.gov/Legislators/Profile/MJB0", "email": "Mike.Barrett@masenate.gov", "phone": "(617) 722-1572", "room": "109-D"}, {"chamber": "Senate", "district": "First Suffolk and Middlesex", "full_name": "Joseph A. Boncore", "first_name": "Joseph", "last_name": "Boncore", "party": "Democrat", "photo": "https://malegislature.gov/Legislators/Profile/70/JAB0.jpg", "url": "https://malegislature.gov/Legislators/Profile/JAB0", "email": "Joseph.Boncore@masenate.gov", "phone": "(617) 722-1634", "room": "112"}, {"chamber": "Senate", "district": "Second Plymouth and Bristol", "full_name": "Michael D. Brady", "first_name": "Michael", "last_name": "Brady", "party": "Democrat", "phot

In [13]:
save_json(representatives + senators, '../static/ma_legislators.json')

-rw-r--r--  1 brian  staff    71K Nov 28 06:24 ../static/ma_legislators.json
[{"chamber": "House", "district": "2nd Middlesex", "full_name": "James Arciero", "first_name": "James", "last_name": "Arciero", "party": "Democrat", "photo": "https://malegislature.gov/Legislators/Profile/70/J_A1.jpg", "url": "https://malegislature.gov/Legislators/Profile/J_A1", "email": "James.Arciero@mahouse.gov", "phone": "(617) 722-2012", "room": "277"}, {"chamber": "House", "district": "2nd Hampden", "full_name": "Brian M. Ashe", "first_name": "Brian", "last_name": "Ashe", "party": "Democrat", "photo": "https://malegislature.gov/Legislators/Profile/70/BMA1.jpg", "url": "https://malegislature.gov/Legislators/Profile/BMA1", "email": "Brian.Ashe@mahouse.gov", "phone": "(617) 722-2430", "room": "236"}, {"chamber": "House", "district": "1st Norfolk", "full_name": "Bruce J. Ayers", "first_name": "Bruce", "last_name": "Ayers", "party": "Democrat", "photo": "https://malegislature.gov/Legislators/Profile/70/BJA1.j