In [293]:
from bs4 import BeautifulSoup
import urllib2
import pandas as pd
import numpy as np

In [325]:
# County codes not listed on Wikipedia pages
counties = {
'Nevada County, California': u'NEV',
'Los Angeles County, California': u'LA',
'Contra Costa County, California': u'CC',
'Alameda County, California': u'ALA',
'Riverside County, California': u'RIV',
'Orange County, California': u'ORA',
'Solano County, California': u'SOL',
'San Diego County, California': u'SD',
'San Bernardino County, California': u'SBD',
'Napa County, California': u'NAP',
'Yolo County, California': u'YOL',
'Sierra County, California': u'SIE',
'Sacramento County, California': u'SAC',
'Placer County, California': u'PLA',
}

# County codes with no 'County' column on their Wikipedia pages
single_county_roads = {
    'CA-85': u'SCL',
    'I-40': u'SBD',
    'CA-55': u'ORA',
    'I-105': u'LA',
    'I-110': u'LA',
    'CA-110': u'LA',    
    'CA-170': u'LA',  
}

In [295]:
i_roads = [
    'I-5', 'I-10', 'I-15', 'I-40', 'I-80', 'I-105', 'I-110', 'I-205', 'I-210', 'I-280', 'I-605', 'I-680',
]

us_roads = [
    'US-50', 'US-101',
]

ca_roads = [
    'CA-1',   'CA-4',   'CA-14',  'CA-15',  'CA-22',  'CA-24',  
    'CA-41',  'CA-55',  'CA-57',  'CA-58',  'CA-60',  'CA-85',  
    'CA-91',  'CA-92',  'CA-99',  'CA-110', 'CA-118',
    'CA-120', 'CA-126', 'CA-134', 'CA-170', 'CA-198',  
    'CA-215'  
]

bad_roads = [   
    'I-8',  'I-580', 
]

roads = i_roads + us_roads + ca_roads# + bad_roads

In [336]:
class Highway:   
    def __init__(self, code):        
        self.exits = {}
        self.code = code
        
        self.get_html()
        
        d = self.exits.itervalues().next()
        
        print '  %6s: %4s - %7s - %s' % (self.code, d[0], d[1], d[2])
    

    def get_html(self):
        s = self.code.split('-')

        if s[0] == 'CA':
            part = 'California_State_Route_%s' % s[1]
        elif s[0] == 'I':
            part = 'Interstate_%s_(California)' % s[1]
        elif s[0] == 'US':
            part = 'U.S._Route_%s_in_California' % s[1]

        url = 'https://en.wikipedia.org/wiki/' + part

        try:
            print 'Checking ', url
            f = urllib2.urlopen(url)
            el_html = f.read()
            f.close()

            html = BeautifulSoup(el_html, 'html')
            
            self.get_exits(html)

        except ValueError:
            print '\tUnable to find:', self.code
        
    def get_exits(self, html):
        exit_list = html.find('span', id='Exit_list')
        
        if exit_list is None:
            exit_list = html.find('span', id='Major_intersections')            
        
        el_table = exit_list.find_next('table')

        el_rows = el_table.find_all('tr')

        single_county_road = self.code in single_county_roads
        county = single_county_roads[self.code] if single_county_road else ''
        
        for tr in enumerate(el_rows[1:]):
            # Check for county listing (first element - spans multiple rows)
            if not single_county_road:
                for td in tr[1].find_all('td'):                    
                    # Some have the county codes listed within the table
                    c = td.find('small')
                    if c:
                        county = c.get_text().split()[0]
                        
                    # Others are input manually
                    c = td.find('a')
                    if c != None and c.has_attr('title') and 'County, California' in c['title']:
                        if c['title'] in counties:
                            county = counties[c['title']]

            
            # Find columns for exits / Postmile codes
            for th in tr[1].find_all('th'):
                td_exit = th.find_next_sibling('td')
                
                # Check for an additional column when both mi / km listed
                if self.code in km_highways:
                    td_exit = td_exit.find_next_sibling('td')                    
                                
                # Check for blank cells for longer exits
                td_location = td_exit.find_next_sibling('td') if td_exit is not None else None

                postmile = th.get_text()
                exit = td_exit.get_text() if td_exit is not None else ''
                location = td_location.get_text().strip() if td_location is not None else ''
                
                # Just take beginning of Postmile code stretch
                if '.' in postmile:
                    postmile = postmile[:postmile.index('.') + 3]

                # Just take beginning of Exit name listing
                if len(postmile) < 2 or len(exit.split()) > 1:
                    break
                
                # Only care about initial exits
                if exit != '':
                    if len(exit) == 1 or len(exit) == 2 and exit[1].isalpha():
                        exit = '0' + exit
                        
                    self.exits[exit] = [county, postmile, location]


In [337]:
# Create set of highway information for each road (check example for each to see validity)
highways = {}
for r in roads:
    highways[r] = Highway(r)

Checking  https://en.wikipedia.org/wiki/Interstate_5_(California)
     I-5:  FRE -   29.96 - SR 33 north (Derrick Avenue) – Mendota
Checking  https://en.wikipedia.org/wiki/Interstate_10_(California)
    I-10:  RIV -  216.76 - Ford Dry Lake Road
Checking  https://en.wikipedia.org/wiki/Interstate_15_(California)
    I-15:  SBD -  212.76 - Field Road
Checking  https://en.wikipedia.org/wiki/Interstate_40_(California)
    I-40:  SBD -    2.35 - Marine Corps Logistics Base
Checking  https://en.wikipedia.org/wiki/Interstate_80_(California)
    I-80:  SOL -   29.27 - Magazine Street
Checking  https://en.wikipedia.org/wiki/Interstate_105_(California)
   I-105:   LA -   R3.05 - Prairie Avenue, Hawthorne Boulevard- The Forum
Checking  https://en.wikipedia.org/wiki/Interstate_110_(California)
   I-110:   LA -   29.50 - York Boulevard
Checking  https://en.wikipedia.org/wiki/Interstate_205_(California)
   I-205:   SJ -    1.38 - Mountain House Parkway
Checking  https://en.wikipedia.org/wiki/Intersta

In [None]:
def get_exit(highway, road):
    d = h_list[highway].exits
    for e in sorted(d):
        if road in d[e][2]:
            return '%4s - %3s - %6s' % (e, d[e][0], d[e][1])
    
    return 0

def get_last_exit(highway, county, postmile):
    data = h_list[test_highway]    
    
    c = postmile[0] if postmile[0].isalpha() else ' '    
    f = float(postmile[1:]) if c != ' ' else float(postmile)
    
    exit = 0
    
    e_last = 0
    c_last = ''
    
    for e in sorted(data.exits):
        t_county = data.exits[e][0]
        t_postmile = data.exits[e][1]
        
        if county != t_county:
            continue
        
        c1 = t_postmile[0] if t_postmile[0].isalpha() else ' '        
        f1 = float(t_postmile[1:]) if c1 != ' ' else float(t_postmile)
                        
        if (c1 == c):
            if f1 > f and exit == 0:
                exit = e_last
                
        else:
            c_last = c1
            
        e_last = e
            
#         print '%4s - %3s - %6s - %s' % (e, data.exits[e][0], data.exits[e][1], data.exits[e][2])
        
    return exit 

In [None]:
test_highway = 'I-280'
test_enter = 'De Anza Boulevard'
test_exit = 'Eastmoor Avenue'

# d = h_list[test_highway].exits
# for e in sorted(d):
#     print e, d[e]

In [324]:
test_county = 'SM'#'SCL'
test_postmile = 'R11.0'#'R2.71'
    
print get_exit(test_highway, test_enter)
print get_exit(test_highway, test_exit)
print ''

get_last_exit(test_highway, test_county, test_postmile)

  11 - SCL -   9.43
  48 -  SM - R25.78



u'33'