#### Import Libraries for Scraping

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words

### Road Suffix List

In [3]:
road_suffix_df = pd.read_csv("../Data/Road_Suffix_List.csv")

In [4]:
suffix_list = list(set(road_suffix_df["Primary Street Name"].str.lower().tolist()))

In [5]:
suffix_dict = {}
for roads in road_suffix_df.index:
    name = road_suffix_df.loc[roads, "Primary Street Name"]
    suffix_dict[name] = road_suffix_df[road_suffix_df["Primary Street Name"] == name]["Abbreviations"].str.lower().to_list()

In [6]:
suffix_dict["ROUTE"] = ["route", "rte"]

In [43]:
def check_road_suffix(text, suffix_dict):
    text_suffix = []
    for suffix, abbrevs in suffix_dict.items():
        for abbrev in abbrevs:
            if abbrev in text:
                text_suffix.append(suffix)
    return text_suffix

In [46]:
check_road_suffix("asdasrd", suffix_dict)

['ROAD']

### Scrape Main Interstate Highways

In [8]:
interstate_df = pd.read_csv("../Data/Interstate_List.csv")

In [9]:
interstate_list = list(set(interstate_df["Interstate Highway"].tolist()))

In [10]:
interstate_dict = {}
for interstate in interstate_list:
    interstate_dict[interstate] = {"states" : interstate_df[interstate_df["Interstate Highway"] == interstate]["State"].to_list(),
                                   "names" : []}

In [11]:
for roads, values in interstate_dict.items():
    lst = []
    lower = roads.lower()
    space = lower.replace("-", " ")
    remove = lower.replace("-", "")
    lst.append(lower)
    lst.append(space)
    lst.append(remove)
    base_list = [lower, space, remove]
    replacements = [{"i" : "interstate"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    interstate_dict[roads]["names"] = lst

In [12]:
def get_state_interstates(state, i_dict):
    new_dict = {}
    for roads, values in i_dict.items():
        if(state in i_dict[roads]["states"]):
            new_dict[roads] = i_dict[roads]["names"]
    return new_dict

In [13]:
virginia_interstates = get_state_interstates("Virginia", interstate_dict)
texas_interstates = get_state_interstates("Texas", interstate_dict)
ncarolina_interstates = get_state_interstates("North Carolina", interstate_dict)
scarolina_interstates = get_state_interstates("South Carolina", interstate_dict)
florida_interstates = get_state_interstates("Florida", interstate_dict)

## Auxiliary Interstate Roads

In [14]:
aux_df = pd.read_csv("../Data/Auxiliary_List.csv")

In [15]:
aux_list = list(set(aux_df["Interstate Route"].tolist()))

In [16]:
aux_dict = {}

In [17]:
for aux_routes in aux_list:
    aux_dict[aux_routes] = {"states" : [], "names" : []}

In [18]:
for highways, values in aux_dict.items():
    values["states"] = aux_df[aux_df["Interstate Route"] == highways]["State"].to_list()

In [19]:
for roads, values in aux_dict.items():
    lst = []
    lower = roads.lower()
    space = lower.replace("-", " ")
    remove = lower.replace("-", "")
    lst.append(lower)
    lst.append(space)
    lst.append(remove)
    base_list = [lower, space, remove]
    replacements = [{"i" : "interstate"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    aux_dict[roads]["names"] = lst

In [20]:
virginia_aux = get_state_interstates("Virginia", aux_dict)
ncarolina_aux = get_state_interstates("North Carolina", aux_dict)
scarolina_aux = get_state_interstates("South Carolina", aux_dict)
florida_aux = get_state_interstates("Florida", aux_dict)
texas_aux = get_state_interstates("Texas", aux_dict)

## US Routes

In [21]:
us_routes_df = pd.read_csv("../Data/US_Routes.csv")

In [22]:
us_routes_dict = {}
for roads in us_routes_df.index:
    us_routes_dict[us_routes_df.loc[roads, "US Routes"]] = []

In [23]:
for roads in us_routes_df.index:
    name = us_routes_df.loc[roads, "US Routes"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    replacements = [{"us" : "u.s."},
                   {"us" : "us route"},
                   {"us" : "u.s. route"},
                   {"us" : "us rte"},
                   {"us" : "u.s. rte"},
                   {"us" : "us-route"},
                   {"us" : "u.s.-route"},
                   {"us" : "us-rte"},
                   {"us" : "u.s.-rte"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    us_routes_dict[name] = lst    

## State Routes

In [24]:
state_roads_df = pd.read_csv("../Data/State_Road_List.csv")

In [25]:
texas_roads_df = state_roads_df[state_roads_df["State"] == "Texas"][["Road Type", "Name"]]
ncarolina_roads_df = state_roads_df[state_roads_df["State"] == "North Carolina"][["Road Type", "Name"]]
scarolina_roads_df = state_roads_df[state_roads_df["State"] == "South Carolina"][["Road Type", "Name"]]
florida_roads_df = state_roads_df[state_roads_df["State"] == "Florida"][["Road Type", "Name"]]
virginia_roads_df = state_roads_df[state_roads_df["State"] == "Virginia"][["Road Type", "Name"]]

### Create dictionaries for each state that contain state roads as keys and list "names" to be populated with alternative spellings

In [26]:
texas_roads_dict = {}
for roads in texas_roads_df.index:
    texas_roads_dict[texas_roads_df.loc[roads, "Name"]] = []

In [27]:
ncarolina_roads_dict = {}
for roads in ncarolina_roads_df.index:
    ncarolina_roads_dict[ncarolina_roads_df.loc[roads, "Name"]] = []

In [28]:
scarolina_roads_dict = {}
for roads in scarolina_roads_df.index:
    scarolina_roads_dict[scarolina_roads_df.loc[roads, "Name"]] = []

In [29]:
florida_roads_dict = {}
for roads in florida_roads_df.index:
    florida_roads_dict[florida_roads_df.loc[roads, "Name"]] = []

In [30]:
virginia_roads_dict = {}
for roads in virginia_roads_df.index:
    virginia_roads_dict[virginia_roads_df.loc[roads, "Name"]] = []

### Create list of alternative highway/road spellings for each state highway

In [31]:
for roads in texas_roads_df.index:
    name = texas_roads_df.loc[roads, "Name"]
    road_type = texas_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    if(road_type == "State Highway"):
        for abbrevs in suffix_dict["HIGHWAY"]:
            lst.append(lower.replace("sh", "state " + abbrevs.lower()))
            lst.append(lower.replace("sh", "st " + abbrevs.lower()))
            lst.append(dash.replace("sh", "state " + abbrevs.lower()))
            lst.append(dash.replace("sh", "st " + abbrevs.lower()))
            lst.append(remove.replace("sh", "state " + abbrevs.lower()))
            lst.append(remove.replace("sh", "st " + abbrevs.lower()))
            lst.append(lower.replace("sh", "state " + abbrevs.lower() + "s"))
            lst.append(lower.replace("sh", "st " + abbrevs.lower() + "s"))
            lst.append(dash.replace("sh", "state " + abbrevs.lower() + "s"))
            lst.append(dash.replace("sh", "st " + abbrevs.lower() + "s"))
            lst.append(remove.replace("sh", "state " + abbrevs.lower() + "s"))
            lst.append(remove.replace("sh", "st " + abbrevs.lower() + "s"))
    replacements = [{"loop" : "sl"},
                    {"fm" : "farm to market"},
                    {"fm" : "farm-to-market"},
                    {"fm" : "farm to market road"},
                    {"fm" : "farm-to-market road"},
                    {"fm" : "farm to market roads"},
                    {"fm" : "farm-to-market roads"},
                    {"pr" : "park road"},
                    {"recreational road" : "re"},
                    {"recreational road" : "recreational roads"},
                    {"ranch road" : "rr"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    texas_roads_dict[name] = lst    

In [32]:
for roads in ncarolina_roads_df.index:
    name = ncarolina_roads_df.loc[roads, "Name"]
    road_type = ncarolina_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    for abbrevs in suffix_dict["HIGHWAY"]:
        for bases in base_list:
            lst.append(bases.replace("nc", abbrevs))
    ncarolina_roads_dict[name] = lst    

In [33]:
for roads in scarolina_roads_df.index:
    name = scarolina_roads_df.loc[roads, "Name"]
    road_type = scarolina_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    for abbrevs in suffix_dict["HIGHWAY"]:
        for bases in base_list:
            lst.append(bases.replace("sc", abbrevs))
    scarolina_roads_dict[name] = lst    

In [34]:
for roads in virginia_roads_df.index:
    name = virginia_roads_df.loc[roads, "Name"]
    road_type = virginia_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    for bases in base_list:
        lst.append(bases.replace("sr", "state route"))
        lst.append(bases.replace("sr", "st route"))
        lst.append(bases.replace("sr", "state rte"))
        lst.append(bases.replace("sr", "st rte "))
    virginia_roads_dict[name] = lst

In [35]:
for roads in florida_roads_df.index:
    name = florida_roads_df.loc[roads, "Name"]
    road_type = florida_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    replacements = [{"sr" : "state road"},
                    {"sr" : "state rd"},
                    {"sr" : "st road"},
                    {"sr" : "st rd"},
                    {"causeway" : "causwa"},
                    {"causeway" : "cswy"},
                    {"bridge" : "brdge"},
                    {"bridge" : "brg"},
                   {"road" : "rd"},
                   {"expresway" : "exp"},
                   {"expresway" : "expr"},
                   {"expresway" : "express"},
                   {"expresway" : "expw"},
                   {"expresway" : "expy"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    florida_roads_dict[name] = lst    

## Create dictionary containing a list of roads relevant to each state

In [36]:
master_road_dict = {"Florida" : [florida_interstates, florida_aux, us_routes_dict, florida_roads_dict],
                   "Virginia" : [virginia_interstates, virginia_aux, us_routes_dict, virginia_roads_dict],
                   "North Carolina" : [ncarolina_interstates, ncarolina_aux, us_routes_dict, ncarolina_roads_dict],
                   "South Carolina" : [scarolina_interstates, scarolina_aux, us_routes_dict, scarolina_roads_dict],
                   "Texas" : [texas_interstates, texas_aux, us_routes_dict, texas_roads_dict]}

### Define function to check for presence of road in string

In [37]:
def check_roads(state, road_dict, text):
    state_roads = road_dict[state]
    relevant_roads = []
    for dicts in state_roads:
        for road, abbrevs in dicts.items():
            for abbrev in abbrevs:
                if(abbrev in text):
                    relevant_roads.append(road)
    return list(set(relevant_roads))

### Cities Towns and Counties Matching

In [47]:
cities_list = pd.read_csv("../Data/Cities_List.csv")

In [48]:
counties_list = pd.read_csv("../Data/list-counties-us.csv",  encoding='ISO-8859-1')

In [49]:
cities_list["City"] = cities_list["City"].str.lower()

In [50]:
counties_list["County or equivalent"] = counties_list["County or equivalent"].str.lower()
relevant_counties = counties_list[(counties_list["State or district"] == "Virginia") |
                                   (counties_list["State or district"] == "North Carolina") |
                                  (counties_list["State or district"] == "South Carolina") |
                                  (counties_list["State or district"] == "Florida") |
                                  (counties_list["State or district"] == "Texas")]

In [51]:
state_list = list(set(relevant_counties["State or district"].to_list()))
state_areas_dict = {}
for state in state_list:
    state_areas_dict[state] = {"counties" : {},
                               "cities" : {}}

In [52]:
for state in state_list:
    counties = relevant_counties[relevant_counties["State or district"] == state]["County or equivalent"]
    cities = cities_list[cities_list["State"] == state]["City"]
    for county in counties:
        state_areas_dict[state]["counties"][county] = [county,
                                                       county.replace("county", "cty"),
                                                       county.replace("county", "cnty"),
                                                       county.replace(" county", "county"),
                                                       county.replace(" county", "cty"),
                                                       county.replace(" county", "cnty"),
                                                      county.replace(" county", "")]
    for city in cities:
        state_areas_dict[state]["cities"][city] = [city]        

In [53]:
def check_area(state, state_areas_dict, text, area_type):
    text_areas = []
    for area in state_areas_dict[state][area_type]:
        for abbrev in state_areas_dict[state][area_type][area]:
            if abbrev in text:
                text_areas.append(area)
    return text_areas

In [54]:
def check_county_word(text):
    county_abbrevs = ["county", "cty", "cnty"]
    has_county = False
    for abbrev in county_abbrevs:
        if abbrev in text:
            has_county = True
    return has_county        