#### Import Libraries for Scraping

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Define function to be used across sections below

In [2]:
def string_from_list(values_list):
    values_string = ""
    if(len(values_list) > 0):
        for i in range(len(values_list)):  
            values_string = str(values_string) + str(values_list[i]) + ","
    else:
        values_string = "None "
    return values_string[:-1]

#### Import tweets gathered for each of 3 hurricances

In [108]:
florence_df = pd.read_csv("../Data/florence_clean.csv")
harvey_df = pd.read_csv("../Data/harvey_clean.csv")
michael_df = pd.read_csv("../Data/michael_clean.csv")

In [110]:
harvey_df[harvey_df["text"].isnull()]

Unnamed: 0,id,username,date,text,time,state,split_text,closed,open,cause


In [111]:
harvey_df["text"].fillna("", inplace = True)

In [19]:
florence_df["text"].fillna("", inplace = True)

In [20]:
michael_df["text"].fillna("", inplace = True)

### Search for Non-Highway Roads

In [21]:
# Read in gathered road suffix data
road_suffix_df = pd.read_csv("../Data/Road_Suffix_List.csv")

In [22]:
# Create list of key suffixes and lowercase them
suffix_list = list(set(road_suffix_df["Primary Street Name"].str.lower().tolist()))

In [23]:
# Create dictionary with keys as each suffix and values as list of abbreviations for that suffix
suffix_dict = {}
for roads in road_suffix_df.index:
    name = road_suffix_df.loc[roads, "Primary Street Name"]
    suffix_dict[name] = road_suffix_df[road_suffix_df["Primary Street Name"] == name]["Abbreviations"].str.lower().to_list()

In [24]:
# Add missing suffixes as needed
suffix_dict["ROUTE"] = ["route", "rte"]

In [107]:
suffix_dict

{'ALLEY': ['allee', 'alley', 'ally', 'aly'],
 'ANEX': ['anex', 'annex', 'annx', 'anx'],
 'ARCADE': ['arc', 'arcade'],
 'AVENUE': ['av', 'ave', 'aven', 'avenu', 'avenue', 'avn', 'avnue'],
 'BAYOU': ['bayoo', 'bayou'],
 'BEACH': ['bch', 'beach'],
 'BEND': ['bend', 'bnd'],
 'BLUFF': ['blf', 'bluf', 'bluff'],
 'BLUFFS': ['bluffs'],
 'BOTTOM': ['bot', 'btm', 'bottm', 'bottom'],
 'BOULEVARD': ['blvd', 'boul', 'boulevard', 'boulv'],
 'BRANCH': ['br', 'brnch', 'branch'],
 'BRIDGE': ['brdge', 'brg', 'bridge'],
 'BROOK': ['brk', 'brook'],
 'BROOKS': ['brooks'],
 'BURG': ['burg'],
 'BURGS': ['burgs'],
 'BYPASS': ['byp', 'bypa', 'bypas', 'bypass', 'byps'],
 'CAMP': ['camp', 'cp', 'cmp'],
 'CANYON': ['canyn', 'canyon', 'cnyn'],
 'CAPE': ['cape', 'cpe'],
 'CAUSEWAY': ['causeway', 'causwa', 'cswy'],
 'CENTER': ['cen',
  'cent',
  'center',
  'centr',
  'centre',
  'cnter',
  'cntr',
  'ctr'],
 'CENTERS': ['centers'],
 'CIRCLE': ['cir', 'circ', 'circl', 'circle', 'crcl', 'crcle'],
 'CIRCLES': ['circle

In [25]:
# Define function that searches through entire tweet for presence of any road suffix.
# If a suffix is found, this function attempts to create the full road name by lookin at words that precede the suffix.
# Specifically, road names are added with 1, 2, and 3 prior words
# All road possibilities are added to a list and returned
def check_other_roads(text, suffix_dict):
    roads_list = []
    words_list = text.split()
    for i in range(len(words_list)):    
        for suffix, abbrevs in suffix_dict.items():
            for abbrev in abbrevs:
                if abbrev in words_list[i]:
                    if(len(words_list[i]) == len(abbrev)):
                        if i > 0:
                            roads_list.append(words_list[i-1] + 
                                                  " " + 
                                                  suffix.lower())
                        if i > 1:
                            roads_list.append(words_list[i-2] + 
                                                  " " + 
                                                  words_list[i-1] + 
                                                  " " + 
                                                  suffix.lower())
                    elif((words_list[i].find(abbrev) + len(abbrev)) == len(words_list[i])):
                        first_word = words_list[i].replace(abbrev, "")
                        roads_list.append(first_word + 
                                                  " " + 
                                                  suffix.lower())
                        if i > 0:
                            roads_list.append(words_list[i-1] + 
                                                  " " + 
                                                  first_word + 
                                                  " " + 
                                                  suffix.lower())
    return roads_list

In [26]:
# Function that loops through a dataframe and used above function to add strings of road possibilities to each tweet
def add_other_road_features(df):
    road_string_list = []
    for rows in df.index:
        text = df.loc[rows,"text"]
        roads = check_other_roads(text, suffix_dict)
        road_string_list.append(string_from_list(roads))
    return road_string_list

In [112]:
harvey_other_roads = pd.DataFrame()
harvey_other_roads["Other_Road_List"] = add_other_road_features(harvey_df)

In [113]:
harvey_other_roads.to_csv("../Data/harvey_other_roads.csv")

In [29]:
florence_other_roads = pd.DataFrame()
florence_other_roads["Other_Road_List"] = add_other_road_features(florence_df)

In [30]:
florence_other_roads.to_csv("../Data/florence_other_roads.csv")

In [31]:
michael_other_roads = pd.DataFrame()
michael_other_roads["Other_Road_List"] = add_other_road_features(michael_df)

In [32]:
michael_other_roads.to_csv("../Data/michael_other_roads.csv")

### Search for exit numbers

In [33]:
def exit_search(text):
    words = text.split()
    exit_string = ""
    exit_list = []
    for i in range(len(words)):
        string = ""
        if(((words[i] == "exit") |  (words[i] == "ex")) &
           (i < len(words) - 1)):
            try:
                float(words[i+1][0])
                string = "exit " + words[i+1]
            except:
                pass
        elif(("exit" in words[i]) & (words[i].find("exit") == 0)):
            num_check = words[i].replace("exit", "")
            try:
                float(num_check[0])
                string = "exit " + num_check 
            except:
                pass
        elif(("ex" in words[i]) & (words[i].find("ex") == 0)):
            num_check = words[i].replace("ex", "")
            try:
                float(num_check[0])
                string = "exit " + num_check 
            except:
                pass
        if string != "":
            exit_list.append(string)
    return string_from_list(exit_list)

In [34]:
def add_exit_lists(df):
    exits_df = pd.DataFrame()
    for row in df.index:
        text = df.loc[row, "text"]
        exits_df.loc[row, "exit_list"] = exit_search(text)
    return exits_df

In [114]:
harvey_exits = add_exit_lists(harvey_df)

In [115]:
harvey_exits[(harvey_exits["exit_list"] != "None") & ("exit" not in harvey_df["text"])]

Unnamed: 0,exit_list
270,exit 845
308,exit 845
365,exit 845
868,exit 335b


In [116]:
print(harvey_df[(harvey_exits["exit_list"] != "None") & ("exit" not in harvey_df["text"])]["text"])

270    closed due to high water in #chambers on i-10 ...
308    closed due to high water in #chambers on i-10 ...
365    closed due to high water in #chambers on i-10 ...
868    #waco ih 35 sb at exit 335b shutdown for major...
Name: text, dtype: object


In [117]:
harvey_exits.to_csv("../Data/harvey_exits.csv")

In [39]:
michael_exits = add_exit_lists(michael_df)

In [40]:
michael_exits[(michael_exits["exit_list"] != "None") & ("exit" not in michael_df["text"])]

Unnamed: 0,exit_list
5,exit 10a
6,exit 10a
10,exit 12
11,exit 152
12,exit 12
14,exit 45
15,exit 152
17,exit 45
20,exit 10a
21,exit 13


In [41]:
michael_exits.to_csv("../Data/michael_exits.csv")

In [42]:
florence_exits = add_exit_lists(florence_df)

In [43]:
florence_exits.to_csv("../Data/florence_exits.csv")

In [44]:
florence_exits[florence_exits["exit_list"] != "None"]

Unnamed: 0,exit_list
383,"exit 276,exit 285"
385,exit 81
386,exit 328
387,exit 98
402,exit 81
403,exit 328
416,exit 81
417,exit 328
422,exit 81
423,exit 328


### Search for mile markers

In [45]:
def mile_marker_search(text):
    words = text.split()
    exit_string = ""
    exit_list = []
    for i in range(len(words)):
        string = ""
        if(((words[i] == "mile marker") |  (words[i] == "mm")) &
           (i < len(words) - 1)):
            try:
                float(words[i+1][0])
                string = "mile marker " + words[i+1]
            except:
                pass
        elif(("mile marker" in words[i]) & (words[i].find("mile marker") == 0)):
            num_check = words[i].replace("mile marker", "")
            try:
                float(num_check[0])
                string = "mile marker " + num_check 
            except:
                pass
        elif(("mm" in words[i]) & (words[i].find("mm") == 0)):
            num_check = words[i].replace("mm", "")
            try:
                float(num_check[0])
                string = "mile marker " + num_check 
            except:
                pass
        if string != "":
            exit_list.append(string)
    return string_from_list(exit_list)

In [46]:
def add_mm_lists(df):
    mm_df = pd.DataFrame()
    for row in df.index:
        text = df.loc[row, "text"]
        mm_df.loc[row, "mm_list"] = mile_marker_search(text)
    return mm_df

In [118]:
harvey_mm = add_mm_lists(harvey_df)

In [119]:
harvey_mm[(harvey_mm["mm_list"] != "None")]

Unnamed: 0,mm_list
867,mile marker 16


In [120]:
harvey_mm.to_csv("../Data/harvey_mm.csv")

In [50]:
michael_mm = add_mm_lists(michael_df)

In [51]:
michael_mm[(michael_mm["mm_list"] != "None")]

Unnamed: 0,mm_list
1,mile marker 88
2,mile marker 88
3,mile marker 88
7,mile marker 121
8,mile marker 49
9,mile marker 49
13,mile marker 49
16,mile marker 49
18,mile marker 153
19,mile marker 153


In [52]:
michael_mm.to_csv("../Data/michael_mm.csv")

In [53]:
florence_mm = add_mm_lists(florence_df)

In [54]:
florence_mm[(florence_mm["mm_list"] != "None")]

Unnamed: 0,mm_list
77,mile marker 286
78,mile marker 294
79,mile marker 286
81,mile marker 100
82,mile marker 105
83,mile marker 105
84,mile marker 98
85,mile marker 98
86,mile marker 436
87,mile marker 436


In [55]:
florence_mm.to_csv("../Data/florence_mm.csv")

### Scrape Main Interstate Highways

In [56]:
interstate_df = pd.read_csv("../Data/Interstate_List.csv")

In [57]:
interstate_list = list(set(interstate_df["Interstate Highway"].tolist()))

In [58]:
interstate_dict = {}
for interstate in interstate_list:
    interstate_dict[interstate] = {"states" : interstate_df[interstate_df["Interstate Highway"] == interstate]["State"].to_list(),
                                   "names" : []}

In [59]:
for roads, values in interstate_dict.items():
    lst = []
    lower = roads.lower()
    space = lower.replace("-", " ")
    remove = lower.replace("-", "")
    lst.append(lower)
    lst.append(space)
    lst.append(remove)
    base_list = [lower, space, remove]
    replacements = [{"i" : "interstate"},
                    {"ih" : "interstate"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    interstate_dict[roads]["names"] = lst

In [60]:
def get_state_interstates(state, i_dict):
    new_dict = {}
    for roads, values in i_dict.items():
        if(state in i_dict[roads]["states"]):
            new_dict[roads] = i_dict[roads]["names"]
    return new_dict

In [61]:
virginia_interstates = get_state_interstates("Virginia", interstate_dict)
texas_interstates = get_state_interstates("Texas", interstate_dict)
ncarolina_interstates = get_state_interstates("North Carolina", interstate_dict)
scarolina_interstates = get_state_interstates("South Carolina", interstate_dict)
florida_interstates = get_state_interstates("Florida", interstate_dict)

## Auxiliary Interstate Roads

In [62]:
aux_df = pd.read_csv("../Data/Auxiliary_List.csv")

In [63]:
aux_list = list(set(aux_df["Interstate Route"].tolist()))

In [64]:
aux_dict = {}

In [65]:
for aux_routes in aux_list:
    aux_dict[aux_routes] = {"states" : [], "names" : []}

In [66]:
for highways, values in aux_dict.items():
    values["states"] = aux_df[aux_df["Interstate Route"] == highways]["State"].to_list()

In [67]:
for roads, values in aux_dict.items():
    lst = []
    lower = roads.lower()
    space = lower.replace("-", " ")
    remove = lower.replace("-", "")
    lst.append(lower)
    lst.append(space)
    lst.append(remove)
    base_list = [lower, space, remove]
    replacements = [{"i" : "interstate"},
                   {"ih" : "interstate"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    aux_dict[roads]["names"] = lst

In [68]:
virginia_aux = get_state_interstates("Virginia", aux_dict)
ncarolina_aux = get_state_interstates("North Carolina", aux_dict)
scarolina_aux = get_state_interstates("South Carolina", aux_dict)
florida_aux = get_state_interstates("Florida", aux_dict)
texas_aux = get_state_interstates("Texas", aux_dict)

## US Routes

In [69]:
us_routes_df = pd.read_csv("../Data/US_Routes.csv")

In [70]:
us_routes_dict = {}
for roads in us_routes_df.index:
    us_routes_dict[us_routes_df.loc[roads, "US Routes"]] = []

In [71]:
for roads in us_routes_df.index:
    name = us_routes_df.loc[roads, "US Routes"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    replacements = [{"us" : "u.s."},
                   {"us" : "us route"},
                   {"us" : "u.s. route"},
                   {"us" : "us rte"},
                   {"us" : "u.s. rte"},
                   {"us" : "us-route"},
                   {"us" : "u.s.-route"},
                   {"us" : "us-rte"},
                   {"us" : "u.s.-rte"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    us_routes_dict[name] = lst    

## State Routes

In [72]:
state_roads_df = pd.read_csv("../Data/State_Road_List.csv")

In [73]:
texas_roads_df = state_roads_df[state_roads_df["State"] == "Texas"][["Road Type", "Name"]]
ncarolina_roads_df = state_roads_df[state_roads_df["State"] == "North Carolina"][["Road Type", "Name"]]
scarolina_roads_df = state_roads_df[state_roads_df["State"] == "South Carolina"][["Road Type", "Name"]]
florida_roads_df = state_roads_df[state_roads_df["State"] == "Florida"][["Road Type", "Name"]]
virginia_roads_df = state_roads_df[state_roads_df["State"] == "Virginia"][["Road Type", "Name"]]

### Create dictionaries for each state that contain state roads as keys and list "names" to be populated with alternative spellings

In [74]:
texas_roads_dict = {}
for roads in texas_roads_df.index:
    texas_roads_dict[texas_roads_df.loc[roads, "Name"]] = []

In [75]:
ncarolina_roads_dict = {}
for roads in ncarolina_roads_df.index:
    ncarolina_roads_dict[ncarolina_roads_df.loc[roads, "Name"]] = []

In [76]:
scarolina_roads_dict = {}
for roads in scarolina_roads_df.index:
    scarolina_roads_dict[scarolina_roads_df.loc[roads, "Name"]] = []

In [77]:
florida_roads_dict = {}
for roads in florida_roads_df.index:
    florida_roads_dict[florida_roads_df.loc[roads, "Name"]] = []

In [78]:
virginia_roads_dict = {}
for roads in virginia_roads_df.index:
    virginia_roads_dict[virginia_roads_df.loc[roads, "Name"]] = []

### Create list of alternative highway/road spellings for each state highway

In [79]:
for roads in texas_roads_df.index:
    name = texas_roads_df.loc[roads, "Name"]
    road_type = texas_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    if(road_type == "State Highway"):
        for abbrevs in suffix_dict["HIGHWAY"]:
            lst.append(lower.replace("sh", "state " + abbrevs.lower()))
            lst.append(lower.replace("sh", "st " + abbrevs.lower()))
            lst.append(dash.replace("sh", "state " + abbrevs.lower()))
            lst.append(dash.replace("sh", "st " + abbrevs.lower()))
            lst.append(remove.replace("sh", "state " + abbrevs.lower()))
            lst.append(remove.replace("sh", "st " + abbrevs.lower()))
            lst.append(lower.replace("sh", "state " + abbrevs.lower() + "s"))
            lst.append(lower.replace("sh", "st " + abbrevs.lower() + "s"))
            lst.append(dash.replace("sh", "state " + abbrevs.lower() + "s"))
            lst.append(dash.replace("sh", "st " + abbrevs.lower() + "s"))
            lst.append(remove.replace("sh", "state " + abbrevs.lower() + "s"))
            lst.append(remove.replace("sh", "st " + abbrevs.lower() + "s"))
    replacements = [{"loop" : "sl"},
                    {"loop" : "lp"},
                    {"fm" : "farm to market"},
                    {"fm" : "farm-to-market"},
                    {"fm" : "farm to market road"},
                    {"fm" : "farm-to-market road"},
                    {"fm" : "farm to market roads"},
                    {"fm" : "farm-to-market roads"},
                    {"pr" : "park road"},
                    {"recreational road" : "re"},
                    {"recreational road" : "recreational roads"},
                    {"ranch road" : "rr"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    texas_roads_dict[name] = lst    

In [80]:
for roads in ncarolina_roads_df.index:
    name = ncarolina_roads_df.loc[roads, "Name"]
    road_type = ncarolina_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    for abbrevs in suffix_dict["HIGHWAY"]:
        for bases in base_list:
            lst.append(bases.replace("nc", abbrevs))
    ncarolina_roads_dict[name] = lst    

In [81]:
for roads in scarolina_roads_df.index:
    name = scarolina_roads_df.loc[roads, "Name"]
    road_type = scarolina_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    for abbrevs in suffix_dict["HIGHWAY"]:
        for bases in base_list:
            lst.append(bases.replace("sc", abbrevs))
    scarolina_roads_dict[name] = lst    

In [82]:
for roads in virginia_roads_df.index:
    name = virginia_roads_df.loc[roads, "Name"]
    road_type = virginia_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    for bases in base_list:
        lst.append(bases.replace("sr", "state route"))
        lst.append(bases.replace("sr", "st route"))
        lst.append(bases.replace("sr", "state rte"))
        lst.append(bases.replace("sr", "st rte "))
    virginia_roads_dict[name] = lst

In [83]:
for roads in florida_roads_df.index:
    name = florida_roads_df.loc[roads, "Name"]
    road_type = florida_roads_df.loc[roads, "Road Type"]
    lst = []
    lower = name.lower()
    dash = lower.replace(" ", "-")
    remove = lower.replace(" ", "")
    lst.append(lower)
    lst.append(dash)
    lst.append(remove)
    base_list = [lower, dash, remove]
    replacements = [{"sr" : "state road"},
                    {"sr" : "state rd"},
                    {"sr" : "st road"},
                    {"sr" : "st rd"},
                    {"causeway" : "causwa"},
                    {"causeway" : "cswy"},
                    {"bridge" : "brdge"},
                    {"bridge" : "brg"},
                   {"road" : "rd"},
                   {"expresway" : "exp"},
                   {"expresway" : "expr"},
                   {"expresway" : "express"},
                   {"expresway" : "expw"},
                   {"expresway" : "expy"}]
    for dicts in replacements:
        keys = list(dicts.keys())[0]
        values = list(dicts.values())[0]
        for bases in base_list:
            lst.append(bases.replace(keys, values))
    florida_roads_dict[name] = lst    

## Create dictionary containing a list of roads relevant to each state

In [84]:
master_road_dict = {"Florida" : [florida_interstates, florida_aux, us_routes_dict, florida_roads_dict],
                   "Virginia" : [virginia_interstates, virginia_aux, us_routes_dict, virginia_roads_dict],
                   "North Carolina" : [ncarolina_interstates, ncarolina_aux, us_routes_dict, ncarolina_roads_dict],
                   "South Carolina" : [scarolina_interstates, scarolina_aux, us_routes_dict, scarolina_roads_dict],
                   "Texas" : [texas_interstates, texas_aux, us_routes_dict, texas_roads_dict]}

### Define function to check for presence of road in string
This function iterates through text to create list of all roads that are found in the string, and returns a dataframe that contains 3 columns:

    1) name of the road
    2) which word the road is contained within (when text is split into words by a space)
    3) index position of the road within the word

Roads that are substrings of other roads are removed, but roads that appear more than once in separate parts of the text are not removed

In [103]:
def check_roads(state, text):
# Create variables including dictionary of roads to search over, a list of words in each tweet,
# and lists to track the presence of roads inside text
    state_roads = master_road_dict[state]
    words = text.split()
    relevant_roads = []
    road_list_position = []
    road_substring_indices = []
    index_tracker = 0
# Loop over the dictionary of roads to check if that road is present in the tweet, and keep track of its relative position 
# within the tweet
    for dicts in state_roads:
        for road, abbrevs in dicts.items():
            for abbrev in abbrevs:
                for i in range(len(words)):
                    if((abbrev in words[i]) & (road not in relevant_roads)):
                        relevant_roads.append(road)
                        road_substring_indices.append(words[i].find(abbrev))
                        road_list_position.append(i)
    
# Loop over newly created list of roads found within the tweet, and delete any roads that are a substring of another road.
# For instance, US-10 should only return US-10 and exclude US-1 even though US-1 is found within US-10.
# First, a list of indices to delete is created - first set of loops. Then, new lists are created where only those not to be
# deleted are added - second loop

# First loop
    delete_list = []
    for i in range(len(relevant_roads)):
        for j in range(len(relevant_roads)):
            if ((i != j) &
                (j not in delete_list) &
                (road_substring_indices[i] == road_substring_indices[j]) &
                (road_list_position[i] == road_list_position[j]) &
                (len(relevant_roads[j]) < len(relevant_roads[i]))):
                delete_list.append(j)
# New lists
    clean_relevant_roads = []
    clean_road_substring_indices = []
    clean_road_list_position = []
# Second loop
    for i in range(len(relevant_roads)):
        if i not in delete_list:
            clean_relevant_roads.append(relevant_roads[i])
            clean_road_substring_indices.append(road_substring_indices[i])
            clean_road_list_position.append(road_list_position[i])
# Create dataframe such that 1) all lists can be sorted in the same way
# and 2) to calculate the overall positioning of a found road within the tweet as a combination of:
# a) which word the road was found in and b) its index within the word.
# This accounts for typos where spaces are not used in between words.
# For instance, in the text "Accident on I-90 between I-93andi-95", we would want to differentiate the relative position
# of i-93 and i-95 within the text, even though both are part of the same word.
    roads_df = pd.DataFrame()
    roads_df["relevant_roads"] = clean_relevant_roads
    roads_df["road_substring_indices"] = clean_road_substring_indices
    roads_df["road_list_position"] = clean_road_list_position
    roads_df = roads_df.sort_values(by=["road_list_position", "road_substring_indices"])
    roads_df = roads_df.reset_index(drop=True)
    roads_df["overall_road_position"] = roads_df.index + 1
# After getting values as a dataframe for each text, below code and use of above function
# takes each series and converts to a list such that text can be stored inside of a dataframe containing many tweets
    roads_string = string_from_list(roads_df["relevant_roads"])
    roads_order = string_from_list(roads_df["overall_road_position"])
    has_road = 1 if roads_string != "None" else 0
    num_roads = roads_df["relevant_roads"].size
    return [roads_string, roads_order, has_road, num_roads]

### Cities Towns and Counties Matching

In [86]:
cities_list = pd.read_csv("../Data/Cities_List.csv")

In [87]:
counties_list = pd.read_csv("../Data/us_counties.csv",  encoding='ISO-8859-1')

In [88]:
cities_list["City"] = cities_list["City"].str.lower()

In [89]:
counties_list["County or equivalent"] = counties_list["County or equivalent"].str.lower()
relevant_counties = counties_list[(counties_list["State or district"] == "Virginia") |
                                   (counties_list["State or district"] == "North Carolina") |
                                  (counties_list["State or district"] == "South Carolina") |
                                  (counties_list["State or district"] == "Florida") |
                                  (counties_list["State or district"] == "Texas")]

In [90]:
state_list = list(set(relevant_counties["State or district"].to_list()))
state_areas_dict = {}
for state in state_list:
    state_areas_dict[state] = {"counties" : {},
                               "cities" : {}}

In [91]:
for state in state_list:
    counties = relevant_counties[relevant_counties["State or district"] == state]["County or equivalent"]
    cities = cities_list[cities_list["State"] == state]["City"]
    for county in counties:
        state_areas_dict[state]["counties"][county] = [county,
                                                       county.replace("county", "cty"),
                                                       county.replace("county", "cnty"),
                                                       county.replace("county", "co"),
                                                       county.replace(" county", "county"),
                                                       county.replace(" county", "cty"),
                                                       county.replace(" county", "cnty"),
                                                       county.replace(" county", "co")]
    for city in cities:
        state_areas_dict[state]["cities"][city] = [city]        

In [102]:
def check_city_county(state, state_areas_dict, text, area_type):
    areas_list = []
    areas_list_position = []
    areas_substring_indices = []
    words = text.split()
    for area in state_areas_dict[state][area_type]:
        for abbrev in state_areas_dict[state][area_type][area]:
            for i in range(len(words)):
                if ((abbrev in words[i]) & (area not in areas_list)):
                    areas_list.append(area)
                    areas_list_position.append(i)
                    areas_substring_indices.append(words[i].find(abbrev))
    areas_df = pd.DataFrame()
    areas_df[area_type + "_list"] = areas_list
    areas_df[area_type + "_list_position"] = areas_list_position
    areas_df[area_type + "_substring_indices"] = areas_substring_indices
    areas_df = areas_df.sort_values(by = [area_type + "_list_position", area_type + "_substring_indices"])
    areas_df = areas_df.reset_index(drop=True)
    areas_df["overall_" + area_type +"_position"] = areas_df.index + 1
    
    area_string = string_from_list(areas_df[area_type + "_list"])
    area_order = string_from_list(areas_df[area_type + "_list_position"])
    has_area = 1 if area_string != "None" else 0
    num_areas = areas_df[area_type + "_list"].size
    return [area_string, area_order, has_area, num_areas]

### Add highway, county and city features to new dataframe for each hurricane
for each hurricane, features include

1) a string that represents a comma separated list of higways found
2) order in which highways/cities/counties appear in text
3) boolean for whether or not a highway/city/county was found
4) number of highways, counties, cities found



In [101]:
def annotate(df):
    new_cols = pd.DataFrame()
    num = []
    highway_strings = []
    highway_orders = []
    highway_bools = []
    highway_lengths = []
    
    city_strings = []
    city_orders = []
    city_bools = []
    city_lengths = []
    
    county_strings = []
    county_orders = []
    county_bools = []
    county_lengths = []    
    
    for rows in df.index:
        state = df.loc[rows,"state"]
        text = df.loc[rows,"text"]
        roads_values = check_roads(state, text)
        city_values = check_city_county(state, state_areas_dict, text, "cities")
        county_values = check_city_county(state, state_areas_dict, text, "counties")
        
        highway_strings.append(roads_values[0])
        highway_bools.append(roads_values[2])

        city_strings.append(city_values[0])
        city_bools.append(city_values[2])

        county_strings.append(county_values[0])
        county_bools.append(county_values[2])
        
    new_cols["highway_string"] = highway_strings
    new_cols["has_highway"] = highway_bools
    
    new_cols["county_string"] = county_strings
    new_cols["has_county"] = county_bools
    
    new_cols["city_string"] = city_strings
    new_cols["has_city"] = city_bools

    return new_cols

In [104]:
annotate(florence_df).to_csv("../Data/florence_new_values.csv")

In [None]:
annotate(harvey_df).to_csv("../Data/harvey_new_values.csv")

In [106]:
annotate(michael_df).to_csv("../Data/michael_new_values.csv")