# 3. Location Extraction and Spacy Word Vectorization

In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import string
import datetime

from spacy import displacy

In [2]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07312019.csv")
rt_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_RT_official_07312019.csv")

exits = pd.read_csv("../data/interstate_exits.csv")

twitter_closures = twitter_closures[['date', 'text', 'type', 'username', 'tweet', 'state', 'road_closure']]
rt_closures = rt_closures[['date', 'text', 'type', 'username', 'tweet', 'state', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)
print(rt_closures.shape)

# Show head of real time tweets
rt_closures.head()

(24054, 7)
(200, 7)


Unnamed: 0,date,text,type,username,tweet,state,road_closure
0,2019-07-31 19:57:12,updated disabled vehicle in duval on i-295 e ...,official,fl511_northeast,Updated Disabled vehicle in Duval on I-295 E ...,Florida,1
1,2019-07-31 19:53:12,updated disabled vehicle in duval on i-295 e ...,official,fl511_northeast,Updated Disabled vehicle in Duval on I-295 E ...,Florida,1
2,2019-07-31 19:49:14,updated disabled vehicle in duval on i-295 e ...,official,fl511_northeast,Updated Disabled vehicle in Duval on I-295 E ...,Florida,1
3,2019-07-31 19:34:13,cleared planned construction in duval on sr-1...,official,fl511_northeast,Cleared Planned construction in Duval on SR-1...,Florida,0
4,2019-07-31 19:34:12,cleared planned construction in duval on sr-1...,official,fl511_northeast,Cleared Planned construction in Duval on SR-1...,Florida,0


In [49]:
exits['crossSt'] = exits['crossSt'].fillna('None')

## SpaCy Preprocessing

In [3]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''
rt_closures['modified_text'] = ''
rt_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
0,2016-10-11 16:39:51+00:00,the pioh for the sr 138 i-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,Georgia,0,,
1,2016-10-10 19:10:23+00:00,we appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,Georgia,0,,


In [4]:
format_dict = {"hwy": "highway ",
            "blvd": "boulevard",
            " st": "street",
           "CR ": "County Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           " In ": " in",
           " in ": " at "}

In [5]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [6]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'tweet', format_dict)
rt_closures['modified_text'] = spacy_cleaner(rt_closures, 'tweet', format_dict)

In [7]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [8]:
# convert date column to datetime
twitter_closures['date'] = pd.to_datetime(twitter_closures['date'])
rt_closures['date'] = pd.to_datetime(rt_closures['date'])

In [9]:
# for ease of use of the historic tweets, only take tweets that happened from
# October 6, 2016 to October 9, 2016
# the day Hurricane Matthew hit Jacksonville
twitter_closures = twitter_closures[(twitter_closures['date'] > '2016-10-6') & (twitter_closures['date'] < '2016-10-9')]

In [10]:
# only use tweets that contain road closures are from 'fl511 northeast'
loc_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')]
loc_df.shape

(285, 9)

In [11]:
# only take tweets that contain road closures from the real time set
rt_loc_df = rt_closures[(rt_closures['road_closure'] == 1)]
rt_loc_df.shape

(125, 9)

## Run SpaCy Location Extraction 

**WARNING** SpaCy is computationally expensive, extracting these locations will take time.

In [12]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [13]:
loc = get_loc(loc_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
rt_loc = get_loc(rt_loc_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
loc_df['location'] = loc
print(loc_df.shape)
loc_df.head()

(285, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
6649,2016-10-08 23:33:38+00:00,new disabled vehicle in duval on sr-202 but...,official,fl511_northeast,NEW Disabled vehicle in Duval on SR-202 But...,Florida,1,At New Disabled Vehicle At Duval At Sr-202 ...,"{Kernan Blvd Right Shoulder Blocked, Blvd East}"
6651,2016-10-08 23:18:05+00:00,new disabled vehicle in duval on i-295 w nort...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 W nort...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,"{San Jose, Interstate 295 West North}"
6657,2016-10-08 21:58:06+00:00,new unconfirmed disabled vehicle in duval on ...,official,fl511_northeast,NEW Unconfirmed disabled vehicle in Duval on ...,Florida,1,At New Unconfirmed Disabled Vehicle At Duval ...,{Interstate 10 East Ramp To Interstate 95}
6660,2016-10-08 21:33:05+00:00,update disabled vehicle in duval on i-295 w n...,official,fl511_northeast,UPDATE Disabled vehicle in Duval on I-295 W n...,Florida,1,At Update Disabled Vehicle At Duval At Inters...,{Interstate 295 West North}
6662,2016-10-08 21:28:40+00:00,new disabled vehicle in duval on i-295 w nort...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 W nort...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{Interstate 295 West North}


In [16]:
rt_loc_df['location'] = rt_loc
print(rt_loc_df.shape)
rt_loc_df.head()

(125, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
0,2019-07-31 19:57:12,updated disabled vehicle in duval on i-295 e ...,official,fl511_northeast,Updated Disabled vehicle in Duval on I-295 E ...,Florida,1,At Updated Disabled Vehicle At Duval At Inter...,"{Interstate 295 East South, Dames Point}"
1,2019-07-31 19:53:12,updated disabled vehicle in duval on i-295 e ...,official,fl511_northeast,Updated Disabled vehicle in Duval on I-295 E ...,Florida,1,At Updated Disabled Vehicle At Duval At Inter...,"{Dames Point 2, Interstate 295 East South}"
2,2019-07-31 19:49:14,updated disabled vehicle in duval on i-295 e ...,official,fl511_northeast,Updated Disabled vehicle in Duval on I-295 E ...,Florida,1,At Updated Disabled Vehicle At Duval At Inter...,"{Interstate 295 East South, Dames Point}"
5,2019-07-31 19:33:16,new disabled vehicle in duval on i-295 e sout...,official,fl511_northeast,New Disabled vehicle in Duval on I-295 E sout...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,"{Interstate 295 East South, Dames Point}"
7,2019-07-31 18:58:11,updated object on roadway in st johns on i-95...,official,fl511_northeast,Updated Object on roadway in St Johns on I-95...,Florida,1,At Updated Object At Roadway At St Johns At I...,"{Interstate 95 North At Exit, Sr-16 Right Lane..."


In [17]:
loc_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_08012019.csv", index = False)
rt_loc_df.to_csv("../data/Loc_Extracted/rt_locations_sample_08012019.csv", index = False)

In [18]:
loc_df = pd.read_csv("../data/Loc_Extracted/tweet_locations_sample_07312019.csv")
rt_loc_df = pd.read_csv("../data/Loc_Extracted/rt_locations_sample_07312019.csv")

## GPS Coordinate Extraction using Interstate Exits

In [73]:
# function to extract interstate, exit number, and direction
def exit_extractor (df, col, i_df):
    
    # instantiate lists for exit data
    exits = []
    interstates = []
    direction = []
    cross_st = []
    
    # loop through text column
    for item in df[col]:
        
        # look for "interstate" in text
        if 'Interstate' in item:
            
            # use regex to extract interstate and number from text
            i_string = re.search(r'Interstate (\S+)', item)
            interstates.append(i_string.group(0))
            
            # use regex to extract direction following "interstate"
            d_string = re.search("(i-\d*|Interstate \d*) (South|North|East|West)*", item)
            d_string = d_string.group(0)
            d_string =  re.search("South|North|East|West", d_string)
            
            # try to extract the direction from the regex object
            # append null if an error is thrown
            try:
                d_string = d_string.group(0)
                
            except AttributeError: 
                d_string = np.nan
                
            # append direction to list    
            direction.append(d_string)
                             
            # find "exit" in text
            if 'Exit' in item:
                
                # use regex to extract interstate and number from text
                e_string = re.search(r'Exit (\S+)', item)
                exits.append(e_string.group(0))
            
            # add "none" when no exit is found   
            else:
                exits.append("None")
                
        # add "none" to exits and interstates if no interstate is found
        else:
            interstates.append("None")
            exits.append("None")
            direction.append("None")
            
    # create a new dataframe from the interstate and exit lists
    new_df = pd.DataFrame(data = interstates, columns = ['interstate'])
    new_df['exits'] = exits
    new_df['direction'] = direction
    
    # return new dataframe
    return new_df

In [70]:
# function to extract longitude and latitude, if available
def loc_extractor(new_df, i_df):
    
    lat = []
    long = []
    cross = []
    
    # loop through the new dataframe
    for index, row in new_df.iterrows():
        
        # find rows that have both an interstate and exit extracted
        if (row['interstate'] != "None") and (row['exits'] != "None") and row['direction'] != "None":
            
            # attempt to add lat and long based on exit and interstate strings
            try:    
                mask = (i_df['interstate'].str.contains(row['interstate'])) & (i_df['exits'].str.contains(row['exits']))
                
                # add lat and long to list
                lat.append(i_df[mask].iloc[0]['lat'])
                long.append(i_df[mask].iloc[0]['long'])
            
            # if an error occurs, append null to lat and long
            # print index where error occured
            except:
                print(f"No exit found at {index}")

                lat.append(np.nan)
                long.append(np.nan)
        # if no exit is found, add null values to lat and long
        else:
            lat.append(np.nan)
            long.append(np.nan)
            
    # add lat and long to new dataframe
    new_df['lat'] = lat
    new_df['long'] = long
    
    return new_df

In [71]:
e_df = loc_extractor(exit_extractor(loc_df, 'modified_text', exits), exits)
final_df = pd.concat([loc_df, e_df], axis = 1)

No exit found at 148
No exit found at 154
No exit found at 206
No exit found at 216
No exit found at 237
No exit found at 248
No exit found at 250


In [72]:
rt_e_df = loc_extractor(exit_extractor(rt_loc_df, 'modified_text', exits), exits)
final_rt_df = pd.concat([rt_loc_df, rt_e_df], axis = 1)

No exit found at 34
No exit found at 76
No exit found at 80
No exit found at 81


In [42]:
final_rt_df.dropna()

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location,interstate,exits,direction,lat,long,cross_st
4,2019-07-31 18:58:11,updated object on roadway in st johns on i-95...,official,fl511_northeast,Updated Object on roadway in St Johns on I-95...,Florida,1,At Updated Object At Roadway At St Johns At I...,"{'Johns', 'Sr-16 Right Lane Blocked', 'Interst...",Interstate 95,Exit 318,North,29.91303,-81.4105,SR 16
5,2019-07-31 18:17:07,updated emergency road construction in st joh...,official,fl511_northeast,Updated Emergency road construction in St Joh...,Florida,1,At Updated Emergency Road Construction At St ...,"{'Johns', 'Exit 298', 'Interstate 95 South'}",Interstate 95,Exit 298,South,29.66032,-81.28373,US-1
6,2019-07-31 18:03:22,updated emergency road construction in st joh...,official,fl511_northeast,Updated Emergency road construction in St Joh...,Florida,1,At Updated Emergency Road Construction At St ...,"{'Johns', 'Exit 298', 'Interstate 95 South'}",Interstate 95,Exit 298,South,29.66032,-81.28373,US-1
7,2019-07-31 17:36:22,updated planned construction in columbia on i...,official,fl511_northeast,Updated Planned construction in Columbia on I...,Florida,1,At Updated Planned Construction At Columbia A...,{'Interstate 10 West Ramp To Exit 296 Intersta...,Interstate 10,Exit 296,West,30.26736,-82.75739,I-75 South
10,2019-07-31 16:19:08,updated vehicle on fire in st johns on i-95 s...,official,fl511_northeast,Updated Vehicle on fire in St Johns on I-95 s...,Florida,1,At Updated Vehicle At Fire At St Johns At Int...,"{'Johns', 'Exit 298', 'Interstate 95 South'}",Interstate 95,Exit 298,South,29.66032,-81.28373,US-1
11,2019-07-31 16:18:07,updated emergency vehicles in st johns on i-9...,official,fl511_northeast,Updated Emergency vehicles in St Johns on I-9...,Florida,1,At Updated Emergency Vehicles At St Johns At ...,"{'Johns', 'Exit 298', 'Interstate 95 South'}",Interstate 95,Exit 298,South,29.66032,-81.28373,US-1
25,2019-07-31 11:38:18,updated vehicle on fire in st johns on i-95 s...,official,fl511_northeast,Updated Vehicle on fire in St Johns on I-95 s...,Florida,1,At Updated Vehicle At Fire At St Johns At Int...,"{'Johns', 'Exit 298', 'Interstate 95 South'}",Interstate 95,Exit 298,South,29.66032,-81.28373,US-1
26,2019-07-31 11:27:06,new vehicle on fire in st johns on i-95 south...,official,fl511_northeast,New Vehicle on fire in St Johns on I-95 south...,Florida,1,At New Vehicle At Fire At St Johns At Interst...,"{'Exit 298', 'Interstate 95 South'}",Interstate 95,Exit 298,South,29.66032,-81.28373,US-1
28,2019-07-31 11:03:08,updated emergency vehicles in st johns on i-9...,official,fl511_northeast,Updated Emergency vehicles in St Johns on I-9...,Florida,1,At Updated Emergency Vehicles At St Johns At ...,"{'Johns', 'Interstate 95 South Ramp To Exit 298'}",Interstate 95,Exit 298,South,29.66032,-81.28373,US-1
29,2019-07-31 10:46:05,updated emergency vehicles in st johns on i-9...,official,fl511_northeast,Updated Emergency vehicles in St Johns on I-9...,Florida,1,At Updated Emergency Vehicles At St Johns At ...,"{'Johns', 'Interstate 95 South Ramp To Exit 298'}",Interstate 95,Exit 298,South,29.66032,-81.28373,US-1


In [43]:
final_df.dropna().head()

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location,interstate,...,exits.1.1,direction.1.1,lat.1.1,long.1.1,interstate.1,exits,direction,lat,long,cross_st
6,2016-10-08 20:38:21+00:00,new disabled vehicle in duval on i-295 e nort...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 E nort...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Interstate 295 East North Ramp'},Interstate 295,...,Exit 60,East,30.16893,-81.53623,Interstate 295,Exit 60,East,30.16893,-81.53623,Philips Hwy
13,2016-10-08 19:08:32+00:00,new disabled vehicle in duval on i-295 w sout...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 W sout...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Interstate 295 West South Ramp To Exit 22 Co...,Interstate 295,...,Exit 22,West,30.33213,-81.76245,Interstate 295,Exit 22,West,30.33213,-81.76245,Commonwealth Blvd
25,2016-10-08 15:53:25+00:00,new disabled vehicle in duval on i-295 e nort...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 E nort...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Interstate 295 East North Ramp To Exit 51 Be...,Interstate 295,...,Exit 51,East,30.29054,-81.52207,Interstate 295,Exit 51,East,30.29054,-81.52207,Beach Blvd
27,2016-10-08 15:43:35+00:00,new disabled vehicle in duval on i-10 east ra...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-10 east ra...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,"{'Interstate 10 East Ramp', 'Interstate 295 We...",Interstate 10,...,Exit 356,East,30.31518,-81.77517,Interstate 10,Exit 356,East,30.31518,-81.77517,I-295
38,2016-10-08 13:48:03+00:00,new disabled vehicle in duval on i-95 north r...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-95 north r...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Interstate 95 North Ramp To Exit 366'},Interstate 95,...,Exit 366,North,30.51338,-81.63441,Interstate 95,Exit 366,North,30.51338,-81.63441,Pecan Park Rd


In [25]:
final_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_08012019.csv", index = False)
rt_loc_df.to_csv("../data/Loc_Extracted/rt_locations_sample_08012019.csv", index = False)

## SpaCy Visualization

In [26]:
nlp = spacy.load("en_core_web_sm")
text = final_rt_df.iloc[110]['modified_text']
doc = nlp(text)
displacy.render(doc, style="ent", jupyter = True)

In [27]:
text = final_rt_df.iloc[96]['modified_text']
doc = nlp(text)
displacy.render(doc, style="ent", jupyter = True)