# Location Extraction and Spacy Word Vectorization

In [1]:
import pandas as pd
import re
import spacy
import string


In [2]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07252019.csv")

twitter_closures = twitter_closures[['text', 'type', 'username', 'tweet', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(43016, 5)


Unnamed: 0,text,type,username,tweet,road_closure
0,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0
1,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0
2,Much needed rain is headed our way please b...,official,GDOTATL,Much needed rain is headed our way please b...,0
3,Happy Thanksgiving Please drive safe and pati...,official,GDOTATL,Happy Thanksgiving Please drive safe and pati...,0
4,Have a safe Thanksgiving Put away your cell p...,official,GDOTATL,Have a safe Thanksgiving Put away your cell p...,0


## SpaCy Preprocessing

In [3]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,text,type,username,tweet,road_closure,modified_text,location
0,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0,,
1,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0,,


In [4]:
format_dict = {"hwy": "Highway ",
            "Blvd": "Boulevard",
            " st": "street",
           "CR ": "Country Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           "In ": "in ",
           " in ": " at "}

In [5]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [6]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'text', format_dict)

In [7]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [8]:
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')].head(100)

In [9]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [10]:
test = get_loc(test_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
test_df['location'] = test
test_df.head(20)

Unnamed: 0,text,type,username,tweet,road_closure,modified_text,location
19213,NEW Crash in Duval on I-295 E north beyond Ph...,official,fl511_northeast,NEW Crash in Duval on I-295 E north beyond Ph...,1,At New Crash At Duval At Interstate 295 East ...,
19214,CLEARED Traffic congestion in Duval on I-95 n...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-95 n...,1,At Cleared Traffic Congestion At Duval At Int...,{Exit 351 Interstate 10}
19215,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,"{Buckman, San Jose}"
19216,UPDATE Traffic congestion in Duval on I-95 no...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 no...,1,At Update Traffic Congestion At Duval At Inte...,{Exit 351 Interstate 10}
19217,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,"{Exit 56, East South}"
19218,CLEARED Traffic congestion in Duval on I-95 s...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-95 s...,1,At Cleared Traffic Congestion At Duval At Int...,{Exit 341 Baymeadows}
19219,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,{East North}
19220,CLEARED Traffic congestion in Duval on SR-202...,official,fl511_northeast,CLEARED Traffic congestion in Duval on SR-202...,1,At Cleared Traffic Congestion At Duval At Sr-...,{Kernan Boulevard}
19221,UPDATE Traffic congestion in Duval on I-295 W...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 W...,1,At Update Traffic Congestion At Duval At Inte...,"{Buckman, San Jose}"
19222,CLEARED Traffic congestion in Duval on I-95 s...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-95 s...,1,At Cleared Traffic Congestion At Duval At Int...,{Exit 351 Interstate 10}


In [12]:
i95 = pd.read_csv('../df_csv')
i95.head()

Unnamed: 0,exit,crossSt,dir,lat,long
0,1A,,S,25.75506,-80.202
1,1B,,N,25.76255,-80.1996
2,1B,,S,25.77588,-80.19986
3,2A,,N,25.77139,-80.19882
4,2B,,N,25.77339,-80.19866


In [13]:
test_df.loc[19218]['tweet'].lower()

'cleared  traffic congestion in duval on i-95 south from exit 344 butler to at exit 341 baymeadows '

In [14]:
exit_loc = test_df.loc[19218]['tweet'].lower().find('exit')
exit_loc = exit_loc + 5

In [16]:
import numpy as np

In [17]:
import re

#Use tweet text, convert to lower case
txt = "Cleared traffic congestion in Duval on I-295 from exit 344a W butler to at exit 341 W baymeadows "
txt=txt.lower()

#Find the Road
road = re.search("(i-\d*|interstate \d*)", txt)
road = road.group(0)
road = road.strip(' ')
print(f'Road: {road}')

#Find the Direction, specifically that are related to the interstate
direction =  re.search("(i-\d*|interstate \d*) (south|north|s|n)*", txt)
direction = direction.group(0)
direction =  re.search("south|north|s|n", direction)

#If direction is found, log direction, otherwise direction is null
try:
    direction = direction.group(0)
except AttributeError: 
    direction = np.nan
print(f'Direction: {direction}')

Road: i-295
Direction: nan


In [25]:
test_df['tweet'].head()

19213    NEW  Crash in Duval on I-295 E north beyond Ph...
19214    CLEARED  Traffic congestion in Duval on I-95 n...
19215    CLEARED  Traffic congestion in Duval on I-295 ...
19216    UPDATE  Traffic congestion in Duval on I-95 no...
19217    CLEARED  Traffic congestion in Duval on I-295 ...
Name: tweet, dtype: object

In [106]:
test_df['text'].iloc[5]

'CLEARED  Traffic congestion in Duval on I-95 south from Exit 344 Butler to at Exit 341 Baymeadows '

In [109]:
def traffic_search(test_df='test_df', column='text', i_df='i95'):
    
    #Instantiate DataFrame 
    new_df = pd.DataFrame()
    
    #Column data to collect
    exits = []
    roads = []
    lats = []
    longs = []
    directions = []
    match_list = []
    
    for row in range(len(test_df)):
        #Mask of the cell to be iterated
        search_string = df[column].iloc[row]
    
        #Check if it's cleared
        if 'cleared' in search_string:
            pass

        #Else collect data 
        else:        
            #Find the road
            road = re.search("(i-\d*|interstate \d*)", search_string)
            road = road.group(0)
            road = road.strip(' ')
            roads.append(road)

            #Find the exit
            exit = re.search("exit \d*[ab-e]*", search_string)
            exit = exit.group(0)
            exit.strip(' ')
            exits.append(exit)

            #Find the Direction -- specifically that is connected to the road
            direction =  re.search("(i-\d*|interstate \d*) (south|north|west|east|s|n|e|w)*", search_string)
            direction = direction.group(0)
            direction =  re.search("south|north|west|east|s|n|e|w", direction

                                   #If direction is found, log direction, otherwise direction is null
            try:
                direction = direction.group(0)
                directions.append(direction)
            
            except AttributeError: 
                direction = np.nan
                directions.append(direction)

            #Find title case words for cross streets
            title_text = text.split(" ")
            title_text = {word for word in title_text if word == word.title()}

            #Remove county name as a search term.  Interferes with road matching
            title_text.remove('')
            title_text.remove('Duval')
            
            #Find Matches
            matches = []
            for word in title_text:
                for row in range(len(i95)):
                    try:
                        if word in i95['crossSt'].iloc[row]:  
                            matches.append(row)
                    except TypeError:
                        pass
            match_list.append(matches)
    
    new_df['road'] = roads
    new_df['exit'] = exits
    new_df['direction'] = directions
    new_df['match'] = match_list
#    new_df['lat'] = lats
#    new_df['long'] = longs
    return new_df

SyntaxError: invalid syntax (<ipython-input-109-ea7aae7c86ae>, line 42)

In [117]:
text = "At New  Disabled Vehicle At Duval At Interstate 10 East Before Cassat, Right Shoulder Blocked"

#Pull out 'title case' words
title_text = text.split(" ")
title_text = {word for word in title_text if word == word.title()}

title_text.remove('')
title_text.remove('Duval')
title_text

{'10',
 'At',
 'Before',
 'Blocked',
 'Cassat,',
 'Disabled',
 'East',
 'Interstate',
 'New',
 'Right',
 'Shoulder',
 'Vehicle'}

In [122]:
#Make a list of matching rows for the title text search
matches = []
for word in title_text:
    for row in range(len(i10)):
        try:
            if word in i10['crossSt'].iloc[row]:  
                matches.append(row)
        except TypeError:
            pass

i10.iloc[matches]

Unnamed: 0,exit,crossSt,dir,lat,long
8,12,I-110,E,30.50348,-87.23715
9,12,I-110,W,30.50379,-87.22393
114,357,SR 103 Lane Ave,E,30.3155,-81.75616
115,357,SR 103 Lane Ave,W,30.31602,-81.74865
64,209B,US 90 East Mahan Dr,E,30.48486,-84.15795
65,209B,US 90 East Mahan Dr,W,30.4821,-84.15316


In [None]:
road = re.search("(i-\d*|interstate \d*) (south|north|s|n)", txt)
direction = direction.group(0)
direction

In [None]:
exit = re.search("exit \d*[ab-e]*", txt)
exit = exit.group(0)
exit.strip(' ')

In [None]:
temp_df

In [None]:
result = i95[i95['crossSt'].str.contains('SR 202') == True]
result

In [None]:
pd.options.display.max_colwidth = 200
test_df[test_df['text'].str.contains('I-95') == True]['text']

In [None]:
i95['road'] = 'I-95'

In [None]:
i95.head()

In [111]:
i10 = pd.read_csv('../i10_csv')

In [112]:
i10

Unnamed: 0,exit,crossSt,dir,lat,long
0,5,US 90 Alt,E,30.53662,-87.33782
1,5,US 90 Alt,W,30.53215,-87.33082
2,7,SR 297,W,30.51983,-87.31054
3,7A,SR 297 South,E,30.52261,-87.31612
4,7B,SR 297 North,E,30.52072,-87.31273
5,10A,US 29 South,E,30.50267,-87.26841
6,10B,US 29 North,E,30.50226,-87.26566
7,10,US 29 North,W,30.50409,-87.26204
8,12,I-110,E,30.50348,-87.23715
9,12,I-110,W,30.50379,-87.22393
