# Location Extraction and Spacy Word Vectorization

In [1]:
import pandas as pd
import re
import spacy
import string

import geocoder

In [2]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07242019.csv")

twitter_closures = twitter_closures[['text', 'type', 'username', 'tweet', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(43016, 5)


Unnamed: 0,text,type,username,tweet,road_closure
0,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0
1,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0
2,Much needed rain is headed our way please b...,official,GDOTATL,Much needed rain is headed our way please b...,0
3,Happy Thanksgiving Please drive safe and pati...,official,GDOTATL,Happy Thanksgiving Please drive safe and pati...,0
4,Have a safe Thanksgiving Put away your cell p...,official,GDOTATL,Have a safe Thanksgiving Put away your cell p...,0


## SpaCy Preprocessing

In [3]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,text,type,username,tweet,road_closure,modified_text,location
0,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0,,
1,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0,,


In [4]:
format_dict = {"hwy": "Highway ",
            "Blvd": "Boulevard",
            " st": "street",
           "CR ": "Country Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           "In ": "in ",
           " in ": " at "}

In [5]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [6]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'text', format_dict)

In [7]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [8]:
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'NCDOT_Triangle')].head(100)

In [9]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [10]:
test = get_loc(test_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
test_df['location'] = test

In [12]:
test_df

Unnamed: 0,text,type,username,tweet,road_closure,modified_text,location
39347,"I-40, Mile Marker 295, East, Raleigh, Wake, Ve...",official,NCDOT_Triangle,"I-40, Mile Marker 295, East, Raleigh, Wake, Ve...",1,"At Interstate 40, Mile Marker 295, East, Ralei...","{East, Interstate 40}"
39348,"I-40, Mile Marker 295, East, Raleigh, Wake, Ve...",official,NCDOT_Triangle,"I-40, Mile Marker 295, East, Raleigh, Wake, Ve...",1,"At Interstate 40, Mile Marker 295, East, Ralei...","{East, Interstate 40}"
39349,"I-40, Mile Marker 295, East, Raleigh, Wake, Ve...",official,NCDOT_Triangle,"I-40, Mile Marker 295, East, Raleigh, Wake, Ve...",1,"At Interstate 40, Mile Marker 295, East, Ralei...","{East, Interstate 40}"
39350,"I-40, Mile Marker 300, East, Raleigh, Wake, Ve...",official,NCDOT_Triangle,"I-40, Mile Marker 300, East, Raleigh, Wake, Ve...",1,"At Interstate 40, Mile Marker 300, East, Ralei...","{East, Interstate 40}"
39351,"I-540, Mile Marker 2, East, Raleigh, Wake, Veh...",official,NCDOT_Triangle,"I-540, Mile Marker 2, East, Raleigh, Wake, Veh...",1,"At Interstate 540, Mile Marker 2, East, Raleig...","{East, Interstate 540}"
39352,"I-40, Mile Marker 297, East, Raleigh, Wake, Di...",official,NCDOT_Triangle,"I-40, Mile Marker 297, East, Raleigh, Wake, Di...",1,"At Interstate 40, Mile Marker 297, East, Ralei...","{East, Interstate 40}"
39353,"I-40, Mile Marker 279, West, Durham, Durham, V...",official,NCDOT_Triangle,"I-40, Mile Marker 279, West, Durham, Durham, V...",1,"At Interstate 40, Mile Marker 279, West, Durha...","{Durham, Interstate 40, West}"
39354,"I-540, In Raleigh Wake, Vehicle Accident, Lane...",official,NCDOT_Triangle,"I-540, In Raleigh Wake, Vehicle Accident, Lane...",1,"At Interstate 540, In Raleigh Wake, Vehicle Ac...","{Raleigh Wake, Interstate 540}"
39355,"I-40, Mile Marker 297, East, Raleigh, Wake, Di...",official,NCDOT_Triangle,"I-40, Mile Marker 297, East, Raleigh, Wake, Di...",1,"At Interstate 40, Mile Marker 297, East, Ralei...","{East, Interstate 40}"
39356,"I-40, Mile Marker 270, West, Durham, Durham, V...",official,NCDOT_Triangle,"I-40, Mile Marker 270, West, Durham, Durham, V...",1,"At Interstate 40, Mile Marker 270, West, Durha...","{Durham, Interstate 40, West}"
