# Location Extraction and Spacy Word Vectorization

In [2]:
import pandas as pd
import re
import spacy
import string
import datetime

In [3]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07252019.csv")

twitter_closures = twitter_closures[['date', 'text', 'type', 'username', 'tweet', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(43016, 6)


Unnamed: 0,date,text,type,username,tweet,road_closure
0,2016-11-29 14:33:36+00:00,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0
1,2016-11-28 20:35:05+00:00,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0
2,2016-11-28 19:02:02+00:00,Much needed rain is headed our way please b...,official,GDOTATL,Much needed rain is headed our way please b...,0
3,2016-11-24 19:01:16+00:00,Happy Thanksgiving Please drive safe and pati...,official,GDOTATL,Happy Thanksgiving Please drive safe and pati...,0
4,2016-11-24 01:00:30+00:00,Have a safe Thanksgiving Put away your cell p...,official,GDOTATL,Have a safe Thanksgiving Put away your cell p...,0


## SpaCy Preprocessing

In [4]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,date,text,type,username,tweet,road_closure,modified_text,location
0,2016-11-29 14:33:36+00:00,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0,,
1,2016-11-28 20:35:05+00:00,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0,,


In [5]:
format_dict = {"hwy": "Highway ",
            "Blvd": "Boulevard",
            " st": "street",
           "CR ": "Country Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           "In ": "in ",
           " in ": " at "}

In [6]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [7]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'text', format_dict)

In [8]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [9]:
twitter_closures['date'] = pd.to_datetime(twitter_closures['date'])

In [10]:
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')].head(100)

In [11]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [12]:
test = get_loc(test_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [13]:
test_df['location'] = test
test_df.head()

Unnamed: 0,date,text,type,username,tweet,road_closure,modified_text,location
19213,2016-11-29 23:59:23+00:00,NEW Crash in Duval on I-295 E north beyond Ph...,official,fl511_northeast,NEW Crash in Duval on I-295 E north beyond Ph...,1,At New Crash At Duval At Interstate 295 East ...,"{New Crash At Duval, Right Lane, Interstate 2..."
19214,2016-11-29 23:59:15+00:00,CLEARED Traffic congestion in Duval on I-95 n...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-95 n...,1,At Cleared Traffic Congestion At Duval At Int...,"{Interstate 10, Interstate 95 North}"
19215,2016-11-29 23:54:26+00:00,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,"{San Jose, Exit 5, Interstate 295 West North}"
19216,2016-11-29 23:49:10+00:00,UPDATE Traffic congestion in Duval on I-95 no...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 no...,1,At Update Traffic Congestion At Duval At Inte...,"{Interstate 10, Interstate 95 North}"
19217,2016-11-29 23:49:10+00:00,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,"{Exit 53, Exit 56, Interstate 295 East South}"


In [14]:
test_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_07292019")

In [15]:
pd.options.display.max_colwidth = 200
test_df[test_df['tweet'].str.contains('I-95') == True]['text']

19214       CLEARED  Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10 
19216        UPDATE  Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10 
19218           CLEARED  Traffic congestion in Duval on I-95 south from Exit 344 Butler to at Exit 341 Baymeadows 
19222                                    CLEARED  Traffic congestion in Duval on I-95 south ramp to Exit 351 I-10 
19223                 CLEARED  Traffic congestion in Duval on SR-202   Butler Blvd west from Southside to at I-95 
19224              CLEARED  Traffic congestion in Duval on I-295 W north from Exit 61 I-95 to at Exit 3 St Aug Rd 
19225         CLEARED  Traffic congestion in Duval on I-95 south from Exit 347 Emerson to ramp to Exit 344 Butler 
19226                                 CLEARED  Crash in Duval on I-95 south at Exit 344 Butler, left lane blocked 
19228                                  UPDATE  Crash in Duval on I-95 south at E

In [16]:
test_df.loc[19215][['location', 'modified_text']]

location                                                                    {San Jose, Exit 5, Interstate 295 West North}
modified_text    At Cleared  Traffic Congestion At Duval At Interstate 295 West North From Exit 5 San Jose To At Buckman 
Name: 19215, dtype: object

In [40]:
i95 = pd.read_csv('../df_csv')
i95['interstate'] = 'Interstate 95'
i95['exits'] = 'Exit ' + i95['exit']
i95.head()

Unnamed: 0,exit,crossSt,dir,lat,long,interstate,exits
0,1A,,S,25.75506,-80.202,Interstate 95,Exit 1A
1,1B,,N,25.76255,-80.1996,Interstate 95,Exit 1B
2,1B,,S,25.77588,-80.19986,Interstate 95,Exit 1B
3,2A,,N,25.77139,-80.19882,Interstate 95,Exit 2A
4,2B,,N,25.77339,-80.19866,Interstate 95,Exit 2B


In [18]:
type(test_df['location'][19217])

set

In [50]:
for item in i95.iterrows():
    if 'Exit 1B'

KeyError: 'interstate'

In [33]:
def exit_extractor (df, col, i_df):
    
    exits = []
    interstates = []
    
    for item in df[col]:
        if 'Interstate' in item:
            i_string = re.search(r'Interstate (\S+)', item)
            interstates.append(i_string.group(0))
            if 'Exit' in item:
                e_string = re.search(r'Exit (\S+)', item)
                exits.append(e_string.group(0))
            else:
                exits.append("None")
        else:
            interstates.append("None")
            exits.append("None")
    
    new_df = pd.DataFrame(data = interstates, columns = ['interstate'])
    new_df['exits'] = exits
    
    for row in new_df.iterows():
        if item[1]['interstate'] != "None":
            if item[1]['interstate'] in i_df['interstate']:
            
            
    return new_df
    

In [34]:
exit_extractor(test_df, 'modified_text')

Unnamed: 0,interstate,exits
0,Interstate 295,
1,Interstate 95,Exit 351
2,Interstate 295,Exit 5
3,Interstate 95,Exit 351
4,Interstate 295,Exit 53
5,Interstate 95,Exit 344
6,Interstate 295,Exit 53
7,Interstate 295,
8,Interstate 295,Exit 5
9,Interstate 95,Exit 351
