# Location Extraction and Spacy Word Vectorization

In [135]:
import pandas as pd
import numpy as np
import re
import spacy
import string
import datetime

In [4]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07252019.csv")

twitter_closures = twitter_closures[['date', 'text', 'type', 'username', 'tweet', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(43016, 6)


Unnamed: 0,date,text,type,username,tweet,road_closure
0,2016-11-29 14:33:36+00:00,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0
1,2016-11-28 20:35:05+00:00,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0
2,2016-11-28 19:02:02+00:00,Much needed rain is headed our way please b...,official,GDOTATL,Much needed rain is headed our way please b...,0
3,2016-11-24 19:01:16+00:00,Happy Thanksgiving Please drive safe and pati...,official,GDOTATL,Happy Thanksgiving Please drive safe and pati...,0
4,2016-11-24 01:00:30+00:00,Have a safe Thanksgiving Put away your cell p...,official,GDOTATL,Have a safe Thanksgiving Put away your cell p...,0


## SpaCy Preprocessing

In [5]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,date,text,type,username,tweet,road_closure,modified_text,location
0,2016-11-29 14:33:36+00:00,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0,,
1,2016-11-28 20:35:05+00:00,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0,,


In [6]:
format_dict = {"hwy": "Highway ",
            "Blvd": "Boulevard",
            " st": "street",
           "CR ": "Country Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           "In ": "in ",
           " in ": " at "}

In [7]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [8]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'text', format_dict)

In [9]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [10]:
twitter_closures['date'] = pd.to_datetime(twitter_closures['date'])

In [80]:
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')].head(100)

In [81]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [82]:
test = get_loc(test_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [83]:
test_df['location'] = test
test_df.head()

Unnamed: 0,date,text,type,username,tweet,road_closure,modified_text,location
19213,2016-11-29 23:59:23+00:00,"NEW Crash in Duval on I-295 E north beyond Philips Hwy, right lane blocked",official,fl511_northeast,"NEW Crash in Duval on I-295 E north beyond Philips Hwy, right lane blocked",1,"At New Crash At Duval At Interstate 295 East North Beyond Philips Hwy, Right Lane Blocked","{Interstate 295 East North, New Crash At Duval, Right Lane}"
19214,2016-11-29 23:59:15+00:00,CLEARED Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,1,At Cleared Traffic Congestion At Duval At Interstate 95 North From Before Fuller Warren To Ramp To Exit 351 Interstate 10,"{Interstate 10, Interstate 95 North}"
19215,2016-11-29 23:54:26+00:00,CLEARED Traffic congestion in Duval on I-295 W north from Exit 5 San Jose to at Buckman,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 W north from Exit 5 San Jose to at Buckman,1,At Cleared Traffic Congestion At Duval At Interstate 295 West North From Exit 5 San Jose To At Buckman,"{Interstate 295 West North, San Jose, Exit 5}"
19216,2016-11-29 23:49:10+00:00,UPDATE Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,1,At Update Traffic Congestion At Duval At Interstate 95 North From Before Fuller Warren To Ramp To Exit 351 Interstate 10,"{Interstate 10, Interstate 95 North}"
19217,2016-11-29 23:49:10+00:00,CLEARED Traffic congestion in Duval on I-295 E south from Exit 53 Butler to at Exit 56 Baymeadows,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 E south from Exit 53 Butler to at Exit 56 Baymeadows,1,At Cleared Traffic Congestion At Duval At Interstate 295 East South From Exit 53 Butler To At Exit 56 Baymeadows,"{Exit 56, Exit 53, Interstate 295 East South}"


In [86]:
test_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_07292019.csv", index = False)

In [87]:
test_df = pd.read_csv("../data/Loc_Extracted/tweet_locations_sample_07292019.csv")

In [88]:
pd.options.display.max_colwidth = 200
test_df[test_df['tweet'].str.contains('I-95') == True]['text']

1        CLEARED  Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10 
3         UPDATE  Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10 
5            CLEARED  Traffic congestion in Duval on I-95 south from Exit 344 Butler to at Exit 341 Baymeadows 
9                                     CLEARED  Traffic congestion in Duval on I-95 south ramp to Exit 351 I-10 
10                 CLEARED  Traffic congestion in Duval on SR-202   Butler Blvd west from Southside to at I-95 
11              CLEARED  Traffic congestion in Duval on I-295 W north from Exit 61 I-95 to at Exit 3 St Aug Rd 
12         CLEARED  Traffic congestion in Duval on I-95 south from Exit 347 Emerson to ramp to Exit 344 Butler 
13                                 CLEARED  Crash in Duval on I-95 south at Exit 344 Butler, left lane blocked 
15                                  UPDATE  Crash in Duval on I-95 south at Exit 344 Butler, left lane b

In [89]:
i95 = pd.read_csv('../df_csv')
i95['interstate'] = 'Interstate 95'
i95['exits'] = 'Exit ' + i95['exit']
i95.head()

Unnamed: 0,exit,crossSt,dir,lat,long,interstate,exits
0,1A,,S,25.75506,-80.202,Interstate 95,Exit 1A
1,1B,,N,25.76255,-80.1996,Interstate 95,Exit 1B
2,1B,,S,25.77588,-80.19986,Interstate 95,Exit 1B
3,2A,,N,25.77139,-80.19882,Interstate 95,Exit 2A
4,2B,,N,25.77339,-80.19866,Interstate 95,Exit 2B


In [37]:
for item in i95.iterrows():
    if item[1]['exits'] != "None":
        print(item[1]['exit'])
        if "Exit 337" in item[1]['exits']:
            print(str(item[1]['lat']) + " " + str(item[1]['long']))

1A
1B
1B
2A
2B
2C
2D
3A
3B
2D
4A
4B
4
6A
6A
6B
7
7
8A
8A
8B
8B
9
10A
10A
10B
10B
11
12A
12B
12C
12
14
14
16
16
18
18
19
19
20
20
21
21
22
22
23
23
24
25
25
26
26
27
27
29A
29B
29
31A
31B
31
32
32
33A
33B
33
36
36
38
38A
38B
39
39
41
41
42A
42B
42
44
44
45
45
48A
48B
48
50
50
51
51
52
52
56
56
57
57
59
59
60
60
61
61
63
63
64
64
66
66
68
68
69
69A
69B
70
70
71
71
74
74
76
76
77
77
79A
79B
79
79C
83
83
87A
87A
87B
87B
96
96
101
101
102
102
110
110
114
114
118
118
120
120
121
121
126
126
129
129
131
131
138
138
147
147
156
156
173
173
176
176
180
180
183
183
188
188
191
191
193
193
195
195
201
201
202
202
205A
205A
205B
205B
208
208
212
212
215
215
220
220
223
223
231
231
244
244
249
249A
249B
256
256
260A
260A
260B
260B
261
265
265
268
268
273
273
278
278
284
284
289
289
293
293
298
298
305
305
311
311
318
318
323
323
329
329
333
333
335
335
337
30.15859 -81.55057
337
30.17594 -81.55901999999999
339
339
340
341
341
344
344
345
346A
346B
347
347
348
351A
351B
350B
350A
352A
352B
352C
351C

In [149]:
def exit_extractor (df, col, i_df):
    
    exits = []
    interstates = []
    lat = []
    long = []
    direction = []
    
    for item in df[col]:
        if 'Interstate' in item:
            
            i_string = re.search(r'Interstate (\S+)', item)
            interstates.append(i_string.group(0))
            
            if 'Exit' in item:
                e_string = re.search(r'Exit (\S+)', item)
                exits.append(e_string.group(0))
            else:
                exits.append("None")
        else:
            interstates.append("None")
            exits.append("None")
    
    new_df = pd.DataFrame(data = interstates, columns = ['interstate'])
    new_df['exits'] = exits
    
    for row in new_df.iterrows():
        
        if row[1]['interstate'] != "None":

            if (row[1]['interstate'] in i95['interstate'].unique()) and (row[1]['exits'] != "None"):
                
                lat.append(i_df[i_df['exits'].str.contains(row[1]['exits'])]['lat'].iloc[0])
                long.append(i_df[i_df['exits'].str.contains(row[1]['exits'])]['long'].iloc[0])
                      
            else:
                lat.append(np.nan)
                long.append(np.nan)
    
    new_df['lat'] = lat
    new_df['long'] = long
    
    return new_df

In [150]:
exit_extractor(test_df, 'modified_text', i95)

Unnamed: 0,interstate,exits,lat,long
0,Interstate 295,,,
1,Interstate 95,Exit 351,30.31568,-81.67278
2,Interstate 295,Exit 5,,
3,Interstate 95,Exit 351,30.31568,-81.67278
4,Interstate 295,Exit 53,,
5,Interstate 95,Exit 344,30.24393,-81.58858
6,Interstate 295,Exit 53,,
7,Interstate 295,,,
8,Interstate 295,Exit 5,,
9,Interstate 95,Exit 351,30.31568,-81.67278


In [151]:
exits = exit_extractor(test_df, 'modified_text', i95)
final_df = pd.concat([test_df, exits], axis = 1)
final_df.head()

Unnamed: 0,date,text,type,username,tweet,road_closure,modified_text,location,interstate,exits,lat,long
0,2016-11-29 23:59:23+00:00,"NEW Crash in Duval on I-295 E north beyond Philips Hwy, right lane blocked",official,fl511_northeast,"NEW Crash in Duval on I-295 E north beyond Philips Hwy, right lane blocked",1,"At New Crash At Duval At Interstate 295 East North Beyond Philips Hwy, Right Lane Blocked","{'Interstate 295 East North', 'New Crash At Duval', 'Right Lane'}",Interstate 295,,,
1,2016-11-29 23:59:15+00:00,CLEARED Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,1,At Cleared Traffic Congestion At Duval At Interstate 95 North From Before Fuller Warren To Ramp To Exit 351 Interstate 10,"{'Interstate 10', 'Interstate 95 North'}",Interstate 95,Exit 351,30.31568,-81.67278
2,2016-11-29 23:54:26+00:00,CLEARED Traffic congestion in Duval on I-295 W north from Exit 5 San Jose to at Buckman,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 W north from Exit 5 San Jose to at Buckman,1,At Cleared Traffic Congestion At Duval At Interstate 295 West North From Exit 5 San Jose To At Buckman,"{'Interstate 295 West North', 'San Jose', 'Exit 5'}",Interstate 295,Exit 5,,
3,2016-11-29 23:49:10+00:00,UPDATE Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,1,At Update Traffic Congestion At Duval At Interstate 95 North From Before Fuller Warren To Ramp To Exit 351 Interstate 10,"{'Interstate 10', 'Interstate 95 North'}",Interstate 95,Exit 351,30.31568,-81.67278
4,2016-11-29 23:49:10+00:00,CLEARED Traffic congestion in Duval on I-295 E south from Exit 53 Butler to at Exit 56 Baymeadows,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 E south from Exit 53 Butler to at Exit 56 Baymeadows,1,At Cleared Traffic Congestion At Duval At Interstate 295 East South From Exit 53 Butler To At Exit 56 Baymeadows,"{'Exit 56', 'Exit 53', 'Interstate 295 East South'}",Interstate 295,Exit 53,,


In [98]:
i95.head()

Unnamed: 0,exit,crossSt,dir,lat,long,interstate,exits
0,1A,,S,25.75506,-80.202,Interstate 95,Exit 1A
1,1B,,N,25.76255,-80.1996,Interstate 95,Exit 1B
2,1B,,S,25.77588,-80.19986,Interstate 95,Exit 1B
3,2A,,N,25.77139,-80.19882,Interstate 95,Exit 2A
4,2B,,N,25.77339,-80.19866,Interstate 95,Exit 2B


In [119]:
i95[i95['exits'].str.contains('Exit 351')].iloc[0]

exit                   351A
crossSt             Park St
dir                       N
lat                30.31568
long               -81.6728
interstate    Interstate 95
exits             Exit 351A
Name: 253, dtype: object

In [129]:
mylist = []
for row in final_df.iterrows():     
    if row[1]['interstate'] != "None":
            
        if (row[1]['interstate'] in i95['interstate'].unique()) and (row[1]['exits'] != "None"): 
            print(f"{row[1]['exits']} Lat: {i95[i95['exits'].str.contains(row[1]['exits'])]['lat'].iloc[0]}")
            mylist.append(row[1]['exits'])
        else:
            print("No Exit Found")
            mylist.append("None")

No Exit Found
Exit 351 Lat: 30.31568
No Exit Found
Exit 351 Lat: 30.31568
No Exit Found
Exit 344 Lat: 30.24393
No Exit Found
No Exit Found
No Exit Found
Exit 351 Lat: 30.31568
No Exit Found
No Exit Found
Exit 347 Lat: 30.288710000000002
Exit 344 Lat: 30.24393
No Exit Found
Exit 344 Lat: 30.24393
Exit 344 Lat: 30.24393
No Exit Found
No Exit Found
No Exit Found
Exit 337 Lat: 30.15859
No Exit Found
No Exit Found
Exit 347 Lat: 30.288710000000002
No Exit Found
Exit 347 Lat: 30.288710000000002
Exit 346 Lat: 30.27203
No Exit Found
Exit 351 Lat: 30.31568
Exit 337 Lat: 30.15859
No Exit Found
Exit 346 Lat: 30.27203
No Exit Found
No Exit Found
No Exit Found
Exit 344 Lat: 30.24393
No Exit Found
No Exit Found
No Exit Found
No Exit Found
No Exit Found
Exit 337 Lat: 30.15859
Exit 337 Lat: 30.15859
No Exit Found
Exit 353 Lat: 30.33461
No Exit Found
No Exit Found
No Exit Found
No Exit Found
Exit 347 Lat: 30.288710000000002
No Exit Found
Exit 351 Lat: 30.31568
No Exit Found
Exit 353 Lat: 30.33461
No Exi

In [131]:
len(mylist)

100

In [132]:
len(final_df)

100