/
process-helbock.R
514 lines (453 loc) · 32.7 KB
/
process-helbock.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
# This is code to take Richard Helbock's dataset of U.S. Post offices and match them to geographic coordinates of places listed in the USGS Board on Geographic Names
# Cameron Blevins (2021)
library(tidyverse)
library(Hmisc)
library(stringdist)
library(fuzzyjoin)
library(data.table)
library(tictoc)
###Read in, clean up Helbock's full database from .mdb file #####
helbock_data_original<- mdb.get("data/Complete_USPO.mdb")
helbock_data<-helbock_data_original
helbock_data<- helbock_data %>%
mutate_all(as.character) %>% #convert all columns to string
mutate_all(trimws) %>% #trim extra whitespace
mutate(ID=as.numeric(ID)) %>% #convert ID column into number
rename(Name=POST.OFFICE, State=STATE, Established=Estab, Continuous=to, Discontinued=Disc, StampIndex=S.I) %>%
mutate(AltName="", OrigName=Name, OrigCounty=County) %>% #creating empty column for altnerative spellings of names
select(Name, AltName, OrigName, everything())
#Getting rid of parentheses and other characters from names to make it easier to match
helbock_data<-helbock_data %>%
mutate(Discontinued=ifelse(Discontinued=="Date", NA, Discontinued)) %>% #replacing "Date" values with NA values
mutate(Discontinued=ifelse(Discontinued=="", NA, Discontinued)) %>% #replacing blank values with NA values
mutate(Established=ifelse(Established=="", NA, Established)) %>% #replacing blank values with NA values
mutate(Continuous=ifelse(Continuous=="-", TRUE, FALSE)) %>% #whether a post office was continuous or not - changing values to T/F
mutate(Name=str_replace(string = Name, pattern=" \\([0-9]\\)", replacement="")) %>% #find pattern of space, parentheses, digit ex. " (1)")
mutate(Name=str_replace(string = Name, pattern=" \\([CH. ]+\\)", replacement="")) %>% #get rid of all with a C, H, whitetpsace, and period in between parentheses
mutate(Name=str_replace(string = Name, pattern=" \\(C\\. H\\.\\)", replacement="")) %>% #get rid of all names with "(C. H.)" in them
mutate(Name=str_replace(string = Name, pattern=" C\\.H\\.", replacement="")) %>% #get rid of all names with "C.H." in them
mutate(Name=str_replace(string = Name, pattern=" \\CH\\)", replacement="")) %>% #get rid of all names with "CH)" in them
mutate(Name=str_replace(string = Name, pattern=" C\\. H\\.\\)", replacement="")) %>%
mutate(Name=str_replace(string = Name, pattern=" C\\. H\\.", replacement="")) %>%
mutate(Name=toupper(Name)) %>%
mutate(AltName=ifelse(str_detect(Name, pattern="\\("), Name, AltName)) %>% #copy contents of Name into Alternate Name column if Name has a parentheses in it
mutate(AltName=str_replace(AltName, pattern="CENTER\\(RE\\)", replacement="CENTRE")) %>%
mutate(Name=str_replace(Name, pattern="CENTER\\(RE\\)", replacement="CENTER")) %>%
mutate(AltName=str_replace(AltName, pattern="CENTRE\\(ER\\)", replacement="CENTER")) %>%
mutate(Name=str_replace(Name, pattern="CENTRE\\(ER\\)", replacement="CENTRE")) %>%
mutate(Name=str_replace_all(string= Name, pattern="\\(.*?\\)", replacement="")) %>% #delete anything inside parentheses in original name
mutate(AltName=str_replace(string=AltName, pattern="\\b[A-Z]+\\b \\(", replacement="")) %>%
mutate(AltName=str_replace_all(string=AltName, pattern="[()]", replacement="")) %>%
mutate(AltName=ifelse(str_detect(Name, pattern="^MC ") & AltName=="", str_replace_all(string=Name, pattern="^MC ", replacement="MC"), AltName)) %>% #if a Name starst with "MC C___" then add alternate name of "MCC___"
mutate(AltName=ifelse(str_detect(Name, pattern=" COURT HOUSE$| COURTHOUSE$| TOWNSHIP$| STATION$") & AltName=="", str_replace_all(string=Name, pattern=" COURT HOUSE$| COURTHOUSE$| TOWNSHIP$| STATION$", replacement=""), AltName)) %>% #if a Name contains "Courthoue" then add alternate name of without the courthoue"
mutate(AltName=ifelse(str_detect(Name, pattern="'") & AltName=="", str_replace_all(string=Name, pattern="'", replacement=""), AltName)) %>% #if a contains an apostrphe, remove it and add to alternative name
mutate(AltName=ifelse(AltName=="", NA, AltName)) %>%
mutate(AltName=toupper(AltName)) %>%
mutate(Established=str_replace(string=Established, pattern="[-``]", replacement="")) %>% #get rid of negative signs
mutate(Discontinued=str_replace(string=Discontinued, pattern="[-``]", replacement="")) %>% #get rid of negative signs
mutate(County=str_replace_all(string=County, pattern="[-]", replacement="")) %>%
mutate(County=toupper(County)) %>%
separate(County, c("County1", "County2", "County3"), sep="[/\\\\]") %>% #split any county names that have a slash or blackslash in them
mutate(County2 = if_else(County1=="CHRISTIAN" & State == "IL", "CLARK", "")) %>% #Helbock mislabeled Clark county as Christian county a bunch, so adding Clark as second county to check
mutate(County2 = if_else(County1=="NORFOLK" & State == "VA" & County2=="", "CHESAPEAKE", County2)) %>% #confusing county history - momst of Chesapeake used to be Norfolk, so adding it as alternative county
mutate_all(trimws) %>% #get rid of any extra whitespace
mutate(ID=as.integer(ID)) %>% #change ID field to an integer
mutate(Continuous=as.logical(Continuous)) #T/F if the post office was continually operational
rm(helbock_data_original)
######### Processing GNIS file ############
#read in GNIS download of features - this is a list of unique features, each with a unique ID
## NOTE: this is only a sample file used for Github because the full file is too large. Replace this file with the full version: https://dataverse.harvard.edu/api/access/datafile/4491883
gnis_full<-read_delim("data/NationalFile_20210101.txt", delim = "|",col_names = TRUE, escape_double = FALSE, trim_ws = TRUE, progress=TRUE)
gnis_full<-as.data.frame(gnis_full)
gnis_full<- gnis_full %>%
rename(OldName=FEATURE_NAME, Feature.Class=FEATURE_CLASS, State=STATE_ALPHA, County=COUNTY_NAME, Latitude=PRIM_LAT_DEC, Longitude=PRIM_LONG_DEC)
#Taking in the full name list (includes ALTERNATE names for features) - this is a list of all names, including altnerate names for the same feature
#then joining it with the GNIS data (lat, long, etc.) to create a big dataframe, with repeated FEATURE ID's in them)
## NOTE: this is only a sample file used for Github because the full file is too large. Replace this file with the full version: https://dataverse.harvard.edu/api/access/datafile/4491882
all_gnisnames_original<-read_delim(file="data/AllNames_20210101_nocitation.txt", delim = "|", col_names = TRUE, escape_double = FALSE, trim_ws = TRUE, progress=TRUE) #escape_double is to handle quote marks within names (othewrwise gets messed up)
all_gnisnames_original<-as.data.frame(all_gnisnames_original)
#only select first two columns of data, clean up, and join it to the main GNIS dataframe
all_gnisnames<-all_gnisnames_original %>%
select(FEATURE_ID:FEATURE_NAME) %>%
rename(Name=FEATURE_NAME) %>%
left_join(gnis_full, by="FEATURE_ID")
rm(all_gnisnames_original)
rm(gnis_full)
#feature types you want to try to match
features<-c("Post Office", "Populated Place", "Locale", "Mine", "Cemetery", "School", "Military", "Cape", "Civil", "Church", "Census")
#take a subset of all the GNIS features that are part of features list - ex. post offices, etc. and process them so they're ready for matching
gnis<- all_gnisnames %>%
filter(Feature.Class %in% features) %>%
mutate(OrigName=Name) %>%
mutate(Name=toupper(Name)) %>%
mutate(OrigCounty=County) %>%
mutate(County=toupper(str_replace_all(string=County, pattern="\\(.*?\\)", replacement=""))) %>%
select(-c(STATE_NUMERIC, COUNTY_NUMERIC)) %>%
filter(PRIMARY_LAT_DMS != "Unknown" | Latitude != 0) #remove ones that don't have coordinates attached to them
rm(all_gnisnames)
#cleaning up the Name field of the GNIS database
#list of words to delete from the Name field of GNIS database
delete_words<-c(" ELEMENTARY SCHOOL", " MIDDLE SCHOOL", " HIGH SCHOOL", " POST OFFICE", " CEMETERY", " SCHOOL", "^TOWN OF ", " CENSUS DESIGNATED PLACE", " \\(HISTORICAL\\)")
for(w in delete_words){
print(w);
gnis<- gnis %>%
mutate(Name=str_replace_all(string=Name, pattern=w, replacement=""))
}
gnis<- gnis %>%
mutate(Name=ifelse(str_detect(string=Name, pattern="^TOWNSHIP"), yes = Name, no= str_replace_all(string=Name, pattern="TOWNSHIP", replacement=""))) %>%
mutate_all(trimws)
#only select a subset of columns
gnis<- gnis %>% #filter(Name!="" & PRIMARY_LAT_DMS != "Unknown") %>%
select(FEATURE_ID:County, OrigName, OrigCounty, Latitude, Longitude, ELEV_IN_M, ELEV_IN_FT)
#### Cleaning Helbock by comparing to GNIS data to look for typos, etc.
#flag problems - when years aren't 4 characters long, when length of state is more than 2 characters
helbock_data_problems<-helbock_data %>%
mutate(Problem = FALSE) %>%
mutate(Problem = case_when(str_length(Established) != 4 | (as.numeric(Established)<1639 | as.numeric(Established)>2000) ~ TRUE, TRUE ~ Problem)) %>%
mutate(Problem = case_when((str_length(Discontinued)!= 4 & !is.na(Discontinued)) ~ TRUE, TRUE ~ Problem)) %>%
mutate(Problem = case_when(str_length(State) > 2 ~ TRUE, TRUE ~ Problem)) %>%
filter(Problem==TRUE)
#export CSV of full Helbock data with problems flagged
write.csv(helbock_data_problems, file = "analytics/Helbock_Data_Problems.csv", row.names = FALSE)
#looking for counties that don't line up between the two datasets
helbock_counties<-helbock_data %>%
group_by(State, County1) %>%
summarise(n=n())
gnis_counties<-gnis %>%
group_by(State, County) %>%
summarise(n=n())
problem_counties <- helbock_counties %>%
full_join(gnis_counties, by=c("State", "County1" = "County")) %>%
filter(is.na(n.x) | is.na(n.y)) #look at counties from one dataset that don't show up in the other
write.csv(problem_counties, "analytics/problem_counties.csv")
#changing names of counties - using a CSV file where I manually started going through and fixing mistakes
counties_change<-read.csv("analytics/problem_counties_replacements.csv", header = TRUE, stringsAsFactors = FALSE)
counties_change <- counties_change %>%
filter(ToDo=="no") %>% #look only at the counties that I've manually found replacement values for - ToDo = yes are ones haven't done yet
select(State, County1, County1_Replacement, County2_Replacement)
#perform joins to lookup and change problem counties in the helbock dataframe
helbock_data<-helbock_data %>%
left_join(counties_change, by = c("State", "County1")) %>% #temporarily add the replacement county values as new columns (if you don't have a match you'll have NA values)
mutate(County1=ifelse(!is.na(County1_Replacement), yes=County1_Replacement, no=County1)) %>% #replace county values
mutate(County1 = replace(County1, str_detect(County1, " ANA"), (gnis %>% filter(str_detect(County, " ANA")))[1,6])) %>% #replacing n character for Dona Ana so it matches GNIS character
mutate(County2=ifelse(!is.na(County2_Replacement), yes=County2_Replacement, no=County2)) %>% #replace county values
select(-c(County1_Replacement, County2_Replacement)) %>%
mutate(County2=ifelse(County2=="", yes=NA, no=County2)) #replace empty values in Column 2 with NA's
####### MATCHING ROUNDS #########
##initializing the dataframes
gnis_tomatch<-gnis #dataframe of GNIS all features you're going to match
po_tomatch<-helbock_data #dataframe of Helbock's data that you're going to try and find matches for in GNIS data
po_matched<-po_tomatch[0,] #empty dataframe that you're going to fill with matched values
#####MATCHING PHASE I: Look up Name, County, State in GNIS across multiple features #####
#taking just a subset of data for testing purposes - comment out later for full matching
#gnis_tomatch <- gnis %>%
# filter((State=="CO" | State=="AZ"))
# po_tomatch<-helbock_data %>%
# filter(State=="AK" | State=="AZ") %>%
# sample_n(120)
#po_tomatch<-helbock_data %>% filter(!(ID %in% po_matched$ID)) #in case you are starting with a subset
#function to match any individual name, county, and state combination, then add results to external dataframe and winnow down the dataframe you want to look up
#send in three columns: df1 = Post OFfices you want to match, df2 is GNIS dataframe, df3 is a copy of Post Offices you want to match that you'll update
matchfun<-function(df1, df2, df3){
df1 <- df1 %>%
rename(Name=1, County=2, State=3) %>% #rename columns
filter(!(ID %in% po_matched$ID)) %>% #if the Post Office has not already been matched
left_join(df2, by=c("Name", "County", "State")) %>% #join post office dataframe to GNIS dataframe based on full name, county and state matches
mutate(Match=ifelse(is.na(FEATURE_ID), FALSE, TRUE)) %>% #add a column Match to flag ones you've found
select(-c(OldName)) #get rid of original GNIS name (used for reference, don't need to attach it to post office data)
names(df1) <- paste0("GNIS.", names(df1)) #add a prefix to all the column names
temp_matched <- df3 %>% #join to the other Post Office dataframe to createa temporary dataframe of matched values
left_join(df1, by=c("ID"="GNIS.ID")) %>%
filter(GNIS.Match ==TRUE)
po_matched<<- po_matched %>% ## double carrot (<<) changes GLOBAL variable of po_matched outside of this function - adding new rows to dataframe of running matches
bind_rows(filter(temp_matched, !(ID %in% po_matched$ID))) %>%#add matches to the matched dataframe so long as the post office isn't already in there
#next few lines gets rid of duplicates matches by only taking the first one you matched (I think)
group_by(ID) %>%
filter(row_number()==1) %>%
ungroup()
po_tomatch<<- po_tomatch %>% #winnow down the running dataframes that you still have to match
filter(!(ID %in% po_matched$ID))
}
###function to match post offices that don't have a county, just a name and a state (mostly Alaska)
matchfun_bynamestate<-function(df1, df2, df3){
df1 <- df1 %>%
rename(Name=1, State=2) %>% #rename columns
filter(!(ID %in% po_matched$ID)) %>% #if the Post Office has not already been matched
left_join(df2, by=c("Name", "State")) %>% #join post office dataframe to GNIS dataframe based on full name, county and state matches
mutate(Match=ifelse(is.na(FEATURE_ID), FALSE, TRUE)) %>% #add a column Match to flag ones you've found
select(-c(OldName)) #get rid of original GNIS name (used for reference, don't need to attach it to post office data)
names(df1) <- paste0("GNIS.", names(df1)) #add a prefix to all the column names
temp_matched <- df3 %>% #join to the other Post Office dataframe to create a temporary dataframe of matched values
left_join(df1, by=c("ID"="GNIS.ID")) %>%
filter(GNIS.Match ==TRUE)
po_matched<<- po_matched %>% ## double carrot (<<) changes GLOBAL variable - adding new rows to dataframe of running matches
bind_rows(filter(temp_matched, !(ID %in% po_matched$ID))) %>%#add matches to the matched dataframe so long as the post office isn't already in there
#next few lines gets rid of duplicates matches by only taking the first one you matched (I think)
group_by(ID) %>%
filter(row_number()==1) %>%
ungroup()
po_tomatch<<- po_tomatch %>% #winnow down the running dataframes that you still have to match
filter(!(ID %in% po_matched$ID))
}
#going to go through a list of GNIS features, sequentially, trying to first match post offices, then populated places, etc. Matching various combinations of names and counties from Helbock's database.
tic()
for(f in features){
print(f);
gnis_tomatch<-filter(gnis, Feature.Class==f);
#po_tomatch <-filter(po_tomatch, !(ID %in% po_matched$ID)); #use if you don't want to try to match over the full list of post offices, but only unmatched ones
matchfun(select(po_tomatch, Name, County1, State, ID), gnis_tomatch, po_tomatch); #try to match on main name and county
#Alaksa post offices don't have a county, sending to separate function
justnamestate<-po_tomatch %>% filter(State=="AK") %>% select(Name, State, ID);
matchfun_bynamestate(justnamestate, gnis_tomatch, po_tomatch);
justnamestate_altname<-po_tomatch %>% filter(!is.na(AltName) & State=="AK") %>% select(AltName, State, ID);
matchfun_bynamestate(justnamestate, gnis_tomatch, po_tomatch);
#looking at post offices that didn't get a match on their name, but have an alternative name to check
othercombos<-po_tomatch %>% filter(!is.na(AltName));
matchfun(select(othercombos, AltName, County1, State, ID), gnis_tomatch, po_tomatch);
othercombos<-po_tomatch %>% filter(!is.na(County2)); #trying to match on an alternative county
matchfun(select(othercombos, Name, County2, State, ID), gnis_tomatch, po_tomatch);
othercombos<-po_tomatch %>% filter(!is.na(AltName));
matchfun(select(othercombos, AltName, County2, State, ID), gnis_tomatch, po_tomatch);
}
#if you've found a match, it means the levenshtein distance score = 1 (a full match), so adding a column
po_matched<-po_matched %>%
mutate(GNIS.MatchScore=1) %>%
mutate(Continuous=as.logical(Continuous))
toc(log=TRUE) #end timer and store to log
endtime<-unlist(tic.log(format=TRUE)) #create character variable has the elapsed time
tic.clear()
tic.clearlog()
write(paste0("Matching data for Round 1 of geolocating Helbock data with GNIS data. Processed on ", Sys.time(), "\n",
"Time: ", endtime, "\n",
"Number matched: ", nrow(po_matched), "\n",
"Number remaining to match: ", nrow(po_tomatch), "\n",
"Percentage matched this round: ", nrow(po_matched)/(nrow(po_tomatch)+nrow(po_matched))*100),
paste0("analytics/round1_analytics_", Sys.Date(), ".txt"))
paste0("Percentage matched this round: ", nrow(po_matched)/(nrow(po_tomatch)+nrow(po_matched))*100, "%")
#using these to save data that you've already run so you don't have to re-run all the matching (cpu intensive)
po_matched_round1<-po_matched
po_tomatch_round1<-po_tomatch
##write to a file = ****** ONLY use this when you're doing the full dataset, otherwise will overwrite ****
write.csv(po_matched_round1, "analytics/round1.csv", row.names = F)
##### ROUND 2 #######
#now that we've tried doing more exact matches with GNIS features, we're going to modify specific GNIS feature names to cast a wider net
#- ex. looking for Populated Places but removing the "TOWN" from "___ TOWN" within only Populated Places
####initializing dataframes - use this if you're starting from scratch
po_matched_start<-po_matched_round1 #this creates a dataframe of just post offices that weren't matched at the start of Round 2
po_matched <- po_matched_round1 #this is the main dataframe of matched post offices that you're going to add to
po_matched_round2<-po_matched[0,] #initiatilizing empty dataframe of just post offices you match in this round
po_tomatch_round2 <- helbock_data %>% filter(!(ID %in% po_matched$ID)) #pulling in post offices you need to match
gnis_tomatch<-gnis #all GNIS features
#set of matching functions you run through for every post office, sending in two dataframes: a subset of the GNIS features and the remaining post offices to match
matchingcombos<-function(df_gnis, df_pos){
matchfun(select(df_pos, Name, County1, State, ID), df_gnis, df_pos); #try to match on main name and county
othercombos<-df_pos %>% filter(!is.na(AltName)); #if there's an alternative name
matchfun(select(othercombos, AltName, County1, State, ID), df_gnis, df_pos); #try to match on alt name and county1
othercombos<-df_pos %>% filter(!is.na(County2)); #if there's an alternative county
matchfun(select(df_pos, Name, County2, State, ID), df_gnis, df_pos); #try to match on main name and county2
othercombos<-df_pos %>% filter(!is.na(AltName) & !(is.na(County2))); #if there's an alternative name and an altenrative county
matchfun(select(df_pos, AltName, County2, State, ID), df_gnis, df_pos); #try to match on alt name and county2
}
tic()
#Populated Places
pattern_list<-" TOWN$"
gnis_tomatch<-gnis %>%
filter(Feature.Class == "Populated Place") %>%
filter(str_detect(Name, pattern_list)) %>% #looking only at features that end with " TOWN"
mutate(Name = str_replace(Name, pattern_list, "")) #deleting " TOWN"
po_tomatch_round2 <- helbock_data %>% filter(!(ID %in% po_matched$ID))
matchingcombos(gnis_tomatch, po_tomatch_round2)
#Civil
pattern_list<-"^TOWNSHIP OF |^CITY OF |^BOROUGH OF | COUNTY$| INDIAN RESERVATION$| RESERVATION$"
gnis_tomatch <- gnis %>%
filter(Feature.Class == "Civil") %>%
filter(str_detect(Name, pattern_list)) %>% #looking only at features that end with " COUNTY"
mutate(Name = str_replace_all(Name, pattern_list, "")) #deleting " COUNTY" "TOWNSHIP OF " ETC.
po_tomatch_round2 <- helbock_data %>% filter(!(ID %in% po_matched$ID))
matchingcombos(gnis_tomatch, po_tomatch_round2)
#Mines
pattern_list<-" MINE$"
gnis_tomatch <- gnis %>%
filter(Feature.Class == "Mine") %>%
filter(str_detect(Name, pattern_list)) %>% #looking only at features that end with " MINE"
mutate(Name = str_replace_all(Name, pattern_list, "")) #deleting " MINE"
po_tomatch_round2 <- helbock_data %>% filter(!(ID %in% po_matched$ID))
matchingcombos(gnis_tomatch, po_tomatch_round2)
#Churches
pattern_list<-" UNITED METHODIST| FIRST BAPTIST| BAPTIST| CATHOLIC| EPISCOPAL| METHODIST| PRESBYTERIAN| LUTHERAN| MENNONITE| GOSPEL| PENTECOSTAL| CHRISTIAN| CONGREGATIONAL| CHURCH$"
gnis_tomatch <- gnis %>%
filter(Feature.Class == "Church") %>%
filter(str_detect(Name, pattern_list)) %>%
mutate(Name = str_replace_all(Name, pattern_list, ""))
po_tomatch_round2 <- helbock_data %>% filter(!(ID %in% po_matched$ID))
matchingcombos(gnis_tomatch, po_tomatch_round2)
#Schools
pattern_list<-" CHRISTIAN ACADEMY$| MILITARY ACADEMY$| CHILDREN'S ACADEMY$| JUNIOR ACADEMY$| GIFTED ACADEMY$| ACADEMY$"
gnis_tomatch <- gnis %>%
filter(Feature.Class=="School") %>%
filter(str_detect(Name, pattern_list)) %>%
mutate(Name = str_replace_all(Name, pattern_list, "")) #deleting " ACADEMY" variants
po_tomatch_round2 <- helbock_data %>% filter(!(ID %in% po_matched$ID))
matchingcombos(gnis_tomatch, po_tomatch_round2)
#Locales
pattern_list<-" HISTORICAL SITE$| VILLAGE$| COUNTRY CLUB$| RECREATION CENTER$| SHOPPING CENTER$| COUNTRY CLUB$| INDUSTRIAL PARK$| MALL$| RANCH$| CAMPGROUND$| PICNIC GROUND$| LANDING$| RAILROAD STATION$| FARM$"
gnis_tomatch <- gnis %>%
filter(Feature.Class=="Locale") %>%
filter(str_detect(Name, pattern_list)) %>%
mutate(Name = str_replace_all(Name, pattern_list, ""))
po_tomatch_round2 <- helbock_data %>% filter(!(ID %in% po_matched$ID))
matchingcombos(gnis_tomatch, po_tomatch_round2)
#Capes
pattern_list<-"^POINT | POINT$"
gnis_tomatch <- gnis %>%
filter(Feature.Class == "Cape") %>%
filter(str_detect(Name, pattern_list)) %>% #looking only at features that end with " POINT"
mutate(Name = str_replace_all(Name, pattern_list, "")) #deleting " POINT"
po_tomatch_round2 <- helbock_data %>% filter(!(ID %in% po_matched$ID))
matchingcombos(gnis_tomatch, po_tomatch_round2)
#fill dataframe with just post offices that you've matched this round
po_matched_round2<-po_matched %>%
filter(!(ID %in% po_matched_start$ID))
#remaining post offices that weren't matched after round 2
po_tomatch_round2 <- helbock_data %>% filter(!(ID %in% po_matched$ID))
#rm(gnis_tomatch)
toc(log=TRUE) #end timer and store to log
endtime<-unlist(tic.log(format=TRUE)) #create character variable has the elapsed time
tic.clear()
tic.clearlog()
write(paste0("Matching data for Round 2 of geolocating Helbock data with GNIS data. Processed on ", Sys.time(), "\n",
"Time: ", endtime, "\n",
"Total number matched so far: ", nrow(po_matched), "\n",
"Total remaining to match: ", nrow(po_tomatch_round2), "\n",
"Number matched this round: ", nrow(po_matched_round2), "\n",
"Percentage matched this round: ", nrow(po_matched_round2)/(nrow(helbock_data))*100),
paste0("analytics/round2_analytics_", Sys.Date(), ".txt"))
paste0("Percentage matched this round: ", nrow(po_matched_round2)/(nrow(helbock_data)-po_matched_round1)*100, "%")
po_matched<-po_matched %>%
mutate(GNIS.MatchScore=1)
##write to a file = only use this when you're doing the full dataset, otherwise will overwrite ****
write.csv(po_matched_round2, "analytics/round2.csv", row.names = F)
#### MATCHING PHASE 3: Fuzzy matching for Names of post offices you didn't match in first round
#function to take in name, state, county and do fuzzy matching based on name
fuzzy_matchfun <- function(df1, df2, minmatch){
print(paste0("Checking name field: ", names(df1)[1], ", and County field: ", names(df1)[2]));
print(paste0("Number of records checking for feature class ", f, ": ", nrow(df1)));
df1 <- rename(df1, Name=1, County=2, State=3)
for (rownum in 1:nrow(df1)){
if (rownum>nrow(df1)){break} #if you hit the end of hte dataframe
temp_row<-df1[rownum,]; #looping through dataframe of post offices by index number
if (is.na(temp_row$Name)) {next};
name_length<-as.integer(str_length(temp_row$Name)); #length of name in characters
max_dist_int<-as.integer(name_length-(name_length*minmatch)); #calculate the distance metrics for matching names
#if (name_length <= 4) {next} #skip over words that are only four characters long or less (too a high chance for false matches)
gnis_tomatch_temp<-gnis_tomatch[0,]; #initializes an empty dataframe
#select only GNIS features for the state and county of this post office
gnis_tomatch_temp<-gnis_tomatch %>%
filter(State==temp_row$State & County==temp_row$County)
#append a prefix of GNIS to the GNIS dataframe - makes it easier to join these fields to the post office datafrmae
names(gnis_tomatch_temp) <- paste0("GNIS.", names(gnis_tomatch_temp));
#get a datafrmae that's only comprised of the Name and ID of the post office, then do a fuzzy match with the GNIS dataframe to look for matches, appending a distance column that's the number of characters away
holder_df<-temp_row %>%
select(Name, ID) %>%
stringdist_inner_join(gnis_tomatch_temp, by=c(Name="GNIS.Name"), method="lv", distance_col="distance") #using levenshtein distance, defined by the distance column
#if (nrow(holder_df)>1) {print(holder_df)} #print if you are getting more than one match
if (nrow(holder_df)>0) { #if you found at least one fuzzy match
holder_df<-holder_df %>%
mutate(GNIS.MatchScore=((name_length-distance)/name_length)) %>% #calculate the string distance as a percentage of the length ofhte post office string
select(-c(distance, GNIS.OldName)) %>%
filter(GNIS.MatchScore >=minmatch) %>%
filter(!((str_sub(Name, 1, 4)=="EAST" & str_sub(GNIS.Name, 1, 4)=="WEST") | (str_sub(Name, 1, 4)=="WEST" & str_sub(GNIS.Name, 1, 4)=="EAST"))) %>% #This is removing matches that are accidentally matching "East __" and "West __"
filter(!(as.integer(str_length(Name))<=4 & (str_sub(Name, 1, 1)!=str_sub(GNIS.Name, 1, 1)))) %>% #remove any matches in which the length of the name is 4 letters or less AND it doesn't start with the same letter as the GNIS match - ex. "FERN" and "LERN"
#only return the highest scoring match
arrange(desc(GNIS.MatchScore)) %>% #sort it descending by GNIS score and then alphabetically by Post Office Name
group_by(ID) %>%
filter(row_number()==1) %>% #take only the highest sorted one (one with highest GNIS score)
ungroup() %>%
#add a GNIS.Match column if you found a match
mutate(GNIS.Match=ifelse(is.na(GNIS.FEATURE_ID), FALSE, TRUE)) %>%
select(-Name) %>% #prepping it for a join - want to drop the name since you're going to grab it from post office records using the ID key
left_join(po_tomatch, by=c("ID")) #get all the other columns associated with that post office record
#if the post office record is not already in the GLOBAL successfully matched df, then add it
po_matched_round3<<- po_matched_round3 %>%
bind_rows(filter(holder_df, !(ID %in% po_matched_round3$ID)))
}
#remove the records you've already matched from the GLOBAL dataframe of unmatched records
po_tomatch<<- po_tomatch %>%
filter(!(ID %in% po_matched_round3$ID))
}
}
#use this for full data
po_tomatch <- helbock_data %>% filter(!(ID %in% po_matched$ID))
gnis_tomatch<-gnis
## define samples of incoming datasets for debugging- comment out for full dataset
# po_tomatch<-helbock_data %>% filter(!(ID %in% po_matched$ID)) %>%
# sample_n(50)
# gnis_tomatch <- gnis %>%
# filter(Name %in% c("HOME VALLEY", "DAYSBOROUGH", "DAYSBORO", "GALLION", "CALION", "LINE", "PRINDLE", "TOOELLE"))
# po_tomatch<-po_tomatch %>%
# filter(ID %in% c("15893", "108367","66960","141858", "146143")) #getting specific ones for testing purposes
# gnis_tomatch <- gnis %>%
# #filter((State=="CO" | State=="AZ") & Feature.Class == "Populated Place")
# filter(Feature.Class == "Populated Place")
minmatch<-0.75 ###minimum matching score (out of 100) you want to try and match
po_matched_round3<-po_matched[0,] #empty dataframe that you're going to add new matches into
#### this is where you actually send in matches - very CPU intensive, going to take awhile to run
tic("start")
for (f in features){
tic("feature")
print(paste0("*******Feature Class: ", f, "*******"));
gnis_tomatch<-gnis %>%
filter(Feature.Class==f)
fuzzy_matchfun(select(po_tomatch, Name, County1, State, ID), po_tomatch, minmatch) #send post offices for fuzzy matching
fuzzy_matchfun(po_tomatch %>% filter(!(is.na(AltName) | AltName == "")) %>% select(AltName, County1, State, ID), po_tomatch, minmatch) #send only post offices that has an alternative name
fuzzy_matchfun(po_tomatch %>% filter(!(is.na(County2) | County2 == "")) %>% select(Name, County2, State, ID), po_tomatch, minmatch) #send only post offices that have an altnerative county
fuzzy_matchfun(po_tomatch %>% filter(!(is.na(AltName) | AltName == "")) %>% filter(!(is.na(County2) | County2 == "")) %>% select(AltName, County2, State, ID), po_tomatch, minmatch) #send only post offices that have an alternative name and an altenrative county
print(paste0("******Successfully matched: ", nrow(po_matched_round3)));
print(paste0("******Still need to match: ", nrow(po_tomatch)));
toc()
}
toc(log=TRUE) #end timer and store to log
endtime<-unlist(tic.log(format=TRUE)) #create character variable has the elapsed time
tic.clear()
tic.clearlog()
po_matched_round3_backup<-po_matched_round3 #just being careful since this is so CPU intensive don't want to run again
po_tomatch_round3<-po_tomatch
nummatched_overall<-nrow(po_matched_round1)+nrow(po_matched_round2)+nrow(po_matched_round3)
numremaining_overall<-nrow(helbock_data)-nummatched_overall
percentagematched_overall<-nummatched_overall/nrow(helbock_data)
write(paste0("Matching data for Round 3 of geolocating Helbock data with GNIS data. Processed on ", Sys.time(), "\n",
"Using a minimum fuzzy match score: ", minmatch, " (Levenshtein distance method).\n",
"Time: ", endtime, "\n",
"Number matched this round: ", nrow(po_matched_round3), "\n",
"Percentage of all post offices matched this round: ", round((nrow(po_matched_round3)/nrow(helbock_data))*100, digits=3), "\n",
"Number remaining to match after this round: ", nrow(po_tomatch), "\n",
"Percentage matched within this round: ", nrow(po_matched_round3)/(nrow(po_tomatch)+nrow(po_matched_round3))*100, "\n",
"Overall number matched: ", nummatched_overall, "\n",
"Overall remaining to match: ", numremaining_overall, "\n",
"Percentage matched overall: ", round(percentagematched_overall*100, digits=3)),
paste0("analytics/round3_analytics_", Sys.Date(), ".txt"))
write.csv(po_matched_round3, "analytics/round3.csv", row.names=F)
write.csv(po_tomatch_round3, "analytics/needtomatch-afterround3.csv", row.names=F, na="")
po_matched<-bind_rows(po_matched_round1, po_matched_round2, po_matched_round3) #all of the post offices that have been matched
###### EXPORT THE DATA
#export csv files with temporary data of matches and the full dataset (but not with all the added columns yet)
write.csv(po_matched, "output/matched_03_14_2021.csv", row.names=F, na="")
fulldata<-po_matched %>%
bind_rows(po_tomatch) %>%
mutate(GNIS.Match=ifelse(is.na(GNIS.Match), FALSE, TRUE)) #flag non-matches with FALSE for GNIS.Match
write.csv(fulldata, "output/fulldata_03_14_2021.csv", row.names=F, na="")
## Export your full dataset with renamed, reordered, and additional columns
fulldata<-read.csv("output/fulldata_03_14_2021.csv", stringsAsFactors = F)
fulldata<-fulldata %>%
mutate_all(trimws) %>%
#mutate(Established=as.integer(Established), Discontinued=as.integer(Discontinued), ID=as.integer(ID))
mutate_at(c("Established", "Discontinued", "ID", "GNIS.FEATURE_ID", "GNIS.ELEV_IN_M"), as.integer) %>%
mutate_at(c("GNIS.Latitude", "GNIS.Longitude", "GNIS.MatchScore"), as.numeric) %>%
mutate_at(c("Continuous", "GNIS.Match"), as.logical)
export_data<- fulldata %>%
mutate(Latitude = GNIS.Latitude, Longitude=GNIS.Longitude) %>%
mutate(Coordinates=ifelse(GNIS.Match==TRUE, TRUE, FALSE)) %>%
mutate(Duration = as.numeric(Discontinued)-as.numeric(Established)) %>%
select(Name, AltName, OrigName, State, County1, County2, County3, OrigCounty, Established, Discontinued, Continuous, StampIndex, ID, Coordinates, Duration, GNIS.Match, everything(), -(GNIS.ELEV_IN_FT)) %>% #reorder columns
arrange(ID)
write.csv(export_data, "output/us-post-offices.csv", row.names=F, na="")