forked from woobe/rugsmaps
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prep_data.R
72 lines (66 loc) · 2.99 KB
/
prep_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Process study trial data from WHO data sources, adding lat, long and other
# required meta data for mapping
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
require(dplyr)
require(geonames)
source("geonames_username.R")
require(maps)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Import raw data with minimal processing
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
trials <- read.csv("./data/simple_overview.csv", stringsAsFactors = FALSE)
trials[which(trials == "?", arr.ind = TRUE)] <- "Unknown"
df_trials <- trials
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Add lat, long at the country level for each trial
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
country_locs <- data.frame(Country = unique(df_trials$Country), lat = NA, lon = NA, countryCode = NA, countryGNId = NA)
for (i in 1:nrow(df_trials)) {
country <- country_locs$Country[i]
res <- GNsearch(name_equals = country, featureClass = "A", featureCode = "PCLI")
if(nrow(res) > 0) {
country_locs$lat[i] <- res$lat[1]
country_locs$lon[i] <- res$lng[1]
country_locs$countryCode[i] <- res$countryCode[1]
country_locs$countryGNId[i] <- res$geonameId[1]
}
if(nrow(res) > 1) {
warning(paste0("More than one geonames result for ", country))
}
}
df_trials <-
tbl_df(df_trials) %>%
inner_join(country_locs, by = c("Country"))
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Where possible, add resolution down to site
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
site_locs <- data.frame(unique(df_trials[,c("countryCode", "Site")]), sitelat = NA, sitelon = NA, siteGNId = NA)
for (i in 1:nrow(df_trials)) {
site <- site_locs$Site[i]
if(site == "Unknown" || site == "TBD" || is.na(site)) next
countryCode <- site_locs$countryCode[i]
res <- GNsearch(name_equals = site, country = countryCode, featureClass = "P")
if(nrow(res) > 0) {
site_locs$sitelat[i] <- res$lat[1]
site_locs$sitelon[i] <- res$lng[1]
site_locs$siteGNId[i] <- res$geonameId[1]
}
if(nrow(res) > 1) {
warning(paste0("More than one geonames result for ", site, " in ", countryCode))
}
}
site_locs <-
tbl_df(site_locs) %>%
filter(!is.na(siteGNId))
df_trials <-
tbl_df(df_trials) %>%
left_join(site_locs, by = c("Site", "countryCode"))
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Write the final table to CSV
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
df_trials$lat <- signif(as.double(df_trials$lat, 5))
df_trials$lon <- signif(as.double(df_trials$lon, 5))
df_trials$sitelat <- signif(as.double(df_trials$sitelat, 5))
df_trials$sitelon <- signif(as.double(df_trials$sitelon, 5))
write.csv(df_trials, file ="./data/simple_overview_modified.csv", row.names = FALSE)