-
Notifications
You must be signed in to change notification settings - Fork 14
/
HW-4.Rmd
261 lines (197 loc) · 9.43 KB
/
HW-4.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
---
title: "MATH 216 Homework 4"
author: "Alden Cowap"
output:
html_document:
toc: true
toc_float: true
collapsed: false
smooth_scroll: false
---
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5}
# Suggested packages; feel free to use more!
library(tidyverse)
library(USAboundaries)
library(maptools)
library(sp)
library(broom)
```
## Admistrative:
Please indicate
* Who you collaborated with:
* Roughly how much time you spent on this HW so far: 6 hours
* The URL of the RPubs published URL [here](http://rpubs.com/acowap/HW4).
* What gave you the most trouble:
* Any comments you have:
## Question 1:
**Question**: DHL, a German shipping company, has a ton of money to build a
shipping hub in the United States. Where should they build it?
**Deliverable**: A choropleth map of the lower 48 states (no Puerto Rico) where
the color corresponds to the distance of the given area to the shipping hub,
which is marked on the map with a large red dot.
First, let's make a map that has a blue dot at the center of each county.
```{r, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5}
# Perform your computations here:
## Get data tables:
Census_county <- read_csv("Census_county.csv")
county_shp <- us_counties()
county_data <- county_shp@data %>% mutate(n=1:3220)
county_polygon <- tidy(county_shp, region="geoid")
county_centroid <- county_shp %>% sp::coordinates()
centroid <- as.data.frame(county_centroid) %>%
mutate(longitude = V1) %>%
mutate(latitude=V2) %>%
mutate(n=1:3220) %>%
select(longitude, latitude, n)
county_data <- left_join(county_data, centroid, by="n")
counties <- left_join(county_polygon, county_data, by=c("id"="geoid")) %>%
filter( !state_name %in% c("Alaska", "Hawaii", "Puerto Rico")) %>% filter(!is.na(state_name))
pop <- Census_county %>% select(Geo_FIPS, SE_T001_001)
county_pop <- left_join(counties, pop, by=c("id"="Geo_FIPS"))
#trial plot:
ggplot(counties, aes(x=long, y=lat, group=group)) +
geom_path() +
coord_map() +
geom_point(aes(x=longitude, y=latitude), color = "blue", size = .05)
```
First, let's figure out where the middle of the country is by averaging the largest and smallest longitude and latitude and put the hub there.
```{r, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5}
find_hub <- county_pop %>% mutate(pop = SE_T001_001) %>%
select(longitude, latitude, pop, id) %>%
unique() %>%
mutate(n=1:3108)
max_lat <- max(find_hub$latitude)
min_lat <- min(find_hub$latitude)
max_long <- max(find_hub$longitude)
min_long <- min(find_hub$longitude)
avg_lat <- (max_lat + min_lat)/2
avg_long <- (max_long + min_long)/2
county_pop_avg <- county_pop %>%
mutate(hub_x_coord = avg_long) %>%
mutate(hub_y_coord = avg_lat) %>%
mutate(distance = (sqrt((longitude - hub_x_coord)^2 + (latitude - hub_y_coord)^2)))
ggplot(data=county_pop_avg, aes(x=long, y=lat, group=group, fill=distance)) +
# Plot choropleth polygons, low -> high being colored white -> dark green
geom_polygon() +
scale_fill_gradient(low="white", high="darkgreen") +
# Trace outlines of areas
geom_path(size=0.1) +
coord_map() +
# Mark hub
geom_point(aes(x=hub_x_coord, y=hub_y_coord), col="red", size=5) +
labs(title= "Hub at middle of country")
```
While the middle of the country means that no one place in the country is very far from the hub, some parts of the country are more populated than others. If we assume that the company will need to ship items more frequently to more populated areas, we should think more about how far the hub is from places with a lot of people vs. places with few people.
To do this, we can find the hub most centrally located based on the distance of each county to the hub. Since county lines are roughly drawn by population, where there are more counties, there should be more people. Therfore, the calculated hub placement will be skewed towards where there are more counties, and thus theoretically more people.
```{r, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5}
#get only unique county centroids
find_hub <- county_pop %>% mutate(pop = SE_T001_001) %>%
select(longitude, latitude, pop, id) %>%
unique() %>%
mutate(n=1:3108)
#make a function that will return the sum of the distances from each county centroid to a specific hub.
#This function will run through each county centroid as a potential hub.
sum_of_dist<- function(hub = 1:3108) {
y <- vector()
for(i in 1:length(hub)){
x <- i
y <-c(y, sum((sqrt((find_hub$longitude - find_hub$longitude[x])^2 + (find_hub$latitude - find_hub$latitude[x])^2))))
}
df <- data.frame(hub, y)
colnames(df) <- c("hub", "distance")
return(df)
}
distances2 <- sum_of_dist(1:3108)
#find the hub which is most centrally located by finding the smallest sum
min_dist_hub <- distances2 %>% arrange(distance) %>% head(1)
min_dist_obs <- min_dist_hub$hub
#find lat and long of hub
find_hub2 <- find_hub %>% filter(n==min_dist_hub$hub)
hub_long <- find_hub2$longitude
hub_lat <- find_hub2$latitude
## mutate data so that we can graph
county_pop_dist <- county_pop %>%
mutate(hub_x_coord = hub_long) %>%
mutate(hub_y_coord = hub_lat) %>%
mutate(distance = (sqrt((longitude - hub_x_coord)^2 + (latitude - hub_y_coord)^2)))
```
```{r, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5}
# Here is a template of a ggplot call; feel free to change this to suit your
# needs. Once your computations above are complete, set the eval=TRUE on the
# line above to tell R Markdown to run this code block.
ggplot(data=county_pop_dist, aes(x=long, y=lat, group=group, fill=distance)) +
# Plot choropleth polygons, low -> high being colored white -> dark green
geom_polygon() +
scale_fill_gradient(low="white", high="darkgreen") +
# Trace outlines of areas
geom_path(size=0.1) +
coord_map() +
# Mark hub
geom_point(aes(x=hub_x_coord, y=hub_y_coord), col="red", size=5) +
labs(title = "Hub by distance")
```
However, county lines aren't a very accurate representaion of population. This time, let's weight the distance from each county to the hub by the popultion of the county, so that the number we consider is not just distance, but distance/popultion.
```{r, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5}
#find hub placement
#make a function that will return the sum of the distances from each county centroid to a specific hub, wieghted by population of that county.
#This function will run through each county centroid as a potential hub.
sum_of_dist_weight <- function(hub = 1:3108) {
y <- vector()
for(i in 1:length(hub)){
x <- i
y <-c(y, sum((sqrt((find_hub$longitude - find_hub$longitude[x])^2 + (find_hub$latitude - find_hub$latitude[x])^2))/find_hub$pop[x]))
}
df <- data.frame(hub, y)
colnames(df) <- c("hub", "distance")
return(df)
}
distances_weight <- sum_of_dist_weight(1:3108)
#find the hub which is most centrally located by finding the smallest sum
min_weight_hub <- distances_weight %>% arrange(distance) %>% head(1)
min_weight_obs <- min_weight_hub$hub
#find lat and long of hub
find_hub2 <- find_hub %>% filter(n==min_weight_hub$hub)
hub_long <- find_hub2$longitude
hub_lat <- find_hub2$latitude
## mutate data so that we can graph
county_pop_weight <- county_pop %>%
mutate(hub_x_coord = hub_long) %>%
mutate(hub_y_coord = hub_lat) %>%
mutate(distance = (sqrt((longitude - hub_x_coord)^2 + (latitude - hub_y_coord)^2)))
```
```{r, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5}
ggplot(data=county_pop_weight, aes(x=long, y=lat, group=group, fill=distance)) +
# Plot choropleth polygons, low -> high being colored white -> dark green
geom_polygon() +
scale_fill_gradient(low="white", high="darkgreen") +
# Trace outlines of areas
geom_path(size=0.1) +
coord_map() +
# Mark hub
geom_point(aes(x=hub_x_coord, y=hub_y_coord), col="red", size=5) +
labs(title= "Hub by weighted distance")
```
While the last map found the best distance by weighting counties by population, the sum of the weights were all different and did not sum to one. In this map, the hub placement is finding the "center of mass" of the country. In order to find the center of mass, I found the center of mass for both longitude and latitude. To find each, the sum of population*latiude (or longitude) was divided by total population. This is a more conventional way of weighting objects because all of the weights sum to 1.
```{r, eval=TRUE, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5}
#Center of mass
find_cm <- find_hub %>% mutate(poplat = pop*latitude) %>% mutate(poplong = pop*longitude)
find_cmna <- find_cm %>% filter(!is.na(pop)) %>%
select(pop, poplat, poplong)
findcmna <- colSums(find_cmna)
cm_lat <- findcmna[2]/findcmna[1]
cm_long <- findcmna[3]/findcmna[1]
county_pop_weight2 <- county_pop %>%
mutate(hub_x_coord = cm_long) %>%
mutate(hub_y_coord = cm_lat) %>%
mutate(distance = (sqrt((longitude - hub_x_coord)^2 + (latitude - hub_y_coord)^2)))
ggplot(data=county_pop_weight2, aes(x=long, y=lat, group=group, fill=distance)) +
# Plot choropleth polygons, low -> high being colored white -> dark green
geom_polygon() +
scale_fill_gradient(low="white", high="darkgreen") +
# Trace outlines of areas
geom_path(size=0.1) +
coord_map() +
# Mark hub
geom_point(aes(x=hub_x_coord, y=hub_y_coord), col="red", size=5) +
labs(title= "Hub by center of mass")
```