/
variable-selection.Rmd
279 lines (207 loc) · 11.7 KB
/
variable-selection.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
---
title: "Variable Selection: RFs w/biovars, HWSD, CDL, & farm(er) characteristics"
author: "Britta Schumacher"
date: "Last updated: `r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
theme: yeti
toc: yes
toc_float: true
code_folding: hide
---
```{r, echo=FALSE}
knitr::opts_chunk$set(error = TRUE)
```
# Data prep
## Load data
```{r, message=F, warning=F}
library(tidyverse)
library(ggcorrplot)
```
### Reduce panel to only counties where yield is reported in a county-year
```{r, message=F, warning=F }
yield <- readRDS("./data/true-train.RDS")
yield <- as.data.frame(yield)
yield <- yield %>% filter(!is.na(YIELD)) # reduces to 16983 unique county-year observations
# add farm(er) characteristics
farm <- readRDS("./data/farm-er-linearinterp.RDS")
farm <- farm %>% select(!county)
yield <- merge(yield, farm, by = c("GEOID", "YEAR"))
```
# Variable selection
```{r, message=F, warning=F}
#find % available
data_avail <- yield %>%
summarize_all(funs(sum(is.na(.)) / length(.)))
#if you want to transpose, use the code below
t_d <- t(data_avail)
```
### Remove missing variables -- these are also more difficult to interpret/qualitative soil variables that are a bit redundant and ranked by J. Cowan as 2-3 (medium to low priority)
```{r }
yield <- yield %>% dplyr::select(-c("PHASE1", "DRAINAGE", "T_USDA_TEX_CLASS", "T_TEXTURE", "REF_DEPTH", "AWC_CLASS", "ADD_PROP"))
```
### Remove variables that are linear products of other variables
```{r }
#BV3 is a direct product of BV2 and BV7
#BV7 is a direct product of BV5 and BV6
# perc_corn = CORN_SQKM/AG_SQKM*100
yield <- yield %>% dplyr::select(-c("BV3", "BV7", "AG_SQKM", "CORN_SQKM"))
```
### Save all possible biophysical
```{r}
all_bioph <- yield %>% dplyr::select("GEOID","YEAR","SLOPE","ELEVATION","GDD","TP","YIELD","SDI_CDL_AG","S_PH_H2O","T_BS","T_CACO3","T_CASO4","T_CEC_CLAY","T_CEC_SOIL","T_CLAY","T_ESP","T_GRAVEL","T_OC","T_PH_H2O","T_REF_BULK_DENSITY","T_SILT","T_TEB","T_ECE","T_SAND","FRR","PERC_IRR","BV1","BV2","BV4","BV5","BV6","BV8","BV9","BV10","BV11","BV12","BV13","BV14","BV15","BV16","BV17","BV18","BV19")
saveRDS(all_bioph, "./data/all-available-BIOPH-attributes.RDS")
```
### Pair-Wise Correlation Matrix ~ Soil
```{r, warning=F, message=F}
# visualize by "natural" groupings of variables
# first, soil -- removing qualitative/coded variables T_TEXTURE, REF_DEPTH, DRAINAGE, AWC_CLASS, ADD_PROP, T_USDA_TEX_CLASS
soil <- yield[c(3:4,25:40)]
corr_soil <- round(cor(soil[c(1:18)]), 2)
p.mat_soil <- cor_pmat(soil[c(1:18)])
corrplot_soil <- ggcorrplot(corr_soil, hc.order = FALSE, p.mat = p.mat_soil, outline.col = "white", type = "lower", insig = "blank",lab = F, ggtheme = ggplot2::theme_minimal, colors = c("#6D9EC1", "white", "#E46726"))
corrplot_soil
# see initial correlations
red_soil <- as.data.frame(as.table(cor(soil[c(1:18)]), 1))
red_soil <- red_soil %>% filter(Freq != 1) %>% mutate(aFreq = abs(Freq)) %>% arrange(desc(aFreq)) %>% filter(aFreq > 0.80) %>% dplyr::select(c(Var1, Var2, Freq))
knitr::kable(red_soil, caption = "Soil variable pairs with high correlation")
# reduce redundant variables
red_soil_s1 <- as.data.frame(as.table(cor(soil[c(1:3,5:8,10:12,14:15,17:18)]), 1))
red_soil_s1 <- red_soil_s1 %>% filter(Freq != 1) %>% mutate(aFreq = abs(Freq)) %>% arrange(desc(aFreq)) %>% filter(aFreq > 0.80) %>% dplyr::select(c(Var1, Var2, Freq))
# remove qualitative coded variables that lack information and save full soil list of interest
soil_quant <- as.data.frame(soil[c(1:3,5:8,10:12,14:15,17:18)])
soil_final_variables <- c("SLOPE","ELEVATION","S_PH_H2O","T_CACO3","T_CASO4","T_CEC_CLAY","T_CEC_SOIL","T_ESP","T_GRAVEL","T_OC","T_REF_BULK_DENSITY","T_SILT","T_ECE","T_SAND")
```
First, we'll get rid of topsoil pH because this can be pretty variable over time in some places. Farmers actively manage for this in the midwest--but pretty time invariable in other places, like the intermountain west. Anyway, let's toss it and see where that takes us. Let's also get rid of T_TEB--J. Cowan prioritized this as a 2 (medium priority) v. T_CEC_SOIL (high priority) and S_PH_H2O (high priority); T_BS--J. Cowan prioritized this as a 2 (medium priority) v. S_PH_H20 (high priority). Let's keep T_REF_BULK_DENSITY because it is an indication of how restricted root growth will be in a soil and changes as the percentage clay-silt-sand changes; so we'll get rid of T_CLAY.
Drop:
- T_PH_H2O
- T_TEB
- T_BS
- T_CLAY
- ^REF_DEPTH
- ^T_TEXTURE
- ^DRAINAGE
- ^AWC_CLASS
- ^ADD_PROP
- ^T_USDA_TEX_CLASS
(^) Variables that do not match the elimination Emily did.
### Pair-Wise Correlation Matrix ~ Climate
```{r, warning=F, message=F}
library(ggcorrplot)
# visualize by "natural" groupings of variables
# second, climate --
climate <- yield[c(5:6,44:60)]
corr_clim <- round(cor(climate[c(1:19)]), 2)
p.mat_clim <- cor_pmat(climate[c(1:19)])
corrplot_clim <- ggcorrplot(corr_clim, hc.order = FALSE, p.mat = p.mat_clim, outline.col = "white", type = "lower", insig = "blank",lab = F, ggtheme = ggplot2::theme_minimal, colors = c("#6D9EC1", "white", "#E46726"))
corrplot_clim
# see initial correlations
red_clim <- as.data.frame(as.table(cor(climate[c(1:19)]), 1))
red_clim <- red_clim %>% filter(Freq != 1) %>% mutate(aFreq = abs(Freq)) %>% arrange(desc(aFreq)) %>% filter(aFreq > 0.80) %>% dplyr::select(c(Var1, Var2, Freq))
knitr::kable(red_clim, caption = "Climate variable pairs with high correlation")
# reduce redundant variables
red_clim_s1 <- as.data.frame(as.table(cor(climate[c(1:2,4:5,8:9,15,18:19)]), 1))
red_clim_s1 <- red_clim_s1 %>% filter(Freq != 1) %>% mutate(aFreq = abs(Freq)) %>% arrange(desc(aFreq)) %>% filter(aFreq > 0.80) %>% dplyr::select(c(Var1, Var2, Freq))
# remove qualitative coded variables that lack information and save full soil list of interest
clim_quant <- as.data.frame(climate[c(1:2,4:5,8:9,15,18:19)])
climate_final_variables <- c("GDD","TP","BV2","BV4","BV8","BV9","BV15","BV18","BV19")
```
Drop - order removed - reason:
**Rules of elimination:
1) Drop any climate variable that measure a range, retain the min and max values
2) Drop any climate variable that is a monthly measurement and retain quarterly measurement
3) Drop any climate variable that is an annual summary
4) Keep crop-specific variables when collinear with biovars
- *BV1: Mean annual temperature - 4 - 3)
- BV3: Isothermality - 1 - 1)
- BV5: Max temperature of warmest month - 2 - 2)
- BV6: Min temperature of coldest month - 2 - 2)
- BV7: Temperature annual range - 1 - 1)
- ^*BV10: Mean temperature of warmest quarter - 6 - 4)
- BV11: Mean temperature of the coldest quarter - 5 - 4)
- ^*BV12: Total Annual Precipitation (mm) - 4 - 3)
- BV13: Precipitation of wettest month (mm) - 3 - 2)
- BV14: Precipitation of driest month (mm) - 3 - 2)
- BV16: Precipitation of wettest quarter - 7 - 4)
- BV17: Precipitation of driest quarter - 8 - ??
(*) Variables that I removed that KS did not.
(^) Variables that I removed that EB did not.
Variables removed by KS/EB that I did not remove, below:
EB: BV9, BV19
KS: BV19
### Pair-Wise Correlation Matrix ~ diversity
```{r, warning=F, message=F}
library(ggcorrplot)
# visualize by "natural" groupings of variables
# third, diversity --
diversity <- yield[c(8:23)]
corr_div <- round(cor(diversity[c(1:16)]), 2)
p.mat_div <- cor_pmat(diversity[c(1:16)])
corrplot_div <- ggcorrplot(corr_div, hc.order = FALSE, p.mat = p.mat_div, outline.col = "white", type = "lower", insig = "blank",lab = F, ggtheme = ggplot2::theme_minimal, colors = c("#6D9EC1", "white", "#E46726"))
corrplot_div
# let's immediately keep only _AG columns
diversity <- diversity[c(3,7,11,15)]
# see initial correlations
red_div <- as.data.frame(as.table(cor(diversity[c(1:4)]), 1))
red_div <- red_div %>% filter(Freq != 1) %>% mutate(aFreq = abs(Freq)) %>% arrange(desc(aFreq)) %>% filter(aFreq > 0.80) %>% dplyr::select(c(Var1, Var2, Freq))
knitr::kable(red_div, caption = "Diversity variable pairs with high correlation")
# all of these variables are essentially trying to say the same thing; let's keep SDI, cause it's what is most commonly used in the literature
div_quant <- as.data.frame(diversity[c(2)])
div_final_variables <- c("SDI_CDL_AG")
```
*Keep one and go with it ~ RPR (max richness)*
Drop:
- all '_ALL' variables
- all '_BBOX' variables
- SIDI_CDL_AG, RICH_CDL_AG, RPR_CDL_AG
Variables removed by KS/EB that I did not remove, below:
EB: NA
KS: NA
### Save all possible farm(er)
```{r}
all_farmer <- yield %>% dplyr::select("GEOID","YEAR","SLOPE","ELEVATION","GDD","TP","YIELD","SDI_CDL_AG","S_PH_H2O","T_BS","T_CACO3","T_CASO4","T_CEC_CLAY","T_CEC_SOIL","T_CLAY","T_ESP","T_GRAVEL","T_OC","T_PH_H2O","T_REF_BULK_DENSITY","T_SILT","T_TEB","T_ECE","T_SAND","FRR","PERC_IRR","BV1","BV2","BV4","BV5","BV6","BV8","BV9","BV10","BV11","BV12","BV13","BV14","BV15","BV16","BV17","BV18","BV19","perc_corn","chem","fert","labor_expense","machinery","insur_acres","gvt_prog","exp","occup","tenant","acres_per_op")
saveRDS(all_farmer, "./data/all-available-FARMER-attributes.RDS")
```
### Pair-Wise Correlation Matrix ~ farm(er) characteristics
```{r, warning=F, message=F}
# visualize by "natural" groupings of variables
# fourth, farm(er) characteristics --
farm <- yield[c(61:71)]
corr_farm <- round(cor(farm[c(1:11)]), 2)
p.mat_farm <- cor_pmat(farm[c(1:11)])
corrplot_farm <- ggcorrplot(corr_farm, hc.order = FALSE, p.mat = p.mat_farm, outline.col = "white", type = "lower", insig = "blank",lab = F, ggtheme = ggplot2::theme_minimal, colors = c("#6D9EC1", "white", "#E46726"))
corrplot_farm
# see initial correlations
red_farm <- as.data.frame(as.table(cor(farm[c(1:11)]), 1))
red_farm <- red_farm %>% filter(Freq != 1) %>% mutate(aFreq = abs(Freq)) %>% arrange(desc(aFreq)) %>% filter(aFreq > 0.80) %>% dplyr::select(c(Var1, Var2, Freq))
#knitr::kable(red_farm, caption = "Farm(er) variable pairs with high correlation")
# all of these variables are essentially trying to say the same thing; let's keep SDI, cause it's what is most commonly used in the literature
farm_quant <- as.data.frame(farm[c(1:11)])
farm_final_variables <- c("perc_corn","chem","fert","labor_expense","machinery","insur_acres","gvt_prog","exp","occup","tenant","acres_per_op")
```
## Pull together all variables we are keeping
```{r}
yield_keep <- yield %>% dplyr::select(c(GEOID,YEAR,YIELD,SDI_CDL_AG,GDD,TP,BV2,BV4,BV8,BV9,BV15,BV18,BV19,SLOPE,ELEVATION,S_PH_H2O,T_CACO3,T_CASO4,T_CEC_CLAY,T_CEC_SOIL,T_ESP,T_GRAVEL,T_OC,T_REF_BULK_DENSITY,T_SILT,T_ECE,T_SAND,PERC_IRR,FRR,perc_corn,chem,fert,labor_expense,machinery,insur_acres,gvt_prog,exp,occup,tenant,acres_per_op))
saveRDS(yield_keep, "./data/yieldRF.RDS")
```
## Pair-wise Correlation Values of Initial Variable Selection
```{r}
red <- as.data.frame(as.table(cor(yield_keep[4:40]), 1))
red <- red %>% filter(Freq != 1) %>% mutate(aFreq = abs(Freq)) %>% arrange(desc(aFreq)) %>% filter(aFreq > 0.80) %>% dplyr::select(c(Var1, Var2, Freq))
# show correlations
yield_keep <- yield_keep[complete.cases(yield_keep), ]
corr_yield <- round(cor(yield_keep[c(4:40)]), 2)
p.mat_yield <- cor_pmat(yield_keep[c(4:40)])
corrplot_yield <- ggcorrplot(corr_yield, hc.order = FALSE, p.mat = p.mat_yield, outline.col = "white", type = "lower", insig = "blank",lab = F, ggtheme = ggplot2::theme_minimal, colors = c("#6D9EC1", "white", "#E46726"))
corrplot_yield
yield_keep_corr <- as.data.frame(as.table(cor(yield_keep[c(4:40)]), 1))
yield_keep_corr
```
###Remove collinear variables
**Rules of elimination:
1) Drop any climate variable that measure a range, retain the min and max values
2) Drop any climate variable that is a monthly measurement and retain quarterly measurement
3) Drop any climate variable that is an annual summary
4) Keep crop-specific variables when collinear with biovars
5) Drop any soil variables that were ranked less important by J. Cowan
6) Choose the variable with the higher availability for other variable pairs