In [1]:
# Clean existing variables
rm(list=ls())
library(plyr)
library(tidyr)
library(dplyr)
library(reshape2)
library(frm)
library(httr)
library(lmtest)
library(sandwich)
library(broom)
library(margins)
library(ggplot2)
library(RColorBrewer)
library(mfx)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘reshape2’


The following object is masked from ‘package:tidyr’:

    smiths


Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




In [3]:
getRMA <- function(type, years) {
  # this function downloads individual yearly policy and claim files from the RMA website  
  # and provides a data frame.
  # inputs:
  #   type: "policy" or "claim"
  #   years: year of the data to be downloaded
  # output:
  #   dataframe
  
  if (type != "policy" & type != "claim") {
      return ("type should be 'policy' or 'claim'")
  }
  dfs <- list()
  for (y in 1:length(years)) {
    year <- years[y]
    if (type == "policy") {
      myurl <- paste(paste("https://www.rma.usda.gov/-/media/RMAweb/SCC-SOB/State-County-Crop-Coverage/sobcov_", year, sep = ""), ".ashx?la=en", sep = "")
      cols <- c("year", "stfips", "stabb", "cntyfips", "cntyname", "commoditycode", "commodityname", 
                "insplancode","insplanname", "covcateg", "delivtype", "covlevel", "polsold", "polprem", "polindemn", "unitssold", 
                "unitsindemn", "quanttype", "acres", "endorsedacres",  
                "liab", "totalpremium", "subsidies", "indemnityamount", "lossratio")
    }
    if (type == "claim") {
      myurl <- paste(paste("https://www.rma.usda.gov/-/media/RMAweb/Cause-Of-Loss/Summary-of-Business-with-Month-of-Loss/colsom_", year, sep = ""), ".ashx?la=en", sep = "")
      cols  <- c("year", "stfips", "stabb", "cntyfips", "cntyname", "commoditycode", "commodityname", 
                 "insplancode","insplanname", "covcateg", "stagecode", "damagecausecode", "damagecausedesc", "monthloss", "monthname", "polprem", 
                 "polindemn",  "acres", "endorsedacres",  
                 "liab", "totalpremium", "subsidies", "lostacres", "indemnityamount", "lossratio")
    }
    response <- GET(myurl)
    writeBin(content(response, as = "raw"), "~/Downloads/temp.zip")
    fName <- unzip("~/Downloads/temp.zip", list = TRUE)$Name
    unzip("~/Downloads/temp.zip", exdir = "temp")
    pathtemp <- paste0(getwd(), "/temp/")
    my_data <- read.table(paste0(pathtemp, fName), sep ="|", header = FALSE, dec =".", quote = "", fill=TRUE)
    names(my_data) <- cols
    my_data$covcateg <- trimws(my_data$covcateg, which = "right") 
    if (type == "policy") {
      my_data$quanttype <- trimws(my_data$quanttype, which = "right")
    }
    dfs[[y]] <- my_data
  }
  data <- do.call("rbind", dfs)
    
  return(data)
}


In [4]:
polacres <- getRMA(type='policy', years=c(2017, 2018))
print(dim(polacres))

[1] 268113     28


In [5]:
claims <- getRMA(type='claim', years=c(2017, 2018))
print(dim(claims))

[1] 244610     30


In [7]:
format_policies <- function(df) {
  # TODO: add docstring
  
  df <- reshape(df, idvar = c("year", "stfips", "stabb", "cntyfips", "cntyname", 
                              "commoditycode", "commodityname", "insplancode", 
                              "insplanname", "covcateg", "claim"), 
                        timevar = "covlevel", v.names=c("acres", "unitssold"), direction = "wide")
  df[is.na(df)] <- 0
  # calculate the mean coverage level and the number of units sold by year, stfips, 
  # cntyfips, commoditycode, insplancode, covcateg, claim combination
  df$acres.total <- df$acres.0.95 + df$acres.0.9 + df$acres.0.85 + df$acres.0.8 + df$acres.0.75 + 
                    df$acres.0.7 + df$acres.0.65 + df$acres.0.6 + df$acres.0.55 + df$acres.0.5 
  df$meancov <- (0.95*df$acres.0.95 + 0.9*df$acres.0.9 + 0.85*df$acres.0.85 + 0.8*df$acres.0.8 + 
                 0.75*df$acres.0.75 + 0.7*df$acres.0.7 + 0.65*df$acres.0.65 + 0.6*df$acres.0.6 + 
                 0.55*df$acres.0.55 + 0.5*df$acres.0.5) / df$acres.total
  df$unitssold <- df$unitssold.0.95 + df$unitssold.0.9 + df$unitssold.0.85 + df$unitssold.0.8 + 
                  df$unitssold.0.75 + df$unitssold.0.7 + df$unitssold.0.65 + df$unitssold.0.6 + 
                  df$unitssold.0.55 + df$unitssold.0.5
  myvars <- c("year","stfips", "stabb", "cntyfips", "cntyname", "commoditycode", "commodityname", 
              "insplancode", "insplanname", "covcateg", "claim", "acres.total", "unitssold", "meancov")
  df <- df[myvars]
    
  return(df)
}

In [8]:
# only keep policies with premium expressed in acres and known county fips and commodity code
polacres <- polacres[polacres$polprem > 0 & polacres$cntyfips < 999 & 
                     polacres$quanttype == "Acres" & polacres$acres > 0 & polacres$commoditycode != 9999, ]
# claim = dummy equal to 1 if at least one claim
polacres$claim <- ifelse(polacres$unitsindemn > 0, 1, 0)
myvars <- c("year","stfips", "stabb", "cntyfips", "cntyname", "commoditycode", "commodityname", "insplancode", "insplanname", "covcateg", "covlevel", "unitssold", "claim", "acres")
polacres <- polacres[myvars]

polacres <- format_policies(polacres)

# we split the policy file into polacresclaim for which we have claims
polacresclaim <- polacres[polacres$claim == 1,]
# and polacresnoclaim for which we have no claims. We save this file for later.
polacresnoclaim <- polacres[polacres$claim == 0,]
polacresnoclaim$lostacres <- 0


# only keep wildlife claims with insured and lost acres and known county fips and commodity code
claimswild <- claims[claims$cntyfips < 999 & claims$acres > 0 & 
                     claims$commoditycode != 9999 & claims$lostacres > 0 & claims$damagecausecode == 93, ]

claimswild<-aggregate(cbind(lostacres) ~ year + stfips + stabb + cntyfips + cntyname + commoditycode + 
                      commodityname + insplancode + insplanname + covcateg, 
                      data = claimswild, FUN = function(x) sum(x), na.action=NULL )
claimspol<-join(polacresclaim, claimswild, by=c("year", "stfips", "stabb", "cntyfips", "cntyname", 
                                               "commoditycode", "commodityname", "insplancode", 
                                               "insplanname", "covcateg"), type='left')

# we can now append the 2 files
claimpoltot <- bind_rows(claimspol, polacresnoclaim)
claimpoltot$lostacres[is.na(claimpoltot$lostacres)] <- 0
claimpoltot$ratio <- claimpoltot$lostacres/claimpoltot$acres.total
claimpoltot$ratio[claimpoltot$ratio > 1] <- 1
claimpoltot$lostacres <- claimpoltot$ratio*claimpoltot$acres
claimpoltot$fullfips <- claimpoltot$stfips*1000 + claimpoltot$cntyfips
claimpoltot$logacres <- log(claimpoltot$acres.total)
claimpoltot$logunitssold <- log(claimpoltot$unitssold)
claimpoltot$cat <- ifelse(claimpoltot$covcateg=="C", 1, 0)
claimpoltot$y2017 <- ifelse(claimpoltot$year==2017, 1, 0)

# create string variables to be treated as categorical variables
claimpoltot$fullfips.ch <- as.character(claimpoltot$fullfips)
claimpoltot$commoditycode.ch <- as.character(claimpoltot$commoditycode)
claimpoltot$insplancode.ch <- as.character(claimpoltot$insplancode)

In [9]:
model_glm = glm(
  ratio ~  meancov + logacres + logunitssold + cat + y2017 + insplancode.ch + commoditycode.ch + fullfips.ch,
  epsilon = 1e-6,
  data = claimpoltot,
  family = quasibinomial
)

# get robust standard errors
tidy(coeftest(model_glm, vcov = vcovHC(model_glm, type="HC")))

summary(model_glm)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-16.9779076,1.85621978,-9.1464964,5.880994e-20
meancov,7.6263524,0.55990203,13.6208695,3.009632e-42
logacres,-0.4097054,0.04654582,-8.8021963,1.341642e-18
logunitssold,0.9383247,0.05676036,16.5313384,2.182435e-61
cat,-1.6938777,0.34105343,-4.9666052,6.813512e-07
y2017,-0.4504292,0.05518614,-8.1619976,3.295287e-16
insplancode.ch13,-7.7846590,1.75104558,-4.4457204,8.759784e-06
insplancode.ch16,-12.9803605,0.68960443,-18.8229077,4.901920e-79
insplancode.ch17,-12.6767721,0.37149743,-34.1234454,3.312834e-255
insplancode.ch2,-0.2166943,0.07487866,-2.8939399,3.804409e-03



Call:
glm(formula = ratio ~ meancov + logacres + logunitssold + cat + 
    y2017 + insplancode.ch + commoditycode.ch + fullfips.ch, 
    family = quasibinomial, data = claimpoltot, epsilon = 1e-06)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8593  -0.0578  -0.0007  -0.0001   3.5771  

Coefficients: (5 not defined because of singularities)
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)          -1.698e+01  3.202e+02  -0.053 0.957715    
meancov               7.626e+00  2.976e-01  25.630  < 2e-16 ***
logacres             -4.097e-01  2.395e-02 -17.110  < 2e-16 ***
logunitssold          9.383e-01  2.920e-02  32.132  < 2e-16 ***
cat                  -1.694e+00  1.846e-01  -9.175  < 2e-16 ***
y2017                -4.504e-01  3.072e-02 -14.663  < 2e-16 ***
insplancode.ch13     -7.785e+00  3.214e+02  -0.024 0.980678    
insplancode.ch16     -1.298e+01  2.460e+02  -0.053 0.957919    
insplancode.ch17     -1.268e+01  8.727e+01  -0.145 0.88

In [30]:
logitmfx(model_glm, claimpoltot)

“non-integer #successes in a binomial glm!”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”


ERROR: Error in gr %*% vcv: non-conformable arguments


In [21]:
margins(model=model_glm, data = claimpoltot, variables = c('meancov'), type = "response", vce='delta', atmeans=TRUE)

“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficient fit may be misleading”
“prediction from a rank-deficie

In [11]:
dydx(data=claimpoltot, model=model_glm, variable='meancov')

“prediction from a rank-deficient fit may be misleading”


Unnamed: 0_level_0,dydx_meancov
Unnamed: 0_level_1,<mrgnlffc>
1,1.983961e-07
2,5.356234e-08
3,4.402215e-08
4,2.258145e-01
5,7.230923e-03
6,2.315009e-01
7,3.320575e-02
8,4.454757e-03
9,1.728111e-07
10,4.072787e-01


In [None]:
, type = c("response", "link"),
change = c("dydx", "minmax", "iqr", "sd"), eps = 1e-07,
as.data.frame = TRUE, ...)