In [110]:
# import Pkg; Pkg.add("CSV")
# import Pkg; Pkg.add("Grep")
# import Pkg; Pkg.add("FixedEffectModels")

[32m[1m    Updating[22m[39m registry at `C:\Users\sandr\.julia\registries\General`
[32m[1m    Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m GroupedArrays ───── v0.3.1
[32m[1m   Installed[22m[39m Vcov ────────────── v0.5.0
[32m[1m   Installed[22m[39m FixedEffects ────── v2.1.0
[32m[1m   Installed[22m[39m FixedEffectModels ─ v1.6.5
[32m[1m    Updating[22m[39m `C:\Users\sandr\.julia\environments\v1.6\Project.toml`
 [90m [9d5cd8c9] [39m[92m+ FixedEffectModels v1.6.5[39m
[32m[1m    Updating[22m[39m `C:\Users\sandr\.julia\environments\v1.6\Manifest.toml`
 [90m [9d5cd8c9] [39m[92m+ FixedEffectModels v1.6.5[39m
 [90m [c8885935] [39m[92m+ FixedEffects v2.1.0[39m
 [90m [6407cd72] [39m[92m+ GroupedArrays v0.3.1[39m
 [90m [ec2bfdc2] [39m[92m+ Vcov v0.5.0[39m
[32m[1mPrecompiling[22m[39m project...
[32m  ✓ [39m[90mGroupedArrays[39m


In [2]:
using CSV, DataFrames, Grep
using RData, LinearAlgebra, GLM, DataFrames, Statistics, Random, Distributions, DataStructures, NamedArrays, PrettyTables
import CodecBzip2


This notebook contains an example for teaching.

# A Case Study: The Effect of Gun Ownership on Gun-Homicide Rates

We consider the problem of estimating the effect of gun
ownership on the homicide rate. For this purpose, we estimate the following partially
linear model

$$
 Y_{j,t} = \beta D_{j,(t-1)} + g(Z_{j,t}) + \epsilon_{j,t}.
$$

## Data

$Y_{j,t}$ is log homicide rate in county $j$ at time $t$, $D_{j, t-1}$ is log  fraction of suicides committed with a firearm in county $j$ at time $t-1$, which we use as a proxy for gun ownership,  and  $Z_{j,t}$ is a set of demographic and economic characteristics of county $j$ at time $t$. The parameter $\beta$ is the effect of gun ownership on the
homicide rates, controlling for county-level demographic and economic characteristics. 

The sample covers 195 large United States counties between the years 1980 through 1999, giving us 3900 observations.

In [8]:
data = CSV.File("../data/gun_clean.csv") |> DataFrame
size(data)

(3900, 415)

### Preprocessing

To account for heterogeneity across counties and time trends in  all variables, we remove from them county-specific and time-specific effects in the following preprocessing.

In [None]:
# #################################  Find Variable Names from Dataset ########################

# varlist <- function (df=NULL,type=c("numeric","factor","character"), pattern="", exclude=NULL) {
#   vars <- character(0)
#   if (any(type %in% "numeric")) {
#     vars <- c(vars,names(df)[sapply(df,is.numeric)])
#   }
#   if (any(type %in% "factor")) {
#     vars <- c(vars,names(df)[sapply(df,is.factor)])
#   }  
#   if (any(type %in% "character")) {
#     vars <- c(vars,names(df)[sapply(df,is.character)])
#   }  
#   vars[(!vars %in% exclude) & grepl(vars,pattern=pattern)]
# }

In [4]:
################################# Create Variables ###############################


# Dummy Variables for Year and County Fixed Effects
R = r"X_Jfips"
fixed = grep(R, names(data))
year = grep("X_Tyear", names(data))
println("Variable: <<Fixed>> has ", length(fixed), " features")
println("Variable: <<Year>> has ", length(year), " outcomes")

# Census Control Variables

census = []
census_var = ["AGE", "BN", "BP", "BZ", "ED", "EL", "HI", "HS", "INC", "LF", "LN", "PI", "PO", "PP", "PV", "SPR", "VS"]

for i in 1:size(census_var, 1) 
    append!(census, grep(census_var[i], names(data)))
end

println("Variable: <<Census>> has ", length(census), " features")


Variable: <<Fixed>> has 195 features
Variable: <<Year>> has 21 outcomes
Variable: <<Census>> has 186 features


In [9]:
################################ Variables ##################################
# Treatment Variable
d = "logfssl"

# Outcome Variable
y = "logghomr"

# Other Control Variables
X1 = ["logrobr", "logburg", "burg_missing", "robrate_missing"]
X2 = ["newblack", "newfhh", "newmove", "newdens", "newmal"]



#################################  Partial out Fixed Effects ########################

# New Dataset for Partiled-out Variables
rdata = DataFrame(CountyCode = data[:,"CountyCode"])

# Variables to be Partialled-out
varlist2 = vcat(y, d, X1, X2, census)
println("Variable: <<varlist2>> has ", length(varlist2), " features")

form = []

# Partial out Variables in varlist from year and county fixed effect
for i in 1:size(varlist2, 1)
    append!(form, [term.(varlist2[i]) ~ sum(term.(vcat(year, fixed)))])
    rdata[!, varlist2[i]] = residuals(lm(form[i], data))
end

rdata

Variable: <<varlist2>> has 197 features


Unnamed: 0_level_0,CountyCode,logghomr,logfssl,logrobr,logburg,burg_missing,robrate_missing
Unnamed: 0_level_1,Int64,Float64,Float64,Float64,Float64,Float64,Float64
1,1073,-0.134778,0.0961271,0.150893,-0.124395,0.0104613,-0.021229
2,1073,-0.239622,0.0808094,0.0401683,-0.134781,0.0104613,-0.0194181
3,1073,-0.0786772,0.0573399,-0.017679,-0.167909,0.0104613,-0.0220374
4,1073,-0.331465,0.0816945,-0.00963344,-0.22925,0.0104613,-0.0194181
5,1073,-0.31664,0.0253655,-0.0267151,-0.176635,0.00324793,-0.0208037
6,1073,0.105132,-0.00677726,-0.151487,-0.189069,0.0104613,0.016953
7,1073,-0.0373401,0.0773061,-0.166729,-0.117739,0.0104613,0.0245505
8,1073,-0.0520609,-0.108433,-0.0996453,-0.0833094,0.00448964,0.021457
9,1073,0.0547007,-0.0340988,0.151557,0.319282,-0.0448348,-0.0366629
10,1073,0.122094,-0.0824292,0.0476034,-0.0144728,-0.00233214,0.00765442


In [206]:
# load dataset
rdata_read = CSV.File("../data/gun_clean2.csv") |> DataFrame
data = rdata_read[!, names(rdata)]
n = size(data,1)

3900

In [207]:
column_names = names(data)

198-element Vector{String}:
 "CountyCode"
 "logghomr"
 "logfssl"
 "logrobr"
 "logburg"
 "burg_missing"
 "robrate_missing"
 "newblack"
 "newfhh"
 "newmove"
 "newdens"
 "newmal"
 "AGE010D"
 ⋮
 "PVY020D"
 "PVY120D"
 "PVY210D"
 "PVY310D"
 "PVY420D"
 "PVY520D"
 "SPR030D"
 "SPR130D"
 "SPR230D"
 "SPR330D"
 "SPR440D"
 "VST020D"

In [208]:
result = []

for i in 1:size(data,1)
    for j in 1:size(data,2)
        data[i,j] = round(data[i,j], digits=6)
        rdata[i,j] = round(rdata[i,j], digits=6)
    end
end

for col in column_names
    result = sum(data[!,col] .== rdata[!,col])

    if result .== 3900
        println("Column ", col,  " are equal at 6 decimals")
    else
        println("Column ", col,  " are not equal at 6 decimals")
    end

end

Column CountyCode are equal at 6 decimals
Column logghomr are equal at 6 decimals
Column logfssl are equal at 6 decimals
Column logrobr are equal at 6 decimals
Column logburg are equal at 6 decimals
Column burg_missing are equal at 6 decimals
Column robrate_missing are equal at 6 decimals
Column newblack are equal at 6 decimals
Column newfhh are equal at 6 decimals
Column newmove are equal at 6 decimals
Column newdens are equal at 6 decimals
Column newmal are equal at 6 decimals
Column AGE010D are equal at 6 decimals
Column AGE050D are equal at 6 decimals
Column AGE110D are equal at 6 decimals
Column AGE170D are equal at 6 decimals
Column AGE180D are equal at 6 decimals
Column AGE270D are equal at 6 decimals
Column AGE310D are equal at 6 decimals
Column AGE320D are equal at 6 decimals
Column AGE350D are equal at 6 decimals
Column AGE380D are equal at 6 decimals
Column AGE410D are equal at 6 decimals
Column AGE470D are equal at 6 decimals
Column AGE570D are equal at 6 decimals
Column AG

Now, we can construct the treatment variable, the outcome variable and the matrix $Z$ that includes the control variables.

In [209]:
# Treatment Variable
D = rdata[!, d]
println("Variable: <<D>> has ", size(D), " outcomes")

# Outcome Variable
Y = rdata[!, y]
println("Variable: <<Y>> has ", size(Y), " outcomes")

# Construct matrix Z
Z = rdata[!, vcat(X1, X2, census)]
println("Variable: <<Z>> has ", size(Z), " outcomes")

Variable: <<D>> has (3900,) outcomes
Variable: <<Y>> has (3900,) outcomes
Variable: <<Z>> has (3900, 195) outcomes


We have in total 195 control variables. The control variables $Z_{j,t}$ are from the U.S. Census Bureau and  contain demographic and economic characteristics of the counties such as  the age distribution, the income distribution, crime rates, federal spending, home ownership rates, house prices, educational attainment, voting paterns, employment statistics, and migration rates. 

In [210]:
clu = rdata[!, "CountyCode"] #for clustering the standard errors

3900-element Vector{Int64}:
  1073
  1073
  1073
  1073
  1073
  1073
  1073
  1073
  1073
  1073
  1073
  1073
  1073
     ⋮
 55133
 55133
 55133
 55133
 55133
 55133
 55133
 55133
 55133
 55133
 55133
 55133

In [211]:
data = DataFrame(hcat(clu, Y, D), [:CountyCode, :logghomr, :logfssl])
data = [data Z]

Unnamed: 0_level_0,CountyCode,logghomr,logfssl,logrobr,logburg,burg_missing,robrate_missing
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1073.0,-0.134778,0.096127,0.150893,-0.124395,0.010461,-0.021229
2,1073.0,-0.239622,0.080809,0.040168,-0.134781,0.010461,-0.019418
3,1073.0,-0.078677,0.05734,-0.017679,-0.167909,0.010461,-0.022037
4,1073.0,-0.331465,0.081694,-0.009633,-0.22925,0.010461,-0.019418
5,1073.0,-0.31664,0.025366,-0.026715,-0.176635,0.003248,-0.020804
6,1073.0,0.105132,-0.006777,-0.151487,-0.189069,0.010461,0.016953
7,1073.0,-0.03734,0.077306,-0.166729,-0.117739,0.010461,0.02455
8,1073.0,-0.052061,-0.108433,-0.099645,-0.083309,0.00449,0.021457
9,1073.0,0.054701,-0.034099,0.151557,0.319282,-0.044835,-0.036663
10,1073.0,0.122094,-0.082429,0.047603,-0.014473,-0.002332,0.007654


In [99]:
# CSV.write("../data/gun_clean2.csv", data)

"../data/gun_clean2.csv"

## The effect of gun ownership

### OLS

After preprocessing the data, we first look at simple regression of $Y_{j,t}$ on $D_{j,t-1}$ without controls as a baseline model.

In [100]:
# # Run this line to avoid all the lines of code above
# data = CSV.File("../data/gun_clean2.csv") |> DataFrame

Unnamed: 0_level_0,CountyCode,logghomr,logfssl,logrobr,logburg,burg_missing,robrate_missing
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1073.0,-0.134778,0.096127,0.150893,-0.124395,0.010461,-0.021229
2,1073.0,-0.239622,0.080809,0.040168,-0.134781,0.010461,-0.019418
3,1073.0,-0.078677,0.05734,-0.017679,-0.167909,0.010461,-0.022037
4,1073.0,-0.331465,0.081694,-0.009633,-0.22925,0.010461,-0.019418
5,1073.0,-0.31664,0.025366,-0.026715,-0.176635,0.003248,-0.020804
6,1073.0,0.105132,-0.006777,-0.151487,-0.189069,0.010461,0.016953
7,1073.0,-0.03734,0.077306,-0.166729,-0.117739,0.010461,0.02455
8,1073.0,-0.052061,-0.108433,-0.099645,-0.083309,0.00449,0.021457
9,1073.0,0.054701,-0.034099,0.151557,0.319282,-0.044835,-0.036663
10,1073.0,0.122094,-0.082429,0.047603,-0.014473,-0.002332,0.007654


In [212]:
unique(clu)

195-element Vector{Int64}:
  1073
  1097
  4019
  5119
  6001
  6013
  6019
  6029
  6037
  6053
  6059
  6065
  6067
     ⋮
 49035
 49049
 51059
 51710
 51810
 53033
 53053
 53061
 53063
 55025
 55079
 55133

In [213]:
using FixedEffectModels

In [276]:
#baseline_formula <- as.formula(paste(y, "~", d ))
#baseline.ols <- lm(baseline_formula,data=rdata)

fm_1 = @formula(logghomr ~ 0 + logfssl + fe(CountyCode))
baseline_ols = reg(data, fm_1, Vcov.cluster(:CountyCode))

                        Fixed Effect Model                        
Number of obs:              3900  Degrees of freedom:            2
R2:                        0.006  R2 Adjusted:               0.006
F-Stat:                  18.9732  p-value:                   0.000
R2 within:                 0.006  Iterations:                    1
logghomr | Estimate Std.Error t value Pr(>|t|) Lower 95% Upper 95%
------------------------------------------------------------------
logfssl  | 0.282304 0.0648108 4.35582    0.000  0.155238   0.40937


In [277]:
println("2.5% : ", coeftable(baseline_ols).cols[5])
println("97.5% : " , coeftable(baseline_ols).cols[6])
println("Estimate: ", coeftable(baseline_ols).cols[1])
println("Cluster s.e. : " , r2(baseline_ols))
println("T-value : ", coeftable(baseline_ols).cols[3])
println("Pr(>|t|) : " , coeftable(baseline_ols).cols[4])

2.5% : [0.15523789539597105]
97.5% : [0.4093704387686366]
Estimate: [0.2823041670823038]
Cluster s.e. : 0.006193251272214595
T-value : [4.355820406592104]
Pr(>|t|) : [1.3597928318408193e-5]


The point estimate is $0.282$ with the confidence interval ranging from 0.155 to 0.41. This
suggests that increases in gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative
to a trend then the predicted gun homicide rate goes up by 0.28%, without controlling for counties' characteristics.

Since our goal is to estimate the effect of gun ownership after controlling for a rich set county characteristics we next include the controls. First, we estimate the model by ols and then by an array of the modern regression methods using the double machine learning approach.

In [216]:
# define the variables
y = "logghomr"

data_columns = names(data)
no_relev_col = ["CountyCode", "logghomr"]

# This gives us: new_list = ['carrot' , 'lemon']
z = data[!,Not(no_relev_col)]

Unnamed: 0_level_0,logfssl,logrobr,logburg,burg_missing,robrate_missing,newblack,newfhh
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.096127,0.150893,-0.124395,0.010461,-0.021229,0.030947,-0.020483
2,0.080809,0.040168,-0.134781,0.010461,-0.019418,0.030947,-0.020483
3,0.05734,-0.017679,-0.167909,0.010461,-0.022037,0.030947,-0.020483
4,0.081694,-0.009633,-0.22925,0.010461,-0.019418,0.030947,-0.020483
5,0.025366,-0.026715,-0.176635,0.003248,-0.020804,0.030947,-0.020483
6,-0.006777,-0.151487,-0.189069,0.010461,0.016953,0.030947,-0.020483
7,0.077306,-0.166729,-0.117739,0.010461,0.02455,0.030947,-0.020483
8,-0.108433,-0.099645,-0.083309,0.00449,0.021457,0.030947,-0.020483
9,-0.034099,0.151557,0.319282,-0.044835,-0.036663,0.030947,-0.020483
10,-0.082429,0.047603,-0.014473,-0.002332,0.007654,0.030947,-0.020483


In [298]:
terms = sum(term.(names(z)))

logfssl(unknown)
logrobr(unknown)
logburg(unknown)
burg_missing(unknown)
robrate_missing(unknown)
newblack(unknown)
newfhh(unknown)
newmove(unknown)
newdens(unknown)
newmal(unknown)
AGE010D(unknown)
AGE050D(unknown)
AGE110D(unknown)
AGE170D(unknown)
AGE180D(unknown)
AGE270D(unknown)
AGE310D(unknown)
AGE320D(unknown)
AGE350D(unknown)
AGE380D(unknown)
AGE410D(unknown)
AGE470D(unknown)
AGE570D(unknown)
AGE640D(unknown)
AGE670D(unknown)
AGE760D(unknown)
BNK010D(unknown)
BNK050D(unknown)
BPS030D(unknown)
BPS130D(unknown)
BPS230D(unknown)
BPS020D(unknown)
BPS120D(unknown)
BPS220D(unknown)
BPS820D(unknown)
BZA010D(unknown)
BZA110D(unknown)
BZA210D(unknown)
EDU100D(unknown)
EDU200D(unknown)
EDU600D(unknown)
EDU610D(unknown)
EDU620D(unknown)
EDU630D(unknown)
EDU635D(unknown)
EDU640D(unknown)
EDU650D(unknown)
EDU680D(unknown)
EDU685D(unknown)
ELE010D(unknown)
ELE020D(unknown)
ELE025D(unknown)
ELE030D(unknown)
ELE035D(unknown)
ELE060D(unknown)
ELE065D(unknown)
ELE210D(unknown)
ELE220D(unknown)
HI

In [316]:
names(data)[1:2]

2-element Vector{String}:
 "CountyCode"
 "logghomr"

In [330]:
a+term(:Coun)

logfssl(unknown)
logrobr(unknown)
Coun(unknown)

In [335]:
a + term(:Coun)

logfssl(unknown)
logrobr(unknown)
Coun(unknown)

In [323]:
a = sum(term.(names(z)[1:2]))

logfssl(unknown)
logrobr(unknown)

In [347]:
control_formula = term(:logghomr) ~ sum(term.(names(z)[1:end])) + fe(:CountyCode)

FormulaTerm
Response:
  logghomr(unknown)
Predictors:
  logfssl(unknown)
  logrobr(unknown)
  logburg(unknown)
  burg_missing(unknown)
  robrate_missing(unknown)
  newblack(unknown)
  newfhh(unknown)
  newmove(unknown)
  newdens(unknown)
  newmal(unknown)
  AGE010D(unknown)
  AGE050D(unknown)
  AGE110D(unknown)
  AGE170D(unknown)
  AGE180D(unknown)
  AGE270D(unknown)
  AGE310D(unknown)
  AGE320D(unknown)
  AGE350D(unknown)
  AGE380D(unknown)
  AGE410D(unknown)
  AGE470D(unknown)
  AGE570D(unknown)
  AGE640D(unknown)
  AGE670D(unknown)
  AGE760D(unknown)
  BNK010D(unknown)
  BNK050D(unknown)
  BPS030D(unknown)
  BPS130D(unknown)
  BPS230D(unknown)
  BPS020D(unknown)
  BPS120D(unknown)
  BPS220D(unknown)
  BPS820D(unknown)
  BZA010D(unknown)
  BZA110D(unknown)
  BZA210D(unknown)
  EDU100D(unknown)
  EDU200D(unknown)
  EDU600D(unknown)
  EDU610D(unknown)
  EDU620D(unknown)
  EDU630D(unknown)
  EDU635D(unknown)
  EDU640D(unknown)
  EDU650D(unknown)
  EDU680D(unknown)
  EDU685D(unknown)
  E

In [353]:
control_ols = reg(data, control_formula, Vcov.cluster(:CountyCode))

│                  model tests should be interpreted with caution.
└ @ Vcov C:\Users\sandr\.julia\packages\Vcov\8Fkqk\src\utils.jl:5


                               Fixed Effect Model                               
Number of obs:                     3900  Degrees of freedom:                 181
R2:                               0.203  R2 Adjusted:                      0.164
F-Stat:                      1.01727e12  p-value:                          0.000
R2 within:                        0.203  Iterations:                           1
logghomr        |   Estimate Std.Error      t value Pr(>|t|) Lower 95% Upper 95%
--------------------------------------------------------------------------------
logfssl         |   0.190671   7.06745    0.0269787    0.978  -13.6658   14.0471
logrobr         |    0.18903   10.5858     0.017857    0.986  -20.5655   20.9435
logburg         |   0.219294   4.03497    0.0543483    0.957  -7.69167   8.13026
burg_missing    |     1.5298    5.3219     0.287453    0.774  -8.90434   11.9639
robrate_missing |    1.13297   9.27664     0.122131    0.903  -17.0548   19.3208
newblack        |   -4.34682

In [354]:
println("For <<logfssl>> variable: ")
println("2.5% : ", coeftable(control_ols).cols[5][1])
println("97.5% : " , coeftable(control_ols).cols[6][1])
println("Estimate: ", coeftable(control_ols).cols[1][1])
println("Cluster s.e. : " , r2(control_ols))
println("T-value : ", coeftable(control_ols).cols[3][1])
println("Pr(>|t|) : " , coeftable(control_ols).cols[4][1])

For <<logfssl>> variable: 
2.5% : -13.665785879497696
97.5% : 14.04712694307223
Estimate: 0.19067053178726698
Cluster s.e. : 0.20296178723373282
T-value : 0.02697868976949486
Pr(>|t|) : 0.9784781784430563


After controlling for a rich set of characteristics, the point estimate of gun ownership reduces to $0.19$.

# DML algorithm

Here we perform inference of the predictive coefficient $\beta$ in our partially linear statistical model, 

$$
Y = D\beta + g(Z) + \epsilon, \quad E (\epsilon | D, Z) = 0,
$$

using the **double machine learning** approach. 

For $\tilde Y = Y- E(Y|Z)$ and $\tilde D= D- E(D|Z)$, we can write
$$
\tilde Y = \alpha \tilde D + \epsilon, \quad E (\epsilon |\tilde D) =0.
$$

Using cross-fitting, we employ modern regression methods
to build estimators $\hat \ell(Z)$ and $\hat m(Z)$ of $\ell(Z):=E(Y|Z)$ and $m(Z):=E(D|Z)$ to obtain the estimates of the residualized quantities:

$$
\tilde Y_i = Y_i  - \hat \ell (Z_i),   \quad \tilde D_i = D_i - \hat m(Z_i), \quad \text{ for each } i = 1,\dots,n.
$$

Finally, using ordinary least squares of $\tilde Y_i$ on $\tilde D_i$, we obtain the 
estimate of $\beta$.

The following algorithm comsumes $Y, D, Z$, and a machine learning method for learning the residuals $\tilde Y$ and $\tilde D$, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient $\beta$ and the corresponding standard error from the final OLS regression.

In [86]:
I_1 <- c(1,2,3,4,5,6,7,8,9,10)
I_2 <- c(11,12,13,14,15,16,17,18,19,20)
I = c(I_1, I_2)
I

In [87]:
DML2.for.PLM <- function(z, d, y, dreg, yreg, nfold=2, clu) {
  nobs <- nrow(z) #number of observations
  foldid <- rep.int(1:nfold,times = ceiling(nobs/nfold))[sample.int(nobs)] #define folds indices
  I <- split(1:nobs, foldid)  #split observation indices into folds  
  ytil <- dtil <- rep(NA, nobs)
  cat("fold: ")
  for(b in 1:length(I)){
    dfit <- dreg(z[-I[[b]],], d[-I[[b]]]) #take a fold out
    yfit <- yreg(z[-I[[b]],], y[-I[[b]]]) # take a foldt out
    dhat <- predict(dfit, z[I[[b]],], type="response") #predict the left-out fold 
    yhat <- predict(yfit, z[I[[b]],], type="response") #predict the left-out fold  
    dtil[I[[b]]] <- (d[I[[b]]] - dhat) #record residual for the left-out fold
    ytil[I[[b]]] <- (y[I[[b]]] - yhat) #record residial for the left-out fold
    cat(b," ")
        }
  #rfit <- lm(ytil ~ dtil)    #estimate the main parameter by regressing one residual on the other
  data <- data.frame(cbind(ytil, dtil, as.matrix(clu)))
  rfit <- felm(ytil ~ dtil|0|0|CountyCode,data=data) 
  coef.est <- coef(rfit)[2]  #extract coefficient
  #HC <- vcovHC(rfit)
  se    <- summary(rfit,robust=T)$coefficients[2,2] #record robust standard error by County
  cat(sprintf("\ncoef (se) = %g (%g)\n", coef.est , se))  #printing output
  return( list(coef.est =coef.est , se=se, dtil=dtil, ytil=ytil, rfit=rfit) ) #save output and residuals 
}

Now, we apply the Double Machine Learning (DML) approach with different machine learning methods. First, we load the relevant libraries.

In [88]:
library(hdm)
library(glmnet)
library(sandwich)
library(randomForest)

Let us, construct the input matrices.

In [89]:
y <- as.matrix(Y)
d <- as.matrix(D)
z <- as.matrix(Z)
clu <- rdata[which(colnames(rdata) == "CountyCode")]
head(data.frame(cbind(y,d,as.matrix(clu))))

Unnamed: 0_level_0,logghomr,logfssl,CountyCode
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
1,-0.13477752,0.096127077,1073
2,-0.23962152,0.080809373,1073
3,-0.07867716,0.057339916,1073
4,-0.33146546,0.081694483,1073
5,-0.3166398,0.025365514,1073
6,0.1051319,-0.006777264,1073


In the following, we apply the DML approach with the differnt versions of lasso.


## Lasso

In [91]:
#DML with Lasso:
set.seed(123)
dreg <- function(z,d){ rlasso(z,d, post=FALSE) } #ML method= lasso from hdm 
yreg <- function(z,y){ rlasso(z,y, post=FALSE) } #ML method = lasso from hdm

In [92]:
DML2.lasso = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu)

fold: 1  2  3  4  5  6  7  8  9  10  
coef (se) = 0.222959 (0.0570325)


In [93]:
#DML with Post-Lasso:
dreg <- function(z,d){ rlasso(z,d, post=T) } #ML method= lasso from hdm 
yreg <- function(z,y){ rlasso(z,y, post=T) } #ML method = lasso from hdm
DML2.post = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu)

fold: 1  2  3  4  5  6  7  8  9  10  
coef (se) = 0.226934 (0.0561918)


In [94]:
#DML with cross-validated Lasso:
dreg <- function(z,d){ cv.glmnet(z,d,family="gaussian", alpha=1) } #ML method = lasso from glmnet 
yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=1) }  #ML method = lasso from glmnet 
DML2.lasso.cv = DML2.for.PLM(z, d, y, dreg, yreg, nfold=5, clu)

dreg <- function(z,d){ cv.glmnet(z,d,family="gaussian", alpha=0.5) } #ML method = elastic net from glmnet 
yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=0.5) }  #ML method = elastic net from glmnet 
DML2.elnet = DML2.for.PLM(z, d, y, dreg, yreg, nfold=5, clu)

dreg <- function(z,d){ cv.glmnet(z,d,family="gaussian", alpha=0) } #ML method = ridge from glmnet 
yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=0) }  #ML method = ridge from glmnet 
DML2.ridge = DML2.for.PLM(z, d, y, dreg, yreg, nfold=5, clu)

fold: 1  2  3  4  5  
coef (se) = 0.194926 (0.0569378)
fold: 1  2  3  4  5  
coef (se) = 0.208474 (0.0600804)
fold: 1  2  3  4  5  
coef (se) = 0.200234 (0.0598422)


Here we also compute DML with OLS used as the ML method

In [95]:
dreg <- function(z,d){  glmnet(z,d,family="gaussian", lambda=0) } #ML method = ols from glmnet 
yreg <- function(z,y){  glmnet(z,y,family="gaussian", lambda=0) }  #ML method = ols from glmnet 
DML2.ols = DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu)

fold: 1  2  3  4  5  6  7  8  9  10  
coef (se) = 0.203079 (0.051136)


Next, we also apply Random Forest for comparison purposes.

### Random Forest


In [66]:
#DML with Random Forest:
dreg <- function(z,d){ randomForest(z, d) } #ML method=Forest 
yreg <- function(z,y){ randomForest(z, y) } #ML method=Forest
set.seed(1)
DML2.RF = DML2.for.PLM(z, d, y, dreg, yreg, nfold=2, clu) # set to 2 due to computation time

fold: 1  2  
coef (se) = 0.153017 (0.0605311)


In [91]:
if (!is.null(d) && !is.factor(d))
             max(floor(ncol(z)/3), 1) else floor(sqrt(ncol(z)))

In [92]:
if (!is.null(d) && !is.factor(d)) 5 else 1

We conclude that the gun ownership rates are related to gun homicide rates - if gun ownership increases by 1% relative
to a trend then the predicted gun homicide rate goes up by about 0.20% controlling for counties' characteristics.

Finally, let's see which method is actually better. We compute RMSE for predicting D and Y, and see which
of the methods works better.


In [96]:
mods<- list(DML2.ols, DML2.lasso, DML2.post, DML2.lasso.cv, DML2.ridge, DML2.elnet, DML2.RF)

RMSE.mdl<- function(mdl) {
RMSEY <- sqrt(mean(mdl$ytil)^2) 
RMSED <- sqrt(mean(mdl$dtil)^2) 
return( list(RMSEY=RMSEY, RMSED=RMSED))
}

#RMSE.mdl(DML2.lasso)

#DML2.lasso$ytil

Res<- lapply(mods, RMSE.mdl)


prRes.Y<- c( Res[[1]]$RMSEY,Res[[2]]$RMSEY, Res[[3]]$RMSEY, Res[[4]]$RMSEY, Res[[5]]$RMSEY,  Res[[6]]$RMSEY, Res[[7]]$RMSEY)
prRes.D<- c( Res[[1]]$RMSED,Res[[2]]$RMSED, Res[[3]]$RMSED, Res[[4]]$RMSED, Res[[5]]$RMSED, Res[[6]]$RMSED, Res[[7]]$RMSED)

prRes<- rbind(prRes.Y, prRes.D); 
rownames(prRes)<- c("RMSE D", "RMSE Y");
colnames(prRes)<- c("OLS", "Lasso", "Post-Lasso", "CV Lasso", "CV Ridge", "CV Elnet", "RF")
print(prRes,digit=6)

               OLS       Lasso  Post-Lasso    CV Lasso    CV Ridge    CV Elnet
RMSE D 0.000407561 3.25471e-05 1.32656e-04 0.000376929 7.24337e-04 9.66559e-04
RMSE Y 0.000134575 3.35791e-05 6.89649e-05 0.000044933 8.41741e-05 5.80479e-19
               RF
RMSE D 0.01086246
RMSE Y 0.00152755


It looks like the best method for predicting D is Lasso, and the best method for predicting Y is CV Ridge.


In [97]:
dreg <- function(z,d){ rlasso(z,d, post=T) } #ML method= lasso from hdm 
yreg <- function(z,y){ cv.glmnet(z,y,family="gaussian", alpha=0) }  #ML method = ridge from glmnet 
DML2.best= DML2.for.PLM(z, d, y, dreg, yreg, nfold=10, clu)

fold: 1  2  3  4  5  6  7  8  9  10  
coef (se) = 0.222066 (0.0565614)


Let's organize the results in a table.

In [84]:
est_baseline

In [18]:
library(xtable)

table <- matrix(0,9,2)
table[1,1] <- as.numeric(est_baseline[1])
table[2,1] <- as.numeric(est_ols[1])
table[3,1]   <- as.numeric(DML2.lasso$coef.est)
table[4,1]   <- as.numeric(DML2.post$coef.est)
table[5,1]  <-as.numeric(DML2.lasso.cv$coef.est)
table[6,1] <-as.numeric(DML2.elnet$coef.est)
table[7,1] <-as.numeric(DML2.ridge$coef.est)
table[8,1] <-as.numeric(DML2.RF$coef.est)
table[9,1] <-as.numeric(DML2.best$coef.est)
table[1,2] <- as.numeric(est_baseline[2])
table[2,2] <- as.numeric(est_ols[2])
table[3,2]   <- as.numeric(DML2.lasso$se)
table[4,2]   <- as.numeric(DML2.post$se)
table[5,2]  <-as.numeric(DML2.lasso.cv$se)
table[6,2] <-as.numeric(DML2.elnet$se)
table[7,2] <-as.numeric(DML2.ridge$se)
table[8,2] <-as.numeric(DML2.RF$se)
table[9,2] <-as.numeric(DML2.best$se)




################################# Print Results #################################

colnames(table) <- c("Estimate","Standard Error")
rownames(table) <- c("Baseline OLS", "Least Squares with controls", "Lasso", "Post-Lasso", "CV Lasso","CV Elnet", "CV Ridge", "Random Forest", 
                     "Best")

table

Unnamed: 0,Estimate,Standard Error
Baseline OLS,0.2823045,0.0648108
Least Squares with controls,0.1906447,0.05244756
Lasso,0.2228074,0.05702673
Post-Lasso,0.2269338,0.05619181
CV Lasso,0.2004742,0.05764115
CV Elnet,0.206117,0.05746222
CV Ridge,0.2013789,0.05790663
Random Forest,0.1921739,0.05814101
Best,0.2190048,0.05721956


In [19]:
print(table, digit=3)


                            Estimate Standard Error
Baseline OLS                   0.282         0.0648
Least Squares with controls    0.191         0.0524
Lasso                          0.223         0.0570
Post-Lasso                     0.227         0.0562
CV Lasso                       0.200         0.0576
CV Elnet                       0.206         0.0575
CV Ridge                       0.201         0.0579
Random Forest                  0.192         0.0581
Best                           0.219         0.0572


In [20]:
tab<- xtable(table, digits=3)
print(tab, type="latex")

% latex table generated in R 3.6.3 by xtable 1.8-4 package
% Sat Feb 13 17:41:19 2021
\begin{table}[ht]
\centering
\begin{tabular}{rrr}
  \hline
 & Estimate & Standard Error \\ 
  \hline
Baseline OLS & 0.282 & 0.065 \\ 
  Least Squares with controls & 0.191 & 0.052 \\ 
  Lasso & 0.223 & 0.057 \\ 
  Post-Lasso & 0.227 & 0.056 \\ 
  CV Lasso & 0.200 & 0.058 \\ 
  CV Elnet & 0.206 & 0.057 \\ 
  CV Ridge & 0.201 & 0.058 \\ 
  Random Forest & 0.192 & 0.058 \\ 
  Best & 0.219 & 0.057 \\ 
   \hline
\end{tabular}
\end{table}
