This notebook contains an example for teaching.

# The Effect of Gun Ownership on Gun-Homicide Rates - proceeding

In this lab, we estimate the effect of gun ownership on the homicide rate by a neural network.

In [55]:
using Pkg
Pkg.add("CSV"), using CSV
Pkg.add("DataFrames"), using DataFrames
Pkg.add("StatsModels"), using StatsModels
Pkg.add("GLM"), using GLM
Pkg.add("Random"), using Random
using Statistics, Plots, FixedEffectModels, MLDataUtils, MLBase

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\PC\.julia\environments\v1.6\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\PC\.julia\environments\v1.6\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\PC\.julia\environments\v1.6\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\PC\.julia\environments\v1.6\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\PC\.julia\environments\v1.6\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\PC\.julia\environments\v1.6\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\PC\.julia\environments\v1.6\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\PC\.julia\environments\v1.6\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\U

First, we need to load and preprocess the data.

In [77]:
data = CSV.File("../data/gun_clean.csv") |> DataFrame;
println("Number of rows: ",size(data,1))
println("Number of columns: ",size(data,2))

Number of rows: 3900
Number of columns: 415


In [78]:
################################# Create Variables ###############################

# Dummy Variables for Year and County Fixed Effects
fixed = filter(x->contains(x, "X_Jfips"), names(data));
year = filter(x->contains(x, "X_Tyear"), names(data));

In [79]:
census = []
census_var = ["AGE", "BN", "BP", "BZ", "ED", "EL", "HI", "HS", "INC", "LF", "LN", "PI", "PO", "PP", "PV", "SPR", "VS"]

for i in 1:size(census_var,1) 
    append!(census, filter(x->contains(x, census_var[i]), names(data)))
end

In [80]:
################################ Variables ##################################

# Treatment Variable
d = ["logfssl"];

# Outcome Variable
y = ["logghomr"];

# Other Control Variables
X1 = ["logrobr", "logburg", "burg_missing", "robrate_missing"];
X2 = ["newblack", "newfhh", "newmove", "newdens", "newmal"];

In [81]:
#################################  Partial out Fixed Effects ########################

# Variables to be Partialled-out
variable = [y, d,X1, X2, census]
varlis = []

# Partial out Variables in varlist from year and county fixed effect
for i in variable
    append!(varlis,i)
end

In [82]:
# Running the following lines takes aprox. 10 minutes (depends on your CPU)

example = DataFrame(CountyCode = data[:,"CountyCode"]);
rdata = DataFrame(CountyCode = data[:,"CountyCode"]);

for i in 1:size(varlis,1)
    rdata[!,varlis[i]]= residuals(lm(term(Symbol(varlis[i])) ~ sum(term.(Symbol.(year))) + sum(term.(Symbol.(fixed))), data))
end

# DML for neural nets

The following algorithm comsumes $Y$,$D$ and $Z$, and learns the residuals $\tilde{Y}$ and $\tilde{D}$ via a neural network, where the residuals are obtained by cross-validation (cross-fitting). Then, it prints the estimated coefficient β and the clustered standard error from the final OLS regression.

In [67]:
using Flux
using Flux: crossentropy, @epochs
using Flux.Data: DataLoader
using Flux: throttle
using Flux: onehotbatch, onecold, @epochs
using StatsBase



In [88]:
mean_1 = mean.(eachcol(Z))
    mean_1 = [names(Z) mean_1]

    std_1 = std.(eachcol(Z))
    std_1 = [names(Z) std_1]

    df = DataFrame()
    for i in 1:size(Z)[2]
        p = (Z[!, i] .- mean_1[i,2]) / std_1[i,2]
        colname = names(Z)[i]
        df[!,colname] = p
    end
    Z = df
    z = Matrix(Z);

In [118]:
function DML2_for_NN(z , d , y, nfold, clu, num_epochs)
    
    # Num ob observations
    nobser = size(z,1)
    
    # Define folds indices 
    foldid = collect(Kfold(size(z)[1], nfold))
    
    # Create array to save errors 
    ytil = ones(nobser)
    dtil = ones(nobser)
    println("Folds: " )
    
    # loop to save results
    for i in 1:nfold
        ##############################################
        ################| MODEL D |###################
        model_y= Chain(Dense(size(z,2), 16, relu), 
        Dense(16, 16, relu),
        Dense(16, 1))

        opt = RMSProp()
        loss_y(x, y) = Flux.Losses.mse(model_y(x), y)
        metrics_y(x, y) = Flux.mae(model_y(x), y)
        ps_y = Flux.params(model_y)

        ##############################################
        ################| MODEL Y |###################
        model_d= Chain(Dense(size(z,2), 16, relu), 
        Dense(16, 16, relu),
        Dense(16, 1))

        opt = RMSProp()
        loss_d(x, y) = Flux.Losses.mse(model_d(x), y)
        metrics_d(x, y) = Flux.mae(model_d(x), y)
        ps_d = Flux.params(model_d)

        data_d = DataLoader((z[foldid[i],:]', d[foldid[i]]'))
        data_y = DataLoader((z[foldid[i],:]', y[foldid[i]]'))

    # Lasso regression, excluding folds selected 
    for epoch in 1:num_epochs
        time = @elapsed Flux.train!(loss_y, ps_y, data_y, opt)
    end

    for epoch in 1:num_epochs
        time = @elapsed Flux.train!(loss_d, ps_d, data_d, opt)
    end

    # Predict estimates using the 
    yhat = model_y(z[Not(foldid[i]),:]')';
    ###############################################################################
    dhat = model_d(z[Not(foldid[i]),:]')';
    
        
    # Save errors 
    dtil[Not(foldid[i])] = (d[Not(foldid[i])] - dhat)
    ytil[Not(foldid[i])] = (y[Not(foldid[i])] - yhat)

    println(i)
    end
    
    # Create dataframe 
    data = DataFrame(ytil = ytil, dtil = dtil, clu=clu)
    
    # OLS clustering at the County level
    rfit = reg(data, @formula(ytil ~ dtil +fe(clu)))
    coef_est = coef(rfit)[1]
    se = FixedEffectModels.coeftable(rfit).cols[2]

    println(" coef (se) = ", coef_est ,"(",se,")")
    
    #return rfit, data;
    
end

DML2_for_NN (generic function with 1 method)

# Estimating the effect with DLM for neural nets

In [34]:
# Treatment variable
D = rdata[!,d]

# Outcome variable

Y = rdata[!,y];

# Construct matrix Z
Z = rdata[!, varlis[3:end]];


In [103]:
# Create main variables
z = Matrix(Z);
d = D[!,1];
y = Y[!,1];
clu = rdata[!, :CountyCode];
first(DataFrame(logghomr = y,logfssl = d,CountyCode = clu ),6)

Unnamed: 0_level_0,logghomr,logfssl,CountyCode
Unnamed: 0_level_1,Float64,Float64,Int64
1,-0.134778,0.0961271,1073
2,-0.239622,0.0808094,1073
3,-0.0786772,0.0573399,1073
4,-0.331465,0.0816945,1073
5,-0.31664,0.0253655,1073
6,0.105132,-0.00677726,1073


In [119]:
DML2_for_NN(z,d,y,10,clu,100)

Folds: 
1
2
3
4
5
6
7
8
9
10
 coef (se) = -0.09614707348813742([0.05693337462206755])
