#### Author: Roberto Menzoa
#### Date: 05/03/2022
#### Topic: Analyzing RCT data with Precision Adjustment

In [492]:
#import Pkg; Pkg.add("DataFrames")
#import Pkg; Pkg.add("FilePaths")
#import Pkg; Pkg.add("Queryverse")
#import Pkg; Pkg.add("GLM")
#import Pkg; Pkg.add("StatsModels")
#import Pkg; Pkg.add("Combinatorics")
#import Pkg; Pkg.add("Iterators")
#import Pkg; Pkg.add("CategoricalArrays")
#import Pkg; Pkg.add("StatsBase")
#import Pkg; Pkg.add("Lasso")
#import Pkg; Pkg.add("TypedTables")
#import Pkg; Pkg.add("MacroTools")
using GLM
using DelimitedFiles, DataFrames, Lasso
using FilePaths
using Queryverse
using StatsModels, Combinatorics
using CategoricalArrays
using StatsBase, Statistics
using TypedTables
using MacroTools

In [367]:
pwd()

"C:\\Users\\Roberto Carlos\\Documents\\GitHub\\14.38_Causal_ML\\Julia_Notebooks"

In [499]:
# Loading data

mat, head = readdlm("../data/penn_jae.dat", header=true, Float64)
mat
df =DataFrame(mat, vec(head))
describe(df)

Unnamed: 0_level_0,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64,Float64,Int64,DataType
1,abdt,10693.6,10404.0,10691.0,10880.0,0,Float64
2,tg,2.56889,0.0,2.0,6.0,0,Float64
3,inuidur1,12.9148,1.0,10.0,52.0,0,Float64
4,inuidur2,12.1938,0.0,9.0,52.0,0,Float64
5,female,0.402142,0.0,0.0,1.0,0,Float64
6,black,0.116653,0.0,0.0,1.0,0,Float64
7,hispanic,0.0363689,0.0,0.0,1.0,0,Float64
8,othrace,0.00575002,0.0,0.0,1.0,0,Float64
9,dep,0.444045,0.0,0.0,2.0,0,Float64
10,q1,0.0136563,0.0,0.0,1.0,0,Float64


In [369]:
#dimenntions of dataframe 

a = size(df,1)
b =  size(df,2)

23

In [370]:
# Filter control group and just treatment group number 4

penn = filter(row -> row[:tg] in [4,0], df)

first(penn,20)

Unnamed: 0_level_0,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,10824.0,0.0,18.0,18.0,0.0,0.0,0.0,0.0,2.0
2,10824.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,10747.0,0.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0
4,10607.0,4.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0
5,10831.0,0.0,27.0,27.0,0.0,0.0,0.0,0.0,1.0
6,10845.0,0.0,27.0,27.0,1.0,0.0,0.0,0.0,0.0
7,10831.0,0.0,9.0,9.0,1.0,0.0,0.0,0.0,1.0
8,10859.0,0.0,27.0,27.0,1.0,0.0,0.0,0.0,1.0
9,10516.0,0.0,15.0,15.0,1.0,0.0,0.0,0.0,0.0
10,10663.0,0.0,28.0,11.0,1.0,0.0,0.0,0.0,0.0


In [371]:
# Treatment group n°4
replace!(penn.tg, 4 => 1)
rename!(penn, "tg" => "T4")


# from float to string
penn[!,:dep] = string.(penn[!,:dep]) 

# dep varaible in categorical format 
penn[!,:dep] = categorical(penn[!,:dep])

describe(penn)

Unnamed: 0_level_0,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,abdt,10695.4,10404.0,10698.0,10880.0,0,Float64
2,T4,0.342224,0.0,0.0,1.0,0,Float64
3,inuidur1,13.053,1.0,11.0,52.0,0,Float64
4,inuidur2,12.2812,0.0,10.0,52.0,0,Float64
5,female,0.404001,0.0,0.0,1.0,0,Float64
6,black,0.121985,0.0,0.0,1.0,0,Float64
7,hispanic,0.0325554,0.0,0.0,1.0,0,Float64
8,othrace,0.00725632,0.0,0.0,1.0,0,Float64
9,dep,,0.0,,2.0,0,"CategoricalValue{String, UInt32}"
10,q1,0.0127476,0.0,0.0,1.0,0,Float64


### 2.0 Carry out covariate balance check

In [372]:
# couples variables combinations 
combinations_upto(x, n) = Iterators.flatten(combinations(x, i) for i in 1:n)

# combinations without same couple
expand_exp(args, deg::ConstantTerm) =
    tuple(((&)(terms...) for terms in combinations_upto(args, deg.n))...)

StatsModels.apply_schema(t::FunctionTerm{typeof(^)}, sch::StatsModels.Schema, ctx::Type) =
    apply_schema.(expand_exp(t.args_parsed...), Ref(sch), ctx)

In [373]:
# linear regression

reg1 = @formula(T4 ~ (female+black+othrace+dep+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2)
reg1 = apply_schema(reg1, schema(reg1, penn))

FormulaTerm
Response:
  T4(continuous)
Predictors:
  female(continuous)
  black(continuous)
  othrace(continuous)
  dep(DummyCoding:3→2)
  q2(continuous)
  q3(continuous)
  q4(continuous)
  q5(continuous)
  q6(continuous)
  agelt35(continuous)
  agegt54(continuous)
  durable(continuous)
  lusd(continuous)
  husd(continuous)
  female(continuous) & black(continuous)
  female(continuous) & othrace(continuous)
  female(continuous) & dep(DummyCoding:3→2)
  female(continuous) & q2(continuous)
  female(continuous) & q3(continuous)
  female(continuous) & q4(continuous)
  female(continuous) & q5(continuous)
  female(continuous) & q6(continuous)
  female(continuous) & agelt35(continuous)
  female(continuous) & agegt54(continuous)
  female(continuous) & durable(continuous)
  female(continuous) & lusd(continuous)
  female(continuous) & husd(continuous)
  black(continuous) & othrace(continuous)
  black(continuous) & dep(DummyCoding:3→2)
  black(continuous) & q2(continuous)
  black(continuous) & q3(

In [374]:
m1 = lm(reg1, penn)

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

T4 ~ 1 + female + black + othrace + dep + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd + female & black + female & othrace + female & dep + female & q2 + female & q3 + female & q4 + female & q5 + female & q6 + female & agelt35 + female & agegt54 + female & durable + female & lusd + female & husd + black & othrace + black & dep + black & q2 + black & q3 + black & q4 + black & q5 + black & q6 + black & agelt35 + black & agegt54 + black & durable + black & lusd + black & husd + othrace & dep + othrace & q2 + othrace & q3 + othrace & q4 + othrace & q5 + othrace & q6 + othrace & agelt35 + othrace & agegt54 + othrace & durable + othrace & lusd + othrace & husd + dep & q2 + dep & q3 + dep & q4 + dep & q5 + dep & q6 + dep & agelt35 + dep & agegt54 + dep & durable + dep & lusd + dep & husd + q2 & q

### 3.0 Model specification

In [375]:
# No adjustment (2-sample approach)

ols_cl = lm(@formula(log(inuidur1) ~ T4), penn)

# adding controls
# Omitted dummies: q1, nondurable, muld

reg2 = @formula(log(inuidur1) ~ T4 + (female+black+othrace+dep+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)^2)
reg2 = apply_schema(reg2, schema(reg2, penn))

ols_cra = lm(reg2, penn)


StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

:(log(inuidur1)) ~ 1 + T4 + female + black + othrace + dep + q2 + q3 + q4 + q5 + q6 + agelt35 + agegt54 + durable + lusd + husd + female & black + female & othrace + female & dep + female & q2 + female & q3 + female & q4 + female & q5 + female & q6 + female & agelt35 + female & agegt54 + female & durable + female & lusd + female & husd + black & othrace + black & dep + black & q2 + black & q3 + black & q4 + black & q5 + black & q6 + black & agelt35 + black & agegt54 + black & durable + black & lusd + black & husd + othrace & dep + othrace & q2 + othrace & q3 + othrace & q4 + othrace & q5 + othrace & q6 + othrace & agelt35 + othrace & agegt54 + othrace & durable + othrace & lusd + othrace & husd + dep & q2 + dep & q3 + dep & q4 + dep & q5 + dep & q6 + dep & agelt35 + dep & agegt54 + dep & durable + dep & lusd + 

In [493]:

function desv_mean(a)
    A = mean(a, dims = 1)
    M = zeros(Float64, size(X,1), size(X,2))
    for i in 1:size(a,2)
          M[:,i] = a[:,i] .- A[i]
    end
    return M
end    

X = StatsModels.modelmatrix(reg1.rhs,penn)
X = desv_mean(X)

X = DataFrame(X, :auto)

Unnamed: 0_level_0,x1,x2,x3,x4,x5,x6,x7,x8
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,-0.404001,-0.121985,-0.00725632,-0.112179,0.836242,-0.203765,-0.235536,-0.225927
2,-0.404001,-0.121985,-0.00725632,-0.112179,-0.163758,-0.203765,-0.235536,-0.225927
3,-0.404001,-0.121985,-0.00725632,-0.112179,-0.163758,-0.203765,-0.235536,0.774073
4,-0.404001,-0.121985,-0.00725632,-0.112179,-0.163758,-0.203765,0.764464,-0.225927
5,-0.404001,-0.121985,-0.00725632,0.887821,-0.163758,-0.203765,-0.235536,-0.225927
6,0.595999,-0.121985,-0.00725632,-0.112179,-0.163758,-0.203765,-0.235536,-0.225927
7,0.595999,-0.121985,-0.00725632,0.887821,-0.163758,-0.203765,-0.235536,-0.225927
8,0.595999,-0.121985,-0.00725632,0.887821,-0.163758,-0.203765,-0.235536,-0.225927
9,0.595999,-0.121985,-0.00725632,-0.112179,-0.163758,0.796235,-0.235536,-0.225927
10,0.595999,-0.121985,-0.00725632,-0.112179,-0.163758,-0.203765,0.764464,-0.225927


In [464]:
var = Symbol(var)

Symbol("[\"x1\", \"x2\", \"x3\", \"x4\", \"x5\", \"x6\", \"x7\", \"x8\", \"x9\", \"x10\", \"x11\", \"x12\", \"x13\", \"x14\", \"x15\", \"x16\", \"x17\", \"x18\", \"x19\", \"x20\", \"x21\", \"x22\", \"x23\", \"x24\", \"x25\", \"x26\", \"x27\", \"x28\", \"x29\", \"x30\", \"x31\", \"x32\", \"x33\", \"x34\", \"x35\", \"x36\", \"x37\", \"x38\", \"x39\", \"x40\", \"x41\", \"x42\", \"x43\", \"x44\", \"x45\", \"x46\", \"x47\", \"x48\", \"x49\", \"x50\", \"x51\", \"x52\", \"x53\", \"x54\", \"x55\", \"x56\", \"x57\", \"x58\", \"x59\", \"x60\", \"x61\", \"x62\", \"x63\", \"x64\", \"x65\", \"x66\", \"x67\", \"x68\", \"x69\", \"x70\", \"x71\", \"x72\", \"x73\", \"x74\", \"x75\", \"x76\", \"x77\", \"x78\", \"x79\", \"x80\", \"x81\", \"x82\", \"x83\", \"x84\", \"x85\", \"x86\", \"x87\", \"x88\", \"x89\", \"x90\", \"x91\", \"x92\", \"x93\", \"x94\", \"x95\", \"x96\", \"x97\", \"x98\", \"x99\", \"x100\", \"x101\", \"x102\", \"x103\", \"x104\", \"x105\", \"x106\", \"x107\", \"x108\", \"x109\", \"x110\",

In [498]:
Y = select(penn, [:inuidur1,:T4])

base = hcat(Y, X)
var = names(base[:,3:121])

#interactive regression model

reg3 = @formula(log(inuidur1) ~ T4*(x1+x2+x4+x5+x6+x7+x8+x9+x10+x119))
ols_ira = lm(reg3, base)

LoadError: MethodError: no method matching |(::Float64, ::Float64)
[0mClosest candidates are:
[0m  |(::Any, ::Any, [91m::Any[39m, [91m::Any...[39m) at C:\Users\Roberto Carlos\AppData\Local\Programs\Julia-1.7.2\share\julia\base\operators.jl:655
[0m  |(::T1, [91m::DataValue{T2}[39m) where {T1<:Number, T2<:Number} at C:\Users\Roberto Carlos\.julia\packages\DataValues\N7oeL\src\scalar\core.jl:213
[0m  |([91m::DataValue{T1}[39m, ::T2) where {T1<:Number, T2<:Number} at C:\Users\Roberto Carlos\.julia\packages\DataValues\N7oeL\src\scalar\core.jl:212
[0m  ...

In [484]:
#Lasso Regression HDM library from R

X = StatsModels.modelmatrix(reg2.rhs,penn)
X = desv_mean(X)
X = DataFrame(X, :auto)

Y = select(penn, [:inuidur1])

base = hcat(Y, X)
rename!(base, Dict(:x1 => :T4))

# Se debe incluir 119 varaibles, pero no he podido encontrar un shortcase

lasso = fit(LassoModel, @formula(log(inuidur1) ~ T4*(x2+x4+x5+x6+x7+x8+x9+x10+x119)), base)
coef(lasso)[2]

-0.06693920714769246

In [485]:
# Comparative ATE estimation

t = Table(outcome = ["Estimate", "Standard error"], 
    CL = [GLM.coeftable(ols_cl).cols[1][2],GLM.coeftable(ols_cl).cols[2][2]], 
    CRA = [GLM.coeftable(ols_cra).cols[1][2],GLM.coeftable(ols_cra).cols[2][2]], 
    IRA = [GLM.coeftable(ols_ira).cols[1][2],GLM.coeftable(ols_ira).cols[2][2]], 
    IRA_w_Lasso = [coef(lasso)[2],0])

#ols_cl
#GLM.coeftable(ols_cl).cols[1][2]
#[2]

Table with 5 columns and 2 rows:
     outcome         CL          CRA         IRA         IRA_w_Lasso
   ┌────────────────────────────────────────────────────────────────
 1 │ Estimate        -0.0854554  -0.0796801  -0.0807984  -0.0669392
 2 │ Standard error  0.0358387   0.0356467   0.0357938   0.0