In [32]:
using CSV, DataFrames, Statistics
using Plots, StatsPlots
using ColorSchemes, Measures

function se(x)
    std(x)/sqrt(length(x))
end

se (generic function with 1 method)

In [33]:
df = similar(CSV.read("aistats-rev/synthetic/"*"linear_mar/"*"FINAL_results.csv", DataFrame),0)
for dir = ["linear_mar/", "linear_censoring/", "nn_mar/", "nn_censoring/"]
    directory = "aistats-rev/"*"synthetic/"*dir
    df = vcat(df, CSV.read(directory*"FINAL_results.csv", DataFrame)[:,names(df)])
end

#### Formatting Fixing

In [None]:
df[!,:n] .= map(t -> parse(Int,split(t,"_")[2]), df[:,:dataset]) ;
df[isnan.(df[:,:r2]),:r2] .= 0.
df[isnan.(df[:,:osr2]),:osr2] .= 0.

filter!(t -> t[:pMissing] <= 0.8, df)
filter!(t -> t[:n] > 20, df) ;

# df[!,:method] .= map(t -> replace(t, "Imp-then-Reg 4" => "Mean Impute-then-Regress"), df[:,:method])
# df[!,:method] .= map(t -> replace(t, "Imp-then-Reg 2" => "mice Impute-then-Regress"), df[:,:method])
# df[!,:method] .= map(t -> replace(t, "Joint Imp-then-Reg" => "Joint Impute-then-Regress"), df[:,:method])
# df[!,:method] .= map(t -> replace(t, "Static" => "Adaptive LR - Affine intercept"), df[:,:method]) ;

#### Rename methods and generate the ``best'' variants

In [None]:
#Rename methods
df[!,:method] .= map(t -> replace(t, "Imp-then-Reg 4" => "Mean Impute-then-Regress"), df[:,:method])
df[!,:method] .= map(t -> replace(t, "Imp-then-Reg 2" => "mice Impute-then-Regress"), df[:,:method])

df[!,:method] .= map(t -> replace(t, "Joint Imp-then-Reg" => "Joint Impute-then-Regress"), df[:,:method])

df[!,:method] .= map(t -> replace(t, "Affine" => "Adaptive LR - Affine"), df[:,:method]) ;
df[!,:method] .= map(t -> replace(t, "Finite" => "Adaptive LR - Finite"), df[:,:method]) ;
df[!,:method] .= map(t -> replace(t, "Static" => "Adaptive LR - Affine intercept only"), df[:,:method]) ;

In [None]:
for method in ["Adaptive LR"]
    aux = filter(t -> startswith(t[:method], method), df)
    # @show size(aux)
    idcols = [:dataset, :X_setting, :Y_setting, :n, :SNR, :k, :pMissing, :splitnum]
    gd = groupby(aux, idcols)

    aux = similar(aux, 0)
    for subdf in gd 
        scoremax = argmax(subdf[:,:score])
        push!(aux, subdf[scoremax,names(aux)])
    end
    aux[!,:method] .= method*" - best"

    df = vcat(df, aux)
end

## Section 3: Evaluation of joint impute-then-regress strategies

#### Figure 1: Plot R2 vs pMissing, linear Y, adaptive LR vs. heuristic for joint

Linear Y

In [None]:
df_lin = filter(t-> t[:Y_setting] == "syn_Y_linear", df)
filter!(t -> t[:method] ∈ ["Adaptive LR - Affine intercept only", 
        "Joint Impute-then-Regress - best", 
        "Mean Impute-then-Regress - best"], df_lin)


gd = groupby(df_lin, [:pMissing, :method, :X_setting])
stats = combine(gd, :osr2 .=> [mean, se]) ;

In [None]:
@df filter(t -> t[:X_setting] == "syn_X_mar", stats) groupedbar(:pMissing, :osr2_mean, 
    yerr=:osr2_se, group=:method, legend=:bottomleft, 
    guidefontsize=12, tickfontsize=12, legendfontsize=11,
    color = [pal[3] pal[1] pal[2]])
xaxis!("Fraction of missing entries")
yaxis!("Out-of-sample R2", ylims=(0.2,0.72))

In [None]:
mkpath("../figures/adaptive_regression/validation")
Plots.savefig("../figures/adaptive_regression/validation/synthetic_linearY_mar.pdf")

In [None]:
@df filter(t -> t[:X_setting] == "syn_X_censoring", stats) groupedbar(:pMissing, :osr2_mean, 
    yerr=:osr2_se, group=:method, legend=:bottomleft,
    guidefontsize=12, tickfontsize=12, legendfontsize=11,
    color = [pal[3] pal[1] pal[2]])
xaxis!("Fraction of missing entries")
yaxis!("Out-of-sample R2", ylims=(0.2,0.72))

In [None]:
Plots.savefig("../figures/adaptive_regression/validation/synthetic_linearY_censoring.pdf")

NN Y

In [None]:
df_nn = filter(t-> t[:Y_setting] == "syn_Y_nn", df)
filter!(t -> t[:method] ∈ [ 
        "Joint Impute-then-Regress - best", 
        "Mean Impute-then-Regress - best"], df_nn)

gd = groupby(df_nn, [:pMissing, :method, :X_setting])
stats = combine(gd, :osr2 .=> [mean, se])

In [None]:
@df filter(t -> t[:X_setting] == "syn_X_mar", stats) groupedbar(:pMissing, :osr2_mean, 
    yerr=:osr2_se, group=:method, legend=:bottomleft,
    guidefontsize=12, tickfontsize=12, legendfontsize=11,
    color = [pal[1] pal[2]])
xaxis!("Fraction of missing entries")
yaxis!("Out-of-sample R2", ylims=(0,0.55))

In [None]:
Plots.savefig("../figures/adaptive_regression/validation/synthetic_nnY_mar.pdf")

In [None]:
@df filter(t -> t[:X_setting] == "syn_X_censoring", stats) groupedbar(:pMissing, :osr2_mean, yerr=:osr2_se, 
    group=:method, legend=:bottomleft, 
    guidefontsize=12, tickfontsize=12, legendfontsize=11,
    color = [pal[1] pal[2]])
xaxis!("Fraction of missing entries")
yaxis!("Out-of-sample R2", ylims=(0.3,0.58))

In [None]:
Plots.savefig("../figures/adaptive_regression/validation/synthetic_nnY_censoring.pdf")

#### Output 2: Summary Table

In [None]:
subdf = filter(t -> t[:method] ∈ [ "Adaptive LR - best", 
        "CART MIA", 
        "RF MIA",
        "XGBoost",
        "Joint Impute-then-Regress - best", 
        "Mean Impute-then-Regress - best", 
        "mice Impute-then-Regress - best"], 
    df)

filter!(t -> t[:pMissing] < 0.9, subdf)
filter!(t -> t[:n] <= 1000, subdf)

gd = groupby(subdf, [:X_setting, :Y_setting, :method])
stats = combine(gd, :osr2 .=> [mean, se])

In [None]:
s = ""
for m in sort(unique(stats[:,:method]))
    aux = filter(t->t[:method]==m, stats)
    s *= m*" & "
    for x in ["syn_X_mar", "syn_X_censoring"]
        for y in "syn_Y_".*["linear", "nn"]
            r1 = filter(t -> t[:Y_setting] == y && t[:X_setting] == x, aux)[1,:osr2_mean]
            s *= string(round(r1, digits=3))
            r1 = filter(t -> t[:Y_setting] == y && t[:X_setting] == x, aux)[1,:osr2_se]
            s *= string(" (",round(r1, digits=3), ") & ")
        end
    end
    s *= " \\\\ \n"
end
print(s)

In [None]:
sort(unique(subdf[:,:n]))