In [1]:
using CSV, DataFrames, Statistics

In [2]:
function method_category(meth)
    if startswith(meth, "Imp-then-Reg")
        return "Imp-then-Reg"
    elseif startswith(meth, "Joint Imp-then-Reg")
        return "Joint Imp-then-Reg"
    elseif meth ∈ ["Static", "Affine", "Finite"]
        return "Adaptive LR"
    elseif startswith(meth, "Complete Features")
        return "Complete Features"
    else 
        return meth
    end
end

method_category (generic function with 1 method)

In [3]:
pb_datasets = ["cylinder-bands", "ozone-level-detection-eight", "ozone-level-detection-one", "thyroid-disease-thyroid-0387", "trains",
                "credit-approval", "Ecdat-Mofa", "sleep"]

8-element Vector{String}:
 "cylinder-bands"
 "ozone-level-detection-eight"
 "ozone-level-detection-one"
 "thyroid-disease-thyroid-0387"
 "trains"
 "credit-approval"
 "Ecdat-Mofa"
 "sleep"

## For Real X - Syn Y Experiments

In [6]:
setting = "fakey/"
for y_model in ["linear", "nn"]
    for m_model = ["mar", "nmar", "mar_adv"]
        dir = y_model*"_"*m_model*"/"
        directory = setting*dir
        
        filelist = [f for f in readdir(directory*"final/") if endswith(f, ".csv")]
        res = similar(CSV.read(directory*"final/"*filelist[1], DataFrame),0)
        
        for subdir = ["final/", "rf_mia/"]
            filelist = [f for f in readdir(directory*subdir) if endswith(f, ".csv")]
#             res = similar(CSV.read(directory*subdir*filelist[1], DataFrame),0)
            for i in 1:length(filelist)
                res = vcat(res, CSV.read(directory*subdir*filelist[i], DataFrame))
            end
        end

        for subdir = ["xgb/"]
            try 
                filelist = [f for f in readdir("xgboost/"*directory*subdir) if endswith(f, ".csv")]
    #             res = similar(CSV.read(directory*subdir*filelist[1], DataFrame),0)
                for i in 1:length(filelist)
                    res = vcat(res, CSV.read("xgboost/"*directory*subdir*filelist[i], DataFrame))
                end
            catch 
                println("No XGBoost results for "*directory*subdir)
            end
        end

        res[!,:method_cat] = map(t -> method_category(t), res[:,:method])
        res[!,:X_setting] .= "real_X_"*m_model
        res[!,:Y_setting] .= "syn_Y_"*y_model

        CSV.write(directory*"FINAL_results.csv", res)
    end
end

No XGBoost results for fakey/linear_mar/xgb/


No XGBoost results for fakey/linear_nmar/xgb/


No XGBoost results for fakey/linear_mar_adv/xgb/


In [7]:
setting = "fakey/"

for y_model in ["linear", "tree", "nn"]
    for m_model = ["mar", "nmar", "mar_adv"]
        dir = y_model*"_"*m_model*"/"
        directory = setting*dir
  
        res = CSV.read(directory*"FINAL_results.csv", DataFrame)
        
        res[!,:method] .= map(t -> (t == "Affine" ? "Adaptive LR - Affine" : t), res[:,:method]) 
        res[!,:method] .= map(t -> (t == "Finite" ? "Adaptive LR - Finite" : t), res[:,:method]) 
        res[!,:method] .= map(t -> (t == "Static" ? "Adaptive LR - Affine intercept only" : t), res[:,:method])
        
        filter!(t -> t[:dataset] ∉ pb_datasets, res)

        for method in ["Oracle X", "Oracle XM", "Complete Features", "Imp-then-Reg 1", "Imp-then-Reg 2", "Imp-then-Reg 3", "Imp-then-Reg 4", "Imp-then-Reg 5", "Joint Imp-then-Reg", "Adaptive LR"]
            aux = filter(t -> startswith(t[:method], method), res)

            idcols = [:dataset, :X_setting, :Y_setting, :SNR, :k, :kMissing, :splitnum]
            gd = groupby(aux, idcols)

            aux = similar(aux, 0)
            for subdf in gd 
                scoremax = argmax(subdf[:,:score])
                push!(aux, subdf[scoremax,names(aux)])
            end
            aux[!,:method] .= method*" - best"

            res = vcat(res, aux)
        end
        
        CSV.write(directory*"FINAL_results.csv", res)
    end
end

## For Real Data Experiments

In [8]:
1+1

2

In [9]:
for directory = ["realy/"]
    filelist = [f for f in readdir(directory*"2022-08-23/") if endswith(f, ".csv") && f ∉ ["all_results.csv","all_results_new.csv"]]
    res = similar(CSV.read(directory*"2022-08-23/"*filelist[1], DataFrame),0)
    
    for subdir = ["2022-08-23/", "rf_mia/"]
        filelist = [f for f in readdir(directory*subdir) if endswith(f, ".csv") && f ∉ ["all_results.csv","all_results_new.csv"]]
#         res = similar(CSV.read(directory*subdir*filelist[1], DataFrame),0)
        for i in 1:length(filelist)
            res = vcat(res, CSV.read(directory*subdir*filelist[i], DataFrame))
        end
    end

    for subdir = ["xgb/"]
        try 
            filelist = [f for f in readdir("xgboost/"*directory*subdir) if endswith(f, ".csv")]
#             res = similar(CSV.read(directory*subdir*filelist[1], DataFrame),0)
            for i in 1:length(filelist)
                res = vcat(res, CSV.read("xgboost/"*directory*subdir*filelist[i], DataFrame))
            end
        catch 
            println("No XGBoost results for "*directory*subdir)
        end
    end
    
    # filter!(t -> t[:k] > 0, res) #Remove dataset with only a bias term
    res[!,:method_cat] = map(method_category, res[:,:method])
    res[!,:X_setting] .= "real_X"
    res[!,:Y_setting] .= "real_Y"
    
    CSV.write(directory*"FINAL_results.csv", res)
end

Sanity check

In [10]:
res = CSV.read("realy/"*"FINAL_results.csv", DataFrame)

Row,dataset,SNR,k,kMissing,splitnum,method,r2,osr2,r2list,osr2list,time,hp,score,method_cat,X_setting,Y_setting
Unnamed: 0_level_1,String,Float64,Float64,Int64,Int64,String31,Float64,Float64,String,String,Float64,String,Float64,String31,String7,String7
1,COUNT-loomis,,,1,1,Complete Features - linear,0.231652,0.200118,Any[0.23165234974566928],Any[0.20011819111587237],19.3096,"Dict{Symbol, Any}(:alpha => 0.6, :regtype => :lasso)",0.238859,Complete Features,real_X,real_Y
2,COUNT-loomis,,,1,1,Complete Features - tree,0.224957,0.160446,Any[0.22495732198332163],Any[0.16044649345650908],1.25356,Dict(:maxdepth => 2),0.195207,Complete Features,real_X,real_Y
3,COUNT-loomis,,,1,1,Complete Features - rf,0.230347,0.0315769,Any[0.23034703386546473],Any[0.03157686979670138],2.35746,"Dict(:ntrees => 125, :maxdepth => 20)",0.317459,Complete Features,real_X,real_Y
4,COUNT-loomis,,,1,1,CART MIA,0.224957,0.160446,Any[0.22495732198332163],Any[0.16044649345650908],0.0158799,Dict(:maxdepth => 2),0.195207,CART MIA,real_X,real_Y
5,COUNT-loomis,,,1,1,Imp-then-Reg 1 - linear,0.231652,0.200118,Any[0.23165234974566928],Any[0.20011819111587237],0.852322,"Dict{Symbol, Any}(:alpha => 0.6, :regtype => :lasso)",0.238859,Imp-then-Reg,real_X,real_Y
6,COUNT-loomis,,,1,1,Imp-then-Reg 2 - linear,0.231652,0.200118,Any[0.23165234974566928],Any[0.20011819111587237],0.416759,"Dict{Symbol, Any}(:alpha => 0.6, :regtype => :lasso)",0.238859,Imp-then-Reg,real_X,real_Y
7,COUNT-loomis,,,1,1,Imp-then-Reg 3 - linear,0.231652,0.200118,Any[0.23165234974566928],Any[0.20011819111587237],0.0708721,"Dict{Symbol, Any}(:alpha => 0.6, :regtype => :lasso)",0.238859,Imp-then-Reg,real_X,real_Y
8,COUNT-loomis,,,1,1,Imp-then-Reg 4 - linear,0.231652,0.200118,Any[0.23165234974566928],Any[0.20011819111587237],1.33671,"Dict{Symbol, Any}(:alpha => 0.6, :regtype => :lasso)",0.238859,Imp-then-Reg,real_X,real_Y
9,COUNT-loomis,,,1,1,Imp-then-Reg 5 - linear,0.23098,0.202817,Any[0.23098038027744217],Any[0.2028172015357963],1.65744,"Dict{Symbol, Any}(:alpha => 0.6, :regtype => :lasso)",0.221106,Imp-then-Reg,real_X,real_Y
10,COUNT-loomis,,,1,1,Imp-then-Reg 1 - tree,0.224957,0.160446,Any[0.22495732198332163],Any[0.16044649345650908],0.0213122,Dict(:maxdepth => 2),0.195207,Imp-then-Reg,real_X,real_Y


In [13]:
unique(filter( t-> t[:nrow] < 10, combine(groupby(res, [:dataset, :method]), nrow))[:,[:dataset, :nrow]])

Row,dataset,nrow
Unnamed: 0_level_1,String,Int64
1,cylinder-bands,1
2,ozone-level-detection-eight,2
3,ozone-level-detection-one,3
4,thyroid-disease-thyroid-0387,6
5,trains,1


Create `best` variant

In [14]:
res = CSV.read("realy/"*"FINAL_results.csv", DataFrame)

filter!(t -> t[:dataset] ∉ pb_datasets, res)

res[!,:method] .= map(t -> (t == "Affine" ? "Adaptive LR - Affine" : t), res[:,:method]) 
res[!,:method] .= map(t -> (t == "Finite" ? "Adaptive LR - Finite" : t), res[:,:method]) 
res[!,:method] .= map(t -> (t == "Static" ? "Adaptive LR - Affine intercept only" : t), res[:,:method])

for method in ["Complete Features", "Imp-then-Reg 1", "Imp-then-Reg 2", "Imp-then-Reg 3", "Imp-then-Reg 4", "Imp-then-Reg 5", "Joint Imp-then-Reg", "Adaptive LR"]
    aux = filter(t -> startswith(t[:method], method), res)
    # @show size(aux)
    idcols = [:dataset, :SNR, :k, :kMissing, :splitnum]
    gd = groupby(aux, idcols)

    aux = similar(aux, 0)
    for subdf in gd 
        scoremax = argmax(subdf[:,:score])
        push!(aux, subdf[scoremax,names(aux)])
    end
    aux[!,:method] .= method*" - best"

    res = vcat(res, aux)
end

CSV.write("realy/"*"FINAL_results.csv", res)

"realy/FINAL_results.csv"

In [None]:
names(res)

# For Synthetic-Data Experiments

In [None]:
1+1

In [15]:
setting = "synthetic_discrete/"
# for y_model in ["linear", "tree", "nn"]
for y_model in ["linear", "nn"]
    for m_model = ["mar", "censoring"]
        dir = y_model*"_"*m_model*"/"
        directory = setting*dir
        
        filelist = [f for f in readdir(directory*"final/") if endswith(f, ".csv") && f ∉ ["all_results.csv","all_results_new.csv"]]
        res = similar(CSV.read(directory*"final/"*filelist[1], DataFrame),0)

        for subdir = ["final/"]
            filelist = [f for f in readdir(directory*subdir) if endswith(f, ".csv") && f ∉ ["all_results.csv","all_results_new.csv"]]
            @show filelist

    #         res = similar(CSV.read(directory*subdir*filelist[1], DataFrame),0)
            for i in 1:length(filelist)
                aux = CSV.read(directory*subdir*filelist[i], DataFrame)
                # if any(aux[:,:kMissing] .> 0)
                #     missingproba = unique(aux[aux[:,:pMissing] .> 0,:pMissing])[1]
                #     aux[!,:pMissing] .= missingproba
                # end
                try
                    res = vcat(res, aux)
                catch 
                    println("Error with ", directory*subdir*filelist[i])
                end
            end
        end

        for subdir = ["xgb/"]
            try 
                filelist = [f for f in readdir("xgboost/"*directory*subdir) if endswith(f, ".csv")]
                for i in 1:length(filelist)
                    res = vcat(res, CSV.read("xgboost/"*directory*subdir*filelist[i], DataFrame))
                end
            catch 
                println("No XGBoost results for "*directory*subdir)
            end
        end

        res[!,:method] .= map(t -> replace(t, "Encoding as new category" => "Imp-then-Reg 4"), res[:,:method])
        res[!,:method] .= map(t -> replace(t, "Mode impute" => "Imp-then-Reg 5"), res[:,:method])

        # filter!(t -> t[:k] > 0, res) #Remove dataset with only a bias term
        res[!,:method_cat] = map(method_category, res[:,:method])
        res[!,:X_setting] .= "syn_X_"*m_model
        res[!,:Y_setting] .= "syn_Y_"*y_model

        for method in ["Imp-then-Reg 4", "Imp-then-Reg 5"]
            aux = filter(t -> startswith(t[:method], method), res)
            # @show nrow(aux)
            idcols = [:dataset, :X_setting, :Y_setting, :SNR, :k, :pMissing, :splitnum]
            gd = groupby(aux, idcols)

            aux = similar(aux, 0)
            for subdf in gd 
                scoremax = argmax(subdf[:,:score])
                # @show subdf[scoremax,names(aux)]
                push!(aux, subdf[scoremax,names(aux)])
            end
            aux[!,:method] .= method*" - best"

            res = vcat(res, aux)
        end

        CSV.write(directory*"FINAL_results.csv", res)   
    end
end

filelist = ["n_1000_p_10_pmiss_0.1_1.csv", "n_1000_p_10_pmiss_0.1_10.csv", "n_1000_p_10_pmiss_0.1_2.csv", "n_1000_p_10_pmiss_0.1_3.csv", "n_1000_p_10_pmiss_0.1_4.csv", "n_1000_p_10_pmiss_0.1_5.csv", "n_1000_p_10_pmiss_0.1_6.csv", "n_1000_p_10_pmiss_0.1_7.csv", "n_1000_p_10_pmiss_0.1_8.csv", "n_1000_p_10_pmiss_0.1_9.csv", "n_1000_p_10_pmiss_0.2_1.csv", "n_1000_p_10_pmiss_0.2_10.csv", "n_1000_p_10_pmiss_0.2_2.csv", "n_1000_p_10_pmiss_0.2_3.csv", "n_1000_p_10_pmiss_0.2_4.csv", "n_1000_p_10_pmiss_0.2_5.csv", "n_1000_p_10_pmiss_0.2_6.csv", "n_1000_p_10_pmiss_0.2_7.csv", "n_1000_p_10_pmiss_0.2_8.csv", "n_1000_p_10_pmiss_0.2_9.csv", "n_1000_p_10_pmiss_0.3_1.csv", "n_1000_p_10_pmiss_0.3_10.csv", "n_1000_p_10_pmiss_0.3_2.csv", "n_1000_p_10_pmiss_0.3_3.csv", "n_1000_p_10_pmiss_0.3_4.csv", "n_1000_p_10_pmiss_0.3_5.csv", "n_1000_p_10_pmiss_0.3_6.csv", "n_1000_p_10_pmiss_0.3_7.csv", "n_1000_p_10_pmiss_0.3_8.csv", "n_1000_p_10_pmiss_0.3_9.csv", "n_1000_p_10_pmiss_0.4_1.csv", "n_1000_p_10_pmiss_0.4_1

p_10_pmiss_0.3_1.csv", "n_520_p_10_pmiss_0.3_10.csv", "n_520_p_10_pmiss_0.3_2.csv", "n_520_p_10_pmiss_0.3_3.csv", "n_520_p_10_pmiss_0.3_4.csv", "n_520_p_10_pmiss_0.3_5.csv", "n_520_p_10_pmiss_0.3_6.csv", "n_520_p_10_pmiss_0.3_7.csv", "n_520_p_10_pmiss_0.3_8.csv", "n_520_p_10_pmiss_0.3_9.csv", "n_520_p_10_pmiss_0.4_1.csv", "n_520_p_10_pmiss_0.4_10.csv", "n_520_p_10_pmiss_0.4_2.csv", "n_520_p_10_pmiss_0.4_3.csv", "n_520_p_10_pmiss_0.4_4.csv", "n_520_p_10_pmiss_0.4_5.csv", "n_520_p_10_pmiss_0.4_6.csv", "n_520_p_10_pmiss_0.4_7.csv", "n_520_p_10_pmiss_0.4_8.csv", "n_520_p_10_pmiss_0.4_9.csv", "n_520_p_10_pmiss_0.5_1.csv", "n_520_p_10_pmiss_0.5_10.csv", "n_520_p_10_pmiss_0.5_2.csv", "n_520_p_10_pmiss_0.5_3.csv", "n_520_p_10_pmiss_0.5_4.csv", "n_520_p_10_pmiss_0.5_5.csv", "n_520_p_10_pmiss_0.5_6.csv", "n_520_p_10_pmiss_0.5_7.csv", "n_520_p_10_pmiss_0.5_8.csv", "n_520_p_10_pmiss_0.5_9.csv", "n_520_p_10_pmiss_0.6_1.csv", "n_520_p_10_pmiss_0.6_10.csv", "n_520_p_10_pmiss_0.6_2.csv", "n_520_p_10_p


filelist = 

["n_1000_p_10_pmiss_0.1_1.csv", "n_1000_p_10_pmiss_0.1_10.csv", "n_1000_p_10_pmiss_0.1_2.csv", "n_1000_p_10_pmiss_0.1_3.csv", "n_1000_p_10_pmiss_0.1_4.csv", "n_1000_p_10_pmiss_0.1_5.csv", "n_1000_p_10_pmiss_0.1_6.csv", "n_1000_p_10_pmiss_0.1_7.csv", "n_1000_p_10_pmiss_0.1_8.csv", "n_1000_p_10_pmiss_0.1_9.csv", "n_1000_p_10_pmiss_0.2_1.csv", "n_1000_p_10_pmiss_0.2_10.csv", "n_1000_p_10_pmiss_0.2_2.csv", "n_1000_p_10_pmiss_0.2_3.csv", "n_1000_p_10_pmiss_0.2_4.csv", "n_1000_p_10_pmiss_0.2_5.csv", "n_1000_p_10_pmiss_0.2_6.csv", "n_1000_p_10_pmiss_0.2_7.csv", "n_1000_p_10_pmiss_0.2_8.csv", "n_1000_p_10_pmiss_0.2_9.csv", "n_1000_p_10_pmiss_0.3_1.csv", "n_1000_p_10_pmiss_0.3_10.csv", "n_1000_p_10_pmiss_0.3_2.csv", "n_1000_p_10_pmiss_0.3_3.csv", "n_1000_p_10_pmiss_0.3_4.csv", "n_1000_p_10_pmiss_0.3_5.csv", "n_1000_p_10_pmiss_0.3_6.csv", "n_1000_p_10_pmiss_0.3_7.csv", "n_1000_p_10_pmiss_0.3_8.csv", "n_1000_p_10_pmiss_0.3_9.csv", "n_1000_p_10_pmiss_0.4_1.csv", "n_1000_p_10_pmiss_0.4_10.csv", "n_

p_10_pmiss_0.3_1.csv", "n_520_p_10_pmiss_0.3_10.csv", "n_520_p_10_pmiss_0.3_2.csv", "n_520_p_10_pmiss_0.3_3.csv", "n_520_p_10_pmiss_0.3_4.csv", "n_520_p_10_pmiss_0.3_5.csv", "n_520_p_10_pmiss_0.3_6.csv", "n_520_p_10_pmiss_0.3_7.csv", "n_520_p_10_pmiss_0.3_8.csv", "n_520_p_10_pmiss_0.3_9.csv", "n_520_p_10_pmiss_0.4_1.csv", "n_520_p_10_pmiss_0.4_10.csv", "n_520_p_10_pmiss_0.4_2.csv", "n_520_p_10_pmiss_0.4_3.csv", "n_520_p_10_pmiss_0.4_4.csv", "n_520_p_10_pmiss_0.4_5.csv", "n_520_p_10_pmiss_0.4_6.csv", "n_520_p_10_pmiss_0.4_7.csv", "n_520_p_10_pmiss_0.4_8.csv", "n_520_p_10_pmiss_0.4_9.csv", "n_520_p_10_pmiss_0.5_1.csv", "n_520_p_10_pmiss_0.5_10.csv", "n_520_p_10_pmiss_0.5_2.csv", "n_520_p_10_pmiss_0.5_3.csv", "n_520_p_10_pmiss_0.5_4.csv", "n_520_p_10_pmiss_0.5_5.csv", "n_520_p_10_pmiss_0.5_6.csv", "n_520_p_10_pmiss_0.5_7.csv", "n_520_p_10_pmiss_0.5_8.csv", "n_520_p_10_pmiss_0.5_9.csv", "n_520_p_10_pmiss_0.6_1.csv", "n_520_p_10_pmiss_0.6_10.csv", "n_520_p_10_pmiss_0.6_2.csv", "n_520_p_10_p

iss_0.5_3.csv", "n_960_p_10_pmiss_0.5_4.csv", "n_960_p_10_pmiss_0.5_5.csv", "n_960_p_10_pmiss_0.5_6.csv", "n_960_p_10_pmiss_0.5_7.csv", "n_960_p_10_pmiss_0.5_8.csv", "n_960_p_10_pmiss_0.5_9.csv", "n_960_p_10_pmiss_0.6_1.csv", "n_960_p_10_pmiss_0.6_10.csv", "n_960_p_10_pmiss_0.6_2.csv", "n_960_p_10_pmiss_0.6_3.csv", "n_960_p_10_pmiss_0.6_4.csv", "n_960_p_10_pmiss_0.6_5.csv", "n_960_p_10_pmiss_0.6_6.csv", "n_960_p_10_pmiss_0.6_7.csv", "n_960_p_10_pmiss_0.6_8.csv", "n_960_p_10_pmiss_0.6_9.csv", "n_960_p_10_pmiss_0.7_1.csv", "n_960_p_10_pmiss_0.7_10.csv", "n_960_p_10_pmiss_0.7_2.csv", "n_960_p_10_pmiss_0.7_3.csv", "n_960_p_10_pmiss_0.7_4.csv", "n_960_p_10_pmiss_0.7_5.csv", "n_960_p_10_pmiss_0.7_6.csv", "n_960_p_10_pmiss_0.7_7.csv", "n_960_p_10_pmiss_0.7_8.csv", "n_960_p_10_pmiss_0.7_9.csv", "n_960_p_10_pmiss_0.8_1.csv", "n_960_p_10_pmiss_0.8_10.csv", "n_960_p_10_pmiss_0.8_2.csv", "n_960_p_10_pmiss_0.8_3.csv", "n_960_p_10_pmiss_0.8_4.csv", "n_960_p_10_pmiss_0.8_5.csv", "n_960_p_10_pmiss_0.8

filelist = ["n_1000_p_10_pmiss_0.1_1.csv", "n_1000_p_10_pmiss_0.1_10.csv", "n_1000_p_10_pmiss_0.1_2.csv", "n_1000_p_10_pmiss_0.1_3.csv", "n_1000_p_10_pmiss_0.1_4.csv", "n_1000_p_10_pmiss_0.1_5.csv", "n_1000_p_10_pmiss_0.1_6.csv", "n_1000_p_10_pmiss_0.1_7.csv", "n_1000_p_10_pmiss_0.1_8.csv", "n_1000_p_10_pmiss_0.1_9.csv", "n_1000_p_10_pmiss_0.2_1.csv", "n_1000_p_10_pmiss_0.2_10.csv", "n_1000_p_10_pmiss_0.2_2.csv", "n_1000_p_10_pmiss_0.2_3.csv", "n_1000_p_10_pmiss_0.2_4.csv", "n_1000_p_10_pmiss_0.2_5.csv", "n_1000_p_10_pmiss_0.2_6.csv", "n_1000_p_10_pmiss_0.2_7.csv", "n_1000_p_10_pmiss_0.2_8.csv", "n_1000_p_10_pmiss_0.2_9.csv", "n_1000_p_10_pmiss_0.3_1.csv", "n_1000_p_10_pmiss_0.3_10.csv", "n_1000_p_10_pmiss_0.3_2.csv", "n_1000_p_10_pmiss_0.3_3.csv", "n_1000_p_10_pmiss_0.3_4.csv", "n_1000_p_10_pmiss_0.3_5.csv", "n_1000_p_10_pmiss_0.3_6.csv", "n_1000_p_10_pmiss_0.3_7.csv", "n_1000_p_10_pmiss_0.3_8.csv", "n_1000_p_10_pmiss_0.3_9.csv", "n_1000_p_10_pmiss_0.4_1.csv", "n_1000_p_10_pmiss_0.4_1

10_pmiss_0.3_2.csv", "n_520_p_10_pmiss_0.3_3.csv", "n_520_p_10_pmiss_0.3_4.csv", "n_520_p_10_pmiss_0.3_5.csv", "n_520_p_10_pmiss_0.3_6.csv", "n_520_p_10_pmiss_0.3_7.csv", "n_520_p_10_pmiss_0.3_8.csv", "n_520_p_10_pmiss_0.3_9.csv", "n_520_p_10_pmiss_0.4_1.csv", "n_520_p_10_pmiss_0.4_10.csv", "n_520_p_10_pmiss_0.4_2.csv", "n_520_p_10_pmiss_0.4_3.csv", "n_520_p_10_pmiss_0.4_4.csv", "n_520_p_10_pmiss_0.4_5.csv", "n_520_p_10_pmiss_0.4_6.csv", "n_520_p_10_pmiss_0.4_7.csv", "n_520_p_10_pmiss_0.4_8.csv", "n_520_p_10_pmiss_0.4_9.csv", "n_520_p_10_pmiss_0.5_1.csv", "n_520_p_10_pmiss_0.5_10.csv", "n_520_p_10_pmiss_0.5_2.csv", "n_520_p_10_pmiss_0.5_3.csv", "n_520_p_10_pmiss_0.5_4.csv", "n_520_p_10_pmiss_0.5_5.csv", "n_520_p_10_pmiss_0.5_6.csv", "n_520_p_10_pmiss_0.5_7.csv", "n_520_p_10_pmiss_0.5_8.csv", "n_520_p_10_pmiss_0.5_9.csv", "n_520_p_10_pmiss_0.6_1.csv", "n_520_p_10_pmiss_0.6_10.csv", "n_520_p_10_pmiss_0.6_2.csv", "n_520_p_10_pmiss_0.6_3.csv", "n_520_p_10_pmiss_0.6_4.csv", "n_520_p_10_pmis

6_1.csv", "n_960_p_10_pmiss_0.6_10.csv", "n_960_p_10_pmiss_0.6_2.csv", "n_960_p_10_pmiss_0.6_3.csv", "n_960_p_10_pmiss_0.6_4.csv", "n_960_p_10_pmiss_0.6_5.csv", "n_960_p_10_pmiss_0.6_6.csv", "n_960_p_10_pmiss_0.6_7.csv", "n_960_p_10_pmiss_0.6_8.csv", "n_960_p_10_pmiss_0.6_9.csv", "n_960_p_10_pmiss_0.7_1.csv", "n_960_p_10_pmiss_0.7_10.csv", "n_960_p_10_pmiss_0.7_2.csv", "n_960_p_10_pmiss_0.7_3.csv", "n_960_p_10_pmiss_0.7_4.csv", "n_960_p_10_pmiss_0.7_5.csv", "n_960_p_10_pmiss_0.7_6.csv", "n_960_p_10_pmiss_0.7_7.csv", "n_960_p_10_pmiss_0.7_8.csv", "n_960_p_10_pmiss_0.7_9.csv", "n_960_p_10_pmiss_0.8_1.csv", "n_960_p_10_pmiss_0.8_10.csv", "n_960_p_10_pmiss_0.8_2.csv", "n_960_p_10_pmiss_0.8_3.csv", "n_960_p_10_pmiss_0.8_4.csv", "n_960_p_10_pmiss_0.8_5.csv", "n_960_p_10_pmiss_0.8_6.csv", "n_960_p_10_pmiss_0.8_7.csv", "n_960_p_10_pmiss_0.8_8.csv", "n_960_p_10_pmiss_0.8_9.csv", "n_960_p_10_pmiss_0.9_1.csv", "n_960_p_10_pmiss_0.9_2.csv", "n_960_p_10_pmiss_0.9_3.csv", "n_960_p_10_pmiss_0.9_4.csv

filelist = ["n_1000_p_10_pmiss_0.1_1.csv", "n_1000_p_10_pmiss_0.1_10.csv", "n_1000_p_10_pmiss_0.1_2.csv", "n_1000_p_10_pmiss_0.1_3.csv", "n_1000_p_10_pmiss_0.1_4.csv", "n_1000_p_10_pmiss_0.1_5.csv", "n_1000_p_10_pmiss_0.1_6.csv", "n_1000_p_10_pmiss_0.1_7.csv", "n_1000_p_10_pmiss_0.1_8.csv", "n_1000_p_10_pmiss_0.1_9.csv", "n_1000_p_10_pmiss_0.2_1.csv", "n_1000_p_10_pmiss_0.2_10.csv", "n_1000_p_10_pmiss_0.2_2.csv", "n_1000_p_10_pmiss_0.2_3.csv", "n_1000_p_10_pmiss_0.2_4.csv", "n_1000_p_10_pmiss_0.2_5.csv", "n_1000_p_10_pmiss_0.2_6.csv", "n_1000_p_10_pmiss_0.2_7.csv", "n_1000_p_10_pmiss_0.2_8.csv", "n_1000_p_10_pmiss_0.2_9.csv", "n_1000_p_10_pmiss_0.3_1.csv", "n_1000_p_10_pmiss_0.3_10.csv", "n_1000_p_10_pmiss_0.3_2.csv", "n_1000_p_10_pmiss_0.3_3.csv", "n_1000_p_10_pmiss_0.3_4.csv", "n_1000_p_10_pmiss_0.3_5.csv", "n_1000_p_10_pmiss_0.3_6.csv", "n_1000_p_10_pmiss_0.3_7.csv", "n_1000_p_10_pmiss_0.3_8.csv", "n_1000_p_10_pmiss_0.3_9.csv", "n_1000_p_10_pmiss_0.4_1.csv", "n_1000_p_10_pmiss_0.4_1

Excessive output truncated after 536311 bytes.

In [16]:
setting = "synthetic/"
# for y_model in ["linear", "tree", "nn"]
for y_model in ["linear", "nn"]
    for m_model = ["mar", "censoring"]
        dir = y_model*"_"*m_model*"/"
        directory = setting*dir
        
        filelist = [f for f in readdir(directory*"final/") if endswith(f, ".csv") && f ∉ ["all_results.csv","all_results_new.csv"]]
        res = similar(CSV.read(directory*"final/"*filelist[1], DataFrame),0)

        for subdir = ["final/", "rf_mia/"]
            filelist = [f for f in readdir(directory*subdir) if endswith(f, ".csv") && f ∉ ["all_results.csv","all_results_new.csv"]]
    #         res = similar(CSV.read(directory*subdir*filelist[1], DataFrame),0)
            for i in 1:length(filelist)
                aux = CSV.read(directory*subdir*filelist[i], DataFrame)
                if any(aux[:,:pMissing] .> 0)
                    missingproba = unique(aux[aux[:,:pMissing] .> 0,:pMissing])[1]
                    aux[!,:pMissing] .= missingproba
                end
                try
                    res = vcat(res, aux)
                catch 
                    println("Error with ", directory*subdir*filelist[i])
                end
            end
        end

        for subdir = ["xgb/"]
            try 
                filelist = [f for f in readdir("xgboost/"*directory*subdir) if endswith(f, ".csv")]
                for i in 1:length(filelist)
                    res = vcat(res, CSV.read("xgboost/"*directory*subdir*filelist[i], DataFrame))
                end
            catch 
                println("No XGBoost results for "*directory*subdir)
            end
        end

        # filter!(t -> t[:k] > 0, res) #Remove dataset with only a bias term
        res[!,:method_cat] = map(method_category, res[:,:method])
        res[!,:X_setting] .= "syn_X_"*m_model
        res[!,:Y_setting] .= "syn_Y_"*y_model

        for method in ["Oracle X", "Oracle XM", "Complete Features", "Imp-then-Reg 1", "Imp-then-Reg 2", "Imp-then-Reg 3", "Imp-then-Reg 4", "Imp-then-Reg 5", "Joint Imp-then-Reg", "Adaptive LR"]
            aux = filter(t -> startswith(t[:method], method), res)

            idcols = [:dataset, :X_setting, :Y_setting, :SNR, :k, :pMissing, :splitnum]
            gd = groupby(aux, idcols)

            aux = similar(aux, 0)
            for subdf in gd 
                scoremax = argmax(subdf[:,:score])
                push!(aux, subdf[scoremax,names(aux)])
            end
            aux[!,:method] .= method*" - best"

            res = vcat(res, aux)
        end
        
        CSV.write(directory*"FINAL_results.csv", res)   
    end
end

In [None]:
setting = "synthetic/"
# for y_model in ["linear", "tree", "nn"]
for y_model in ["linear", "nn"]
    for m_model = ["mar", "censoring"]
        dir = y_model*"_"*m_model*"/"
        directory = setting*dir
        
        filelist = [f for f in readdir(directory*"final/") if endswith(f, ".csv") && f ∉ ["all_results.csv","all_results_new.csv"]]
        res = similar(CSV.read(directory*"final/"*filelist[1], DataFrame),0)

        for subdir = ["high_n/"]
            filelist = [f for f in readdir(directory*subdir) if endswith(f, ".csv") && f ∉ ["all_results.csv","all_results_new.csv"]]
    #         res = similar(CSV.read(directory*subdir*filelist[1], DataFrame),0)
            for i in 1:length(filelist)
                aux = CSV.read(directory*subdir*filelist[i], DataFrame)
                if any(aux[:,:pMissing] .> 0)
                    missingproba = unique(aux[aux[:,:pMissing] .> 0,:pMissing])[1]
                    aux[!,:pMissing] .= missingproba
                end
                try
                    res = vcat(res, aux)
                catch 
                    println("Error with ", directory*subdir*filelist[i])
                end
            end
        end
        # filter!(t -> t[:k] > 0, res) #Remove dataset with only a bias term
        res[!,:method_cat] = map(method_category, res[:,:method])
        res[!,:X_setting] .= "syn_X_"*m_model
        res[!,:Y_setting] .= "syn_Y_"*y_model

        CSV.write(directory*"HIGHN_results.csv", res)   
    end
end

In [None]:
1+1

Sanity check

In [None]:
df = CSV.read("synthetic/linear_mar/FINAL_results.csv", DataFrame) 
df[!,:setting] .= "1 - Lin-MAR"

aux = CSV.read("synthetic/linear_censoring/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "2 - Lin-NMAR"
df = vcat(df, aux)
 
aux = CSV.read("synthetic/tree_mar/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "3 - Tree-MAR"
df = vcat(df, aux)

aux = CSV.read("synthetic/tree_censoring/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "4 - Tree-NMAR"
df = vcat(df, aux)

aux = CSV.read("synthetic/nn_mar/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "5 - NN-MAR"
df = vcat(df, aux)

aux = CSV.read("synthetic/nn_censoring/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "6 - NN-NMAR"
df = vcat(df, aux)

In [5]:
df = CSV.read("synthetic_discrete/linear_mar/FINAL_results.csv", DataFrame) 
df[!,:setting] .= "1 - Lin-MAR"

aux = CSV.read("synthetic_discrete/linear_censoring/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "2 - Lin-NMAR"
df = vcat(df, aux)
 
aux = CSV.read("synthetic_discrete/tree_mar/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "3 - Tree-MAR"
df = vcat(df, aux)

aux = CSV.read("synthetic_discrete/tree_censoring/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "4 - Tree-NMAR"
df = vcat(df, aux)

aux = CSV.read("synthetic_discrete/nn_mar/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "5 - NN-MAR"
df = vcat(df, aux)

aux = CSV.read("synthetic_discrete/nn_censoring/FINAL_results.csv", DataFrame) 
aux[!,:setting] .= "6 - NN-NMAR"
df = vcat(df, aux)

Row,dataset,SNR,k,pMissing,splitnum,method,r2,osr2,r2list,osr2list,muvec,time,hp,score,method_cat,X_setting,Y_setting,setting
Unnamed: 0_level_1,String15,Int64,Int64,Float64,Int64,String31,Float64,Float64,String31,String31,String7,Float64,String,Float64,String,String15,String15,String
1,n_1000_p_10,2,5,0.1,1,Oracle X,0.664324,0.620391,Any[0.6643244100492505],Any[0.620391048897614],Any[],14.0286,"Dict{Symbol, Any}(:alpha => 1.0, :regtype => :lasso)",0.660321,Oracle X,syn_X_mar,syn_Y_linear,1 - Lin-MAR
2,n_1000_p_10,2,5,0.1,1,Oracle XM,0.664324,0.620391,Any[0.6643244100492505],Any[0.620391048897614],Any[],0.16225,"Dict{Symbol, Any}(:alpha => 1.0, :regtype => :lasso)",0.660321,Oracle XM,syn_X_mar,syn_Y_linear,1 - Lin-MAR
3,n_1000_p_10,2,5,0.1,1,Complete Features,0.0,0.0,Any[],Any[],Any[],0.0,0.0,0.0,Complete Features,syn_X_mar,syn_Y_linear,1 - Lin-MAR
4,n_1000_p_10,2,5,0.1,1,Imp-then-Reg 4 - linear,0.593441,0.53181,Any[0.5934413009795707],Any[0.5318097996975277],Any[],1.13563,"Dict{Symbol, Any}(:alpha => 0.9, :regtype => :lasso)",0.577878,Imp-then-Reg,syn_X_mar,syn_Y_linear,1 - Lin-MAR
5,n_1000_p_10,2,5,0.1,1,Imp-then-Reg 5 - linear,0.560968,0.497329,Any[0.5609675224934351],Any[0.49732897796408093],Any[],1.45368,"Dict{Symbol, Any}(:alpha => 1.0, :regtype => :lasso)",0.554987,Imp-then-Reg,syn_X_mar,syn_Y_linear,1 - Lin-MAR
6,n_1000_p_10,2,5,0.1,1,Imp-then-Reg 4 - tree,0.58702,0.479833,Any[0.5870200577625153],Any[0.4798332977933134],Any[],1.17518,Dict(:maxdepth => 4),0.547871,Imp-then-Reg,syn_X_mar,syn_Y_linear,1 - Lin-MAR
7,n_1000_p_10,2,5,0.1,1,Imp-then-Reg 5 - tree,0.57186,0.472435,Any[0.5718596427687634],Any[0.47243515467020425],Any[],0.0483472,Dict(:maxdepth => 4),0.537809,Imp-then-Reg,syn_X_mar,syn_Y_linear,1 - Lin-MAR
8,n_1000_p_10,2,5,0.1,1,Imp-then-Reg 4 - rf,0.626915,0.469273,Any[0.6269145092191312],Any[0.4692725673834949],Any[],26.6316,"Dict(:ntrees => 50, :maxdepth => 10)",0.514037,Imp-then-Reg,syn_X_mar,syn_Y_linear,1 - Lin-MAR
9,n_1000_p_10,2,5,0.1,1,Imp-then-Reg 5 - rf,0.552731,0.475783,Any[0.552731214936813],Any[0.47578277074979725],Any[],16.6645,"Dict(:ntrees => 50, :maxdepth => 5)",0.515967,Imp-then-Reg,syn_X_mar,syn_Y_linear,1 - Lin-MAR
10,n_1000_p_10,2,5,0.1,10,Oracle X,0.616533,0.630505,Any[0.6165329881971267],Any[0.6305051676301737],Any[],0.130059,"Dict{Symbol, Any}(:alpha => 0.6, :regtype => :lasso)",0.604229,Oracle X,syn_X_mar,syn_Y_linear,1 - Lin-MAR


In [6]:
combine(groupby(df, [:dataset, :method, :setting]), nrow)

Row,dataset,method,setting,nrow
Unnamed: 0_level_1,String15,String31,String,Int64
1,n_1000_p_10,Oracle X,1 - Lin-MAR,40
2,n_1000_p_10,Oracle XM,1 - Lin-MAR,40
3,n_1000_p_10,Complete Features,1 - Lin-MAR,40
4,n_1000_p_10,Imp-then-Reg 4 - linear,1 - Lin-MAR,40
5,n_1000_p_10,Imp-then-Reg 5 - linear,1 - Lin-MAR,40
6,n_1000_p_10,Imp-then-Reg 4 - tree,1 - Lin-MAR,40
7,n_1000_p_10,Imp-then-Reg 5 - tree,1 - Lin-MAR,40
8,n_1000_p_10,Imp-then-Reg 4 - rf,1 - Lin-MAR,39
9,n_1000_p_10,Imp-then-Reg 5 - rf,1 - Lin-MAR,39
10,n_100_p_10,Oracle X,1 - Lin-MAR,40


In [None]:
unique(combine(groupby(df, [:dataset, :method, :setting]), nrow)[:,:nrow])

In [None]:
filter( t-> t[:nrow] < 90, combine(groupby(res, [:dataset, :method, :setting]), nrow))

In [None]:
pbdata = unique(filter( t -> t[:nrow] < 90, combine(groupby(df, [:dataset, :method, :setting]), nrow))[:,[:method, :setting, :dataset]])

In [None]:
aux = filter( t -> t[:dataset] ∈ pbdata && t[:nrow] < 10, combine(groupby(df, [:dataset, :method, :setting, :pMissing]), nrow))
unique(aux[:,[:dataset, :setting, :pMissing, :nrow]])

In [None]:
filter(t -> t[:pMissing] == 0.1, unique(aux[:,[:dataset, :setting, :pMissing, :nrow]]))

In [None]:
filter(t -> t[:pMissing] == 0.3, unique(aux[:,[:dataset, :setting, :pMissing, :nrow]]))

In [None]:
filter(t -> startswith(t[:setting], "5"), unique(aux[:,[:dataset, :setting, :pMissing, :nrow]]))