In [None]:
using Pkg
Pkg.activate("..")

using Revise
using PHD

using Random, Statistics, CSV, DataFrames, LinearAlgebra

dataset_list = [d for d in split.(read(`ls ../datasets/`, String), "\n") if length(d) > 0]
sort!(dataset_list)

# SNR_list = [2, 1, .5]
missingsignal_list = [0,1,2,3,4,5,6,7,8,9,10]

if !isdir("../results")
    mkdir("../results")
end
savedir = "../results/fakey_nmar/"
if !isdir(savedir)
    mkdir(savedir)
end
SNR = 10

results_main = DataFrame(dataset=[], SNR=[], k=[], kMissing=[], splitnum=[], method=[], osr2=[])

In [None]:
d_num = 13
aux_num = 2

dname = dataset_list[d_num]#"dermatology" #"""thyroid-disease-thyroid-0387" #dataset_list[1]
k_missingsignal = missingsignal_list[aux_num]
@show dname, k_missingsignal

# Read in a data file.
X_missing = PHD.standardize_colnames(DataFrame(CSV.read("../datasets/"*dname*"/X_missing.csv",
            missingstrings=["", "NaN"]))) #df with missing values
canbemissing = [any(ismissing.(X_missing[:,j])) for j in names(X_missing)] #indicator of missing features
X_full = PHD.standardize_colnames(DataFrame(CSV.read("../datasets/"*dname*"/X_full.csv"))) #ground truth df

@show size(X_full)

# Create output
Random.seed!(5234)
@time Y, k, k_missing = PHD.linear_y(X_full, X_missing, k=1, SNR=SNR, canbemissing=canbemissing,
    k_missing_in_signal=k_missingsignal, mar=false);

In [None]:
iter = 1
test_prop = .3
results_table = similar(results_main,0)

filename = string(dname, "_SNR_", SNR, "_nmiss_", k_missingsignal, "_$iter.csv")

# Split train / test
Random.seed!(56802+767*iter)
test_ind = rand(nrow(X_missing)) .< test_prop ;

In [None]:
df = X_full[:,:]
df[!,:Test] = test_ind
linear, bestparams = PHD.regress_cv(Y, df, lasso=[true], alpha=[0.7,0.8,0.9,1.0])
@show R2, OSR2 = PHD.evaluate(Y, df, linear)
push!(results_table, [dname, SNR, k, k_missing, iter, "Oracle", OSR2])

In [None]:
linear

In [None]:
df = [X_full[:,:] PHD.indicatemissing(X_missing[:,:]; removezerocols=true)]
df[!,:Test] = test_ind
linear, bestparams = PHD.regress_cv(Y, df, lasso=[true], alpha=[0.7,0.8,0.9,1.0])
@show R2, OSR2 = PHD.evaluate(Y, df, linear)
push!(results_table, [dname, SNR, k, k_missing, iter, "Oracle XM", OSR2])

In [None]:
for i = 1:ncol(X_missing)
    indic = PHD.indicatemissing(X_missing[:,[i]]; removezerocols=true)
    if ncol(indic) > 0
        @show i
        for j = 1:ncol(select(X_full, Not(:Id)))
            correlation = cor([X_full[!, [j]] indic] |> Matrix)
            if correlation[1, 2] > 0.9
                @show j, correlation
            end
        end
    end
end

In [None]:
PHD.intrinsic_indicators(X_missing)

In [None]:
dname = "Ecdat-Schooling"
X_missing = PHD.standardize_colnames(DataFrame(CSV.read("../datasets/"*dname*"/X_missing.csv",
                missingstrings=["", "NaN"])))
desc = describe(X_missing)
filter(row -> row[:eltype] != Int, desc)

In [None]:
PHD.intrinsic_indicators(X_missing)

In [None]:
for dname in dataset_list
    X_missing = PHD.standardize_colnames(DataFrame(CSV.read("../datasets/"*dname*"/X_missing.csv",
                missingstrings=["", "NaN"])))
    intrinsic = 0
    try
        intrinsic = PHD.intrinsic_indicators(X_missing)
    catch
    end
    @show dname
    if length(intrinsic) > 0 && maximum(length.(values(intrinsic))) > 0
        @show dname, intrinsic
    end
end