# Example notebook

In [None]:
using Pkg
Pkg.activate("..")

In [None]:
using Revise
using PHD

In [None]:
using Random, Statistics, CSV, DataFrames
using LinearAlgebra

In [None]:
dataset_list = PHD.list_datasets(p_min = 1)

In [None]:
SNR = 1

In [None]:
if !isdir("../results")
    mkdir("../results")
end

In [None]:
dname = dataset_list[3]
iter = 1

Read in a data file.

In [None]:
X_missing = PHD.standardize_colnames(DataFrame(CSV.read("../datasets/"*dname*"/X_missing.csv",
                                     missingstrings=["", "NaN"])))
X_full = PHD.standardize_colnames(DataFrame(CSV.read("../datasets/"*dname*"/X_full.csv")))
first(X_full, 5)

Create output

In [None]:
Random.seed!(3)
@time Y, _, _ = PHD.binary_y(X_full, X_missing, SNR=SNR, k=10, k_missing_in_signal=5)
Y

Add Offset

X_full[:,:One] = Ref(1.)
X_missing[:,:One] = Ref(1.) ;

- Method 1: Impute missing data using MICE and regress using LASSO.

In [None]:
Random.seed!(56802+767*iter)
test_ind = PHD.split_dataset_nonrandom(X_missing, test_fraction = 0.3)
#test_ind = rand(nrow(X_missing)) .< test_prop
X_full[!,:Test] = test_ind
X_missing[!,:Test] = test_ind

In [None]:
X_imputed = PHD.mice(X_missing);
first(X_imputed, 5)

In [None]:
# regular
linear = PHD.regress(Y, X_imputed, lasso=true, alpha=1.0)
# validate parameters
linear, bestparams = PHD.regress_cv(Y, X_imputed, lasso=[true], alpha=[0.8, 1.0])
@show bestparams
linear

In [None]:
R2, OSR2 = PHD.evaluate(Y, X_imputed, linear, metric="auc")
@show R2
@show OSR2

- Method 2: Add indicator variables for missingness and impute zeros (finitely adaptive). Regress using LASSO.

In [None]:
X_augmented = hcat(PHD.zeroimpute(X_missing), PHD.indicatemissing(X_missing, removezerocols=true))
# regular
linear2 = PHD.regress(Y, X_augmented, lasso=true, alpha=0.8, missing_penalty=2.0)
# with validation
linear2, bestparams2 = PHD.regress_cv(Y, X_augmented, lasso=[true], alpha=[0.8, 1.0],
                                      missing_penalty=[2.0, 4.0, 8.0, 16.0])
@show bestparams2
linear2

In [None]:
R2, OSR2 = PHD.evaluate(Y, X_augmented, linear2, metric="auc")
@show R2
@show OSR2

- Method 3: Affinely adaptive

In [None]:
X_affine = PHD.augmentaffine(X_missing, removezerocols=true)
# regular
linear3 = PHD.regress(Y, X_affine, lasso=true, alpha=0.8, missing_penalty=40.0)
# with validation
linear3, bestparams3 = PHD.regress_cv(Y, X_affine, lasso=[true], alpha=[0.8],
                                      missing_penalty=[10.0, 20.0, 40.0, 80.0, 160.0])
@show bestparams3
linear3

In [None]:
R2, OSR2 = PHD.evaluate(Y, X_affine, linear3, metric="auc")
@show R2
@show OSR2

- Method 4: Finite adaptability with no zero-imputation

In [None]:
X_missing_std = PHD.standardize(X_missing)
@time gm = PHD.trainGreedyModel(Y, X_missing_std, maxdepth = 10, tolerance = 0.01, minbucket = 10)

In [None]:
@time gm, bestparams_gm = PHD.greedymodel_cv(Y, X_missing_std,
    maxdepth = [10], tolerance = [0.01, 0.05, 0.10], minbucket = [10])
bestparams_gm

In [None]:
PHD.print_ascii(gm)

In [None]:
PHD.evaluate(Y, X_missing_std, gm, metric="auc")

- Method 4.5: Finite adaptability with zero-imputation

In the previous example, we just passed the dataframe with missing values, and we train regression models that only use features that are common across all models. We may also wish to try running on a zero-imputed dataset; then we need to pass the dataframe with missing values as a keyword argument

In [None]:
X_missing_zero_std = PHD.zeroimpute(PHD.standardize(X_missing))
@time gm2 = PHD.trainGreedyModel(Y, X_missing_zero_std,
                                 maxdepth = 8, tolerance = 0.05, minbucket = 20, missingdata = X_missing)

In [None]:
@time gm2, bestparams_gm2 = PHD.greedymodel_cv(Y, X_missing_zero_std,
            val_fraction = 0.3,
            maxdepth = [8], tolerance = [0.0, 0.005, 0.01, 0.02, 0.05, 0.10],
            minbucket = [20], missingdata = X_missing)
bestparams_gm2

In [None]:
PHD.print_ascii(gm2)

In [None]:
PHD.evaluate(Y, X_missing_zero_std, gm2, X_missing_std, metric="auc")