# Test if marginal correlation can be used as feature importance statistic

In [2]:
using Revise
using Knockoffs
using LinearAlgebra
using Random
using StatsBase
using Statistics
using ToeplitzMatrices
using Distributions
using ProgressMeter
using DataFrames, CSV
using DelimitedFiles

In [3]:
target_fdr = 0.1
n = 1000
m = 1
p = 500
k = 50
true_mu = zeros(p)

fit_function = fit_marginal

for i in 1:10

    # simulate Σ
    Random.seed!(i)
    Σ = Matrix(SymmetricToeplitz(0.1.^(0:(p-1)))) # true covariance matrix
    L = cholesky(Σ).L

    # simulate beta
    βtrue = zeros(p)
    βtrue[1:k] .= 10 .* randn(k)
    shuffle!(βtrue)

    X = randn(n, p) * L
    zscore!(X, mean(X, dims=1), std(X, dims=1))

    # simulate y
    ϵ = randn(n)
    y = X * βtrue + ϵ

    # define group
    groups = hc_partition_groups(X, cutoff=0.99)
    @show length(unique(groups))

    # find truly causal groups
    correct_groups = groups[findall(!iszero, βtrue)] |> unique

    # equi
    equi_t = @elapsed equi = modelX_gaussian_group_knockoffs(
        X, :equi, groups, true_mu, Σ, 
        m = m,
    )
    equi_ko_filter = fit_function(y, equi)
    fdr_idx = findfirst(x -> x == target_fdr, equi_ko_filter.fdr_target)
    selected = equi_ko_filter.selected[fdr_idx]
    equi_power = length(intersect(correct_groups, selected)) / length(correct_groups)
    equi_fdr = length(setdiff(selected, correct_groups)) / max(1, length(selected))
    equi_ssum = sum(abs.(equi_ko_filter.ko.S))

    # ME
    me_t = @elapsed me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, true_mu, Σ, 
        m = m, 
    )
    me_ko_filter = fit_function(y, me)
    selected = me_ko_filter.selected[fdr_idx]
    me_power = length(intersect(correct_groups, selected)) / length(correct_groups)
    me_fdr = length(setdiff(selected, correct_groups)) / max(1, length(selected))
    me_ssum = sum(abs.(me_ko_filter.ko.S))

    # MVR
    mvr_t = @elapsed mvr = modelX_gaussian_group_knockoffs(
        X, :mvr, groups, true_mu, Σ, 
        m = m, 
    )
    mvr_ko_filter = fit_function(y, mvr)
    selected = mvr_ko_filter.selected[fdr_idx]
    mvr_power = length(intersect(correct_groups, selected)) / length(correct_groups)
    mvr_fdr = length(setdiff(selected, correct_groups)) / max(1, length(selected))
    mvr_ssum = sum(abs.(mvr_ko_filter.ko.S))

    println("equi (n $n): power = $(equi_power), fdr = $(equi_fdr), t=$(equi_t), sum(s) = $(equi_ssum)")
    println("ME (n $n): power = $(me_power), fdr = $(me_fdr), t=$(me_t), sum(s) = $(me_ssum)")
    println("mvr (n $n): power = $(mvr_power), fdr = $(mvr_fdr), t=$(mvr_t), sum(s) = $(mvr_ssum)\n")
end

length(unique(groups)) = 500


LoadError: UndefVarError: zscore not defined



In [107]:
target_fdr = 0.1
n = 1000
m = 1
p = 500
k = 50
true_mu = zeros(p)

fit_function = fit_marginal

for i in 1:10

    # simulate Σ
    Random.seed!(i)
    Σ = Matrix(SymmetricToeplitz(0.1.^(0:(p-1)))) # true covariance matrix
    L = cholesky(Σ).L

    # simulate beta
    βtrue = zeros(p)
    βtrue[1:k] .= 10 .* randn(k)
    shuffle!(βtrue)

    X = randn(n, p) * L
    zscore!(X, mean(X, dims=1), std(X, dims=1))

    # simulate y
    ϵ = randn(n)
    y = X * βtrue + ϵ

    # define group
    groups = hc_partition_groups(X, cutoff=0.99)
    @show length(unique(groups))

    # find truly causal groups
    correct_groups = groups[findall(!iszero, βtrue)] |> unique

    # equi
    equi_t = @elapsed equi = modelX_gaussian_group_knockoffs(
        X, :equi, groups, true_mu, Σ, 
        m = m,
    )
    equi_ko_filter = fit_function(y, equi)
    fdr_idx = findfirst(x -> x == target_fdr, equi_ko_filter.fdr_target)
    selected = equi_ko_filter.selected[fdr_idx]
    equi_power = length(intersect(correct_groups, selected)) / length(correct_groups)
    equi_fdr = length(setdiff(selected, correct_groups)) / max(1, length(selected))
    equi_ssum = sum(abs.(equi_ko_filter.ko.S))

    # ME
    me_t = @elapsed me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, true_mu, Σ, 
        m = m, 
    )
    me_ko_filter = fit_function(y, me)
    selected = me_ko_filter.selected[fdr_idx]
    me_power = length(intersect(correct_groups, selected)) / length(correct_groups)
    me_fdr = length(setdiff(selected, correct_groups)) / max(1, length(selected))
    me_ssum = sum(abs.(me_ko_filter.ko.S))

    # MVR
    mvr_t = @elapsed mvr = modelX_gaussian_group_knockoffs(
        X, :mvr, groups, true_mu, Σ, 
        m = m, 
    )
    mvr_ko_filter = fit_function(y, mvr)
    selected = mvr_ko_filter.selected[fdr_idx]
    mvr_power = length(intersect(correct_groups, selected)) / length(correct_groups)
    mvr_fdr = length(setdiff(selected, correct_groups)) / max(1, length(selected))
    mvr_ssum = sum(abs.(mvr_ko_filter.ko.S))

    println("equi (n $n): power = $(equi_power), fdr = $(equi_fdr), t=$(equi_t), sum(s) = $(equi_ssum)")
    println("ME (n $n): power = $(me_power), fdr = $(me_fdr), t=$(me_t), sum(s) = $(me_ssum)")
    println("mvr (n $n): power = $(mvr_power), fdr = $(mvr_fdr), t=$(mvr_t), sum(s) = $(mvr_ssum)\n")
end

length(unique(groups)) = 500
equi (n 1000): power = 0.42, fdr = 0.08695652173913043, t=0.026863333, sum(s) = 500.0
ME (n 1000): power = 0.38, fdr = 0.05, t=0.163278917, sum(s) = 480.9690727399732
mvr (n 1000): power = 0.5, fdr = 0.2647058823529412, t=0.249722583, sum(s) = 472.87343798260235

length(unique(groups)) = 500
equi (n 1000): power = 0.56, fdr = 0.15151515151515152, t=0.030452958, sum(s) = 500.0
ME (n 1000): power = 0.58, fdr = 0.21621621621621623, t=0.151088417, sum(s) = 480.9690727399732
mvr (n 1000): power = 0.52, fdr = 0.07142857142857142, t=0.258631583, sum(s) = 472.87343798260235

length(unique(groups)) = 500
equi (n 1000): power = 0.5, fdr = 0.038461538461538464, t=0.031655042, sum(s) = 500.0
ME (n 1000): power = 0.46, fdr = 0.0, t=0.154680292, sum(s) = 480.9690727399732
mvr (n 1000): power = 0.64, fdr = 0.15789473684210525, t=0.252533208, sum(s) = 472.87343798260235

length(unique(groups)) = 500
equi (n 1000): power = 0.52, fdr = 0.13333333333333333, t=0.025781416, sum