# Check if representative group knockoffs are working

In [1]:
# load packages for this tutorial
using Revise
using Knockoffs
using LinearAlgebra
using Random
using StatsBase
using Statistics
using ToeplitzMatrices
using Distributions
using Clustering
using ProgressMeter
using LowRankApprox
using Plots
using CSV, DataFrames
gr(fmt=:png);

# some helper functions to compute power and empirical FDR
function TP(correct_groups, signif_groups)
    return length(signif_groups ∩ correct_groups) / max(1, length(correct_groups))
end
function TP(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return TP(correct_groups, signif_groups)
end
function power(correct_snps, discovered_snps)
    return length(discovered_snps ∩ correct_snps) / length(correct_snps)
end
function FDR(correct_groups, signif_groups)
    FP = length(signif_groups) - length(signif_groups ∩ correct_groups) # number of false positives
    return FP / max(1, length(signif_groups))
end
function FDR(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return FDR(correct_groups, signif_groups)
end
function get_signif_groups(β, groups)
    correct_groups = Int[]
    for i in findall(!iszero, β)
        g = groups[i]
        g ∈ correct_groups || push!(correct_groups, g)
    end
    return correct_groups
end

┌ Info: Precompiling Knockoffs [878bf26d-0c49-448a-9df5-b057c815d613]
└ @ Base loading.jl:1423


get_signif_groups (generic function with 1 method)

## gnomdAD panel

In [75]:
datadir = "/Users/biona001/Benjamin_Folder/research/4th_project/group_knockoff_test_data"
covfile = CSV.read(joinpath(datadir, "CorG_2_127374341_128034347.txt"), DataFrame)
Σ = covfile |> Matrix{Float64}
Σ = 0.99Σ + 0.01I #ensure PSD

# test on smaller data
idx = 1000 # 1261 # includes largest group with 192 members
Σ = Σ[1:idx, 1:idx];

In [84]:
# simulate data
m = 5
p = size(Σ, 1)
k = 10 # number of causal groups
n = 1000 # sample size

# simulate X
μ = zeros(p)
X = rand(MvNormal(μ, Σ), n)' |> Matrix
zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

# define groups
nrep = 2
groups, group_reps = id_partition_groups(X, rep_method=:rss, nrep=nrep, force_contiguous=false)
unique_groups = unique(groups)
countmap(groups) |> values |> collect |> sort

# rep group knockoffs
@time ko = modelX_gaussian_rep_group_knockoffs(X, :maxent, μ, Σ, groups, group_reps, m=m, nrep=nrep)
# @time Xr, Σ11 = modelX_gaussian_rep_group_knockoffs(X, :maxent, μ, Σ, groups, group_reps, m=m, nrep=nrep)
# @time Xr, Xc, X̃r, X̃c, X̃r_correct, X̃c_correct = modelX_gaussian_rep_group_knockoffs(X, :maxent, μ, Σ, groups, group_reps, m=m, nrep=nrep)
# @time Xr, Xc, X̃r, X̃c = modelX_gaussian_rep_group_knockoffs(X, :maxent, μ, Σ, groups, group_reps, m=m, nrep=nrep)

X̃ = ko.X̃
Xr = X[:, group_reps]
Xc = X[:, setdiff(1:p, group_reps)]
X̃r = X̃[:, group_reps]
X̃c = X̃[:, setdiff(1:p, group_reps)];

328 representatives for 1000 variables
eigmin(Symmetric(D)) = 0.005731717330369396
  2.089163 seconds (9.61 k allocations: 899.285 MiB, 2.66% gc time)


In [85]:
v = rand(1:p, 5) |> sort
is_rep = map(x -> x ∈ group_reps, v)

5-element Vector{Bool}:
 1
 0
 1
 0
 0

In [86]:
cor(X)[v, v]

5×5 Matrix{Float64}:
  1.0        -0.128538     0.0476025   0.0361166  -0.0293369
 -0.128538    1.0         -0.0127992  -0.039143    0.00126043
  0.0476025  -0.0127992    1.0        -0.216614    0.0569595
  0.0361166  -0.039143    -0.216614    1.0        -0.288986
 -0.0293369   0.00126043   0.0569595  -0.288986    1.0

In [87]:
cor(X̃)[v, v]

5×5 Matrix{Float64}:
  1.0        -0.132074      0.0519285    0.0500152  -0.0395178
 -0.132074    1.0          -0.00445221  -0.0630244   0.000921488
  0.0519285  -0.00445221    1.0         -0.222619    0.0312273
  0.0500152  -0.0630244    -0.222619     1.0        -0.276945
 -0.0395178   0.000921488   0.0312273   -0.276945    1.0

In [88]:
cor(Xr)[1:5, 1:5]

5×5 Matrix{Float64}:
  1.0         0.0362155   0.36476    -0.147871    0.166614
  0.0362155   1.0         0.101374   -0.115301    0.522624
  0.36476     0.101374    1.0        -0.0779953   0.294344
 -0.147871   -0.115301   -0.0779953   1.0        -0.111885
  0.166614    0.522624    0.294344   -0.111885    1.0

In [89]:
cor(X̃r)[1:5, 1:5]

5×5 Matrix{Float64}:
  1.0         0.0400154   0.332551   -0.0889591   0.14156
  0.0400154   1.0         0.0805449  -0.0728491   0.488461
  0.332551    0.0805449   1.0        -0.0907807   0.294156
 -0.0889591  -0.0728491  -0.0907807   1.0        -0.0466453
  0.14156     0.488461    0.294156   -0.0466453   1.0

In [90]:
cor(Xc)[1:5, 1:5]

5×5 Matrix{Float64}:
  1.0        0.98311   -0.403906  -0.403641  -0.406845
  0.98311    1.0       -0.40092   -0.402446  -0.404
 -0.403906  -0.40092    1.0        0.978953   0.980318
 -0.403641  -0.402446   0.978953   1.0        0.983577
 -0.406845  -0.404      0.980318   0.983577   1.0

In [91]:
cor(X̃c)[1:5, 1:5]

5×5 Matrix{Float64}:
  1.0        0.853673  -0.283059  -0.341981  -0.349544
  0.853673   1.0       -0.288434  -0.369715  -0.386392
 -0.283059  -0.288434   1.0        0.894652   0.822844
 -0.341981  -0.369715   0.894652   1.0        0.964723
 -0.349544  -0.386392   0.822844   0.964723   1.0

**Conclusion**
Seems like $X_r$ and $\tilde{X}_r$ agrees fairly well while $X_c$ and $\tilde{X}_c$ agrees only somewhat. Lets try this in simulations

## One simulation

In [21]:
# simulate data
m = 5
p = size(Σ, 1)
k = 10 # number of causal groups
n = 500 # sample size
μ = zeros(p)

# simulate X
X = rand(MvNormal(μ, Σ), n)' |> Matrix
zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

# define groups
nrep = 5
groups, group_reps = id_partition_groups(X, rep_method=:rss, nrep=nrep, force_contiguous=false)
unique_groups = unique(groups)

# simulate βtrue
βtrue = zeros(p)
βtrue[1:k] .= rand(-1:2:1, k) .* randn(k)
shuffle!(βtrue)
causal_groups = get_signif_groups(βtrue, groups)

# simulate y
ϵ = randn(n)
y = X * βtrue + ϵ

# fully general me
@time me = modelX_gaussian_group_knockoffs(
    X, :maxent, groups, μ, Σ, 
    m = m,
    niter = 10,
    tol = 0.01,    # convergence tolerance
    verbose=false, # whether to print informative intermediate results
)
me_ko_filter = fit_lasso(y, me)
me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
me_ssum = sum(abs.(me_ko_filter.ko.S))
@show me_power, me_fdr

# representative ME knockoffs
@time rme = modelX_gaussian_rep_group_knockoffs(
    X, :maxent, μ, Σ, groups, group_reps,
    m = m, 
    nrep = nrep, 
);
rme_ko_filter = fit_lasso(y, rme)
discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
rme_power = round(TP(causal_groups, discovered_groups), digits=3)
rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
@show rme_power, rme_fdr

 11.531111 seconds (94.64 k allocations: 249.330 MiB, 0.06% gc time)
(me_power, me_fdr) = (0.5, 0.0)
232 representatives for 500 variables
  0.395546 seconds (6.19 k allocations: 249.188 MiB)
(rme_power, rme_fdr) = (0.625, 0.444)


(0.625, 0.444)

### Interpolative decomposition, target FDR = 0.1, nrep=5, m = 5

In [22]:
fdr_hat = 0.0
nsims = 10
for i in 1:nsims
    Random.seed!(i)
    
    # simulate data
    m = 5
    p = size(Σ, 1)
    k = 10 # number of causal groups
    n = 500 # sample size
    μ = zeros(p)

    # simulate X
    X = rand(MvNormal(μ, Σ), n)' |> Matrix
    zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

    # define groups
    nrep = 5
    groups, group_reps = id_partition_groups(X, rep_method=:rss, nrep=nrep, force_contiguous=false)
    unique_groups = unique(groups)

    # simulate βtrue
    βtrue = zeros(p)
    βtrue[1:k] .= rand(-1:2:1, k) .* randn(k)
    shuffle!(βtrue)
    causal_groups = get_signif_groups(βtrue, groups)

    # simulate y
    ϵ = randn(n)
    y = X * βtrue + ϵ

    # fully general me
    @time me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, μ, Σ, 
        m = m,
        niter = 10,
        tol = 0.01,    # convergence tolerance
        verbose=false, # whether to print informative intermediate results
    )
    me_ko_filter = fit_lasso(y, me)
    me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_ssum = sum(abs.(me_ko_filter.ko.S))
    @show me_power, me_fdr

    # representative ME knockoffs
    @time rme = modelX_gaussian_rep_group_knockoffs(
        X, :maxent, μ, Σ, groups, group_reps,
        m = m, 
        nrep = nrep, 
    );
    rme_ko_filter = fit_lasso(y, rme)
    discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
    rme_power = round(TP(causal_groups, discovered_groups), digits=3)
    rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
    @show rme_power, rme_fdr
    fdr_hat += rme_fdr
end
fdr_hat /= nsims
println("representative ME knockoff has avg FDR $fdr_hat")

 12.313693 seconds (95.67 k allocations: 249.381 MiB, 0.08% gc time)
(me_power, me_fdr) = (0.429, 0.25)
230 representatives for 500 variables
  0.509597 seconds (6.16 k allocations: 249.041 MiB, 2.71% gc time)
(rme_power, rme_fdr) = (0.714, 0.375)
  7.505731 seconds (60.49 k allocations: 246.737 MiB, 2.55% gc time)
(me_power, me_fdr) = (0.5, 0.0)
244 representatives for 500 variables
  0.471243 seconds (6.89 k allocations: 250.043 MiB, 1.13% gc time)
(rme_power, rme_fdr) = (0.8, 0.333)
 12.215512 seconds (97.08 k allocations: 249.463 MiB, 0.06% gc time)
(me_power, me_fdr) = (0.444, 0.0)
234 representatives for 500 variables
  0.458480 seconds (6.64 k allocations: 249.290 MiB, 0.92% gc time)
(rme_power, rme_fdr) = (0.444, 0.2)
 10.417053 seconds (90.08 k allocations: 248.990 MiB, 0.07% gc time)
(me_power, me_fdr) = (0.333, 0.0)
239 representatives for 500 variables
  0.415355 seconds (6.80 k allocations: 249.696 MiB, 0.92% gc time)
(rme_power, rme_fdr) = (0.778, 0.0)
 10.054782 seconds 

### Increasing nreps to 25

In [23]:
fdr_hat = 0.0
nsims = 10
for i in 1:nsims
    Random.seed!(i)
    
    # simulate data
    m = 5
    p = size(Σ, 1)
    k = 10 # number of causal groups
    n = 500 # sample size
    μ = zeros(p)

    # simulate X
    X = rand(MvNormal(μ, Σ), n)' |> Matrix
    zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

    # define groups
    nrep = 25
    groups, group_reps = id_partition_groups(X, rep_method=:rss, nrep=nrep, force_contiguous=false)
    unique_groups = unique(groups)

    # simulate βtrue
    βtrue = zeros(p)
    βtrue[1:k] .= rand(-1:2:1, k) .* randn(k)
    shuffle!(βtrue)
    causal_groups = get_signif_groups(βtrue, groups)

    # simulate y
    ϵ = randn(n)
    y = X * βtrue + ϵ

    # fully general me
    @time me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, μ, Σ, 
        m = m,
        niter = 10,
        tol = 0.01,    # convergence tolerance
        verbose=false, # whether to print informative intermediate results
    )
    me_ko_filter = fit_lasso(y, me)
    me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_ssum = sum(abs.(me_ko_filter.ko.S))
    @show me_power, me_fdr

    # representative ME knockoffs
    @time rme = modelX_gaussian_rep_group_knockoffs(
        X, :maxent, μ, Σ, groups, group_reps,
        m = m, 
        nrep = nrep, 
    );
    rme_ko_filter = fit_lasso(y, rme)
    discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
    rme_power = round(TP(causal_groups, discovered_groups), digits=3)
    rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
    @show rme_power, rme_fdr
    fdr_hat += rme_fdr
end
fdr_hat /= nsims
println("representative ME knockoff has avg FDR $fdr_hat")

 12.011551 seconds (95.67 k allocations: 249.381 MiB, 0.21% gc time)
(me_power, me_fdr) = (0.429, 0.25)
451 representatives for 500 variables
  5.136995 seconds (51.74 k allocations: 266.111 MiB, 0.11% gc time)
(rme_power, rme_fdr) = (0.571, 0.0)
  7.051117 seconds (60.49 k allocations: 246.737 MiB)
(me_power, me_fdr) = (0.5, 0.0)
469 representatives for 500 variables
  7.076789 seconds (64.98 k allocations: 267.282 MiB, 0.07% gc time)
(rme_power, rme_fdr) = (0.7, 0.0)
 11.949892 seconds (97.08 k allocations: 249.463 MiB, 0.06% gc time)
(me_power, me_fdr) = (0.444, 0.0)
444 representatives for 500 variables
  7.579106 seconds (77.50 k allocations: 271.273 MiB, 0.12% gc time)
(rme_power, rme_fdr) = (0.0, 0.0)
 10.252961 seconds (90.08 k allocations: 248.990 MiB)
(me_power, me_fdr) = (0.333, 0.0)
459 representatives for 500 variables
  6.615283 seconds (64.83 k allocations: 265.520 MiB)
(rme_power, rme_fdr) = (0.444, 0.0)
 10.005554 seconds (82.59 k allocations: 248.314 MiB, 0.03% gc tim