# Check if representative group knockoffs are working

In [1]:
# load packages for this tutorial
using Revise
using Knockoffs
using LinearAlgebra
using Random
using StatsBase
using Statistics
using ToeplitzMatrices
using Distributions
using Clustering
using ProgressMeter
using LowRankApprox
using Plots
using CSV, DataFrames
gr(fmt=:png);

# some helper functions to compute power and empirical FDR
function TP(correct_groups, signif_groups)
    return length(signif_groups ∩ correct_groups) / max(1, length(correct_groups))
end
function TP(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return TP(correct_groups, signif_groups)
end
function power(correct_snps, discovered_snps)
    return length(discovered_snps ∩ correct_snps) / length(correct_snps)
end
function FDR(correct_groups, signif_groups)
    FP = length(signif_groups) - length(signif_groups ∩ correct_groups) # number of false positives
    return FP / max(1, length(signif_groups))
end
function FDR(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return FDR(correct_groups, signif_groups)
end
function get_signif_groups(β, groups)
    correct_groups = Int[]
    for i in findall(!iszero, β)
        g = groups[i]
        g ∈ correct_groups || push!(correct_groups, g)
    end
    return correct_groups
end

┌ Info: Precompiling Knockoffs [878bf26d-0c49-448a-9df5-b057c815d613]
└ @ Base loading.jl:1423


get_signif_groups (generic function with 1 method)

## gnomdAD panel

In [38]:
datadir = "/Users/biona001/Benjamin_Folder/research/4th_project/group_knockoff_test_data"
covfile = CSV.read(joinpath(datadir, "CorG_2_127374341_128034347.txt"), DataFrame)
Σ = covfile |> Matrix{Float64}
Σ = 0.99Σ + 0.01I #ensure PSD

# test on smaller data
idx = 1261 # 1261 # includes largest group with 192 members
Σ = Σ[1:idx, 1:idx];

In [99]:
# simulate data
m = 5
p = size(Σ, 1)
k = 10 # number of causal groups
n = 1000 # sample size

# simulate X
μ = zeros(p)
X = rand(MvNormal(μ, Σ), n)' |> Matrix
zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

# define groups
nrep = 2
groups, group_reps = id_partition_groups(X, rep_method=:rss, nrep=nrep, force_contiguous=false)
unique_groups = unique(groups)
countmap(groups) |> values |> collect |> sort

# rep group knockoffs
@time ko = modelX_gaussian_rep_group_knockoffs(X, :maxent, μ, Σ, groups, group_reps, m=m, nrep=nrep)
# @time Xr, Σ11 = modelX_gaussian_rep_group_knockoffs(X, :maxent, μ, Σ, groups, group_reps, m=m, nrep=nrep)
# @time Xr, Xc, X̃r, X̃c, X̃r_correct, X̃c_correct = modelX_gaussian_rep_group_knockoffs(X, :maxent, μ, Σ, groups, group_reps, m=m, nrep=nrep)
# @time Xr, Xc, X̃r, X̃c = modelX_gaussian_rep_group_knockoffs(X, :maxent, μ, Σ, groups, group_reps, m=m, nrep=nrep)

X̃ = ko.ko.X̃
Xr = X[:, group_reps]
Xc = X[:, setdiff(1:p, group_reps)]
X̃r = X̃[:, group_reps]
X̃c = X̃[:, setdiff(1:p, group_reps)];

364 representatives for 1261 variables
  3.363169 seconds (10.93 k allocations: 1.445 GiB, 0.74% gc time)


In [100]:
group_reps

364-element Vector{Int64}:
    1
    2
    3
    4
    7
    8
   13
   14
   15
   16
   36
   47
   49
    ⋮
 1236
 1242
 1244
 1246
 1247
 1249
 1250
 1253
 1255
 1258
 1259
 1261

In [101]:
cor(X)[1:10, 1:10]

10×10 Matrix{Float64}:
  1.0         0.0385965   0.354094   -0.140926    …  -0.121515    -0.12824
  0.0385965   1.0         0.0627565  -0.063695       -0.145059    -0.144189
  0.354094    0.0627565   1.0        -0.0228068      -0.39173     -0.388413
 -0.140926   -0.063695   -0.0228068   1.0            -0.00548315  -0.00765585
  0.108633    0.486039    0.217817   -0.0834431      -0.316002    -0.309761
  0.0983057   0.48137     0.209839   -0.0860413   …  -0.329421    -0.323862
  0.0958495   0.489057    0.214312   -0.0760447      -0.32819     -0.322793
  0.0714102   0.0105137   0.0956552  -0.0361292      -0.0518508   -0.0521433
 -0.121515   -0.145059   -0.39173    -0.00548315      1.0          0.977416
 -0.12824    -0.144189   -0.388413   -0.00765585      0.977416     1.0

In [102]:
cor(X̃)[1:10, 1:10]

10×10 Matrix{Float64}:
  1.0        -0.0110717   0.345622   -0.111989   …  -0.0372902   -0.0438857
 -0.0110717   1.0         0.0877963  -0.070183      -0.0604354   -0.0831542
  0.345622    0.0877963   1.0        -0.0899163     -0.153379    -0.157162
 -0.111989   -0.070183   -0.0899163   1.0           -0.0327084    0.0458918
  0.0913083   0.319488    0.159071   -0.0785176     -0.00796609  -0.0420391
  0.113616    0.215618    0.122851   -0.091956   …  -0.147272    -0.19753
  0.0859025   0.339694    0.177065   -0.0633664     -0.154906    -0.115378
  0.0909293   0.0265434   0.149323   -0.0274331     -0.0490022   -0.0524674
 -0.0372902  -0.0604354  -0.153379   -0.0327084      1.0          0.159371
 -0.0438857  -0.0831542  -0.157162    0.0458918      0.159371     1.0

In [103]:
cor(Xr)[1:10, 1:10]

10×10 Matrix{Float64}:
  1.0         0.0385965    0.354094   …  -0.132584     0.0717828
  0.0385965   1.0          0.0627565     -0.147847     0.0502559
  0.354094    0.0627565    1.0           -0.396256     0.0473726
 -0.140926   -0.063695    -0.0228068     -0.00931261   0.00557025
  0.0958495   0.489057     0.214312      -0.329895     0.0490143
  0.0714102   0.0105137    0.0956552  …  -0.0572786   -0.0151829
 -0.118088   -0.00171077  -0.0466596      0.199575    -0.00359033
  0.142427    0.304971     0.324561      -0.258455     0.017858
 -0.132584   -0.147847    -0.396256       1.0          0.0278238
  0.0717828   0.0502559    0.0473726      0.0278238    1.0

In [104]:
cor(X̃r)[1:10, 1:10]

10×10 Matrix{Float64}:
  1.0        -0.0110717   0.345622   -0.111989    …  -0.0812101    0.0527239
 -0.0110717   1.0         0.0877963  -0.070183       -0.170508     0.061914
  0.345622    0.0877963   1.0        -0.0899163      -0.284619     0.0657834
 -0.111989   -0.070183   -0.0899163   1.0            -0.0570336    0.00626099
  0.0859025   0.339694    0.177065   -0.0633664      -0.234108     0.0484252
  0.0909293   0.0265434   0.149323   -0.0274331   …  -0.0219612   -0.0184096
 -0.0756698  -0.029397   -0.0477974   0.113699        0.128917    -0.0411753
  0.183828    0.137694    0.228483   -0.0855473      -0.188393     0.0240128
 -0.0812101  -0.170508   -0.284619   -0.0570336       1.0         -0.00515397
  0.0527239   0.061914    0.0657834   0.00626099     -0.00515397   1.0

In [105]:
cor(Xc)[1:10, 1:10]

10×10 Matrix{Float64}:
  1.0         0.981264   -0.316002  …  -0.317638  -0.132409  -0.316947
  0.981264    1.0        -0.329421     -0.333033  -0.120048  -0.330637
 -0.316002   -0.329421    1.0           0.973351  -0.210581   0.972282
 -0.309761   -0.323862    0.977416      0.97784   -0.206922   0.975301
 -0.312024   -0.326439    0.977265      0.978027  -0.206273   0.977115
  0.0985639   0.0838191   0.65559   …   0.658019   0.2612     0.662686
 -0.319572   -0.332927    0.973712      0.983475  -0.195903   0.982618
 -0.317638   -0.333033    0.973351      1.0       -0.206586   0.981393
 -0.132409   -0.120048   -0.210581     -0.206586   1.0       -0.197846
 -0.316947   -0.330637    0.972282      0.981393  -0.197846   1.0

In [106]:
cor(X̃c)[1:10, 1:10]

10×10 Matrix{Float64}:
  1.0          0.231955   -0.00796609  …  -0.122182    0.0500748  -0.141323
  0.231955     1.0        -0.147272       -0.100869   -0.338159   -0.0888098
 -0.00796609  -0.147272    1.0             0.375901   -0.280176    0.321197
 -0.0420391   -0.19753     0.159371       -0.150479   -0.213948    0.0598459
 -0.13757     -0.0379065   0.188739        0.53763     0.015457    0.554841
  0.0404046    0.0529159  -0.0147074   …   0.0720529   0.557415    0.00332686
  0.00934952  -0.099161    0.312389       -0.0173565  -0.313688   -0.168701
 -0.122182    -0.100869    0.375901        1.0        -0.0741195   0.538016
  0.0500748   -0.338159   -0.280176       -0.0741195   1.0        -0.174749
 -0.141323    -0.0888098   0.321197        0.538016   -0.174749    1.0

**Conclusion**
Seems like $X_r$ and $\tilde{X}_r$ agrees fairly well while $X_c$ and $\tilde{X}_c$ agrees only somewhat. Lets try this in simulations

## One simulation

In [114]:
# simulate data
m = 5
p = 500
k = 10 # number of causal groups
n = 1000 # sample size
μ = zeros(p)

# define groups empirically
nrep = 2
groups, group_reps = id_partition_groups(X, rep_method=:rss, nrep=nrep, force_contiguous=false)
unique_groups = unique(groups)

# simulate X
X = rand(MvNormal(μ, Σ), n)' |> Matrix
zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

# simulate βtrue: SNPs in causal groups will share the same effect 
δ = 2
βtrue = zeros(p)
shuffle!(unique_groups)
causal_groups = unique_groups[1:k]
for g in causal_groups
    β_idx = findall(x -> x == g, groups)
    effect_size = rand(-1:2:1) * rand(Uniform(δ/2, δ)) / length(β_idx)
    βtrue[β_idx] .= effect_size
end

# simulate y
ϵ = randn(n)
y = X * βtrue + ϵ
correct_snps = findall(!iszero, βtrue)

# fully general me
@time me = modelX_gaussian_group_knockoffs(
    X, :maxent, groups, μ, Σ, 
    m = m,
    niter = 10,
    tol = 0.01,    # convergence tolerance
    verbose=false, # whether to print informative intermediate results
)
me_ko_filter = fit_lasso(y, me, debias=:lasso)
me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
me_ssum = sum(abs.(me_ko_filter.ko.S))
@show me_power, me_fdr

# representative ME knockoffs
@time rme = modelX_gaussian_rep_group_knockoffs(
    X, :maxent, μ, Σ, groups, group_reps,
    m = m, 
    nrep = nrep, 
);
rme_ko_filter = fit_lasso(y, rme)
discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
rme_power = round(TP(causal_groups, discovered_groups), digits=3)
rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
@show rme_power, rme_fdr

  0.748810 seconds (13.54 k allocations: 293.404 MiB, 1.16% gc time)
(me_power, me_fdr) = (1.0, 0.0)
465 representatives for 500 variables
  0.501058 seconds (12.66 k allocations: 319.634 MiB, 2.32% gc time)
(rme_power, rme_fdr) = (1.0, 0.091)


(1.0, 0.091)

### Interpolative decomposition, target FDR = 0.1, nrep=2, m = 1

In [117]:
fdr_hat = 0.0
nsims = 10
for i in 1:nsims
    Random.seed!(i)
    
    # simulate data
    m = 1
    p = 500
    k = 10 # number of causal groups
    n = 1000 # sample size
    μ = zeros(p)

    # define groups empirically
    nrep = 2
    groups, group_reps = id_partition_groups(X, rep_method=:rss, nrep=nrep, force_contiguous=false)
    unique_groups = unique(groups)

    # simulate X
    X = rand(MvNormal(μ, Σ), n)' |> Matrix
    zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

    # simulate βtrue: SNPs in causal groups will share the same effect 
    δ = 2
    βtrue = zeros(p)
    shuffle!(unique_groups)
    causal_groups = unique_groups[1:k]
    for g in causal_groups
        β_idx = findall(x -> x == g, groups)
        effect_size = rand(-1:2:1) * rand(Uniform(δ/2, δ)) / length(β_idx)
        βtrue[β_idx] .= effect_size
    end

    # simulate y
    ϵ = randn(n)
    y = X * βtrue + ϵ
    correct_snps = findall(!iszero, βtrue)

    # fully general me
    @time me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, μ, Σ, 
        m = m,
        niter = 10,
        tol = 0.01,    # convergence tolerance
        verbose=false, # whether to print informative intermediate results
    )
    me_ko_filter = fit_lasso(y, me, debias=:lasso)
    me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_ssum = sum(abs.(me_ko_filter.ko.S))
    @show me_power, me_fdr

    # representative ME knockoffs
    @time rme = modelX_gaussian_rep_group_knockoffs(
        X, :maxent, μ, Σ, groups, group_reps,
        m = m, 
        nrep = nrep, 
        verbose=false
    );
    rme_ko_filter = fit_lasso(y, rme)
    discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
    rme_power = round(TP(causal_groups, discovered_groups), digits=3)
    rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
    @show rme_power, rme_fdr
    fdr_hat += rme_fdr
end
fdr_hat /= nsims
println("representative ME knockoff has avg FDR $fdr_hat")

  0.383911 seconds (13.55 k allocations: 76.065 MiB)
(me_power, me_fdr) = (0.8, 0.2)
  0.307048 seconds (12.89 k allocations: 81.699 MiB)
(rme_power, rme_fdr) = (0.8, 0.0)
  0.351180 seconds (13.54 k allocations: 76.032 MiB)
(me_power, me_fdr) = (0.9, 0.25)
  0.281787 seconds (12.84 k allocations: 81.112 MiB)
(rme_power, rme_fdr) = (0.9, 0.1)
  0.327460 seconds (13.61 k allocations: 76.037 MiB)
(me_power, me_fdr) = (0.8, 0.0)
  0.241370 seconds (12.80 k allocations: 80.286 MiB)
(rme_power, rme_fdr) = (0.8, 0.2)
  0.345613 seconds (13.63 k allocations: 76.040 MiB)
(me_power, me_fdr) = (0.8, 0.0)
  0.272350 seconds (12.79 k allocations: 80.296 MiB, 1.24% gc time)
(rme_power, rme_fdr) = (0.8, 0.0)
  0.320781 seconds (13.82 k allocations: 76.449 MiB)
(me_power, me_fdr) = (0.7, 0.222)
  0.270935 seconds (13.21 k allocations: 81.991 MiB)
(rme_power, rme_fdr) = (0.7, 0.3)
  0.341603 seconds (12.99 k allocations: 75.295 MiB)
(me_power, me_fdr) = (0.6, 0.0)
  0.260321 seconds (12.21 k allocatio

### hierarchical clustering, target FDR = 0.25, nrep=5, m = 5

In [122]:
fdr_hat = 0.0
nsims = 20 # 10 showed slightly inflated FDR, try 20
for i in 1:nsims
    Random.seed!(i)
    
    # simulate data
    m = 5
    p = 500
    k = 10 # number of causal groups
    n = 1000 # sample size
    μ = zeros(p)

    # define groups empirically
    nrep = 5
    groups, group_reps = hc_partition_groups(X, nrep=nrep, force_contiguous=true)
    unique_groups = unique(groups)

    # simulate X
    X = rand(MvNormal(μ, Σ), n)' |> Matrix
    zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

    # simulate βtrue: SNPs in causal groups will share the same effect 
    δ = 2
    βtrue = zeros(p)
    shuffle!(unique_groups)
    causal_groups = unique_groups[1:k]
    for g in causal_groups
        β_idx = findall(x -> x == g, groups)
        effect_size = rand(-1:2:1) * rand(Uniform(δ/2, δ)) / length(β_idx)
        βtrue[β_idx] .= effect_size
    end

    # simulate y
    ϵ = randn(n)
    y = X * βtrue + ϵ
    correct_snps = findall(!iszero, βtrue)

    # fully general me
    @time me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, μ, Σ, 
        m = m,
        niter = 10,
        tol = 0.01,    # convergence tolerance
        verbose=false, # whether to print informative intermediate results
    )
    me_ko_filter = fit_lasso(y, me, debias=:lasso)
    me_power = round(TP(causal_groups, me_ko_filter.βs[4], groups), digits=3)
    me_fdr = round(FDR(causal_groups, me_ko_filter.βs[4], groups), digits=3)
    me_ssum = sum(abs.(me_ko_filter.ko.S))
    @show me_power, me_fdr

    # representative ME knockoffs
    @time rme = modelX_gaussian_rep_group_knockoffs(
        X, :maxent, μ, Σ, groups, group_reps,
        m = m, 
        nrep = nrep, 
        verbose=false
    );
    rme_ko_filter = fit_lasso(y, rme)
    discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[4])] |> unique
    rme_power = round(TP(causal_groups, discovered_groups), digits=3)
    rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
    @show rme_power, rme_fdr
    fdr_hat += rme_fdr
end
fdr_hat /= nsims

  0.539341 seconds (5.87 k allocations: 279.172 MiB)
(me_power, me_fdr) = (1.0, 0.333)
  0.495837 seconds (5.87 k allocations: 288.882 MiB)
(rme_power, rme_fdr) = (1.0, 0.231)
  0.460896 seconds (5.87 k allocations: 279.172 MiB)
(me_power, me_fdr) = (1.0, 0.231)
  0.472334 seconds (5.87 k allocations: 288.420 MiB)
(rme_power, rme_fdr) = (1.0, 0.286)
  0.419509 seconds (5.87 k allocations: 279.172 MiB, 2.33% gc time)
(me_power, me_fdr) = (1.0, 0.091)
  0.592884 seconds (5.87 k allocations: 288.882 MiB, 1.40% gc time)
(rme_power, rme_fdr) = (1.0, 0.091)
  0.627975 seconds (5.87 k allocations: 279.172 MiB)
(me_power, me_fdr) = (1.0, 0.091)
  0.605533 seconds (5.87 k allocations: 287.919 MiB)
(rme_power, rme_fdr) = (1.0, 0.0)
  0.415533 seconds (5.87 k allocations: 279.172 MiB)
(me_power, me_fdr) = (1.0, 0.167)
  0.479999 seconds (5.87 k allocations: 288.327 MiB, 1.94% gc time)
(rme_power, rme_fdr) = (1.0, 0.167)
  0.496479 seconds (5.87 k allocations: 279.172 MiB)
(me_power, me_fdr) = (1.

0.143