# Check if representative group knockoffs are working

In [1]:
# load packages for this tutorial
using Revise
using Knockoffs
using LinearAlgebra
using Random
using StatsBase
using Statistics
using ToeplitzMatrices
using Distributions
using Clustering
using ProgressMeter
using LowRankApprox
using Plots
gr(fmt=:png);

# some helper functions to compute power and empirical FDR
function TP(correct_groups, signif_groups)
    return length(signif_groups ∩ correct_groups) / max(1, length(correct_groups))
end
function TP(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return TP(correct_groups, signif_groups)
end
function power(correct_snps, discovered_snps)
    return length(discovered_snps ∩ correct_snps) / length(correct_snps)
end
function FDR(correct_groups, signif_groups)
    FP = length(signif_groups) - length(signif_groups ∩ correct_groups) # number of false positives
    return FP / max(1, length(signif_groups))
end
function FDR(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return FDR(correct_groups, signif_groups)
end
function get_signif_groups(β, groups)
    correct_groups = Int[]
    for i in findall(!iszero, β)
        g = groups[i]
        g ∈ correct_groups || push!(correct_groups, g)
    end
    return correct_groups
end

┌ Info: Precompiling Knockoffs [878bf26d-0c49-448a-9df5-b057c815d613]
└ @ Base loading.jl:1423


get_signif_groups (generic function with 1 method)

## One simulation

In [229]:
# simulate data
m = 1
p = 500
k = 10 # number of causal groups
n = 250 # sample size
μ = zeros(p)
# Σ = Matrix(SymmetricToeplitz(0.9.^(0:(p-1))))

# define true groups and true covariance
groupsizes = [5 for i in 1:div(p, 5)] 
groups = vcat([i*ones(g) for (i, g) in enumerate(groupsizes)]...) |> Vector{Int}
Σ = simulate_block_covariance(groups, 0.75, 0.0)

# define groups empirically
nrep = 1
groups, group_reps = id_partition_groups(Σ, nrep=nrep)
unique_groups = unique(groups)

# simulate X
X = rand(MvNormal(μ, Σ), n)' |> Matrix
zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

# simulate βtrue: SNPs in causal groups will share the same effect 
δ = 2
βtrue = zeros(p)
shuffle!(unique_groups)
causal_groups = unique_groups[1:k]
for g in causal_groups
    β_idx = findall(x -> x == g, groups)
    effect_size = rand(-1:2:1) * rand(Uniform(δ/2, δ)) / length(β_idx)
    βtrue[β_idx] .= effect_size
end

# simulate y
ϵ = randn(n)
y = X * βtrue + ϵ
correct_snps = findall(!iszero, βtrue)

# fully general me
me_t = @elapsed me = modelX_gaussian_group_knockoffs(
    X, :maxent, groups, μ, Σ, 
    m = m,
    niter = 10,
    tol = 0.01,    # convergence tolerance
    verbose=false, # whether to print informative intermediate results
)
me_ko_filter = fit_lasso(y, X, me, debias=:lasso)
me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
me_ssum = sum(abs.(me_ko_filter.ko.S))
@show me_power, me_fdr

# representative ME knockoffs
rme = modelX_gaussian_rep_group_knockoffs(
    X, :maxent, μ, Σ, groups, group_reps,
    m = m, 
    nrep = nrep
);
rme_ko_filter = fit_lasso(y, rme)
discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
rme_power = round(TP(causal_groups, discovered_groups), digits=3)
rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
@show rme_power, rme_fdr

# single ME knockoff
sme = modelX_gaussian_knockoffs(
    X[:, group_reps], :maxent, μ[group_reps], Σ[group_reps, group_reps], 
    m = m, 
);
sme_ko_filter = fit_lasso(y, X, sme)
discovered_snps = findall(!iszero, sme_ko_filter.βs[3])
sme_power = round(TP(correct_snps, discovered_snps), digits=3)
sme_fdr = round(FDR(correct_snps, discovered_snps), digits=3)
@show sme_power, sme_fdr

(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.091)
(sme_power, sme_fdr) = (1.0, 0.167)


(1.0, 0.167)

In [223]:
rme_ko_filter.βs[3]

500-element Vector{Float64}:
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
 -0.09454328079742436
 -1.613562588404716
  0.0
  ⋮
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0

In [220]:
causal_groups |> sort

10-element Vector{Int64}:
  12
  84
 123
 146
 214
 308
 311
 325
 355
 358

In [221]:
discovered_groups

17-element Vector{Int64}:
  11
  12
  83
  85
 123
 148
 150
 214
 306
 307
 314
 315
 325
 352
 353
 356
 360

### Target FDR = 0.1

In [232]:
fdr_hat = 0.0
nsims = 20
for i in 1:nsims
    Random.seed!(i)
    
    # simulate data
    m = 1
    p = 500
    k = 10 # number of causal groups
    n = 1250 # sample size
    μ = zeros(p)
    # Σ = Matrix(SymmetricToeplitz(0.9.^(0:(p-1))))

    # define true groups and true covariance
    groupsizes = [5 for i in 1:div(p, 5)] 
    groups = vcat([i*ones(g) for (i, g) in enumerate(groupsizes)]...) |> Vector{Int}
    Σ = simulate_block_covariance(groups, 0.75, 0.0)

    # define groups empirically
    nrep = 2
    groups, group_reps = hc_partition_groups(Σ, nrep=nrep)
    unique_groups = unique(groups)

    # simulate X
    X = rand(MvNormal(μ, Σ), n)' |> Matrix
    zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

    # simulate βtrue: SNPs in causal groups will share the same effect 
    δ = 1
    βtrue = zeros(p)
    shuffle!(unique_groups)
    causal_groups = unique_groups[1:k]
    for g in causal_groups
        β_idx = findall(x -> x == g, groups)
        effect_size = rand(-1:2:1) * rand(Uniform(δ/2, δ)) / length(β_idx)
        βtrue[β_idx] .= effect_size
    end

    # simulate y
    ϵ = randn(n)
    y = X * βtrue + ϵ
    correct_snps = findall(!iszero, βtrue)

    # fully general me
    me_t = @elapsed me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, μ, Σ, 
        m = m,
        niter = 10,
        tol = 0.01,    # convergence tolerance
        verbose=false, # whether to print informative intermediate results
    )
    me_ko_filter = fit_lasso(y, X, me, debias=:lasso)
    me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_ssum = sum(abs.(me_ko_filter.ko.S))
    @show me_power, me_fdr

    # representative ME knockoffs
    rme = modelX_gaussian_rep_group_knockoffs(
        X, :maxent, μ, Σ, groups, group_reps,
        m = m, 
        nrep = nrep
    );
    rme_ko_filter = fit_lasso(y, rme)
    discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
    rme_power = round(TP(causal_groups, discovered_groups), digits=3)
    rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
    @show rme_power, rme_fdr
    fdr_hat += rme_fdr

#     me = modelX_gaussian_rep_group_knockoffs(
#         X[:, group_reps], :maxent, μ[group_reps], Σ[group_reps, group_reps], 
#         m = m, 
#         nrep = nrep
#     );
#     me_ko_filter = fit_lasso(y, X[:, group_reps], me)
#     correct_rep_snps = findall(!iszero, βtrue[group_reps])
#     discovered_rep_snps = findall(!iszero, me_ko_filter.βs[3]);
#     me_power = round(TP(correct_rep_snps, findall(!iszero, me_ko_filter.βs[3])), digits=3)
#     me_fdr = round(FDR(correct_rep_snps, findall(!iszero, me_ko_filter.βs[3])), digits=3)
#     @show me_power, me_fdr
end
fdr_hat /= nsims

(me_power, me_fdr) = (1.0, 0.091)
(rme_power, rme_fdr) = (1.0, 0.0)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.0)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.0)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.091)
(rme_power, rme_fdr) = (1.0, 0.231)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.0)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.091)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr) = (1.0, 0.0)
(me_power, me_fdr) = (1.0, 0.167)
(rme_power, rme_fdr) = (1.0, 0.0)
(me_power, me_fdr) = (1.0, 0.0)
(rme_power, rme_fdr)

0.056299999999999996

### Target FDR = 0.25

In [231]:
fdr_hat = 0.0
nsims = 20
for i in 1:nsims
    Random.seed!(i)
    
    # simulate data
    m = 1
    p = 500
    k = 10 # number of causal groups
    n = 1250 # sample size
    μ = zeros(p)
    # Σ = Matrix(SymmetricToeplitz(0.9.^(0:(p-1))))

    # define true groups and true covariance
    groupsizes = [5 for i in 1:div(p, 5)] 
    groups = vcat([i*ones(g) for (i, g) in enumerate(groupsizes)]...) |> Vector{Int}
    Σ = simulate_block_covariance(groups, 0.75, 0.0)

    # define groups empirically
    nrep = 2
    groups, group_reps = id_partition_groups(Σ, nrep=nrep)
    unique_groups = unique(groups)

    # simulate X
    X = rand(MvNormal(μ, Σ), n)' |> Matrix
    zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

    # simulate βtrue: SNPs in causal groups will share the same effect 
    δ = 1
    βtrue = zeros(p)
    shuffle!(unique_groups)
    causal_groups = unique_groups[1:k]
    for g in causal_groups
        β_idx = findall(x -> x == g, groups)
        effect_size = rand(-1:2:1) * rand(Uniform(δ/2, δ)) / length(β_idx)
        βtrue[β_idx] .= effect_size
    end

    # simulate y
    ϵ = randn(n)
    y = X * βtrue + ϵ
    correct_snps = findall(!iszero, βtrue)

    # fully general me
    me_t = @elapsed me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, μ, Σ, 
        m = m,
        niter = 10,
        tol = 0.01,    # convergence tolerance
        verbose=false, # whether to print informative intermediate results
    )
    me_ko_filter = fit_lasso(y, X, me, debias=:lasso)
    me_power = round(TP(causal_groups, me_ko_filter.βs[4], groups), digits=3)
    me_fdr = round(FDR(causal_groups, me_ko_filter.βs[4], groups), digits=3)
    me_ssum = sum(abs.(me_ko_filter.ko.S))
    @show me_power, me_fdr

    # representative ME knockoffs
    rme = modelX_gaussian_rep_group_knockoffs(
        X, :maxent, μ, Σ, groups, group_reps,
        m = m, 
        nrep = nrep
    );
    rme_ko_filter = fit_lasso(y, rme)
    discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[4])] |> unique
    rme_power = round(TP(causal_groups, discovered_groups), digits=3)
    rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
    @show rme_power, rme_fdr
    fdr_hat += rme_fdr

#     me = modelX_gaussian_rep_group_knockoffs(
#         X[:, group_reps], :maxent, μ[group_reps], Σ[group_reps, group_reps], 
#         m = m, 
#         nrep = nrep
#     );
#     me_ko_filter = fit_lasso(y, X[:, group_reps], me)
#     correct_rep_snps = findall(!iszero, βtrue[group_reps])
#     discovered_rep_snps = findall(!iszero, me_ko_filter.βs[3]);
#     me_power = round(TP(correct_rep_snps, findall(!iszero, me_ko_filter.βs[3])), digits=3)
#     me_fdr = round(FDR(correct_rep_snps, findall(!iszero, me_ko_filter.βs[3])), digits=3)
#     @show me_power, me_fdr
end
fdr_hat /= nsims

(me_power, me_fdr) = (1.0, 0.167)
(rme_power, rme_fdr) = (1.0, 0.167)
(me_power, me_fdr) = (1.0, 0.167)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.231)
(rme_power, rme_fdr) = (1.0, 0.0)
(me_power, me_fdr) = (1.0, 0.375)
(rme_power, rme_fdr) = (1.0, 0.231)
(me_power, me_fdr) = (1.0, 0.091)
(rme_power, rme_fdr) = (1.0, 0.167)
(me_power, me_fdr) = (1.0, 0.375)
(rme_power, rme_fdr) = (1.0, 0.5)
(me_power, me_fdr) = (1.0, 0.375)
(rme_power, rme_fdr) = (1.0, 0.286)
(me_power, me_fdr) = (1.0, 0.231)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.167)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.167)
(rme_power, rme_fdr) = (1.0, 0.167)
(me_power, me_fdr) = (1.0, 0.444)
(rme_power, rme_fdr) = (1.0, 0.412)
(me_power, me_fdr) = (1.0, 0.286)
(rme_power, rme_fdr) = (1.0, 0.231)
(me_power, me_fdr) = (1.0, 0.091)
(rme_power, rme_fdr) = (1.0, 0.091)
(me_power, me_fdr) = (1.0, 0.231)
(rme_power, rme_fdr) = (1.0, 0.231)
(me_power, me_fdr) = (1.

0.21405