# Check if representative group knockoffs are working

In [5]:
# load packages for this tutorial
using Revise
using Knockoffs
using LinearAlgebra
using Random
using StatsBase
using Statistics
using ToeplitzMatrices
using Distributions
using Clustering
using ProgressMeter
using LowRankApprox
using CSV, DataFrames
using RCall
using Plots
gr(fmt=:png);

# some helper functions to compute power and empirical FDR
function TP(correct_groups, signif_groups)
    return length(signif_groups ∩ correct_groups) / max(1, length(correct_groups))
end
function TP(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return TP(correct_groups, signif_groups)
end
function power(correct_snps, discovered_snps)
    return length(discovered_snps ∩ correct_snps) / length(correct_snps)
end
function FDR(correct_groups, signif_groups)
    FP = length(signif_groups) - length(signif_groups ∩ correct_groups) # number of false positives
    return FP / max(1, length(signif_groups))
end
function FDR(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return FDR(correct_groups, signif_groups)
end
function get_signif_groups(β, groups)
    correct_groups = Int[]
    for i in findall(!iszero, β)
        g = groups[i]
        g ∈ correct_groups || push!(correct_groups, g)
    end
    return correct_groups
end

## Simulate data with gnomdAD panel

In [4]:
datadir = "/Users/biona001/Benjamin_Folder/research/4th_project_PRS/group_knockoff_test_data"
covfile = CSV.read(joinpath(datadir, "CorG_2_127374341_128034347.txt"), DataFrame)
Σ = covfile |> Matrix{Float64}
Σ = 0.99Σ + 0.01I #ensure PSD

# test on smaller data
idx = 500 # 1241 # includes largest group with 192 members
Σ = Σ[1:idx, 1:idx];

# simulate data
m = 5
p = size(Σ, 1)
k = 10 # number of causal groups
n = 1000 # sample size

# simulate X
μ = zeros(p)
X = rand(MvNormal(μ, Σ), n)' |> Matrix
zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

# define groups
nrep = 5
groups = id_partition_groups(X, force_contiguous=false)
unique_groups = unique(groups)
countmap(groups) |> values |> collect |> sort

135-element Vector{Int64}:
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  ⋮
 15
 16
 17
 18
 19
 22
 24
 25
 30
 32
 34
 44

In [36]:
# group knockoffs
@time ko = modelX_gaussian_group_knockoffs(X, :maxent, groups, μ, Σ, m=m, verbose=true);

Maxent initial obj = -21697.062824271106
Iter 1 (PCA): obj = -17873.19705916164, δ = 2.7894170882622116, t1 = 0.2, t2 = 0.07
Iter 2 (PCA): obj = -14422.241803784958, δ = 0.9222596242085215, t1 = 0.37, t2 = 0.13
Iter 3 (PCA): obj = -12753.226804435964, δ = 0.5414552226809967, t1 = 0.58, t2 = 0.2
Iter 4 (PCA): obj = -11795.165658886217, δ = 0.4192851451235552, t1 = 0.86, t2 = 0.26
Iter 5 (PCA): obj = -11206.72302765663, δ = 0.48592870354058665, t1 = 1.07, t2 = 0.33
Iter 6 (PCA): obj = -10838.549047856957, δ = 0.6366331320900753, t1 = 1.33, t2 = 0.39
Iter 7 (PCA): obj = -10617.833462405928, δ = 0.6754313214808505, t1 = 1.57, t2 = 0.45
Iter 8 (PCA): obj = -10482.505674070737, δ = 0.27530848796806606, t1 = 1.79, t2 = 0.51
Iter 9 (PCA): obj = -10373.903657779418, δ = 0.17728527440017383, t1 = 1.95, t2 = 0.57
Iter 10 (PCA): obj = -10284.36124155696, δ = 0.2618276162815484, t1 = 2.1, t2 = 0.64
Iter 11 (CCD): obj = -10068.859996265239, δ = 0.5205009289960065, t1 = 2.38, t2 = 1.05, t3 = 0.0
Iter

In [37]:
# rep group knockoffs
@time rko = modelX_gaussian_rep_group_knockoffs(X, :maxent, μ, Σ, groups, verbose=true)

X̃ = rko.X̃
group_reps = rko.group_reps
Xr = X[:, group_reps]
Xc = X[:, setdiff(1:p, group_reps)]
X̃r = X̃[:, group_reps]
X̃c = X̃[:, setdiff(1:p, group_reps)];

140 representatives for 500 variables, 148 optimization variables
  0.416092 seconds (19.74 k allocations: 209.171 MiB)


In [38]:
count(!iszero, ko.S), count(!iszero, rko.S)

(8836, 230548)

Check if knockoffs generated from conditional independence assumption satisfy exchangability

In [39]:
# right column is the first 10 reps, left column is the group they belong to
[groups[group_reps[1:10]] group_reps[1:10]]

10×2 Matrix{Int64}:
  1   1
  2   2
  3   3
  4   4
  6   6
 90   8
  5  13
 17  38
  7  47
  8  49

In [40]:
groups[1:5]

5-element Vector{Int64}:
 1
 2
 3
 4
 6

In [8]:
cor(Xr)[1:5, 1:5]

5×5 Matrix{Float64}:
  1.0         -0.00637431  -0.122304    0.105602    0.0475231
 -0.00637431   1.0         -0.0649999   0.519294    0.0303531
 -0.122304    -0.0649999    1.0        -0.0991171  -0.0112321
  0.105602     0.519294    -0.0991171   1.0        -0.0572634
  0.0475231    0.0303531   -0.0112321  -0.0572634   1.0

In [9]:
cor(X̃r)[1:5, 1:5]

5×5 Matrix{Float64}:
  1.0         0.0171303   -0.118741     0.128614   -0.0119235
  0.0171303   1.0         -0.00397907   0.519748   -0.0560058
 -0.118741   -0.00397907   1.0         -0.0493806  -0.00172672
  0.128614    0.519748    -0.0493806    1.0        -0.0398285
 -0.0119235  -0.0560058   -0.00172672  -0.0398285   1.0

In [10]:
cor(Xc)[1:5, 1:5]

5×5 Matrix{Float64}:
  1.0        0.244665   0.253973  -0.432722  -0.434066
  0.244665   1.0        0.983266  -0.399884  -0.390473
  0.253973   0.983266   1.0       -0.398894  -0.387487
 -0.432722  -0.399884  -0.398894   1.0        0.979364
 -0.434066  -0.390473  -0.387487   0.979364   1.0

In [11]:
cor(X̃c)[1:5, 1:5]

5×5 Matrix{Float64}:
  1.0        0.381004   0.184924  -0.456294  -0.433225
  0.381004   1.0        0.659963  -0.505704  -0.381982
  0.184924   0.659963   1.0       -0.189855  -0.284863
 -0.456294  -0.505704  -0.189855   1.0        0.837525
 -0.433225  -0.381982  -0.284863   0.837525   1.0

**Conclusion**
Seems like $X_r$ and $\tilde{X}_r$ agrees fairly well while $X_c$ and $\tilde{X}_c$ agrees only somewhat. Lets try this in simulations

## One simulation

In [27]:
# simulate data
m = 5
p = size(Σ, 1)
k = 10 # number of causal groups
n = 500 # sample size
μ = zeros(p)

# simulate X
X = rand(MvNormal(μ, Σ), n)' |> Matrix
zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

# define groups
groups = id_partition_groups(X, force_contiguous=false)

# simulate βtrue
βtrue = zeros(p)
βtrue[1:k] .= rand(-1:2:1, k) .* randn(k)
shuffle!(βtrue)
causal_groups = get_signif_groups(βtrue, groups)

# simulate y
y = X * βtrue + randn(n)

# fully general me
@time me = modelX_gaussian_group_knockoffs(
    X, :maxent, groups, μ, Σ, 
    m = m,
    tol = 0.0001,    # convergence tolerance
    verbose=false, # whether to print informative intermediate results
)
me_ko_filter = fit_lasso(y, me)
me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
me_ssum = sum(abs.(me_ko_filter.ko.S))
@show me_power, me_fdr

# representative ME knockoffs
@time rme = modelX_gaussian_rep_group_knockoffs(
    X, :maxent, μ, Σ, groups, 
    m = m, 
);
rme_ko_filter = fit_lasso(y, rme)
discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
rme_power = round(TP(causal_groups, discovered_groups), digits=3)
rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
@show rme_power, rme_fdr

 10.925319 seconds (88.72 k allocations: 263.535 MiB)
(me_power, me_fdr) = (0.3, 0.0)
  0.994276 seconds (558.12 k allocations: 411.939 MiB, 1.40% gc time, 21.01% compilation time)
(rme_power, rme_fdr) = (0.3, 0.0)


(0.3, 0.0)

### Interpolative decomposition, target FDR = 0.1, m=5

In [25]:
rme_fdr_hat = 0.0
rme_power_hat = 0.0
me_fdr_hat = 0.0
me_power_hat = 0.0

nsims = 10
for i in 1:nsims
    Random.seed!(i)
    
    # simulate data
    m = 5
    p = size(Σ, 1)
    k = 10 # number of causal groups
    n = 500 # sample size
    μ = zeros(p)

    # simulate X
    X = rand(MvNormal(μ, Σ), n)' |> Matrix
    zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

    # define groups
    groups = id_partition_groups(X, force_contiguous=false)

    # simulate βtrue
    βtrue = zeros(p)
    βtrue[1:k] .= rand(-1:2:1, k) .* randn(k)
    shuffle!(βtrue)
    causal_groups = get_signif_groups(βtrue, groups)

    # simulate y
    y = X * βtrue + randn(n)

    # fully general me
    @time me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, μ, Σ, 
        m = m,
        tol = 0.0001,    # convergence tolerance
        verbose=false, # whether to print informative intermediate results
    )
    me_ko_filter = fit_lasso(y, me)
    me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_ssum = sum(abs.(me_ko_filter.ko.S))
    @show me_power, me_fdr
    
    me_fdr_hat += me_fdr
    me_power_hat += me_power

    # representative ME knockoffs
    @time rme = modelX_gaussian_rep_group_knockoffs(
        X, :maxent, μ, Σ, groups, 
        m = m, 
    );
    rme_ko_filter = fit_lasso(y, rme)
    discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
    rme_power = round(TP(causal_groups, discovered_groups), digits=3)
    rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
    @show rme_power, rme_fdr
    
    rme_fdr_hat += rme_fdr
    rme_power_hat += rme_power
end
rme_fdr_hat /= nsims
rme_power_hat /= nsims
me_fdr_hat /= nsims
me_power_hat /= nsims
println("representative ME knockoff has avg power $rme_power_hat and FDR $rme_fdr_hat")
println("normal ME knockoff has avg power $me_power_hat and FDR $me_fdr_hat")

 13.573214 seconds (116.16 k allocations: 265.621 MiB, 0.07% gc time)
(me_power, me_fdr) = (0.25, 0.0)
  0.781274 seconds (19.64 k allocations: 379.382 MiB, 0.48% gc time)
(rme_power, rme_fdr) = (0.5, 0.0)
  9.439470 seconds (74.35 k allocations: 262.346 MiB, 0.04% gc time)
(me_power, me_fdr) = (0.222, 0.0)
  0.822142 seconds (19.83 k allocations: 390.208 MiB, 0.61% gc time)
(rme_power, rme_fdr) = (0.222, 0.0)
  9.172355 seconds (72.26 k allocations: 262.174 MiB)
(me_power, me_fdr) = (0.4, 0.2)
  0.925949 seconds (20.03 k allocations: 397.905 MiB, 23.96% gc time)
(rme_power, rme_fdr) = (0.5, 0.167)
  9.483176 seconds (70.27 k allocations: 261.993 MiB, 1.77% gc time)
(me_power, me_fdr) = (0.333, 0.0)
  0.752983 seconds (19.92 k allocations: 393.701 MiB, 0.40% gc time)
(rme_power, rme_fdr) = (0.444, 0.2)
 11.075415 seconds (90.86 k allocations: 263.741 MiB, 0.03% gc time)
(me_power, me_fdr) = (0.778, 0.0)
  0.830293 seconds (19.70 k allocations: 379.558 MiB, 0.40% gc time)
(rme_power, rm

In [None]:
representative ME knockoff has avg power 0.7348 and FDR 0.2743
normal ME knockoff has avg power 0.5098999999999999 and FDR 0.057499999999999996

# Get modified correlation matrix

In [1]:
# load packages for this tutorial
using Revise
using Knockoffs
using LinearAlgebra
using Random
using StatsBase
using Statistics
using ToeplitzMatrices
using Distributions
using Clustering
using ProgressMeter
using LowRankApprox
using CSV, DataFrames
using RCall
# using Plots
# gr(fmt=:png);

# some helper functions to compute power and empirical FDR
function TP(correct_groups, signif_groups)
    return length(signif_groups ∩ correct_groups) / max(1, length(correct_groups))
end
function TP(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return TP(correct_groups, signif_groups)
end
function power(correct_snps, discovered_snps)
    return length(discovered_snps ∩ correct_snps) / length(correct_snps)
end
function FDR(correct_groups, signif_groups)
    FP = length(signif_groups) - length(signif_groups ∩ correct_groups) # number of false positives
    return FP / max(1, length(signif_groups))
end
function FDR(correct_groups, β̂, groups)
    signif_groups = get_signif_groups(β̂, groups)
    return FDR(correct_groups, signif_groups)
end
function get_signif_groups(β, groups)
    correct_groups = Int[]
    for i in findall(!iszero, β)
        g = groups[i]
        g ∈ correct_groups || push!(correct_groups, g)
    end
    return correct_groups
end

# zihuai's code for modifying correlation matrix
R"""
library(Matrix)
modify.cor <- function(cor.G, clusters, rep.data){
  rep.index<-rep.data[,2]
  Sigma2<-matrix(0,nrow(cor.G),ncol(cor.G))
  Sigma2[rep.index,rep.index]<-cor.G[rep.index,rep.index]
  temp<-matrix(0,nrow(cor.G),ncol(cor.G))
  for(k in 1:max(clusters)){
    temp.rep.index<-rep.data[rep.data[,1]==k,2]
    temp.res.index<-setdiff(which(clusters==k),temp.rep.index) # indices of variables in group k that are NOT representatives
    temp[temp.rep.index,temp.res.index]<-solve(cor.G[temp.rep.index,temp.rep.index,drop=F])%*%cor.G[temp.rep.index,temp.res.index,drop=F]
  }
  Sigma2[rep.index,-rep.index]<-cor.G[rep.index,rep.index]%*%temp[rep.index,-rep.index]
  Sigma2[-rep.index,rep.index]<-t(Sigma2[rep.index,-rep.index])
  temp2<-matrix(0,nrow(cor.G),ncol(cor.G))
  for(k in 1:max(clusters)){
    temp.rep.index<-rep.data[rep.data[,1]==k,2]
    temp.res.index<-setdiff(which(clusters==k),temp.rep.index)
    temp2[temp.res.index,temp.res.index]<-cor.G[temp.res.index,temp.res.index]-t(cor.G[temp.rep.index,temp.res.index,drop=F])%*%solve(cor.G[temp.rep.index,temp.rep.index,drop=F])%*%cor.G[temp.rep.index,temp.res.index,drop=F]
  }
  Sigma2[-rep.index,-rep.index]<-t(temp[rep.index,-rep.index])%*%cor.G[rep.index,rep.index]%*%temp[rep.index,-rep.index]+temp2[-rep.index,-rep.index]
  Sigma2<-as.matrix(forceSymmetric(Sigma2))
  return(Sigma2)
}

# zihuai's code for finding representative variants per group
Get.group.rep<-function(Sigma,clusters,inv.Sigma=NULL,thres=0.75,search.method='subsetC',stop.method='R2.ratio'){
  if(length(inv.Sigma)==0 & stop.method=='R2.ratio'){inv.Sigma<-solve(Sigma)}
  rep.data<-c()
  for(j in 1:max(clusters)){
    # print(j)
    if(sum(clusters==j)==1){
      rep.data<-rbind(rep.data,cbind(j,which(clusters==j)))
    }else{
      cor.G<-Sigma[clusters==j,clusters==j]
      if(search.method=='ID'){
        #interpolative decomposition
        A<-chol(cor.G)
        temp.fit<-rid(A,ncol(A),rand=F,idx_only=T)
        index.all<-temp.fit$idx
      }
      if(search.method=='subsetC'){
        index.all<-subsetC(cor.G, k=nrow(cor.G), traceit=FALSE)$indices
      }
      index<-index.all[1]
      for(i in 1:(nrow(cor.G)-1)){
        # print(i)
        #for(i in 1:4){
        temp.A<-cor.G[index,index,drop=F]
        #pre-compute some matrices
        if(i==1){inv.A<-solve(temp.A)}
        B<-cor.G[index,(1:nrow(cor.G))[-index],drop=F]
        #representative residual R2
        R2.R<-colSums(B*inv.A%*%B)
        inv.AB<-inv.A%*%B
        
        if(stop.method=='R2.ratio'){
          #representative plus other groups R2
          index.O<-which(clusters!=j)
          index.OR<-c(which(clusters==j)[index],index.O)
          inv.A.OR<-inv.Sigma[index.OR,index.OR]-
            inv.Sigma[index.OR,-index.OR,drop=F]%*%solve(inv.Sigma[-index.OR,-index.OR])%*%t(inv.Sigma[index.OR,-index.OR,drop=F])
          B.OR<-Sigma[which(clusters==j)[-index],index.OR,drop=F]
          R2.OR<-rowSums(B.OR%*%inv.A.OR*B.OR)#diag(B%*%inv.A%*%t(B))

        #print(R2.R)
        #print(R2.OR)
        #print(B.OR[1:4])
        #print(R2.R/R2.OR)

          if(mean(R2.R/R2.OR)>=thres){
            #print(mean(R2.R/R2.OR))
            break
          }
        }
        if(stop.method=='R2'){
          if(mean(R2.R)>=thres){
            print(min(R2.R))
            break
          }
        }
        index.add<-index.all[i+1]
        b<-cor.G[index,index.add,drop=F]
        c<-cor.G[index.add,index.add,drop=F]
        #R<-as.numeric(solve(c-t(b)%*%inv.A%*%b))
        R<-solve(c-t(b)%*%inv.A%*%b)
        inv.Ab<-inv.A%*%b
        inv.A<-rbind(cbind(inv.A+inv.Ab%*%R%*%t(inv.Ab),-inv.Ab%*%R),cbind(-R%*%t(inv.Ab),R))
        #update results
        index<-c(index,index.add)
        
      }
      index<-which(clusters==j)[index]
      rep.data<-rbind(rep.data,cbind(j,as.numeric(index)))
    }
  }
  return(rep.data)
}

step1 <- function(C,vlist=seq(ncol(C)),RSS0=sum(diag(C)),zero=1e-12){
  dC <- diag(C)
  rs <- colSums(C^2)/dC
  imax <- order(rs,decreasing=TRUE)[1]
  vmin <- sum(dC) - rs[imax]
  residC = C - outer(C[,imax],C[,imax],"*")/C[imax,imax]
  index = vlist[imax]
  izero = diag(residC) <= zero
  list(index = index, variance = vmin, R2 = 1-vmin/RSS0, C=residC[!izero,!izero],vlist=vlist[!izero])
}

subsetC <- function(C, k=NA, traceit=FALSE){
  ## C correlation matrix
  ## k subset size
  do.adaptive <- is.na(k)
  p <- ncol(C)
  if (do.adaptive) {
    k <- p-1
  }
  indices <- rep(0, k)
  RSS0 <- p
  R2 <- double(k)
  vlist = seq(p)
  for(i in 1:k){
    fit1 <- step1(C, RSS0=RSS0, vlist=vlist)
    indices[i] <- fit1$index
    C <- as.matrix(fit1$C)
    vlist <- fit1$vlist
    R2[i] <- fit1$R2
    if(traceit)cat(i, "index", fit1$index, "Variance Explained", fit1$variance,"R-squared",fit1$R2,"\n")
    
    # if there is at least 3 R2 values,
    # check early stopping rule
    if (do.adaptive && (i >= 3)) {
      rsq_u <- R2[i]
      rsq_m <- R2[i-1]
      rsq_l <- R2[i-2]
      if (check_early_stopping_rule(rsq_l, rsq_m, rsq_u)) {
        indices <- indices[1:i]
        R2 <- R2[1:i]
        break
      }
    }
  }
  list(indices = indices, R2=R2)
}

check_early_stopping_rule <- function(rsq_l, rsq_m, rsq_u, cond_0_thresh=1e-2, cond_1_thresh=1e-2) 
{
  delta_u <- (rsq_u-rsq_m)
  delta_m <- (rsq_m-rsq_l)
  (delta_u < cond_0_thresh*rsq_u) && ((delta_m*rsq_u-delta_u*rsq_m) < cond_1_thresh*rsq_m*rsq_u)
}
""";

┌ Info: Precompiling Knockoffs [878bf26d-0c49-448a-9df5-b057c815d613]
└ @ Base loading.jl:1423


In [2]:
# sigma from gnomAD
p = 500
datadir = "/Users/biona001/Benjamin_Folder/research/4th_project_PRS/group_knockoff_test_data"
covfile = CSV.read(joinpath(datadir, "CorG_2_127374341_128034347.txt"), DataFrame) # 3782 SNPs
Sigma = covfile |> Matrix{Float64}
Sigma = 0.99Sigma + 0.01I #ensure PSD
Sigma = Sigma[1:p, 1:p]
groups = hc_partition_groups(Symmetric(Sigma))

500-element Vector{Int64}:
   1
   2
   3
   4
   5
   5
   5
   6
   7
   7
   7
   8
   9
   ⋮
 126
 126
 126
 126
 116
 126
  29
 126
 126
 126
 127
 126

### Julia selected representatives (output is sorted by default)

In [3]:
@time group_reps = Knockoffs.choose_group_reps(Symmetric(Sigma), groups)

  2.132803 seconds (7.74 M allocations: 599.943 MiB, 9.71% gc time, 87.41% compilation time)


128-element Vector{Int64}:
   1
   2
   3
   4
   6
  13
  20
  47
  49
  55
  56
  57
  59
   ⋮
 415
 417
 427
 435
 441
 454
 456
 466
 469
 470
 496
 499

### Zihuai's selected representatives (output not sorted by default)

In [4]:
SigmaInv = inv(Sigma)
@rput groups Sigma SigmaInv
@time begin
    R"""
    rep_data <- Get.group.rep(Sigma,groups,inv.Sigma=SigmaInv,thres=0.5,search.method='subsetC',stop.method="R2.ratio")
    """
end
@rget rep_data
sort(rep_data[:, 2])

  0.310382 seconds (11.67 k allocations: 676.069 KiB, 1.74% compilation time)


128-element Vector{Float64}:
   1.0
   2.0
   3.0
   4.0
   6.0
  13.0
  20.0
  47.0
  49.0
  55.0
  56.0
  57.0
  59.0
   ⋮
 415.0
 417.0
 427.0
 435.0
 441.0
 454.0
 456.0
 466.0
 469.0
 470.0
 496.0
 499.0

### Julia code to obtain modified Sigma

In [5]:
groups = hc_partition_groups(Symmetric(Sigma))
group_reps = choose_group_reps(Symmetric(Sigma), groups)
@time Σblock = Knockoffs.cond_indep_corr(Symmetric(Sigma), groups, group_reps)

  1.009973 seconds (3.94 M allocations: 299.847 MiB, 2.85% gc time, 95.32% compilation time)


500×500 Matrix{Float64}:
  1.0          0.00703777   0.36065    …   0.0475407   -0.0153158
  0.00703777   1.0          0.0724068     -0.0111789   -0.0394566
  0.36065      0.0724068    1.0           -0.031883    -0.0722137
 -0.116456    -0.0428011   -0.0750035      0.00356468   0.112799
  0.121003     0.486608     0.244956      -0.0196619   -0.045217
  0.123155     0.49526      0.249311   …  -0.0200115   -0.046021
  0.121709     0.489446     0.246384      -0.0197766   -0.0454807
  0.0531057   -0.0247771    0.0957684      0.00627473  -0.0243865
 -0.141313    -0.171324    -0.41472        0.0447385    0.0768919
 -0.14165     -0.171733    -0.415709       0.0448453    0.0770753
 -0.141832    -0.171954    -0.416243   …   0.0449028    0.0771743
 -0.138011     0.14106     -0.266951       0.0198007    0.0255342
 -0.118319    -0.0342125   -0.0722016     -0.0141803   -0.0382694
  ⋮                                    ⋱               
 -0.0154147   -0.0397113   -0.0726798     -0.0294573    0.982984

### Zihuai's code to obtain modified Sigma

In [6]:
# zihuai's output
@time begin
    R"""
    Sigma2 <- modify.cor(Sigma, groups, rep_data)
    """
end
@rget Sigma2

  0.039085 seconds (42 allocations: 1.141 KiB)


500×500 Matrix{Float64}:
  1.0          0.00703777   0.36065    …   0.0475407   -0.0153158
  0.00703777   1.0          0.0724068     -0.0111789   -0.0394566
  0.36065      0.0724068    1.0           -0.031883    -0.0722137
 -0.116456    -0.0428011   -0.0750035      0.00356468   0.112799
  0.121003     0.486608     0.244956      -0.0196619   -0.045217
  0.123155     0.49526      0.249311   …  -0.0200115   -0.046021
  0.121709     0.489446     0.246384      -0.0197766   -0.0454807
  0.0531057   -0.0247771    0.0957684      0.00627473  -0.0243865
 -0.141313    -0.171324    -0.41472        0.0447385    0.0768919
 -0.14165     -0.171733    -0.415709       0.0448453    0.0770753
 -0.141832    -0.171954    -0.416243   …   0.0449028    0.0771743
 -0.138011     0.14106     -0.266951       0.0198007    0.0255342
 -0.118319    -0.0342125   -0.0722016     -0.0141803   -0.0382694
  ⋮                                    ⋱               
 -0.0154147   -0.0397113   -0.0726798     -0.0294573    0.982984

In [7]:
all(Σblock .≈ Sigma2)

true

### Check if S indeed is block diag

In [13]:
Σ = Sigma

# simulate data
m = 5
p = size(Σ, 1)
k = 10 # number of causal groups
n = 500 # sample size
μ = zeros(p)

# simulate X
X = rand(MvNormal(μ, Σ), n)' |> Matrix
zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

# define groups
groups = id_partition_groups(X, force_contiguous=false)

# simulate βtrue
βtrue = zeros(p)
βtrue[1:k] .= rand(-1:2:1, k) .* randn(k)
shuffle!(βtrue)
causal_groups = get_signif_groups(βtrue, groups)

# simulate y
y = X * βtrue + randn(n)

# representative ME knockoffs
rme = modelX_gaussian_rep_group_knockoffs(
    X, :maxent, μ, Σ, groups, 
    m = m, enforce_cond_indep=true
);

# this is the D variable
rme.S

500×500 Symmetric{Float64, Matrix{Float64}}:
  0.393751      0.0          -6.14744e-18  …   0.0           8.87963e-18
  0.0           0.362128      3.45506e-18      0.0          -5.02554e-18
 -6.14744e-18   3.45506e-18   0.828354         8.56135e-18   1.0154e-16
  0.0           0.0           1.30222e-17      0.0          -7.62857e-18
  3.91045e-17  -5.40245e-17  -6.96433e-16      0.0          -4.15214e-16
  0.0           0.0          -5.28104e-17  …   0.0           1.52638e-17
  2.97126e-17  -3.76915e-18  -6.67596e-16     -6.42101e-18  -3.00153e-16
  0.0           0.0           4.4937e-18       0.0          -2.201e-18
 -2.04915e-17   2.13585e-17  -0.0281838        4.28067e-18  -2.37133e-16
  6.83048e-18   1.52337e-17  -0.0298409       -8.56135e-18   2.05622e-17
  1.63932e-17   4.39735e-17  -0.0291392    …  -1.2842e-17    1.098e-16
 -1.98084e-17  -4.55439e-18   8.44318e-16     -1.07017e-17  -5.00021e-16
 -4.78134e-18   4.39735e-17   1.41984e-16     -6.42101e-18  -9.21813e-17
  ⋮        

### Power/FDR comparision when enforcing conditional independence

It seems FDR is much inflated.

In [8]:
rme_fdr_hat = 0.0
rme_power_hat = 0.0
me_fdr_hat = 0.0
me_power_hat = 0.0
Σ = Sigma

nsims = 10
for i in 1:nsims
    Random.seed!(i)
    
    # simulate data
    m = 5
    p = size(Σ, 1)
    k = 10 # number of causal groups
    n = 500 # sample size
    μ = zeros(p)

    # simulate X
    X = rand(MvNormal(μ, Σ), n)' |> Matrix
    zscore!(X, mean(X, dims=1), std(X, dims=1)); # standardize columns of X

    # define groups
    groups = id_partition_groups(X, force_contiguous=false)

    # simulate βtrue
    βtrue = zeros(p)
    βtrue[1:k] .= rand(-1:2:1, k) .* randn(k)
    shuffle!(βtrue)
    causal_groups = get_signif_groups(βtrue, groups)

    # simulate y
    y = X * βtrue + randn(n)

    # fully general me
    @time me = modelX_gaussian_group_knockoffs(
        X, :maxent, groups, μ, Σ, 
        m = m,
        tol = 0.0001,    # convergence tolerance
        verbose=false, # whether to print informative intermediate results
    )
    me_ko_filter = fit_lasso(y, me)
    me_power = round(TP(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_fdr = round(FDR(causal_groups, me_ko_filter.βs[3], groups), digits=3)
    me_ssum = sum(abs.(me_ko_filter.ko.S))
    @show me_power, me_fdr
    
    me_fdr_hat += me_fdr
    me_power_hat += me_power

    # representative ME knockoffs
    @time rme = modelX_gaussian_rep_group_knockoffs(
        X, :maxent, μ, Σ, groups, 
        m = m, enforce_cond_indep=true
    );
    rme_ko_filter = fit_lasso(y, rme)
    discovered_groups = groups[findall(!iszero, rme_ko_filter.βs[3])] |> unique
    rme_power = round(TP(causal_groups, discovered_groups), digits=3)
    rme_fdr = round(FDR(causal_groups, discovered_groups), digits=3)
    @show rme_power, rme_fdr
    
    rme_fdr_hat += rme_fdr
    rme_power_hat += rme_power
end
rme_fdr_hat /= nsims
rme_power_hat /= nsims
me_fdr_hat /= nsims
me_power_hat /= nsims
println("representative ME knockoff has avg power $rme_power_hat and FDR $rme_fdr_hat")
println("normal ME knockoff has avg power $me_power_hat and FDR $me_fdr_hat")

 36.821832 seconds (73.29 M allocations: 4.096 GiB, 2.10% gc time, 56.35% compilation time)
(me_power, me_fdr) = (0.25, 0.0)
  3.053414 seconds (8.72 M allocations: 810.523 MiB, 2.67% gc time, 73.54% compilation time)
(rme_power, rme_fdr) = (0.625, 0.375)
 11.298044 seconds (74.36 k allocations: 262.350 MiB, 0.08% gc time)
(me_power, me_fdr) = (0.222, 0.0)
  0.979882 seconds (23.98 k allocations: 421.083 MiB, 1.82% gc time)
(rme_power, rme_fdr) = (0.444, 0.0)
 11.243664 seconds (72.26 k allocations: 262.174 MiB, 1.62% gc time)
(me_power, me_fdr) = (0.4, 0.2)
  1.241485 seconds (24.19 k allocations: 430.095 MiB, 13.82% gc time)
(rme_power, rme_fdr) = (0.7, 0.417)
 10.989873 seconds (70.27 k allocations: 261.993 MiB, 0.03% gc time)
(me_power, me_fdr) = (0.333, 0.0)
  0.819091 seconds (24.07 k allocations: 425.007 MiB, 0.80% gc time)
(rme_power, rme_fdr) = (0.556, 0.5)
 13.210707 seconds (90.86 k allocations: 263.743 MiB, 0.04% gc time)
(me_power, me_fdr) = (0.778, 0.0)
  0.858225 seconds