# Compare MVR, ME, fast SDP knockoff with KnockPy

Below code was run on Sherlock, Julia v1.6.2 and python 3.9.10

In [83]:
using Revise
using Knockoffs
using Random
using GLMNet
using Distributions
using LinearAlgebra
using ToeplitzMatrices
using StatsBase
using PyCall
using BenchmarkTools
# using Plots
# gr(fmt=:png);

py"""
import numpy as np
import knockpy as kpy
from knockpy.knockoff_filter import KnockoffFilter
from knockpy.mrc import solve_mvr
from knockpy.mrc import solve_maxent
from knockpy.mrc import _solve_maxent_sdp_cd
"""

# import FANOK functions
py"""
from sklearn.datasets import make_regression
from fanok import GaussianKnockoffs, KnockoffSelector
from fanok.statistics import EstimatorStatistics
from fanok.sdp.sdp import sdp_full
"""

## First check Accuracy

In [51]:
seed = 2022

# simulate cov
Random.seed!(seed)
n = 600
p = 300
ρ = 0.5
Σ = Matrix(SymmetricToeplitz(ρ.^(0:(p-1))))

# solve s vector in Julia
Xko_fastSDP = Knockoffs.solve_sdp_fast(Σ)
Xko_maxent = Knockoffs.solve_max_entropy(Σ)
Xko_mvr = Knockoffs.solve_MVR(Σ)

# solve s vector in Python
py"""
s1 = _solve_maxent_sdp_cd($Σ, True, verbose=False)
s2 = solve_maxent($Σ, verbose=False)
s3 = solve_mvr($Σ, verbose=False)
"""
python_sdp_fast = [py"s1"[j, j] for j in 1:p]
python_me = [py"s2"[j, j] for j in 1:p]
python_mvr = [py"s3"[j, j] for j in 1:p];

Compare coordiate descent SDP 

In [52]:
[Xko_fastSDP python_sdp_fast]

300×2 Matrix{Float64}:
 1.0       0.999023
 0.657597  0.658304
 0.682226  0.672158
 0.656115  0.669939
 0.672361  0.65783
 0.663902  0.670257
 0.667925  0.668111
 0.666119  0.662868
 0.666898  0.666026
 0.666572  0.668988
 0.666705  0.660951
 0.666651  0.670985
 0.666673  0.66308
 ⋮         
 0.666641  0.66214
 0.66665   0.668238
 0.666724  0.665808
 0.666804  0.667473
 0.666575  0.661187
 0.665975  0.670046
 0.667282  0.664861
 0.671132  0.667198
 0.647843  0.657499
 0.704939  0.686756
 0.640859  0.650471
 1.0       0.999023

Compare MVR solutions

In [55]:
[Xko_mvr python_mvr]

300×2 Matrix{Float64}:
 0.594468  0.593891
 0.430784  0.430363
 0.438428  0.438002
 0.438477  0.438042
 0.438445  0.438027
 0.438447  0.438015
 0.438447  0.438035
 0.438447  0.438014
 0.438447  0.438033
 0.438447  0.438017
 0.438447  0.438024
 0.438447  0.438018
 0.438447  0.43802
 ⋮         
 0.438447  0.438017
 0.438447  0.43802
 0.438447  0.438013
 0.438447  0.438032
 0.438447  0.43802
 0.438447  0.438018
 0.438447  0.438023
 0.438445  0.438016
 0.438477  0.43805
 0.438428  0.437997
 0.430784  0.430365
 0.594468  0.593887

Compare ME solutions

In [56]:
[Xko_maxent python_me]

300×2 Matrix{Float64}:
 0.658052  0.657409
 0.470574  0.470114
 0.486666  0.486192
 0.485212  0.484738
 0.485343  0.484869
 0.485331  0.484856
 0.485332  0.48486
 0.485332  0.484858
 0.485332  0.48486
 0.485332  0.484856
 0.485332  0.484859
 0.485332  0.484854
 0.485332  0.484859
 ⋮         
 0.485332  0.484856
 0.485332  0.484859
 0.485332  0.484858
 0.485332  0.484859
 0.485332  0.484856
 0.485332  0.484861
 0.485331  0.484857
 0.485343  0.484867
 0.485212  0.484739
 0.486666  0.486188
 0.470574  0.470118
 0.658052  0.657408

## Use covariance structure estimated from haplotype panel

In [24]:
using VCFTools, CovarianceEstimation
vcffile = "/scratch/users/bbchu/1000genomes/chr22.1kg.phase3.v5a.vcf.gz"
H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_gt(Float32, 
    vcffile, save_snp_info=true, msg="importing");

[32mimporting 100%|██████████████████████████████████████████| Time: 0:02:35[39m


In [57]:
p = 1000
pos = sort!(rand(H_pos, p))
idx = indexin(pos, H_pos)
covariance_approximator=LinearShrinkage(DiagonalUnequalVariance(), :lw)
Hsubset = H[:, idx] |> Matrix{Float64}
Σapprox = cov(covariance_approximator, Hsubset)
StatsBase.cov2cor!(Σapprox.data, sqrt.(diag(Σapprox)))
Σapprox

1000×1000 Symmetric{Float64, Matrix{Float64}}:
  1.0         -0.00494105  -0.00881853  …  -0.0036799    0.0294882
 -0.00494105   1.0         -0.00619296     -0.00258427   0.0115194
 -0.00881853  -0.00619296   1.0            -0.00461226  -0.0137788
 -0.0214722   -0.0150792    0.0417584      -0.0112304   -0.0877525
 -0.00403193  -0.00283149  -0.00505349     -0.00210878  -0.028092
 -0.00465754  -0.00327083  -0.0058376   …  -0.00243598   3.11329e-5
 -0.00834217  -0.00585842  -0.0104558      -0.00436312  -0.00355534
 -0.0585277    0.0210869    0.0396601       0.00894323  -0.0156688
 -0.0036799   -0.00258427  -0.00461226     -0.00192466  -0.021533
 -0.00842693  -0.00591795  -0.010562       -0.00440745  -0.0623303
 -0.00465754  -0.00327083   0.0512615   …  -0.00243598  -0.0194581
 -0.0375728   -0.0263861    0.0919638       0.0360546   -0.117813
 -0.00659733  -0.00463309   0.113051       -0.00345053  -0.0413651
  ⋮                                     ⋱               
 -0.0036799   -0.00258427 

In [None]:
# solve s vector in Julia
Xko_fastSDP = Knockoffs.solve_sdp_fast(Σapprox)
Xko_maxent = Knockoffs.solve_max_entropy(Σapprox)
Xko_mvr = Knockoffs.solve_MVR(Σapprox)

# solve s vector in Python
Σapprox_dense = Σapprox |> Matrix{Float64}
py"""
s1 = _solve_maxent_sdp_cd($Σapprox_dense, True, verbose=False)
s2 = solve_maxent($Σapprox_dense, verbose=False)
s3 = solve_mvr($Σapprox_dense, verbose=False)
"""
python_sdp_fast = [py"s1"[j, j] for j in 1:p]
python_me = [py"s2"[j, j] for j in 1:p]
python_mvr = [py"s3"[j, j] for j in 1:p];

# FANOK coordinate descent SDp
py"""
fanok_s, objective = sdp_full($Σapprox_dense, return_objectives=True)
"""
fanok_s = [py"fanok_s"[j] for j in 1:p]
fanok_obj = [py"objective"[j] for j in 1:p]

In [88]:
[Xko_fastSDP python_sdp_fast fanok_s]

1000×3 Matrix{Float64}:
 0.0          0.414159   0.413731
 0.0367086    0.795124   0.755108
 0.000939696  0.1243     0.198332
 0.0          0.0495887  0.0854215
 0.0525752    0.136539   0.14108
 0.0          0.323591   0.308282
 0.0          0.164048   0.218462
 0.0371742    0.365272   0.395066
 0.0          0.0929491  0.125177
 0.0          0.184221   0.153254
 0.200141     0.200468   0.193885
 0.0          0.196814   0.180653
 0.184513     0.322894   0.306238
 ⋮                       
 0.343322     0.655016   0.6873
 0.0          0.0752058  0.0553899
 0.0          0.11009    0.0545394
 0.0          0.379152   0.281685
 0.531038     0.431247   0.458284
 0.0          0.04467    0.0373567
 0.0          0.507285   0.418157
 0.477493     0.54709    0.620794
 0.234428     0.584779   0.43726
 0.358622     0.705355   0.696764
 0.0118011    0.185331   0.170402
 0.322787     0.384423   0.405193

In [73]:
[Xko_mvr python_mvr]

1000×2 Matrix{Float64}:
 0.323994  0.323679
 0.46754   0.467083
 0.127592  0.127468
 0.128104  0.127979
 0.133576  0.133449
 0.220463  0.220249
 0.146178  0.146036
 0.310374  0.310073
 0.139745  0.139612
 0.135091  0.134962
 0.152819  0.152671
 0.171994  0.171826
 0.178473  0.178297
 ⋮         
 0.386806  0.386431
 0.139714  0.139577
 0.122189  0.12207
 0.325262  0.324944
 0.286693  0.286413
 0.11044   0.110331
 0.316815  0.31651
 0.36265   0.362296
 0.39878   0.398389
 0.433776  0.433353
 0.138955  0.13882
 0.32342   0.323105

In [74]:
[Xko_maxent python_me]

1000×2 Matrix{Float64}:
 0.398377  0.397991
 0.568115  0.567562
 0.138766  0.138629
 0.132038  0.131908
 0.157582  0.157428
 0.271662  0.271393
 0.17012   0.169953
 0.369722  0.369359
 0.153242  0.153095
 0.149692  0.149553
 0.183624  0.183445
 0.1984    0.198205
 0.213967  0.213762
 ⋮         
 0.499938  0.499446
 0.155157  0.155002
 0.133605  0.133474
 0.379493  0.379122
 0.350647  0.350307
 0.1108    0.11069
 0.395266  0.394875
 0.448027  0.447583
 0.487639  0.487163
 0.52835   0.527838
 0.16625   0.166086
 0.389779  0.389391

# Timing results (Julia 1.6)

Here we force all functions to run same number of iterations of coordiate descent.

In [7]:
# simulate covariance matrix Sigma in python, and bring it over to Julia
py"""
p = 300
rho = 0.5
Sigma = (1-rho) * np.eye(p) + rho * np.ones((p, p))
"""
Sigma = py"Sigma";

Maximum entropy

In [8]:
@btime begin
    py"""
    solve_maxent(Sigma, verbose=False) # 5 iter
    """
end

  643.552 ms (3 allocations: 144 bytes)


In [9]:
@btime Knockoffs.solve_max_entropy(Sigma, tol=1e-15); # 5 iter

  42.950 ms (22 allocations: 2.86 MiB)


MVR

In [10]:
@btime begin
    py"""
    solve_mvr(Sigma, verbose=False) # 5 iter
    """
end

  708.561 ms (3 allocations: 144 bytes)


In [11]:
@btime Knockoffs.solve_MVR(Sigma, tol=1e-13); # 5 iter

  70.486 ms (24 allocations: 2.87 MiB)


Coordinate descent SDP

In [12]:
@btime begin
    py"""
    _solve_maxent_sdp_cd(Sigma, True, verbose=False) # 49 iter
    """
end

  4.500 s (3 allocations: 144 bytes)


In [13]:
@btime Knockoffs.solve_sdp_fast(Sigma, verbose=false, niter=49); # 49 iter

  465.666 ms (9 allocations: 1.38 MiB)


## Conclusion

Julia is 9~15x faster in all 3 cases. 

On separate machine timed on the Julia REPL, Julia is 11~22x faster in all 3 cases. 