# Make fastPHASE knockoffs

In [29]:
using Revise
using LinearAlgebra
using DelimitedFiles
using Distributions
using ProgressMeter
using SnpArrays
using Random
using Knockoffs
using BenchmarkTools
plinkname = "/Users/biona001/.julia/dev/Knockoffs/fastphase/ukb.10k.chr10"
datadir = "/Users/biona001/.julia/dev/Knockoffs/fastphase"
T = 10
extension="ukb_chr10_n1000_"

"ukb_chr10_n1000_"

## (scaled) Forward backward algorithm to get Z

In [2]:
xdata = SnpData(plinkname)
x = xdata.snparray
n, p = size(x)

# get r, α, θ estimated by fastPHASE
r, θ, α = process_fastphase_output(datadir, T, extension=extension)
K = size(θ, 2) # number of haplotype motifs
statespace = (K * (K + 1)) >> 1
table = MarkovChainTable(K)

# form transition matrices, initial state and emission probabilities
H = get_haplotype_transition_matrix(r, θ, α)
Q = get_genotype_transition_matrix(H, table)
q = get_initial_probabilities(α, table);

In [43]:
xi = zeros(Float64, p)
Z = zeros(Int, p);

Random.seed!(2022)
i = 1
xi = copyto!(xi, @view(x[i, :]))
@time forward_backward_sampling!(Z, xi, Q, q, θ, table)
[Z [index_to_pair(table, i) for i in Z]]

  0.106456 seconds (29.49 k allocations: 27.892 MiB)


29481×2 Matrix{Any}:
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
 10  (1, 10)
  ⋮  
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)
 43  (6, 8)

## Sample knockoff of markov chain

In [30]:
Z̃ = zeros(Int, p)
N = zeros(p, statespace)
d = Categorical([1 / statespace for _ in 1:statespace])
@btime markov_knockoffs!($Z̃, $Z, $N, $d, $Q, $q)

  407.559 ms (0 allocations: 0 bytes)


29481-element Vector{Int64}:
 10
 55
 55
 55
 55
 55
 55
 55
 55
 55
 55
 55
 55
  ⋮
 55
 55
 55
 55
 55
 55
 55
 55
 55
 55
 55
 52

In [41]:
Z̃ = zeros(Int, p)
N = zeros(p, statespace)
d = Categorical([1 / statespace for _ in 1:statespace])
markov_knockoffs!(Z̃, Z, N, d, Q, q)

29481-element Vector{Int64}:
 10
 10
 10
 10
 10
 10
 10
 10
 10
 10
 10
 10
  4
  ⋮
 43
 43
 43
 43
 43
 43
 43
 43
 43
 43
 43
 43

## Numerical issues with sampling knockoffs

In [11]:
j = 1
update_normalizing_constants!(N, Z, Z̃, Q, q, j) # equation 5
sample_dmc_knockoff!(Z̃, Z, d, N, Q, q, j)

j = 2
update_normalizing_constants!(N, Z, Z̃, Q, q, j) # equation 5
sample_dmc_knockoff!(Z̃, Z, d, N, Q, q, j)

j = 3
update_normalizing_constants!(N, Z, Z̃, Q, q, j) # equation 5
sample_dmc_knockoff!(Z̃, Z, d, N, Q, q, j)

# j = 4
# update_normalizing_constants!(N, Z, Z̃, Q, q, j) # equation 5
# sample_dmc_knockoff!(Z̃, Z, d, N, Q, q, j)

In [18]:
for j in 1:p
    update_normalizing_constants!(N, Z, Z̃, Q, q, j) # equation 5
    sample_dmc_knockoff!(Z̃, Z, d, N, Q, q, j)
end

└ @ Main In[17]:57


In [19]:
sum(d.p)

6.519686293830544e6

In [163]:
sum(d.p)

0.9999083501058149

In [174]:
sum(d.p)

0.999860105088087

In [20]:
sum(N, dims=2)

29481×1 Matrix{Float64}:
  1.0000000000000002
 54.2639683695825
  0.018428434849456683
 54.28294099029847
  0.01842202157592164
 54.46560222966858
  0.018421161559418645
 54.76540497321972
  0.018324716116195197
 54.878190326055076
  0.018273299653702453
 55.0139156956132
  0.018204465860137662
  ⋮
  3.6609090374126475e86
  2.73506239070679e-87
  3.65622384277509e86
  2.7386355293396198e-87
  3.656499771203274e86
  2.735025320919982e-87
  3.657915374966508e86
  2.735829710737848e-87
  3.656225791496842e86
  2.7354690477186257e-87
  3.6556919305423893e86
  2.3076678095328526e-92

In [21]:
N

29481×55 Matrix{Float64}:
 0.0104452    0.0202199    0.0202368    …  0.0181799    0.00809235
 0.00206479   0.000988552  0.000876434     0.00160533   0.00250341
 8.28113e-7   4.18914e-7   2.98606e-7      6.68505e-7   9.49961e-7
 0.022767     0.0144971    0.0150734       0.0215867    0.0202142
 2.19784e-5   1.68489e-5   1.40993e-5      1.57211e-5   7.47459e-6
 0.072532     0.028756     0.0587493    …  0.0335778    0.00271332
 1.52591e-5   1.51584e-5   2.2648e-5       2.28177e-5   2.76152e-5
 0.0296311    0.0766969    0.106753        0.0997475    0.121466
 6.97654e-6   1.59861e-5   3.2569e-5       2.34815e-5   2.22566e-5
 0.0697864    0.055066     0.0922631       0.0305128    0.0566642
 3.31068e-5   1.93318e-5   4.08648e-5   …  1.20735e-5   1.20862e-5
 0.126527     0.142382     0.133423        0.149429     0.133096
 0.000110562  9.05297e-5   0.000103102     8.60514e-5   8.62945e-5
 ⋮                                      ⋱               
 3.99625e77   4.28725e78   5.92134e77      4.70307e7

In [27]:
log(1.7789e74)

170.96729207730846

In [26]:
log(4.19576e-94)

-215.00892424987288

In [22]:
[Z Z̃]

29481×2 Matrix{Int64}:
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
  ⋮  
 43  43
 43  43
 43  43
 43  43
 43  43
 43  43
 43  43
 43  43
 43  43
 43  43
 43  43
 43   6

In [25]:
idx = findall(Z .!= Z̃)
[Z[idx] Z̃[idx]]

1035×2 Matrix{Int64}:
  6   4
  6  41
 41  42
  7   9
 25  52
 52  10
 45  49
 37  38
  6   4
  5  14
 12   3
  3  20
 27  26
  ⋮  
  6  15
 27  45
  5  22
 35   5
 35  29
  5   4
 47   7
 47  46
  4  31
 32   4
 50  32
 43   6

In [61]:
sum(N, dims=2)

29481×1 Matrix{Float64}:
  1.0000000000000002
 54.252902658802775
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  ⋮
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0
  0.0

In [52]:
[Z Z̃]

29481×2 Matrix{Int64}:
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10  10
 10   4
  ⋮  
 43   1
 43   1
 43   1
 43   1
 43   1
 43   1
 43   1
 43   1
 43   1
 43   1
 43   1
 43   1

In [51]:
sum(N, dims=2)

29481×1 Matrix{Float64}:
   1.0000000000000002
  54.252902658802775
   0.01842374543503838
  54.112817757347486
   0.018278940586756772
  53.87084989721795
   0.018208199190451246
  54.169598287482536
   0.018162786236685727
  54.43644276278406
   0.018090173843828914
  53.72953347112866
   0.017324637624574243
   ⋮
 NaN
 NaN
 NaN
 NaN
 NaN
 NaN
 NaN
 NaN
 NaN
 NaN
 NaN
 NaN