# Part 2: Knockoff sampling + GhostBasil

Over 1703 quasi-independent blocks, we have assembled
\begin{align*}
    \Sigma =
    \begin{bmatrix}
        \Sigma_1 & & \\
        & \ddots & \\
        & & \Sigma_{1703}
    \end{bmatrix}, \quad
    S = 
    \begin{bmatrix}
        S_1 & & \\
        & \ddots & \\
        & & S_{1703}
    \end{bmatrix}, \quad
    S_i = 
    \begin{bmatrix}
        S_{i,1} & & \\
        & \ddots & \\
        & & S_{i,G_i}
    \end{bmatrix}
\end{align*}
where $\Sigma_i$ are LD matrices obtained from the Pan-UKBB panel and $S_i$ is the group-block-diagonal matrices obtained by solving the knockoff optimization problem. Given a Z-score vector $z$, we can compute $r = \frac{1}{\sqrt{n}} z$, and `ghostbasil` will solve the following problem with $\lambda \ge 0, p_i \ge 0$, and $0 \le \alpha \le 1$.

\begin{align*}
\min \frac{1}{2}\beta^t A \beta - \beta^tr + \lambda\sum_ip_i\left(\alpha|\beta_i| + \frac{1-\alpha}{2}\beta_i^2\right)
\end{align*}

Here $A = \frac{1}{n}[X,\tilde{X}]'[X,\tilde{X}]$ and $\beta$ contains the effect size for both original variables and their knockoffs. 

To solve this problem, we will call `ghostbasil(A, r)` where
\begin{align*}
    A &= \text{BlockBlockGroupGhostMatrix}(B_1, ..., B_{1703})\\
    B_i &= \text{BlockGroupGhostMatrix}(C_i, S_i, m+1)\\
    C_i &= \Sigma_i - S_i
\end{align*}  
Note that Jame's function
\begin{align*}
    \text{BlockGroupGhostMatrix}(C, S, n.groups) = 
    \begin{bmatrix}
        C+S & C & ... & C\\
        C & C+S & ... & \\
        \vdots & & \ddots & \vdots\\
        C & C & & C + S
    \end{bmatrix}
\end{align*}
Thus we have
\begin{align*}
    A = 
    \begin{bmatrix}
        B_1 & & \\
        & \ddots & \\
        & & B_{1703}
    \end{bmatrix}, \quad
    B_i = 
    \begin{bmatrix}
        \Sigma_i & \Sigma_i-S_i & ... & \Sigma_i-S_i\\
        \Sigma_i-S_i & \Sigma_i & ... & \\
        \vdots & & \ddots & \vdots\\
        \Sigma_i-S_i & \Sigma_i-S_i & & \Sigma_i
    \end{bmatrix} = 
    \begin{bmatrix}
        C_i+S_i & C_i & ... & C_i\\
        C_i & C_i+S_i & ... & \\
        \vdots & & \ddots & \vdots\\
        C_i & C_i & & C_i + S_i
    \end{bmatrix}
\end{align*}

## Software versions

Code was tested on Sherlock with 
+ `julia/1.8.4`
+ `R/4.0.2`
+ `openssl/3.0.7`

More Z scores available at:
+ https://github.com/mikegloudemans/gwas-download

## Script

+ The script below runs the knockoff sampling + ghostbasil procedure. 
+ Directories to pre-computed knockoff statistics and Albuminuria Z scores are hard-coded. 

In [None]:
# ml julia/1.8.4 R/4.0.2 openssl/3.0.7

using GhostKnockoffGWAS
using CSV
using DataFrames
using Random

# helper function to import albuminuria GWAS z-scores
function read_phenotype_zscores()
    # albuminuria study: https://pubmed.ncbi.nlm.nih.gov/30220432
    file = "/oak/stanford/groups/zihuai/GWAS_Summary_Gloudemans/Albuminuria_Haas_2018/Albuminuria_Haas_2018.txt.gz"
    info = CSV.read(file, DataFrame)
    chr = info[!, "chr"]
    pos = info[!, "snp_pos"]
    non_effect_allele = info[!, "non_effect_allele"] |> Vector{String}
    effect_allele = info[!, "effect_allele"] |> Vector{String}
    pvals = info[!, "pvalue"]
    betas = info[!, "beta"]
    z = pval2zscore(pvals, betas)
    Neffect = 382500
    hg_build = 38

    # remove NaN/Inf
    idx = findall(x -> !isnan(x) && !isinf(x), z)
    return z[idx], chr[idx], pos[idx], effect_allele[idx], non_effect_allele[idx], Neffect, hg_build
end

# helper function to summarize result
function summary_result(outfile::String, target_fdr=0.1)
    result = CSV.read(outfile, DataFrame)
    pheno_name = basename(outfile)[1:end-4] # no .txt

    #
    # Manhattan analysis: find most significant SNP within blocks of 1Mb
    # 
    Mb = 1e6
    label_idx = Int[]
    for chr in 1:22
        chr_idx = findall(x -> x == chr, result[!, "chr"])
        pos = result[chr_idx, :pos_hg19]
        blocks = div.(pos, Mb)
        for block in unique(blocks)
            block_idx = findall(x -> x == block, blocks)
            pvals = result[chr_idx[block_idx], :pvals]
            pval, pval_idx = findmin(pvals)
            if pval < 5e-8
                push!(label_idx, chr_idx[block_idx[pval_idx]])
            end
        end
    end

    # label significant SNPs
    marginal_signif_snps = length(result[label_idx, :rsid])

    #
    # Knockoff analysis
    # 
    summary = CSV.read(joinpath(dirname(outfile), pheno_name * "_summary.txt"), DataFrame, header=false)
    if target_fdr == 0.05
        q = summary[3, 2] # target FDR 0.05
    elseif target_fdr == 0.1
        q = summary[5, 2] # target FDR 0.1
    elseif target_fdr == 0.2
        q = summary[9, 2] # target FDR 0.2
    end

    # optional: keep only W ≥ 0
    result = result[findall(x -> x > 0, result[!, :W]), :]

    # for each group, only keep SNP with largest Z score
    keep_idx = Int[]
    for g in unique(result[!, "group"])
        idx = findall(x -> x == g, result[!, "group"])
        zscores = result[idx, "zscores"]
        z, z_idx = findmax(abs.(zscores))
        push!(keep_idx, idx[z_idx])
    end
    result = result[keep_idx, :]

    # find most significant SNP within blocks of 1Mb
    Mb = 1e6
    label_idx = Int[]
    result[!, "gene"] = ["" for i in 1:size(result, 1)]
    for chr in 1:22
        chr_idx = findall(x -> x == chr, result[!, "chr"])
        pos = result[chr_idx, :pos_hg19]
        blocks = div.(pos, Mb)
        for block in unique(blocks)
            block_idx = findall(x -> x == block, blocks)
            Ws = result[chr_idx[block_idx], :W]
            w, w_idx = findmax(Ws)
            if w > q
                push!(label_idx, chr_idx[block_idx[w_idx]])
            end
        end
    end

    # knockoff significant SNPs
    ko_signif_snps = length(result[label_idx, :rsid])
    nregions = summary[12, 2]
    nsnps = summary[13, 2]
    return marginal_signif_snps, ko_signif_snps, nregions, nsnps
end

#
# read phenotype z scores
#
phenotype = "albuminuria"
LDpopulation = "EUR"
seed = 1
pseudo_validate = true
LD_shrinkage = false
z, chr, pos, effect_allele, non_effect_allele, Neffect, hg_build = read_phenotype_zscores()
knockoff_dir = "/oak/stanford/groups/zihuai/pan_ukb_group_knockoffs/$LDpopulation"
outdir = joinpath(knockoff_dir, "results")

#
# run analysis
#
Random.seed!(seed)
outfile = "$(phenotype)_seed$seed"
ghostbasil(knockoff_dir, z, chr, pos, 
    effect_allele, non_effect_allele, Neffect, hg_build, outdir, 
    pseudo_validate=pseudo_validate, LD_shrinkage=LD_shrinkage,
    outname=outfile, seed=seed)

#
# summarize result
#
marginal_signif, ko_signif, nregions, nsnps = 
    summary_result(joinpath(outdir, outfile * ".txt"))
open(joinpath(outdir, "$(outfile)_independent_discoveries.txt"), "w") do io
    println(io, "marginal_signif,$(marginal_signif)")
    println(io, "ko_signif,$(ko_signif)")
    println(io, "nsnps,$(nsnps)")
    println(io, "nregions,$(nregions)")
end

println("finished.")