# Run `solveblock` on UKB array data

1. Start with UKB array data filtered to British samples, **on SNPs with MAF>0.01**
2. Use covariates to adjust LD matrix following [Pan-UKB documentation](https://pan-dev.ukbb.broadinstitute.org/docs/ld#ld-matrices)
    + sex
    + age
    + age^2
    + age*sex
    + age^2*sex
    + first 10 PCs

3. Run `solveblock` on given PLINK file

```shell
ml julia/1.10 R/4.0.2
export OPENBLAS_NUM_THREADS=1
```

In [1]:
using CSV
using DataFrames
using DelimitedFiles

# helper function to submit 1 job to run 1 command
function submit(command::String, ncores::Int, total_mem::Number, 
        joblog_dir::String="/oak/stanford/groups/zihuai/solveblock/joblogs"; 
        jobname="submit", waitfor=Int[], verbose=true, highp=false)
    mem = round(Int, total_mem / ncores) # memory per core
    filename = "$jobname.sh"
    open(filename, "w") do io
        println(io, "#!/bin/bash")
        println(io, "#")
        println(io, "#SBATCH --job-name=$jobname")
        println(io, "#")
        if highp
            println(io, "#SBATCH --time=168:00:00")
        else
            println(io, "#SBATCH --time=24:00:00")
        end
        println(io, "#SBATCH --cpus-per-task=$ncores")
        println(io, "#SBATCH --mem-per-cpu=$(mem)G")
        if highp
            println(io, "#SBATCH --partition=candes,zihuai")
        else
            println(io, "#SBATCH --partition=candes,zihuai,normal,owners")
        end
        println(io, "#SBATCH --output=$(joinpath(joblog_dir, "slurm-%j.out"))")
        println(io, "")
        println(io, "#save job info on joblog:")
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io, "")
        println(io, "# load the job environment:")
#         println(io, "module load julia/1.9")
#         println(io, "module load biology plink/1.90b5.3")
#         println(io, "module load R/4.0.2")
#         println(io, "export OPENBLAS_NUM_THREADS=1")
#         println(io, "export JULIA_DEPOT_PATH=\"/home/groups/sabatti/.julia\"")
        println(io, "")
        println(io, "# run code")
        println(io, "echo \"$command\"")
        println(io, "$command")
        println(io, "")
        println(io, "#echo job info on joblog:")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io, "#echo \" \"")
    end
    # submit job and capture job ID
    io = IOBuffer()
    if length(waitfor) != 0
        run(pipeline(`sbatch --dependency=afterok:$(join(waitfor, ':')) $filename`; stdout=io))
    else
        run(pipeline(`sbatch $filename`; stdout=io))
    end
    msg = String(take!(io))
    verbose && print(stdout, msg)
    jobid = parse(Int, strip(msg)[21:end])
    # clean up and return job ID
    close(io)
    rm(filename, force=true)
    return jobid
end

# helper function to submit 1 job to run multiple commands
function submit(commands::Vector{String}, ncores::Int, total_mem::Number, 
        joblog_dir::String="/oak/stanford/groups/zihuai/solveblock/joblogs"; 
        jobname="submit", waitfor=Int[], verbose=true, highp=false)
    mem = round(Int, total_mem / ncores) # memory per core
    filename = "$jobname.sh"
    open(filename, "w") do io
        println(io, "#!/bin/bash")
        println(io, "#")
        println(io, "#SBATCH --job-name=$jobname")
        println(io, "#")
        if highp
            println(io, "#SBATCH --time=168:00:00")
        else
            println(io, "#SBATCH --time=24:00:00")
        end
        println(io, "#SBATCH --cpus-per-task=$ncores")
        println(io, "#SBATCH --mem-per-cpu=$(mem)G")
        if highp
            println(io, "#SBATCH --partition=candes,zihuai")
        else
            println(io, "#SBATCH --partition=candes,zihuai,normal,owners")
        end
        println(io, "#SBATCH --output=$(joinpath(joblog_dir, "slurm-%j.out"))")
        println(io, "")
        println(io, "#save job info on joblog:")
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io, "")
        println(io, "# load the job environment:")
#         println(io, "module load julia/1.9")
#         println(io, "module load biology plink/1.90b5.3")
#         println(io, "module load R/4.0.2")
#         println(io, "export OPENBLAS_NUM_THREADS=1")
#         println(io, "export JULIA_DEPOT_PATH=\"/home/groups/sabatti/.julia\"")
        println(io, "")
        for command in commands
            println(io, "echo \"$command\"")
            println(io, "$command")
        end
        println(io, "")
        println(io, "#echo job info on joblog:")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io, "#echo \" \"")
    end
    # submit job and capture job ID
    io = IOBuffer()
    if length(waitfor) != 0
        run(pipeline(`sbatch --dependency=afterok:$(join(waitfor, ':')) $filename`; stdout=io))
    else
        run(pipeline(`sbatch $filename`; stdout=io))
    end
    msg = String(take!(io))
    verbose && print(stdout, msg)
    jobid = parse(Int, strip(msg)[21:end])
    # clean up and return job ID
    close(io)
    rm(filename, force=true)
    return jobid
end


"Run a Cmd object, returning the stdout & stderr contents plus the exit code"
function execute(cmd::Cmd)
    out = Pipe()
    err = Pipe()

    process = run(pipeline(ignorestatus(cmd), stdout=out, stderr=err))
    close(out.in)
    close(err.in)

    return (
        stdout = String(read(out)), 
        stderr = String(read(err)),  
        code = process.exitcode
    )
end

function get_job_names()
    data_str, _, _ = execute(`squeue -u bbchu -h -o "%.30j"`)
    lines = split(data_str, "\n")
    jobnames = String[]
    for line in lines
        push!(jobnames, strip(line))
    end
    return jobnames
end

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling CSV [336ed68f-0bac-5ca0-87d4-7b16caf5d00b]


get_job_names (generic function with 1 method)

## Download software

```shell
cd /u/home/b/biona001/project-loes/ghostknockoff
wget https://github.com/biona001/GhostKnockoffGWAS/releases/download/v0.2.2/app_linux_x86.tar.gz
tar -xvzf app_linux_x86.tar.gz
```

## Prepare covariate file

In [7]:
file = "/scratch/groups/sabatti/ukb_phenotypes/phenotypes.QC.britishonly.csv"
df = CSV.read(file, DataFrame)

# interaction covariates
df[!, "age_sex"] = df[!, "age"] .* df[!, "sex"]
df[!, "age_squared_sex"] = df[!, "age_squared"] .* df[!, "sex"]

# sample ID is FID and IID merged
eid = df[!, "eid"]
df[!, "sampleID"] = string.(eid, "_", eid)

# final covariates file
df = df[!, vcat([:sampleID, :sex, :age, :age_squared, :age_sex, :age_squared_sex], [Symbol("PC$i") for i in 1:10])]
CSV.write("/oak/stanford/groups/zihuai/solveblock/covariates.csv", df)
df

Row,sampleID,sex,age,age_squared,age_sex,age_squared_sex,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
Unnamed: 0_level_1,String,Int64,Int64,Int64,Int64,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,5393090_5393090,1,41,1681,41,1681,-12.1725,5.39163,-1.28103,0.841765,-5.26521,-1.78657,3.10992,-2.63085,2.39288,0.307537
2,1532732_1532732,0,46,2116,0,0,-13.0245,6.41514,-0.183365,2.92761,-5.88964,0.940534,1.14106,-1.98213,-2.70226,2.50775
3,3186275_3186275,0,52,2704,0,0,-11.4712,3.48383,-1.15458,3.08383,7.6516,-0.913399,-1.54879,1.47893,-1.20895,1.0649
4,1277047_1277047,0,65,4225,0,0,-12.1327,4.02976,-0.98808,0.750294,-2.36431,0.431658,-0.534071,-0.654367,-6.59593,-1.53356
5,5282298_5282298,0,56,3136,0,0,-12.2171,3.50821,-1.62599,-1.2268,-5.3458,3.81679,0.579155,-1.16958,1.01855,-0.795227
6,1782425_1782425,0,41,1681,0,0,-11.2134,3.8501,-2.73488,0.286941,1.47179,1.1056,2.08845,0.930548,-15.8034,-0.36375
7,3359165_3359165,1,53,2809,53,2809,-12.8808,3.36515,0.233168,-3.95787,1.98784,-0.909523,-2.50145,-1.04002,-8.80647,-1.624
8,5262420_5262420,0,50,2500,0,0,-13.0892,4.15636,-1.62439,4.15643,-1.10979,-0.695489,-3.1308,-1.51867,4.14309,0.4839
9,2168079_2168079,0,55,3025,0,0,-11.9601,4.05693,-1.57496,1.83788,4.11338,-0.291908,-1.32838,0.0791562,2.3278,1.01982
10,4529749_4529749,0,58,3364,0,0,-10.5537,5.14151,-3.22977,1.54827,0.42232,0.192385,-0.9318,2.20073,2.4611,0.828631


In [37]:
exe = "/home/groups/sabatti/.julia/dev/GhostKnockoffGWAS/app_linux_x86/bin/solveblock"
plinkfile = "/oak/stanford/groups/zihuai/solveblock/array/ukb_gen_british.bed"
outdir = "/oak/stanford/groups/zihuai/solveblock/LD_files"
region_file = "/oak/stanford/groups/zihuai/pan_ukb_LD_matrices/LD_block/EUR_hg19/fourier_ls-all.bed"
covfile = "/oak/stanford/groups/zihuai/solveblock/covariates.csv"
hg_build = 19

# quasi independent regions
df = CSV.read(region_file, DataFrame)

failed = 0
running_jobs = get_job_names()
for chr in 1:22
    # regions in current chr
    idx = findall(x -> x == "chr$chr ", df[!, 1])
    start_pos = df[idx, 2]
    end_pos = df[idx, 3] .- 1

    # submit jobs
    for (s, e) in zip(start_pos, end_pos)
        LDfile = joinpath(outdir, "chr$chr", "LD_start$(s)_end$(e).h5")
        summary_file = joinpath(outdir, "chr$chr","summary_start$(s)_end$(e).csv")
        info_file = joinpath(outdir, "chr$chr","Info_start$(s)_end$(e).csv")
        job = "chr$(chr)s$(s)e$e"
        if !(isfile(LDfile) && isfile(summary_file) && isfile(info_file)) && (job ∉ running_jobs)
            rm(LDfile, force=true); rm(summary_file, force=true); rm(info_file, force=true); 
            cmd = "$exe --file $plinkfile --chr $chr --start_bp $s --end_bp $e --outdir $outdir --genome-build $hg_build --covfile $covfile"
            submit(cmd, 1, 24, jobname=job, highp=false)
            failed += 1
#             failed > 5 && fdsa
        end
    end
end
failed

Submitted batch job 55139193
Submitted batch job 55139195
Submitted batch job 55139197
Submitted batch job 55139198
Submitted batch job 55139199
Submitted batch job 55139200
Submitted batch job 55139201
Submitted batch job 55139202
Submitted batch job 55139203
Submitted batch job 55139204
Submitted batch job 55139205
Submitted batch job 55139206
Submitted batch job 55139207
Submitted batch job 55139208
Submitted batch job 55139209
Submitted batch job 55139210
Submitted batch job 55139211
Submitted batch job 55139212
Submitted batch job 55139213
Submitted batch job 55139214
Submitted batch job 55139215
Submitted batch job 55139216
Submitted batch job 55139217
Submitted batch job 55139218
Submitted batch job 55139219
Submitted batch job 55139220
Submitted batch job 55139221
Submitted batch job 55139223
Submitted batch job 55139224
Submitted batch job 55139225
Submitted batch job 55139226
Submitted batch job 55139227
Submitted batch job 55139228
Submitted batch job 55139229
Submitted batc

199

## Run `solveblock` on `snp_ldsplit` output

In [2]:
obj = "default" # LD split objective

exe = "/home/groups/sabatti/.julia/dev/GhostKnockoffGWAS/app_linux_x86/bin/solveblock"
plinkfile = "/oak/stanford/groups/zihuai/solveblock/array/ukb_gen_british.bed"
outdir = "/oak/stanford/groups/zihuai/solveblock/LD_files_ld_split/$obj"
region_dir = "/oak/stanford/groups/zihuai/solveblock/LD_split/$obj"
covfile = "/oak/stanford/groups/zihuai/solveblock/covariates.csv"
hg_build = 19

failed = 0
running_jobs = get_job_names()
for chr in 1:22
    ##### get quasi-independent blocks and handle SNPs falling between windows
    region_file = joinpath(region_dir, "chr$chr.maf0.01.thr0.01.maxr0.3.bed")
    df = CSV.read(region_file, DataFrame)
    start_pos = df[!, "start"]
    end_pos = df[!, "stop"]
    mid_point = [floor((start_pos[i] - end_pos[i-1]) / 2) for i in 2:length(start_pos)]
    end_pos[1:end-1] .+= mid_point
    start_pos[2:end] .-= mid_point
    for i in 2:length(start_pos)
        if start_pos[i] == end_pos[i-1]
            start_pos[i] += 1
        end
    end

    # submit jobs
    for (s, e) in zip(start_pos, end_pos)
        LDfile = joinpath(outdir, "chr$chr", "LD_start$(s)_end$(e).h5")
        summary_file = joinpath(outdir, "chr$chr","summary_start$(s)_end$(e).csv")
        info_file = joinpath(outdir, "chr$chr","Info_start$(s)_end$(e).csv")
        job = "chr$(chr)s$(s)e$e"
        if !(isfile(LDfile) && isfile(summary_file) && isfile(info_file)) && (job ∉ running_jobs)
            rm(LDfile, force=true); rm(summary_file, force=true); rm(info_file, force=true); 
            cmd = "$exe --file $plinkfile --chr $chr --start_bp $s --end_bp $e --outdir $outdir --genome-build $hg_build --covfile $covfile"
            submit(cmd, 1, 128, jobname=job, highp=false)
            failed += 1
        end
    end
end
failed

0

## Liftover from HG19 to HG38 (note: we did not use this in our paper)

This code is left here in case people are interested in how to liftOver HG19 <--> HG38 coordinates. 

```
# on sherlock, loads julia/R/many other modules that makes liftover work on R/4.0.2
module restore liftover 
```

In [None]:
using HDF5
using RCall
using CSV, DataFrames
R"library(liftOver)"

# use liftOver R package to add hg38 coordinates to Sigma_info
# SNPs that can't be converted, or if chr does not match after conversion, are deleted
function augment_hg38(Sigma_info, Sigma, chr, liftOver_chain_dir)
    pos = Sigma_info[!, "pos"]
    @rput chr pos liftOver_chain_dir
    R"""
    df<-cbind(data.frame(paste0('chr',chr)),pos,pos)
    colnames(df)<-c('chr','start','end')
    temp.Granges<-makeGRangesFromDataFrame(df)
    chain <- import.chain(liftOver_chain_dir)
    converted<-data.frame(liftOver(temp.Granges,chain))
    """
    @rget converted
    success_idx, pos_hg38 = falses(size(Sigma, 1)), Int[]
    for row in eachrow(converted)
        row["seqnames"] == "chr$chr" || continue # check chr match
        success_idx[row["group"]] = true
        push!(pos_hg38, row["start"])
    end
    Sigma_info_new = Sigma_info[success_idx, :]
    Sigma_new = Sigma[success_idx, success_idx]
    Sigma_info_new[!, :pos_hg19] = pos[success_idx]
    Sigma_info_new[!, :pos_hg38] = pos_hg38
    select!(Sigma_info_new, Not(:pos))
    return Sigma_new, Sigma_info_new
end

function augment_hg38(h5file, info_file)
    h5reader = h5open(h5file)
    Sigma_info = CSV.read(info_file, DataFrame)
end

In [None]:
liftOver_chain_dir = "/oak/stanford/groups/zihuai/GeneticsResources/LiftOver/hg19ToHg38.over.chain"
LD_files = "/oak/stanford/groups/zihuai/solveblock/LD_files_african/default"
for chr in 1:22
    for file in readdir(joinpath(LD_files, "chr$chr"))
        startswith(file, "Info") || continue
        info_file = joinpath(LD_files, "chr$chr", file)
        h5file = joinpath(LD_files, "chr$chr", "LD" * file[5:end-4] * ".h5")
    end
end