# Computing quasi-independent blocks

+ Reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8696101/
+ R software: https://privefl.github.io/bigsnpr/reference/snp_ldsplit.html
+ Practical choice of tuning parameters: https://github.com/privefl/paper-misspec/blob/main/code/prepare-corr-1000G-EUR.R

```shell
module load R/4.0.2
```

In [1]:
using CSV
using DataFrames

# helper function to submit 1 job to run 1 command
function submit(command::String, ncores::Int, total_mem::Number, 
        joblog_dir::String="/oak/stanford/groups/zihuai/solveblock/joblogs"; 
        jobname="submit", waitfor=Int[], verbose=true)
    mem = round(Int, total_mem / ncores) # memory per core
    filename = "$jobname.sh"
    open(filename, "w") do io
        println(io, "#!/bin/bash")
        println(io, "#")
        println(io, "#SBATCH --job-name=$jobname")
        println(io, "#")
        println(io, "#SBATCH --time=24:00:00")
        println(io, "#SBATCH --cpus-per-task=$ncores")
        println(io, "#SBATCH --mem-per-cpu=$(mem)G")
        println(io, "#SBATCH --partition=candes,zihuai,normal,owners")
        println(io, "#SBATCH --output=$(joinpath(joblog_dir, "slurm-%j.out"))")
        println(io, "")
        println(io, "#save job info on joblog:")
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io, "")
        println(io, "# load the job environment:")
        println(io, "module load julia/1.10")
        println(io, "module load biology plink/1.90b5.3")
        println(io, "module load R/4.0.2")
        println(io, "export OPENBLAS_NUM_THREADS=1")
        println(io, "export JULIA_DEPOT_PATH=\"/home/groups/sabatti/.julia\"")
        println(io, "")
        println(io, "# run code")
        println(io, "echo \"$command\"")
        println(io, "$command")
        println(io, "")
        println(io, "#echo job info on joblog:")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io, "#echo \" \"")
    end
    # submit job and capture job ID
    io = IOBuffer()
    if length(waitfor) != 0
        run(pipeline(`sbatch --dependency=afterok:$(join(waitfor, ':')) $filename`; stdout=io))
    else
        run(pipeline(`sbatch $filename`; stdout=io))
    end
    msg = String(take!(io))
    verbose && print(stdout, msg)
    jobid = parse(Int, strip(msg)[21:end])
    # clean up and return job ID
    close(io)
    rm(filename, force=true)
    return jobid
end

# helper function to submit 1 job to run multiple commands
function submit(commands::Vector{String}, ncores::Int, total_mem::Number, 
        joblog_dir::String="/oak/stanford/groups/zihuai/solveblock/joblogs"; 
        jobname="submit", waitfor=Int[], verbose=true)
    mem = round(Int, total_mem / ncores) # memory per core
    filename = "$jobname.sh"
    open(filename, "w") do io
        println(io, "#!/bin/bash")
        println(io, "#")
        println(io, "#SBATCH --job-name=$jobname")
        println(io, "#")
        println(io, "#SBATCH --time=24:00:00")
        println(io, "#SBATCH --cpus-per-task=$ncores")
        println(io, "#SBATCH --mem-per-cpu=$(mem)G")
        println(io, "#SBATCH --partition=candes,zihuai,normal,owners")
        println(io, "#SBATCH --output=$(joinpath(joblog_dir, "slurm-%j.out"))")
        println(io, "")
        println(io, "#save job info on joblog:")
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io, "")
        println(io, "# load the job environment:")
        println(io, "module load julia/1.10")
        println(io, "module load biology plink/1.90b5.3")
        println(io, "module load R/4.0.2")
        println(io, "export OPENBLAS_NUM_THREADS=1")
        println(io, "export JULIA_DEPOT_PATH=\"/home/groups/sabatti/.julia\"")
        println(io, "")
        for command in commands
            println(io, "echo \"$command\"")
            println(io, "$command")
        end
        println(io, "")
        println(io, "#echo job info on joblog:")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io, "#echo \" \"")
    end
    # submit job and capture job ID
    io = IOBuffer()
    if length(waitfor) != 0
        run(pipeline(`sbatch --dependency=afterok:$(join(waitfor, ':')) $filename`; stdout=io))
    else
        run(pipeline(`sbatch $filename`; stdout=io))
    end
    msg = String(take!(io))
    verbose && print(stdout, msg)
    jobid = parse(Int, strip(msg)[21:end])
    # clean up and return job ID
    close(io)
    rm(filename, force=true)
    return jobid
end

"Run a Cmd object, returning the stdout & stderr contents plus the exit code"
function execute(cmd::Cmd)
    out = Pipe()
    err = Pipe()

    process = run(pipeline(ignorestatus(cmd), stdout=out, stderr=err))
    close(out.in)
    close(err.in)

    return (
        stdout = String(read(out)), 
        stderr = String(read(err)),  
        code = process.exitcode
    )
end

function get_job_names()
    data_str, _, _ = execute(`squeue -u bbchu -h -o "%.30j"`)
    lines = split(data_str, "\n")
    jobnames = String[]
    for line in lines
        push!(jobnames, strip(line))
    end
    return jobnames
end

get_job_names (generic function with 1 method)

# Compute quasi-independent blocks

+ Reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8696101/
+ R software: https://privefl.github.io/bigsnpr/reference/snp_ldsplit.html
+ Practical choice of tuning parameters: https://github.com/privefl/paper-misspec/blob/main/code/prepare-corr-1000G-EUR.R

```shell
ml R/4.0.2
export OPENBLAS_NUM_THREADS=1
```

## Runtime script

In [None]:
# put in /oak/stanford/groups/zihuai/solveblock/ld_split.R

library("bigsnpr")
library("dplyr")
args = commandArgs(TRUE)
chr = as.numeric(args[1])
plinkfile = args[2]
fbmfile = args[3]
outfile = args[4]
thr_r2 = as.numeric(args[5])
max_r2 = as.numeric(args[6]) 
snp_ldsplit_obj = args[7] # 'default', 'min_error', or 'max_num_blocks'

# testing
# chr = 22
# plinkfile = "/oak/stanford/groups/zihuai/solveblock/array/ukb_gen_british_maf0.01_chr22.bed"
# fbmfile = "/oak/stanford/groups/zihuai/solveblock/array/FBM/ukb_gen_british_maf0.01_chr22"
# outfile = "/oak/stanford/groups/zihuai/solveblock/LD_split/default/test/chr6.bed"
# thr_r2 = 0.01
# max_r2 = 0.3

# import PLINK data as FBM (file backed matrix) format
rdsfile <- paste0(fbmfile, ".rds")
if (!file.exists(rdsfile)){snp_readBed2(plinkfile, backingfile = fbmfile)} 
x <- snp_attach(rdsfile)

# estimate correlation matrix
corr <- snp_cor(x$genotypes, infos.pos=x$map$physical.pos, ncores=1)

# compute LD regions
m <- ncol(corr)
max_sizes <- c(1000, 1500, 3000, 6000, 10000)
max_sizes <- max_sizes[max_sizes <= dim(corr)[1]]
splits <- snp_ldsplit(corr, thr_r2 = thr_r2, min_size = 500, max_size = max_sizes, max_r2 = max_r2)

if (snp_ldsplit_obj == 'default'){
    # balances block size with sum of squared correlations outside the blocks
    # Note: this is default objective from snp_ldsplit
    splits$cost2 <- sapply(splits$all_size, function(sizes) sum(sizes^2))
    best_split <- splits %>%
        arrange(cost2 * sqrt(5 + cost)) %>%
        print() %>%
        slice(1) %>%
        print()
    all_size <- best_split$all_size[[1]]
    best_grp <- rep(seq_along(all_size), all_size)
} else if (snp_ldsplit_obj == 'min_error') {
    # minimizes sum of squared correlations outside the blocks
    best_split <- splits %>%
        arrange(cost) %>%
        print() %>%
        slice(1) %>%
        print()
    all_size <- best_split$all_size[[1]]
    best_grp <- rep(seq_along(all_size), all_size)
} else if (snp_ldsplit_obj == 'max_num_blocks') {
    # find LD splits with most blocks
    most_split <- splits %>%
        arrange(desc(n_block)) %>%
        print() %>%
        slice(1) %>%
        print()
    all_size <- most_split$all_size[[1]]
    best_grp <- rep(seq_along(all_size), all_size)
} else {
    stop("snp_ldsplit_obj should be default, min_error, or max_num_blocks")
}

# get position of LD split
unique_grp <- unique(best_grp)
start_pos <- integer(length(unique_grp))
end_pos <- integer(length(unique_grp))
for (i in seq_along(unique_grp)) {
  start_pos[i] <- min(which(best_grp == unique_grp[i]))
  end_pos[i] <- max(which(best_grp == unique_grp[i]))
}

# save result
pos <- x$map$physical.pos
result <- data.frame(
    chr = rep(chr, length(start_pos)),
    start = pos[start_pos], 
    stop = pos[end_pos]
)
write.table(result, outfile, row.names = FALSE, quote=FALSE, sep="\t")

## Tune `snp_ldsplit` parameters

First, try various hyperparameters to see which ones can successfully split the LD matrix into parts. Note that `LDetect` splitted chr22 into 24 regions (on array data). So we would like a combination that splits chr22 into similarly or more parts. 

In [7]:
exe = "/oak/stanford/groups/zihuai/solveblock/ld_split.R"
chr = 22
dir = "/oak/stanford/groups/zihuai/solveblock"
maf = 0.01
for thr_r2 in [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5]
    for max_r2 in [0.3, 0.5, 0.75]
        for obj in ["default", "min_error", "max_num_blocks"]
            outdir = joinpath(dir, "LD_split", obj, "test")
            isdir(outdir) || mkpath(outdir)

            plinkfile = joinpath(dir, "array/ukb_gen_british_maf$(maf)_chr$chr.bed")
            fbmfile = joinpath(dir, "array/FBM/ukb_gen_british_maf$(maf)_chr$chr")
            outfile = joinpath(outdir, "chr$chr.maf$maf.thr$(thr_r2).maxr$(max_r2).bed")
            cmd = "Rscript --vanilla $exe $chr $plinkfile $fbmfile $outfile $thr_r2 $max_r2 $obj"
            submit(cmd, 1, 16, jobname="chr$chr")
        end
    end
end

Submitted batch job 56349337
Submitted batch job 56349339
Submitted batch job 56349341
Submitted batch job 56349343
Submitted batch job 56349344
Submitted batch job 56349345
Submitted batch job 56349346
Submitted batch job 56349348
Submitted batch job 56349350
Submitted batch job 56349352
Submitted batch job 56349354
Submitted batch job 56349355
Submitted batch job 56349356
Submitted batch job 56349358
Submitted batch job 56349360
Submitted batch job 56349362
Submitted batch job 56349364
Submitted batch job 56349365
Submitted batch job 56349366
Submitted batch job 56349367
Submitted batch job 56349368
Submitted batch job 56349370
Submitted batch job 56349372
Submitted batch job 56349374
Submitted batch job 56349376
Submitted batch job 56349377
Submitted batch job 56349378
Submitted batch job 56349379
Submitted batch job 56349380
Submitted batch job 56349381
Submitted batch job 56349384
Submitted batch job 56349386
Submitted batch job 56349388
Submitted batch job 56349390
Submitted batc

In [10]:
for obj in ["default", "min_error", "max_num_blocks"]
    println("obj $obj:")
    outdir = joinpath(dir, "LD_split", obj, "test")
    for file in readdir(outdir)
        f = joinpath(outdir, file)
        println("\t $file = $(countlines(f)) regions")
    end
    println("")
end

obj default:
	 chr22.maf0.01.thr0.01.maxr0.3.bed = 29 regions
	 chr22.maf0.01.thr0.01.maxr0.5.bed = 29 regions
	 chr22.maf0.01.thr0.01.maxr0.75.bed = 29 regions
	 chr22.maf0.01.thr0.05.maxr0.3.bed = 66 regions
	 chr22.maf0.01.thr0.05.maxr0.5.bed = 67 regions
	 chr22.maf0.01.thr0.05.maxr0.75.bed = 67 regions
	 chr22.maf0.01.thr0.1.maxr0.3.bed = 90 regions
	 chr22.maf0.01.thr0.1.maxr0.5.bed = 65 regions
	 chr22.maf0.01.thr0.1.maxr0.75.bed = 65 regions
	 chr22.maf0.01.thr0.15.maxr0.3.bed = 90 regions
	 chr22.maf0.01.thr0.15.maxr0.5.bed = 76 regions
	 chr22.maf0.01.thr0.15.maxr0.75.bed = 76 regions
	 chr22.maf0.01.thr0.2.maxr0.3.bed = 89 regions
	 chr22.maf0.01.thr0.2.maxr0.5.bed = 89 regions
	 chr22.maf0.01.thr0.2.maxr0.75.bed = 89 regions
	 chr22.maf0.01.thr0.25.maxr0.3.bed = 93 regions
	 chr22.maf0.01.thr0.25.maxr0.5.bed = 93 regions
	 chr22.maf0.01.thr0.25.maxr0.75.bed = 93 regions
	 chr22.maf0.01.thr0.5.maxr0.3.bed = 136 regions
	 chr22.maf0.01.thr0.5.maxr0.5.bed = 136 regions
	 chr22

## Run `snp_ldsplit` for all chrom

In [6]:
exe = "/oak/stanford/groups/zihuai/solveblock/ld_split.R"
dir = "/oak/stanford/groups/zihuai/solveblock"
maf = 0.01
thr_r2 = 0.01
max_r2 = 0.3
obj = "default"
outdir = joinpath(dir, "LD_split", obj)
for chr in 1:22
    plinkfile = joinpath(dir, "array/ukb_gen_british_maf$(maf)_chr$chr.bed")
    fbmfile = joinpath(dir, "array/FBM/ukb_gen_british_maf$(maf)_chr$chr")
    outfile = joinpath(outdir, "chr$chr.maf$maf.thr$(thr_r2).maxr$(max_r2).bed")
    cmd = "Rscript --vanilla $exe $chr $plinkfile $fbmfile $outfile $thr_r2 $max_r2 $obj"
    submit(cmd, 1, 24, jobname="ld_chr$chr")
end

Submitted batch job 58211642
Submitted batch job 58211643
Submitted batch job 58211644
Submitted batch job 58211648
Submitted batch job 58211649
Submitted batch job 58211650
Submitted batch job 58211651
Submitted batch job 58211652
Submitted batch job 58211653
Submitted batch job 58211654
Submitted batch job 58211655
Submitted batch job 58211656
Submitted batch job 58211657
Submitted batch job 58211658
Submitted batch job 58211660
Submitted batch job 58211661
Submitted batch job 58211663
Submitted batch job 58211664
Submitted batch job 58211666
Submitted batch job 58211667
Submitted batch job 58211668
Submitted batch job 58211669


Finally, check number of SNPs in quasi-independent blocks

### `objective=default`

In [7]:
chr = 1
obj = "default"
splits = CSV.read("/oak/stanford/groups/zihuai/solveblock/LD_split/$obj/chr$chr.maf0.01.thr0.01.maxr0.3.bed", DataFrame)
bimfile = CSV.read("/oak/stanford/groups/zihuai/solveblock/array/ukb_gen_british_maf0.01_chr$chr.bim", DataFrame, header=false)
nsnps = zeros(size(splits, 1))
for (i, (chr, start, stop)) in enumerate(eachrow(splits))
    nsnps[i] = count(x -> start ≤ x ≤ stop, bimfile[!, 4])
end
splits[!, "nsnps"] = nsnps

for chr in 2:22
    splits2 = CSV.read("/oak/stanford/groups/zihuai/solveblock/LD_split/$obj/chr$chr.maf0.01.thr0.01.maxr0.3.bed", DataFrame)
    bimfile = CSV.read("/oak/stanford/groups/zihuai/solveblock/array/ukb_gen_british_maf0.01_chr$chr.bim", DataFrame, header=false)
    nsnps = zeros(size(splits2, 1))
    for (i, (chr, start, stop)) in enumerate(eachrow(splits2))
        nsnps[i] = count(x -> start ≤ x ≤ stop, bimfile[!, 4])
    end
    splits2[!, "nsnps"] = nsnps
    append!(splits, splits2)
end

sort(splits, [:nsnps], rev=true)

Row,chr,start,stop,nsnps
Unnamed: 0_level_1,Int64,Int64,Int64,Float64
1,6,22056131,40408156,9847.0
2,6,106316464,139560140,5765.0
3,6,61934516,91842836,4478.0
4,6,40411700,58739368,3376.0
5,17,34816186,48123424,2442.0
6,6,91843655,106315792,2252.0
7,12,76523263,92066467,2242.0
8,13,46750002,61266575,2204.0
9,12,51776494,63237235,2111.0
10,12,37858073,51775276,2090.0


### `objective=min_error`

In [3]:
chr = 1
obj = "min_error"
splits = CSV.read("/oak/stanford/groups/zihuai/solveblock/LD_split/$obj/chr$chr.maf0.01.thr0.01.maxr0.3.bed", DataFrame)
bimfile = CSV.read("/oak/stanford/groups/zihuai/solveblock/array/ukb_gen_british_maf0.01_chr$chr.bim", DataFrame, header=false)
nsnps = zeros(size(splits, 1))
for (i, (chr, start, stop)) in enumerate(eachrow(splits))
    nsnps[i] = count(x -> start ≤ x ≤ stop, bimfile[!, 4])
end
splits[!, "nsnps"] = nsnps

for chr in 2:22
    splits2 = CSV.read("/oak/stanford/groups/zihuai/solveblock/LD_split/$obj/chr$chr.maf0.01.thr0.01.maxr0.3.bed", DataFrame)
    bimfile = CSV.read("/oak/stanford/groups/zihuai/solveblock/array/ukb_gen_british_maf0.01_chr$chr.bim", DataFrame, header=false)
    nsnps = zeros(size(splits2, 1))
    for (i, (chr, start, stop)) in enumerate(eachrow(splits2))
        nsnps[i] = count(x -> start ≤ x ≤ stop, bimfile[!, 4])
    end
    splits2[!, "nsnps"] = nsnps
    append!(splits, splits2)
end

sort(splits, [:nsnps], rev=true)

Row,chr,start,stop,nsnps
Unnamed: 0_level_1,Int64,Int64,Int64,Float64
1,6,22056131,40408156,9847.0
2,6,106316464,148920091,7441.0
3,6,61934516,106315792,6730.0
4,17,34816186,66898415,6000.0
5,13,73110576,105639771,5995.0
6,10,51785728,82413423,5990.0
7,2,95358799,133038729,5969.0
8,5,70671482,106801970,5965.0
9,10,14007025,51594462,5964.0
10,2,133062562,170142105,5950.0
