# Computing quasi-independent blocks

+ Reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8696101/
+ R software: https://privefl.github.io/bigsnpr/reference/snp_ldsplit.html
+ Practical choice of tuning parameters: https://github.com/privefl/paper-misspec/blob/main/code/prepare-corr-1000G-EUR.R

```shell
module load R/4.0.2
```

In [1]:
using CSV
using DataFrames

# helper function to submit 1 job to run 1 command
function submit(command::String, ncores::Int, total_mem::Number, 
        joblog_dir::String="/oak/stanford/groups/zihuai/solveblock/joblogs"; 
        jobname="submit", waitfor=Int[], verbose=true)
    mem = round(Int, total_mem / ncores) # memory per core
    filename = "$jobname.sh"
    open(filename, "w") do io
        println(io, "#!/bin/bash")
        println(io, "#")
        println(io, "#SBATCH --job-name=$jobname")
        println(io, "#")
        println(io, "#SBATCH --time=24:00:00")
        println(io, "#SBATCH --cpus-per-task=$ncores")
        println(io, "#SBATCH --mem-per-cpu=$(mem)G")
        println(io, "#SBATCH --partition=candes,zihuai,normal,owners")
        println(io, "#SBATCH --output=$(joinpath(joblog_dir, "slurm-%j.out"))")
        println(io, "")
        println(io, "#save job info on joblog:")
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io, "")
        println(io, "# load the job environment:")
        println(io, "module load julia/1.10")
        println(io, "module load biology plink/1.90b5.3")
        println(io, "module load R/4.0.2")
        println(io, "export OPENBLAS_NUM_THREADS=1")
        println(io, "export JULIA_DEPOT_PATH=\"/home/groups/sabatti/.julia\"")
        println(io, "")
        println(io, "# run code")
        println(io, "echo \"$command\"")
        println(io, "$command")
        println(io, "")
        println(io, "#echo job info on joblog:")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io, "#echo \" \"")
    end
    # submit job and capture job ID
    io = IOBuffer()
    if length(waitfor) != 0
        run(pipeline(`sbatch --dependency=afterok:$(join(waitfor, ':')) $filename`; stdout=io))
    else
        run(pipeline(`sbatch $filename`; stdout=io))
    end
    msg = String(take!(io))
    verbose && print(stdout, msg)
    jobid = parse(Int, strip(msg)[21:end])
    # clean up and return job ID
    close(io)
    rm(filename, force=true)
    return jobid
end

# helper function to submit 1 job to run multiple commands
function submit(commands::Vector{String}, ncores::Int, total_mem::Number, 
        joblog_dir::String="/oak/stanford/groups/zihuai/solveblock/joblogs"; 
        jobname="submit", waitfor=Int[], verbose=true)
    mem = round(Int, total_mem / ncores) # memory per core
    filename = "$jobname.sh"
    open(filename, "w") do io
        println(io, "#!/bin/bash")
        println(io, "#")
        println(io, "#SBATCH --job-name=$jobname")
        println(io, "#")
        println(io, "#SBATCH --time=24:00:00")
        println(io, "#SBATCH --cpus-per-task=$ncores")
        println(io, "#SBATCH --mem-per-cpu=$(mem)G")
        println(io, "#SBATCH --partition=candes,zihuai,normal,owners")
        println(io, "#SBATCH --output=$(joinpath(joblog_dir, "slurm-%j.out"))")
        println(io, "")
        println(io, "#save job info on joblog:")
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io, "")
        println(io, "# load the job environment:")
        println(io, "module load julia/1.10")
        println(io, "module load biology plink/1.90b5.3")
        println(io, "module load R/4.0.2")
        println(io, "export OPENBLAS_NUM_THREADS=1")
        println(io, "export JULIA_DEPOT_PATH=\"/home/groups/sabatti/.julia\"")
        println(io, "")
        for command in commands
            println(io, "echo \"$command\"")
            println(io, "$command")
        end
        println(io, "")
        println(io, "#echo job info on joblog:")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io, "#echo \" \"")
    end
    # submit job and capture job ID
    io = IOBuffer()
    if length(waitfor) != 0
        run(pipeline(`sbatch --dependency=afterok:$(join(waitfor, ':')) $filename`; stdout=io))
    else
        run(pipeline(`sbatch $filename`; stdout=io))
    end
    msg = String(take!(io))
    verbose && print(stdout, msg)
    jobid = parse(Int, strip(msg)[21:end])
    # clean up and return job ID
    close(io)
    rm(filename, force=true)
    return jobid
end

"Run a Cmd object, returning the stdout & stderr contents plus the exit code"
function execute(cmd::Cmd)
    out = Pipe()
    err = Pipe()

    process = run(pipeline(ignorestatus(cmd), stdout=out, stderr=err))
    close(out.in)
    close(err.in)

    return (
        stdout = String(read(out)), 
        stderr = String(read(err)),  
        code = process.exitcode
    )
end

function get_job_names()
    data_str, _, _ = execute(`squeue -u bbchu -h -o "%.30j"`)
    lines = split(data_str, "\n")
    jobnames = String[]
    for line in lines
        push!(jobnames, strip(line))
    end
    return jobnames
end

get_job_names (generic function with 1 method)

# Compute quasi-independent blocks

+ Reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8696101/
+ R software: https://privefl.github.io/bigsnpr/reference/snp_ldsplit.html
+ Practical choice of tuning parameters: https://github.com/privefl/paper-misspec/blob/main/code/prepare-corr-1000G-EUR.R

```shell
ml R/4.0.2
export OPENBLAS_NUM_THREADS=1
```

## Runtime script

In [None]:
# put in /oak/stanford/groups/zihuai/solveblock/ld_split.R

library("bigsnpr")
library("dplyr")
args = commandArgs(TRUE)
chr = as.numeric(args[1])
plinkfile = args[2]
fbmfile = args[3]
outfile = args[4]
thr_r2 = as.numeric(args[5])
max_r2 = as.numeric(args[6]) 
snp_ldsplit_obj = args[7] # 'default', 'min_error', or 'max_num_blocks'

# testing
# chr = 22
# plinkfile = "/oak/stanford/groups/zihuai/solveblock/array/ukb_gen_british_maf0.01_chr22.bed"
# fbmfile = "/oak/stanford/groups/zihuai/solveblock/array/FBM/ukb_gen_british_maf0.01_chr22"
# outfile = "/oak/stanford/groups/zihuai/solveblock/LD_split/default/test/chr6.bed"
# thr_r2 = 0.01
# max_r2 = 0.3

# import PLINK data as FBM (file backed matrix) format
rdsfile <- paste0(fbmfile, ".rds")
if (!file.exists(rdsfile)){snp_readBed2(plinkfile, backingfile = fbmfile)} 
x <- snp_attach(rdsfile)

# estimate correlation matrix
corr <- snp_cor(x$genotypes, infos.pos=x$map$physical.pos, ncores=1)

# compute LD regions
m <- ncol(corr)
max_sizes <- c(1000, 1500, 3000, 6000, 10000)
max_sizes <- max_sizes[max_sizes <= dim(corr)[1]]
splits <- snp_ldsplit(corr, thr_r2 = thr_r2, min_size = 500, max_size = max_sizes, max_r2 = max_r2)

if (snp_ldsplit_obj == 'default'){
    # balances block size with sum of squared correlations outside the blocks
    # Note: this is default objective from snp_ldsplit
    splits$cost2 <- sapply(splits$all_size, function(sizes) sum(sizes^2))
    best_split <- splits %>%
        arrange(cost2 * sqrt(5 + cost)) %>%
        print() %>%
        slice(1) %>%
        print()
    all_size <- best_split$all_size[[1]]
    best_grp <- rep(seq_along(all_size), all_size)
} else if (snp_ldsplit_obj == 'min_error') {
    # minimizes sum of squared correlations outside the blocks
    best_split <- splits %>%
        arrange(cost) %>%
        print() %>%
        slice(1) %>%
        print()
    all_size <- best_split$all_size[[1]]
    best_grp <- rep(seq_along(all_size), all_size)
} else if (snp_ldsplit_obj == 'max_num_blocks') {
    # find LD splits with most blocks
    most_split <- splits %>%
        arrange(desc(n_block)) %>%
        print() %>%
        slice(1) %>%
        print()
    all_size <- most_split$all_size[[1]]
    best_grp <- rep(seq_along(all_size), all_size)
} else {
    stop("snp_ldsplit_obj should be default, min_error, or max_num_blocks")
}

# get position of LD split
unique_grp <- unique(best_grp)
start_pos <- integer(length(unique_grp))
end_pos <- integer(length(unique_grp))
for (i in seq_along(unique_grp)) {
  start_pos[i] <- min(which(best_grp == unique_grp[i]))
  end_pos[i] <- max(which(best_grp == unique_grp[i]))
}

# save result
pos <- x$map$physical.pos
result <- data.frame(
    chr = rep(chr, length(start_pos)),
    start = pos[start_pos], 
    stop = pos[end_pos]
)
write.table(result, outfile, row.names = FALSE, quote=FALSE, sep="\t")

## Run `snp_ldsplit` for all chrom

In [5]:
exe = "/oak/stanford/groups/zihuai/solveblock/ld_split.R"
maf = 0.01
thr_r2 = 0.01
max_r2 = 0.3
obj = "default"
for pop in ["indian", "chinese", "caribbean", "african"]
    outdir = "/oak/stanford/groups/zihuai/solveblock/LD_files_$pop/LD_blocks/$obj"
    mkpath(outdir)
    for chr in 1:22
        plinkfile = "/scratch/groups/sabatti/ukb_genotypes/$pop/chr$chr.bed"
        fbmfile = "/scratch/groups/sabatti/ukb_genotypes/$pop/FBM/chr$chr"
        outfile = joinpath(outdir, "chr$chr.maf$maf.thr$(thr_r2).maxr$(max_r2).bed")
        cmd = "Rscript --vanilla $exe $chr $plinkfile $fbmfile $outfile $thr_r2 $max_r2 $obj"
        submit(cmd, 1, 24, jobname="ld_chr$chr")
    end
end

Submitted batch job 61563213
Submitted batch job 61563214
Submitted batch job 61563215
Submitted batch job 61563219
Submitted batch job 61563222
Submitted batch job 61563224
Submitted batch job 61563225
Submitted batch job 61563226
Submitted batch job 61563227
Submitted batch job 61563229
Submitted batch job 61563231
Submitted batch job 61563234
Submitted batch job 61563235
Submitted batch job 61563236
Submitted batch job 61563237
Submitted batch job 61563238
Submitted batch job 61563239
Submitted batch job 61563240
Submitted batch job 61563242
Submitted batch job 61563245
Submitted batch job 61563248
Submitted batch job 61563249
Submitted batch job 61563250
Submitted batch job 61563251
Submitted batch job 61563252
Submitted batch job 61563254
Submitted batch job 61563256
Submitted batch job 61563259
Submitted batch job 61563260
Submitted batch job 61563261
Submitted batch job 61563262
Submitted batch job 61563263
Submitted batch job 61563264
Submitted batch job 61563265
Submitted batc

Finally, check number of SNPs in quasi-independent blocks

### `objective=default`

In [4]:
chr = 1
obj = "default"
splits = CSV.read("/oak/stanford/groups/zihuai/solveblock/LD_files_indians/LD_blocks/$obj/chr$chr.maf0.01.thr0.01.maxr0.3.bed", DataFrame)
bimfile = CSV.read("/scratch/groups/sabatti/ukb_genotypes/indians/chr$chr.bim", DataFrame, header=false)
nsnps = zeros(size(splits, 1))
for (i, (chr, start, stop)) in enumerate(eachrow(splits))
    nsnps[i] = count(x -> start ≤ x ≤ stop, bimfile[!, 4])
end
splits[!, "nsnps"] = nsnps

for chr in 2:22
    splits2 = CSV.read("/oak/stanford/groups/zihuai/solveblock/LD_files_indians/LD_blocks/$obj/chr$chr.maf0.01.thr0.01.maxr0.3.bed", DataFrame)
    bimfile = CSV.read("/scratch/groups/sabatti/ukb_genotypes/indians/chr$chr.bim", DataFrame, header=false)
    nsnps = zeros(size(splits2, 1))
    for (i, (chr, start, stop)) in enumerate(eachrow(splits2))
        nsnps[i] = count(x -> start ≤ x ≤ stop, bimfile[!, 4])
    end
    splits2[!, "nsnps"] = nsnps
    append!(splits, splits2)
end

sort(splits, [:nsnps], rev=true)

Row,chr,start,stop,nsnps
Unnamed: 0_level_1,Int64,Int64,Int64,Float64
1,6,23597831,33956165,8128.0
2,6,110283679,125367101,2025.0
3,6,73177863,88813412,1788.0
4,6,139563937,151128492,1764.0
5,6,153719985,162318396,1644.0
6,3,116839982,126801736,1461.0
7,7,96581811,106052692,1456.0
8,2,132722193,141804323,1452.0
9,3,154801978,168342251,1449.0
10,3,93529128,106335082,1430.0


## Count number of blocks

In [2]:
for pop in ["british", "indian", "chinese", "caribbean", "african"]
    blocks = 0
    for chr in 1:22
        file = "/oak/stanford/groups/zihuai/solveblock/LD_files_$pop/LD_blocks/default/chr$chr.maf0.01.thr0.01.maxr0.3.bed"
        blocks += countlines(file) - 1
    end
    println("$pop has $blocks blocks")
end

british has 636 blocks
indian has 615 blocks
chinese has 489 blocks
caribbean has 513 blocks
african has 505 blocks
