# Run `solveblock` on UKB Indian/African/Chinese/Caribbean

1. Start with UKB array data filtered to Indian samples, **on SNPs with MAF>0.01**
2. Use LD blocks defined by `snp_ldsplit`
3. Use covariates to adjust LD matrix following [Pan-UKB documentation](https://pan-dev.ukbb.broadinstitute.org/docs/ld#ld-matrices)
    + sex
    + age
    + age^2
    + age*sex
    + age^2*sex
    + first 10 PCs

4. Run `solveblock` on given PLINK file

```shell
ml julia/1.10 R/4.0.2
export OPENBLAS_NUM_THREADS=1
```

In [1]:
using CSV
using DataFrames
using DelimitedFiles

# helper function to submit 1 job to run 1 command
function submit(command::String, ncores::Int, total_mem::Number, 
        joblog_dir::String="/oak/stanford/groups/zihuai/solveblock/joblogs"; 
        jobname="submit", waitfor=Int[], verbose=true, highp=false)
    mem = round(Int, total_mem / ncores) # memory per core
    filename = "$jobname.sh"
    open(filename, "w") do io
        println(io, "#!/bin/bash")
        println(io, "#")
        println(io, "#SBATCH --job-name=$jobname")
        println(io, "#")
        if highp
            println(io, "#SBATCH --time=168:00:00")
        else
            println(io, "#SBATCH --time=24:00:00")
        end
        println(io, "#SBATCH --cpus-per-task=$ncores")
        println(io, "#SBATCH --mem-per-cpu=$(mem)G")
        if highp
            println(io, "#SBATCH --partition=candes,zihuai")
        else
            println(io, "#SBATCH --partition=candes,zihuai,normal,owners")
        end
        println(io, "#SBATCH --output=$(joinpath(joblog_dir, "slurm-%j.out"))")
        println(io, "")
        println(io, "#save job info on joblog:")
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io, "")
        println(io, "# load the job environment:")
#         println(io, "module load julia/1.9")
#         println(io, "module load biology plink/1.90b5.3")
#         println(io, "module load R/4.0.2")
#         println(io, "export OPENBLAS_NUM_THREADS=1")
#         println(io, "export JULIA_DEPOT_PATH=\"/home/groups/sabatti/.julia\"")
        println(io, "")
        println(io, "# run code")
        println(io, "echo \"$command\"")
        println(io, "$command")
        println(io, "")
        println(io, "#echo job info on joblog:")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io, "#echo \" \"")
    end
    # submit job and capture job ID
    io = IOBuffer()
    if length(waitfor) != 0
        run(pipeline(`sbatch --dependency=afterok:$(join(waitfor, ':')) $filename`; stdout=io))
    else
        run(pipeline(`sbatch $filename`; stdout=io))
    end
    msg = String(take!(io))
    verbose && print(stdout, msg)
    jobid = parse(Int, strip(msg)[21:end])
    # clean up and return job ID
    close(io)
    rm(filename, force=true)
    return jobid
end

# helper function to submit 1 job to run multiple commands
function submit(commands::Vector{String}, ncores::Int, total_mem::Number, 
        joblog_dir::String="/oak/stanford/groups/zihuai/solveblock/joblogs"; 
        jobname="submit", waitfor=Int[], verbose=true, highp=false)
    mem = round(Int, total_mem / ncores) # memory per core
    filename = "$jobname.sh"
    open(filename, "w") do io
        println(io, "#!/bin/bash")
        println(io, "#")
        println(io, "#SBATCH --job-name=$jobname")
        println(io, "#")
        if highp
            println(io, "#SBATCH --time=168:00:00")
        else
            println(io, "#SBATCH --time=24:00:00")
        end
        println(io, "#SBATCH --cpus-per-task=$ncores")
        println(io, "#SBATCH --mem-per-cpu=$(mem)G")
        if highp
            println(io, "#SBATCH --partition=candes,zihuai")
        else
            println(io, "#SBATCH --partition=candes,zihuai,normal,owners")
        end
        println(io, "#SBATCH --output=$(joinpath(joblog_dir, "slurm-%j.out"))")
        println(io, "")
        println(io, "#save job info on joblog:")
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io, "")
        println(io, "# load the job environment:")
#         println(io, "module load julia/1.9")
#         println(io, "module load biology plink/1.90b5.3")
#         println(io, "module load R/4.0.2")
#         println(io, "export OPENBLAS_NUM_THREADS=1")
#         println(io, "export JULIA_DEPOT_PATH=\"/home/groups/sabatti/.julia\"")
        println(io, "")
        for command in commands
            println(io, "echo \"$command\"")
            println(io, "$command")
        end
        println(io, "")
        println(io, "#echo job info on joblog:")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io, "#echo \" \"")
    end
    # submit job and capture job ID
    io = IOBuffer()
    if length(waitfor) != 0
        run(pipeline(`sbatch --dependency=afterok:$(join(waitfor, ':')) $filename`; stdout=io))
    else
        run(pipeline(`sbatch $filename`; stdout=io))
    end
    msg = String(take!(io))
    verbose && print(stdout, msg)
    jobid = parse(Int, strip(msg)[21:end])
    # clean up and return job ID
    close(io)
    rm(filename, force=true)
    return jobid
end


"Run a Cmd object, returning the stdout & stderr contents plus the exit code"
function execute(cmd::Cmd)
    out = Pipe()
    err = Pipe()

    process = run(pipeline(ignorestatus(cmd), stdout=out, stderr=err))
    close(out.in)
    close(err.in)

    return (
        stdout = String(read(out)), 
        stderr = String(read(err)),  
        code = process.exitcode
    )
end

function get_job_names()
    data_str, _, _ = execute(`squeue -u bbchu -h -o "%.30j"`)
    lines = split(data_str, "\n")
    jobnames = String[]
    for line in lines
        push!(jobnames, strip(line))
    end
    return jobnames
end

get_job_names (generic function with 1 method)

## Obtain Indian SNP data

QC of Indian genotype/phenotype was done in `ukb_phenotypes_QC_Indians.ipynb`

## Prepare covariate file

In [11]:
for pop in ["indian", "chinese", "caribbean", "african"]
    file = "/scratch/groups/sabatti/ukb_phenotypes/phenotypes.$pop.csv"
    df = CSV.read(file, DataFrame)

    # interaction covariates
    df[!, "age_sex"] = df[!, "age"] .* df[!, "sex"]
    df[!, "age_squared_sex"] = df[!, "age_squared"] .* df[!, "sex"]

    # sample ID is FID and IID merged
    eid = df[!, "eid"]
    df[!, "sampleID"] = string.(eid, "_", eid)

    # final covariates file
    df = df[!, vcat([:sampleID, :sex, :age, :age_squared, :age_sex, :age_squared_sex], [Symbol("PC$i") for i in 1:10])]
    CSV.write("/oak/stanford/groups/zihuai/solveblock/covariates.$pop.csv", df)
end

## Run `solveblock`

In [16]:
obj = "default" # LD split objective
exe = "/home/groups/sabatti/.julia/dev/GhostKnockoffGWAS/app_linux_x86/bin/solveblock"
hg_build = 19

for pop in ["indian", "chinese", "caribbean", "african"]
    plinkfile = "/scratch/groups/sabatti/ukb_genotypes/$pop/allchr.bed"
    outdir = "/oak/stanford/groups/zihuai/solveblock/LD_files_$pop/$obj"
    region_dir = "/oak/stanford/groups/zihuai/solveblock/LD_files_$pop/LD_blocks/$obj"
    covfile = "/oak/stanford/groups/zihuai/solveblock/covariates.$pop.csv"

    failed = 0
    running_jobs = get_job_names()
    for chr in 1:22
        ##### get quasi-independent blocks and handle SNPs falling between windows
        region_file = joinpath(region_dir, "chr$chr.maf0.01.thr0.01.maxr0.3.bed")
        df = CSV.read(region_file, DataFrame)
        start_pos = df[!, "start"]
        end_pos = df[!, "stop"]
        mid_point = [floor((start_pos[i] - end_pos[i-1]) / 2) for i in 2:length(start_pos)]
        end_pos[1:end-1] .+= mid_point
        start_pos[2:end] .-= mid_point
        for i in 2:length(start_pos)
            if start_pos[i] == end_pos[i-1]
                start_pos[i] += 1
            end
        end

        # submit jobs
        for (s, e) in zip(start_pos, end_pos)
            LDfile = joinpath(outdir, "chr$chr", "LD_start$(s)_end$(e).h5")
            summary_file = joinpath(outdir, "chr$chr","summary_start$(s)_end$(e).csv")
            info_file = joinpath(outdir, "chr$chr","Info_start$(s)_end$(e).csv")
            job = "chr$(chr)s$(s)e$e"
            if !(isfile(LDfile) && isfile(summary_file) && isfile(info_file)) && (job âˆ‰ running_jobs)
                rm(LDfile, force=true); rm(summary_file, force=true); rm(info_file, force=true); 
                cmd = "$exe --file $plinkfile --chr $chr --start_bp $s --end_bp $e --outdir $outdir --genome-build $hg_build --covfile $covfile"
                submit(cmd, 1, 24, jobname=job, highp=false)
                failed += 1
            end
        end
    end
    println("$pop failed $failed")
end

indian failed 0
chinese failed 0
caribbean failed 0
african failed 0
