# Compare MendelImpute against Minimac4 and Beagle5 on simulated data

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using SparseArrays
using JLD2, FileIO, JLSO
using ProgressMeter
using GroupSlices
using ThreadPools
# using Plots
# using ProfileView

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


# Simulate data

### Step 0. Install `msprime`

[msprime download Link](https://msprime.readthedocs.io/en/stable/installation.html).

Some people might need to activate conda environment via `conda config --set auto_activate_base True`. You can turn it off once simulation is done by executing `conda config --set auto_activate_base False`.


### Step 1. Simulate data in terminal

```
python3 msprime_script.py 4000 10000 5000000 2e-8 2e-8 2019 > full.vcf
```

Arguments: 
+ Number of haplotypes = 40000
+ Effective population size = 10000 ([source](https://www.the-scientist.com/the-nutshell/ancient-humans-more-diverse-43556))
+ Sequence length = 10 million (same as Beagle 5's choice)
+ Rrecombination rate = 2e-8 (default)
+ mutation rate = 2e-8 (default)
+ seed = 2019

### Step 2: Convert simulated haplotypes to reference haplotypes and target genotype files

+ `haplo_ref.vcf.gz`: haplotype reference files
+ `target.vcf.gz`: complete genotype information
+ `target_masked.vcf.gz`: the same as `target.vcf.gz` except some entries are masked

In [4]:
records, samples = nrecords("./compare1/full.vcf"), nsamples("./compare1/full.vcf")
@show records
@show samples;

# compute target and reference index
tgt_index = falses(samples)
tgt_index[samples-999:end] .= true
ref_index = .!tgt_index
record_index = trues(records) # save all records (SNPs) 

# create target.vcf.gz and haplo_ref.vcf.gz
@time VCFTools.filter("./compare1/full.vcf", record_index, tgt_index, des = "./compare1/target.vcf.gz")
@time VCFTools.filter("./compare1/full.vcf", record_index, ref_index, des = "./compare1/haplo_ref.vcf.gz")

# import full target matrix. Also transpose so that columns are samples. 
@time X = convert_gt(Float32, "target.vcf.gz"; as_minorallele=false)
X = copy(X')

# mask 10% entries
p, n = size(X)
Random.seed!(123)
missingprop = 0.1
X .= ifelse.(rand(Float32, p, n) .< missingprop, missing, X)
masks = ismissing.(X)

# save X to new VCF file
mask_gt("target.vcf.gz", masks, des="target_masked.vcf.gz")

records = 35897
samples = 2000
 70.297829 seconds (397.28 M allocations: 33.310 GiB, 11.51% gc time)
 67.404677 seconds (395.78 M allocations: 33.237 GiB, 11.69% gc time)
 18.210343 seconds (144.39 M allocations: 12.666 GiB, 17.95% gc time)


# Try compressing haplotype ref panels

In [12]:
# compress as jld2
reffile = "./compare1/haplo_ref.vcf.gz"
tgtfile = "./compare1/target_masked.vcf.gz"
outfile = "./compare1/haplo_ref.jld2"
width = 512
@time compress_haplotypes(reffile, tgtfile, outfile, width);

[32mimporting reference data...100%|████████████████████████| Time: 0:00:06[39m
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


 19.165787 seconds (154.79 M allocations: 11.436 GiB, 7.42% gc time)


In [13]:
# compress as jlso
reffile = "./compare1/haplo_ref.vcf.gz"
tgtfile = "./compare1/target_masked.vcf.gz"
outfile = "./compare1/haplo_ref.jlso"
width = 512
@time compress_haplotypes(reffile, tgtfile, outfile, width);

[32mimporting reference data...100%|████████████████████████| Time: 0:00:06[39m
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


 18.817403 seconds (152.23 M allocations: 11.387 GiB, 7.13% gc time)


In [14]:
# compress as jlso
function compress()
    widths  = [32, 64, 128, 256, 512]
    reffile = "./compare1/haplo_ref.vcf.gz"
    tgtfile = "./compare1/target_masked.vcf.gz"
    H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg="importing reference data...")
    X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")
    for width in widths
        outfile = "./compare1/haplo_ref.w$width.jlso"
        @time compress_haplotypes(H, X, outfile, X_pos, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt, width)
    end
end
compress()

[32mimporting reference data...100%|████████████████████████| Time: 0:00:06[39m
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


  6.430602 seconds (978.23 k allocations: 2.129 GiB, 3.79% gc time)
  4.639667 seconds (836.07 k allocations: 1.126 GiB, 7.78% gc time)
  3.349600 seconds (700.40 k allocations: 619.415 MiB, 1.92% gc time)
  2.739742 seconds (583.56 k allocations: 360.084 MiB, 1.68% gc time)
  2.359550 seconds (471.87 k allocations: 211.756 MiB, 1.36% gc time)


In [18]:
# load jld2
@time @load "./compare1/haplo_ref.jld2" compressed_Hunique;

  0.088776 seconds (774.50 k allocations: 58.516 MiB)


In [20]:
# load jlso
@time loaded = JLSO.load("./compare1/haplo_ref.jlso")
compressed_Hunique = loaded[:compressed_Hunique];

  0.225266 seconds (612.38 k allocations: 38.652 MiB, 19.15% gc time)


In [21]:
;ls -al ./compare1/haplo_ref.jld2

-rw-r--r--  1 biona001  staff  23098725 Jul  2 20:58 ./compare1/haplo_ref.jld2


In [22]:
;ls -al ./compare1/haplo_ref.jlso

-rw-r--r--  1 biona001  staff  1959743 Jul  2 20:58 ./compare1/haplo_ref.jlso


In [23]:
;ls -al ./compare1/haplo_ref.vcf.gz

-rw-r--r--@ 1 biona001  staff  5449864 Apr  5 19:59 ./compare1/haplo_ref.vcf.gz


# Haplotype thinning (experiment)

In [4]:
# haplopair_thin (doesn't accounts for allele freq), keep = 100 (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    thinning_factor=100, thinning_scale_allelefreq=false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.94385 seconds
    Computing haplotype pair        = 3.78641 seconds
        computing dist(X, H)           = 0.0803656 seconds per thread
        BLAS3 mul! to get M and N      = 2.79965 seconds per thread
        haplopair search               = 0.0902262 seconds per thread
        finding redundant happairs     = 0.050725 seconds per thread
    Phasing by dynamic programming  = 1.34666 seconds
    Imputation                      = 3.32163 seconds

 16.552239 seconds (74.83 M allocations: 6.717 GiB, 5.34% gc time)


0.00012427222330556873

In [5]:
# haplopair_thin (accounts for allele freq), keep = 100 (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    thinning_factor=100, thinning_scale_allelefreq=true);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 8.71165 seconds
    Computing haplotype pair        = 3.82927 seconds
        computing dist(X, H)           = 0.103507 seconds per thread
        BLAS3 mul! to get M and N      = 3.00298 seconds per thread
        haplopair search               = 0.0957462 seconds per thread
        finding redundant happairs     = 0.0483971 seconds per thread
    Phasing by dynamic programming  = 1.31137 seconds
    Imputation                      = 3.59727 seconds

 17.449979 seconds (74.09 M allocations: 6.688 GiB, 5.87% gc time)


0.00037705100704794273

In [4]:
# haplopair_thin (not accounts for allele freq), keep = 100 (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    thinning_factor=400, thinning_scale_allelefreq=false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:33[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.23692 seconds
    Computing haplotype pair        = 33.8668 seconds
        computing dist(X, H)           = 0.0678813 seconds per thread
        BLAS3 mul! to get M and N      = 31.0557 seconds per thread
        haplopair search               = 1.01014 seconds per thread
        finding redundant happairs     = 0.0575714 seconds per thread
    Phasing by dynamic programming  = 1.59336 seconds
    Imputation                      = 3.47687 seconds

 46.173727 seconds (73.97 M allocations: 6.730 GiB, 1.82% gc time)


0.00012354792879627822

In [5]:
# haplopair_thin (accounts for allele freq), keep = 100 (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    thinning_factor=400, thinning_scale_allelefreq=true);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:35[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.29256 seconds
    Computing haplotype pair        = 35.8707 seconds
        computing dist(X, H)           = 0.083919 seconds per thread
        BLAS3 mul! to get M and N      = 32.2807 seconds per thread
        haplopair search               = 1.80037 seconds per thread
        finding redundant happairs     = 0.0578252 seconds per thread
    Phasing by dynamic programming  = 1.61832 seconds
    Imputation                      = 3.74514 seconds

 48.526605 seconds (73.88 M allocations: 6.863 GiB, 1.92% gc time)


0.00012257291695684877

# MendelImpute error

In [4]:
# keep best pair only (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:08[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.15743 seconds
    Computing haplotype pair        = 4.59733 seconds
        BLAS3 mul! to get M and N      = 0.0877909 seconds per thread
        haplopair search               = 3.38927 seconds per thread
        supplying constant terms       = 0.0350991 seconds per thread
        finding redundant happairs     = 0.241295 seconds per thread
    Phasing by dynamic programming  = 8.16645 seconds
    Imputation                      = 3.09324 seconds

 23.014640 seconds (73.66 M allocations: 6.632 GiB, 4.34% gc time)


0.00012268434688135499

In [60]:
# keep top matching happairs (1 thread)
Random.seed!(2020)
tgtfile = "target_masked.vcf.gz"
reffile = "haplo_ref.jlso"
outfile = "imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, rescreen=true);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:07[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:09[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 8.60848 seconds
    Computing haplotype pair        = 7.25466 seconds
        BLAS3 mul! to get M and N      = 0.0993171 seconds (on thread 1)
        haplopair search               = 4.07569 seconds (on thread 1)
        min least sq on observed data  = 1.16035 seconds (on thread 1)
        finding redundant happairs     = 0.525039 seconds (on thread 1)
    Phasing by dynamic programming  = 9.25942 seconds
    Imputation                      = 3.39352 seconds

 28.516151 seconds (73.61 M allocations: 7.466 GiB, 7.49% gc time)


0.0001033791124606513

In [4]:
# keep best pair only (8 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 8.12152 seconds
    Computing haplotype pair        = 1.1218 seconds
        BLAS3 mul! to get M and N      = 0.0341568 seconds per thread
        haplopair search               = 0.705712 seconds per thread
        supplying constant terms       = 0.00539063 seconds per thread
        finding redundant happairs     = 0.0542955 seconds per thread
    Phasing by dynamic programming  = 1.64034 seconds
    Imputation                      = 3.46126 seconds

 14.345867 seconds (73.67 M allocations: 6.676 GiB, 6.10% gc time)


0.00012268434688135499

In [3]:
# Keep a list of top haplotype pairs (1 thread)
Random.seed!(2020)
tgtfile = "target_masked.vcf.gz"
reffile = "haplo_ref.jlso"
outfile = "imputed_target.vcf.gz"
width   = 500
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:10[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:25[39m


Data import time                    = 6.6587 seconds
Computing haplotype pair time       = 10.2962 seconds
Phasing by dynamic programming time = 25.9828 seconds
Imputing time                       = 3.6637 seconds
 46.601150 seconds (73.68 M allocations: 7.879 GiB, 2.17% gc time)


0.00010181909351756413

In [5]:
# Keep a list of top haplotype pairs (8 thread)
Random.seed!(2020)
tgtfile = "target_masked.vcf.gz"
reffile = "haplo_ref.jlso"
outfile = "imputed_target.vcf.gz"
width   = 500
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


Data import time                    = 7.00521 seconds
Computing haplotype pair time       = 2.14302 seconds
Phasing by dynamic programming time = 4.11389 seconds
Imputing time                       = 3.09487 seconds
 16.357013 seconds (73.69 M allocations: 7.939 GiB, 5.88% gc time)


0.00010181909351756413

# MendelImpute with intersecting haplotype sets

In [6]:
# keep best pair only (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 6.95714 seconds
    Computing haplotype pair        = 0.80774 seconds
        BLAS3 mul! to get M and N      = 0.0253795 seconds per thread
        haplopair search               = 0.616438 seconds per thread
        finding redundant happairs     = 0.0152108 seconds per thread
    Phasing by win-win intersection = 0.406233 seconds
    Imputation                      = 2.97703 seconds

 11.148283 seconds (78.37 M allocations: 6.367 GiB, 7.62% gc time)


0.0014259408864250494

In [11]:
hs[1].strand1[1]

Set(Int32[493, 549, 1118, 395, 204, 1186, 1288, 1205, 357, 1278])

In [12]:
hs[1].strand1[10]

Set(Int32[1971, 791, 1006, 384, 1566, 159, 285, 1047, 258, 669  …  580, 869, 1512, 1258, 1168, 840, 557, 1661, 56, 1449])

In [13]:
hs[1].strand2[1]

Set(Int32[461, 1492, 381, 882, 1384, 1425, 1886])

In [14]:
hs[1].strand2[10]

Set(Int32[461, 1841, 1492, 381, 345, 882, 637])

In [47]:
# keep best pair only (1 thread)
Random.seed!(2020)
tgtfile = "target_masked.vcf.gz"
reffile = "haplo_ref.jlso"
outfile = "imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.65154 seconds
    Computing haplotype pair        = 3.42246 seconds
        BLAS3 mul! to get M and N      = 0.0976265 seconds per thread
        haplopair search               = 2.44649 seconds per thread
        supplying constant terms       = 0.0362332 seconds per thread
        finding redundant happairs     = 0.0447599 seconds per thread
    Phasing by dynamic programming  = 0.127417 seconds
    Imputation                      = 3.12078 seconds

 14.322548 seconds (73.73 M allocations: 6.138 GiB, 6.11% gc time)


0.00014143243167952753

In [6]:
# keep best pair only (8 thread)
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 8.20204 seconds
    Computing haplotype pair        = 0.83389 seconds
        BLAS3 mul! to get M and N      = 0.026647 seconds per thread
        haplopair search               = 0.645136 seconds per thread
        supplying constant terms       = 0.00510663 seconds per thread
        finding redundant happairs     = 0.00728226 seconds per thread
    Phasing by dynamic programming  = 0.036382 seconds
    Imputation                      = 3.11191 seconds

 12.184995 seconds (73.73 M allocations: 6.138 GiB, 6.84% gc time)


0.00014143243167952753

# Try Lasso

In [5]:
# keep best pair only (8 thread)
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    lasso = 1);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 8.24451 seconds
    Computing haplotype pair        = 1.29866 seconds
        BLAS3 mul! to get M and N      = 0.965563 seconds per thread
        haplopair search               = 0.0108781 seconds per thread
        finding redundant happairs     = 0.0334909 seconds per thread
    Phasing by dynamic programming  = 0.689294 seconds
    Imputation                      = 3.45493 seconds

 13.687752 seconds (73.92 M allocations: 6.695 GiB, 7.37% gc time)


0.0007754408446388277

In [10]:
# keep best pair only (8 thread)
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    lasso = 5);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.73813 seconds
    Computing haplotype pair        = 1.36674 seconds
        BLAS3 mul! to get M and N      = 0.976143 seconds per thread
        haplopair search               = 0.0872356 seconds per thread
        finding redundant happairs     = 0.0453141 seconds per thread
    Phasing by dynamic programming  = 1.20663 seconds
    Imputation                      = 3.04656 seconds

 13.358349 seconds (74.00 M allocations: 6.687 GiB, 6.65% gc time)


0.0002194612363150124

In [11]:
# keep best pair only (8 thread)
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    lasso = 20);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
# X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.54905 seconds
    Computing haplotype pair        = 1.45052 seconds
        BLAS3 mul! to get M and N      = 0.965816 seconds per thread
        haplopair search               = 0.168008 seconds per thread
        finding redundant happairs     = 0.04538 seconds per thread
    Phasing by dynamic programming  = 1.2754 seconds
    Imputation                      = 3.05638 seconds

 13.331794 seconds (73.95 M allocations: 6.682 GiB, 6.54% gc time)


0.000129342284870602

# Beagle 5.1 Error

In [5]:
# convert to bref3 (run in terminal)
java -jar bref3.18May20.d20.jar haplo_ref.vcf.gz > haplo_ref.bref3 

usage:
  java -jar bref3.18May20.d20.jar help

  java -jar bref3.18May20.d20.jar [vcf] <nseq>  > [bref3]

  cat   [vcf]   | java -jar bref3.18May20.d20.jar <nseq>  > [bref3]

where
  [bref3]  = the output bref3 file
  [vcf]    = A VCF file with phased, non-missing genotype data.  If the
             file is gzip-compressed, its filename must end in ".gz"
             and "cat" must be replaced with "zcat"
  <nseq>   = optional argument for maximum number of unique sequences
             in a bref3 block. If there are N reference samples,
             the default value is: <max-seq>=2^(2*log10(N) + 1)



In [6]:
# run beagle 5 (1 thread)
run(`java -jar beagle.18May20.d20.jar gt=target_masked.vcf.gz ref=haplo_ref.bref3 out=beagle.result nthreads=1`)

beagle.18May20.d20.jar (version 5.1)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.18May20.d20.jar" to list command line argument
Start time: 09:01 PM PDT on 29 Jun 2020

Command line: java -Xmx3641m -jar beagle.18May20.d20.jar
  gt=target_masked.vcf.gz
  ref=haplo_ref.bref3
  out=beagle.result
  nthreads=1

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       1,000
Study samples:           1,000

Window 1 (1:36-4999683)
Reference markers:      35,897
Study markers:          35,897

Burnin  iteration 1:           2 minutes 39 seconds
Burnin  iteration 2:           2 minutes 56 seconds
Burnin  iteration 3:           2 minutes 59 seconds
Burnin  iteration 4:           2 minutes 56 seconds
Burnin  iteration 5:           2 minutes 58 seconds
Burnin  iteration 6:           4 minutes 33 seconds

Phasing iteration 1:           5 minutes 48 seconds
Phasing iteration 2:           2 minutes 50 seconds
Phasing iteration 3:           2 minutes 52 seconds
Pha

MethodError: MethodError: no method matching convert_gt(::Type{Float32}, ::String; as_minorallele=false)
Closest candidates are:
  convert_gt(::Type{T}, ::AbstractString; model, impute, center, scale, trans, msg, save_snp_info) where T<:Real at /Users/biona001/.julia/packages/VCFTools/fTfDS/src/convert.jl:232 got unsupported keyword argument "as_minorallele"
  convert_gt(::Type{T}, !Matched::Tuple{Bool,Bool}) where T<:Real at /Users/biona001/.julia/packages/VCFTools/fTfDS/src/convert.jl:10 got unsupported keyword argument "as_minorallele"
  convert_gt(::Type{T}, !Matched::Tuple{Bool,Bool}, !Matched::Symbol) where T<:Real at /Users/biona001/.julia/packages/VCFTools/fTfDS/src/convert.jl:10 got unsupported keyword argument "as_minorallele"

In [8]:
# beagle 5.1 error rate
X_complete = convert_gt(Float32, "target.vcf.gz")
n, p = size(X_complete)
X_beagle = convert_gt(Float32, "beagle.result.vcf.gz")
error_rate = sum(X_beagle .!= X_complete) / n / p

2.0698108477031506e-5

In [2]:
# run beagle 5.1 (8 thread)
run(`java -jar beagle.18May20.d20.jar gt=./compare1/target_masked.vcf.gz ref=./compare1/haplo_ref.bref3 out=./compare1/beagle.result nthreads=8`)
    
# beagle 5 error rate
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_complete)
X_beagle = convert_gt(Float32, "./compare1/beagle.result.vcf.gz")
error_rate = sum(X_beagle .!= X_complete) / n / p

beagle.18May20.d20.jar (version 5.1)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.18May20.d20.jar" to list command line argument
Start time: 07:33 PM PDT on 30 Jun 2020

Command line: java -Xmx3641m -jar beagle.18May20.d20.jar
  gt=./compare1/target_masked.vcf.gz
  ref=./compare1/haplo_ref.bref3
  out=./compare1/beagle.result
  nthreads=8

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       1,000
Study samples:           1,000

Window 1 (1:36-4999683)
Reference markers:      35,897
Study markers:          35,897

Burnin  iteration 1:           26 seconds
Burnin  iteration 2:           29 seconds
Burnin  iteration 3:           30 seconds
Burnin  iteration 4:           45 seconds
Burnin  iteration 5:           34 seconds
Burnin  iteration 6:           51 seconds

Phasing iteration 1:           1 minute 10 seconds
Phasing iteration 2:           34 seconds
Phasing iteration 3:           35 seconds
Phasing iteration 4:           33 seconds
Phasing i

2.0948825807170516e-5

# Minimac4 error

Need to first convert reference vcf file to m3vcf using minimac3 (on Hoffman)

```Julia
minimac3 = "/u/home/b/biona001/haplotype_comparisons/minimac3/Minimac3/bin/Minimac3"
@time run(`$minimac3 --refHaps haplo_ref.vcf.gz --processReference --prefix haplo_ref`)
```

In [17]:
# run minimac 4
minimac4 = "/Users/biona001/Benjamin_Folder/UCLA/research/softwares/Minimac4/build/minimac4"
run(`$minimac4 --refHaps haplo_ref.m3vcf.gz --haps target_masked.vcf.gz --prefix minimac4.result`)
    
X_minimac = convert_gt(Float32, "minimac4.result.dose.vcf.gz", as_minorallele=false)
error_rate = sum(X_minimac .!= X_complete) / n / p



 -------------------------------------------------------------------------------- 
          Minimac4 - Fast Imputation Based on State Space Reduction HMM
 --------------------------------------------------------------------------------
           (c) 2014 - Sayantan Das, Christian Fuchsberger, David Hinds
                             Mary Kate Wing, Goncalo Abecasis 

 Version: 1.0.2;
 Built: Mon Sep 30 11:52:22 PDT 2019 by biona001

 Command Line Options: 
       Reference Haplotypes : --refHaps [haplo_ref.m3vcf.gz], --passOnly,
                              --rsid, --referenceEstimates [ON],
                              --mapFile [docs/geneticMapFile.b38.map.txt.gz]
          Target Haplotypes : --haps [target_masked.vcf.gz]
          Output Parameters : --prefix [minimac4.result], --estimate,
                              --nobgzip, --vcfBuffer [200], --format [GT,DS],
                              --allTypedSites, --meta, --memUsage
        Chunking Parameters : --ChunkLengthMb

0.00018399866284090594

# BLAS 2 

In [5]:
# haplopair_thin (doesn't accounts for allele freq), keep = 100 (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    thinning_factor=100, thinning_scale_allelefreq=false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:05[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 9.06753 seconds
    Computing haplotype pair        = 4.12708 seconds
        computing dist(X, H)           = 0.102362 seconds per thread
        BLAS3 mul! to get M and N      = 3.08088 seconds per thread
        haplopair search               = 0.115584 seconds per thread
        finding redundant happairs     = 0.077741 seconds per thread
    Phasing by dynamic programming  = 2.44924 seconds
    Imputation                      = 5.92068 seconds

 21.565006 seconds (73.97 M allocations: 6.675 GiB, 4.47% gc time)


0.00012427222330556873

# BLAS 3

In [4]:
# haplopair_thin (doesn't accounts for allele freq), keep = 100 (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    thinning_factor=100, thinning_scale_allelefreq=false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:06[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 8.99109 seconds
    Computing haplotype pair        = 4.14255 seconds
        computing dist(X, H)           = 0.103014 seconds per thread
        BLAS3 mul! to get M and N      = 3.12272 seconds per thread
        haplopair search               = 0.121277 seconds per thread
        finding redundant happairs     = 0.0745271 seconds per thread
    Phasing by dynamic programming  = 2.65593 seconds
    Imputation                      = 6.88047 seconds

 22.670608 seconds (73.97 M allocations: 6.675 GiB, 4.33% gc time)


0.00012427222330556873