Skip to content

Commit

Permalink
support VCFs with no AD field
Browse files Browse the repository at this point in the history
closes #31
  • Loading branch information
brentp committed Oct 23, 2019
1 parent af2617c commit 35e42e5
Show file tree
Hide file tree
Showing 8 changed files with 56,946 additions and 5 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
v0.2.5 (dev)
============
+ handle more types of GVCF
+ handle VCFs without depth (AD) information. this enables extracting VCFs with only genotypes such as
files converted from array information (#31)

v0.2.4
======
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# somalier: extract informative sites, evaluate relatedness, and perform quality-control
# somalier: extract informative sites, evaluate relatedness, and perform quality-control on BAM/CRAM/BCF/VCF/GVCF

[![Actions Status](https://github.com/brentp/somalier/workflows/Docker%20Image%20CI/badge.svg)](https://github.com/brentp/somalier/actions)

Expand Down Expand Up @@ -56,7 +56,7 @@ of the extracted files.

## VCF

`somalier` can `extract` from a multi or single-sample VCF. This will be much faster, in cases where it's available,
`somalier` can `extract` from a multi or single-sample VCF or a GVCF. This will be much faster, in cases where it's available,
this would look like:

```
Expand All @@ -65,6 +65,10 @@ somalier extract -d extracted/ --sites sites.vcf.gz -f /data/human/g1k_v37_decoy

following this, there will be a `$sample.somalier` file for each sample in the `joint.vcf.gz`

Note that `somalier` uses the `AD` field to extract depth information. If that FORMAT field is not present in the
header, then it will use the genotypes only and use a total depth of 20 (10,10 for heterozygote), etc.


## Install

get a static binary from [here](https://github.com/brentp/somalier/releases)
Expand Down
1 change: 1 addition & 0 deletions somalier.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ srcDir = "src"

task test, "run the tests":
exec "nim c -d:useSysAssert -d:useGcAssert --lineDir:on --debuginfo -r tests/test_groups"
exec "bash tests/functional-tests.sh"

39 changes: 36 additions & 3 deletions src/somalier.nim
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,56 @@ proc get_variant(ivcf:VCF, site:Site): Variant =
(v.REF == site.B_allele and v.ALT[0] == site.A_allele)):
return v.copy()

proc ok(v:Variant): bool {.inline.} =
if v == nil: return false
if v.FILTER notin ["PASS", "", ".", "RefCall"]: return false
return true

proc fill(AD: var seq[int32], alts: seq[int8]) =
for i, a in alts:
case a:
of 0:
AD[2 * i] = 20
AD[2 * i + 1] = 0
of 1:
AD[2 * i] = 10
AD[2 * i + 1] = 10
of 2:
AD[2 * i] = 0
AD[2 * i + 1] = 20
else:
AD[2 * i] = 0
AD[2 * i + 1] = 0

proc get_ref_alt_counts(ivcf:VCF, sites:seq[Site], fai:Fai=nil): seq[counts] =
result = newSeq[counts](ivcf.samples.len)
var vcf_samples = ivcf.samples
for j, s in vcf_samples:
result[j].sample_name = s
result[j].sites = newSeqOfCap[allele_count](sites.len)

var has_AD = true
try:
discard ivcf.header.get("AD", BCF_HEADER_TYPE.BCF_HL_FMT)
except KeyError:
has_AD = false

if not has_AD:
stderr.write_line "[somalier] FORMAT field 'AD' not found for depth information. using genotype only"

var AD = newSeq[int32](5*vcf_samples.len)
var x = newSeq[int32](vcf_samples.len)
var n = 0

for i, site in sites:
var v = ivcf.get_variant(site)
if v == nil or ($v.CHROM notin ["chrX", "X", "chrY", "Y"] and v.FILTER notin ["PASS", "", ".", "RefCall"]) or v.format.get("AD", AD) != Status.OK:
AD.setLen(vcf_samples.len * 2)
zeroMem(AD[0].addr, AD.len * sizeof(AD[0]))
if v != nil and v.looks_like_gvcf_variant:
if v.ok and not has_AD:
AD.fill(v.format.genotypes(x).alts)
n += 1
elif v.ok and v.looks_like_gvcf_variant:
var dp:seq[int32]
if v.format.get("MIN_DP", dp) == Status.OK or v.format.get("DP", dp) == Status.OK:
AD[0] = dp[0]
Expand Down Expand Up @@ -199,14 +233,13 @@ proc main() =
return

if len(args) == 0 or args[0] in ["-h", "--help"]:
stderr.write_line "Commands: "
stdout.write_line "Commands: "
for k, v in dispatcher:
echo &" {k:<13}: {v.description}"
else:
echo &"unknown program '{args[0]}'"
quit ""



when isMainModule:
main()
32 changes: 32 additions & 0 deletions tests/functional-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
test -e ssshtest || wget -q https://raw.githubusercontent.com/ryanlayer/ssshtest/master/ssshtest

. ssshtest

set -o nounset

nim c -d:debug -d:useSysAssert -d:useGcAssert --lineDir:on --debuginfo --boundChecks:on -x:on src/somalier
set +e
exe=./src/somalier

run check_help_works $exe --help
assert_exit_code 0
assert_in_stdout "Commands:"

#wget -O - https://github.com/brentp/somalier/files/3412455/sites.GRCh37.vcf.gz | zcat - | head -200 > tests/test_sites.vcf
# make a GT only VCF

set -e
awk 'BEGIN{FS=OFS="\t"} ($0 ~ /^#CHROM/) { print $0"\tFORMAT\ttest_sample"; next } NR == 3 { print "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"} $0 ~ /^#/ { print $0 } $0 !~ /^#/ { if(NR!=180) {print $0"\tGT\t0/1"} else { print $0"\tGT\t./." }}' \
tests/test_sites.vcf | bgzip -c > tests/gt_only.vcf.gz
tabix tests/gt_only.vcf.gz
set +e


run check_gt_only_extract $exe extract -s tests/test_sites.vcf -f tests/test.fa -d test_gt_only tests/gt_only.vcf.gz
assert_in_stderr "[somalier] FORMAT field 'AD' not found for depth information. using genotype only"
assert_exit_code 0
assert_in_stderr "[somalier] found 49 sites"

rm -rf test_gt_only
#rm -f tests/gt_only.vcf.gz
1 change: 1 addition & 0 deletions tests/nim.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
path = "$projectPath/../src"
Loading

0 comments on commit 35e42e5

Please sign in to comment.