Skip to content

Commit

Permalink
feat: implement seqvar frequency db construction (#2) (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Mar 9, 2023
1 parent 6465c4d commit 01fc64c
Show file tree
Hide file tree
Showing 28 changed files with 30,802 additions and 21 deletions.
10 changes: 10 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,18 @@ name = "mehari"

[dependencies]
anyhow = "1.0.69"
byteorder = "1.4.3"
clap = { version = "4.1.8", features = ["derive"] }
clap-verbosity-flag = "2.0.0"
csv = "1.2.0"
hgvs = "0.2.0"
log = "0.4.17"
noodles = { version = "0.33.0", features = ["vcf", "bcf", "csi", "fasta", "bgzf", "tabix"] }
noodles-util = { version = "0.5.0", features = ["noodles-bcf", "noodles-bgzf", "noodles-vcf", "variant"] }
rocksdb = "0.20.1"
serde = { version = "1.0.152", features = ["derive"] }
tracing = { version = "0.1.37", features = ["log"] }
tracing-subscriber = "0.3.16"

[dev-dependencies]
pretty_assertions = "1.3.0"
106 changes: 106 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,109 @@

Mehari is a software package for annotating VCF files with variant effect.
The program uses [hgvs-rs](https://crates.io/crates/hgvs) for projecting genomic variants to transcripts and proteins and thus has high prediction quality.

## Supported Sequence Variant Frequency Databases

Mehari can import public sequence variant frequency databases.
The supported set slightly differs between import for GRCh37 and GRCh38.

**GRCh37**

- gnomAD r2.1.1 Exomes [`gnomad.exomes.r2.1.1.sites.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v2)
- gnomAD r2.1.1 Genomes [`gnomad.genomes.r2.1.1.sites.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v2)
- gnomAD v3.1 mtDNA [`gnomad.genomes.v3.1.sites.chrM.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v3-mitochondrial-dna)
- HelixMTdb `HelixMTdb_20200327.tsv`

**GRCh38**

- gnomAD r2.1.1 lift-over Exomes [`gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v2)
- gnomAD v3.1 Genomes [`gnomad.genomes.v3.1.2.sites.$CHROM.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v3)
- gnomAD v3.1 mtDNA [`gnomad.genomes.v3.1.sites.chrM.vcf.bgz`](https://gnomad.broadinstitute.org/downloads#v3-mitochondrial-dna)
- HelixMTdb `HelixMTdb_20200327.tsv`

## Internal Notes

```
rm -rf /tmp/out ; cargo run -- db create seqvar-freqs --path-output-db /tmp/out --genome-release grch38 --path-helix-mtdb ~/Downloads/HelixMTdb_20200327.vcf.gz --path-gnomad-mtdna ~/Downloads/gnomad.genomes.v3.1.sites.chrM.vcf.bgz --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrY.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrX.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr1.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr2.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr1.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr2.vcf
rm -rf /tmp/out ; cargo run -- db create seqvar-freqs --path-output-db /tmp/out --genome-release grch37 --path-gnomad-mtdna ~/Downloads/gnomad.genomes.v3.1.sites.chrM.vcf.bgz --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrY.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.genomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr1.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr2.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr1.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr2
```

```
prepare()
{
in=$1
out=$2
zcat $in \
| head -n 5000 \
| grep ^# \
> $out
zcat $in \
| grep -v ^# \
| head -n 3 \
>> $out
}
base=/data/sshfs/data/gpfs-1/groups/cubi/work/projects/2021-07-20_varfish-db-downloader-holtgrewe/varfish-db-downloader/
mkdir -p tests/data/db/create/seqvar_freqs/{12,xy}-{37,38}
## 37 exomes
prepare \
$base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr1.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr1.vcf
prepare \
$base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr2.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr2.vcf
prepare \
$base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrX.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrX.vcf
prepare \
$base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrY.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrY.vcf
## 37 genomes
prepare \
$base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chr1.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr1.vcf
prepare \
$base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chr2.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr2.vcf
prepare \
$base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chrX.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-37/gnomad.genomes.r2.1.1.sites.chrX.vcf
## 38 exomes
prepare \
$base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr1.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr1.vcf
prepare \
$base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr2.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr2.vcf
prepare \
$base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrX.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrX.vcf
prepare \
$base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrY.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrY.vcf
## 38 genomes
prepare \
$base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chr1.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr1.vcf
prepare \
$base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chr2.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr2.vcf
prepare \
$base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrX.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrX.vcf
prepare \
$base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrY.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf
```
2 changes: 2 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ignore:
- "misc/*.py"
91 changes: 91 additions & 0 deletions misc/helix-to-vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python
"""Helper script to convert the HelixMtDb format to VCF.
The resulting file will look like this::
##fileformat=VCFv4.2
##contig=<ID=chrM,length=16569>
##FILTER=<ID=PASS,Description="Variant passes all filters">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Overall allele number (number of samples with non-missing genotype)">
##INFO=<ID=AC_hom,Number=1,Type=Integer,Description="Allele counts called as homoplasmic">
##INFO=<ID=AC_het,Number=1,Type=Integer,Description="Alelle counts called as heteroplasmic">
#CHROM POS ID REF ALT QUAL FILTER INFO
chrM 5 . A C . PASS AN=196554;AC_hom=1;AC_het=0
chrM 10 . T C . PASS AN=196554;AC_hom=7;AC_het=1
chrM 11 . C T . PASS AN=196554;AC_hom=0;AC_het=1
"""

import csv
import json
import sys

import vcfpy

#: Number of individiduals in HelixMtDb
AN = 196_554


def build_writer():
"""Build ``vcfpy`` writer for writing out the VCF file."""
header = vcfpy.Header(samples=vcfpy.SamplesInfos(sample_names=[]))
header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.2"))
header.add_contig_line({"ID": "chrM", "length": 16569})
header.add_filter_line({"ID": "PASS", "Description": "Variant passes all filters"})
header.add_info_line(
{
"ID": "AN",
"Number": 1,
"Type": "Integer",
"Description": "Overall allele number (number of samples with non-missing genotype)",
}
)
header.add_info_line(
{
"ID": "AC_hom",
"Number": 1,
"Type": "Integer",
"Description": "Allele counts called as homoplasmic",
}
)
header.add_info_line(
{
"ID": "AC_het",
"Number": 1,
"Type": "Integer",
"Description": "Alelle counts called as heteroplasmic",
}
)
return vcfpy.Writer(sys.stdout, header)


def main():
writer = build_writer()
reader = csv.DictReader(sys.stdin, delimiter="\t")
for row in reader:
locus = row["locus"].split(":")
chrom, pos = locus[0], int(locus[1])
alleles = json.loads(row["alleles"])
ac_hom = int(row["counts_hom"])
ac_het = int(row["counts_het"])
writer.write_record(
vcfpy.Record(
CHROM=chrom,
POS=pos,
ID=[],
REF=alleles[0],
ALT=[vcfpy.Substitution(vcfpy.SNV, alleles[1])],
QUAL=None,
FILTER=["PASS"],
INFO={
"AN": AN,
"AC_hom": ac_hom,
"AC_het": ac_het,
},
FORMAT=None,
calls=None,
)
)


if __name__ == "__main__":
sys.exit(main())
20 changes: 0 additions & 20 deletions src/db/create/seqvar_freqs.rs

This file was deleted.

Loading

0 comments on commit 01fc64c

Please sign in to comment.