Skip to content

Commit

Permalink
Merge pull request #660 from davmlaw/main
Browse files Browse the repository at this point in the history
#447 - implement babelfish for VCF
  • Loading branch information
reece committed Sep 10, 2023
2 parents 4761114 + 7c4b2d9 commit 4f838df
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 48 deletions.
86 changes: 60 additions & 26 deletions src/hgvs/extras/babelfish.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
"""translate between HGVS and other formats"""
import os

import bioutils.assemblies

import hgvs
import hgvs.normalizer
from bioutils.assemblies import make_ac_name_map, make_name_ac_map
from bioutils.sequences import reverse_complement
from hgvs.edit import NARefAlt
from hgvs.location import Interval, SimplePosition
from hgvs.normalizer import Normalizer
from hgvs.posedit import PosEdit
from hgvs.sequencevariant import SequenceVariant


def _as_interbase(posedit):
Expand All @@ -18,21 +25,18 @@ def _as_interbase(posedit):

class Babelfish:
def __init__(self, hdp, assembly_name):
self.assembly_name = assembly_name
self.hdp = hdp
self.hn = hgvs.normalizer.Normalizer(hdp, cross_boundaries=False, shuffle_direction=5, validate=False)
self.ac_to_chr_name_map = {
sr["refseq_ac"]: sr["name"] for sr in bioutils.assemblies.get_assembly("GRCh38")["sequences"]
}
self.ac_to_name_map = make_ac_name_map(assembly_name)
self.name_to_ac_map = make_name_ac_map(assembly_name)

def hgvs_to_vcf(self, var_g):
"""**EXPERIMENTAL**
converts a single hgvs allele to (chr, pos, ref, alt) using
the given assembly_name. The chr name uses official chromosome
name (i.e., without a "chr" prefix).
Returns None for non-variation (e.g., NC_000006.12:g.49949407=)
"""

if var_g.type != "g":
Expand All @@ -42,33 +46,63 @@ def hgvs_to_vcf(self, var_g):

(start_i, end_i) = _as_interbase(vleft.posedit)

chr = self.ac_to_chr_name_map[vleft.ac]
chrom = self.ac_to_name_map[vleft.ac]

typ = vleft.posedit.edit.type

if typ == "dup":
start_i -= 1
alt = self.hdp.seqfetcher.fetch_seq(vleft.ac, start_i, end_i)
ref = alt[0]
end_i = start_i
return (chr, start_i + 1, ref, alt, typ)

if vleft.posedit.edit.ref == vleft.posedit.edit.alt:
return None

alt = vleft.posedit.edit.alt or ""

if end_i - start_i == 1 and vleft.posedit.length_change == 0:
# SNVs aren't left anchored
elif typ == 'inv':
ref = vleft.posedit.edit.ref

alt = reverse_complement(ref)
else:
# everything else is left-anchored
start_i -= 1
ref = self.hdp.seqfetcher.fetch_seq(vleft.ac, start_i, end_i)
alt = ref[0] + alt

return (chr, start_i + 1, ref, alt, typ)
alt = vleft.posedit.edit.alt or ""

if typ in ('del', 'ins'): # Left anchored
start_i -= 1
ref = self.hdp.seqfetcher.fetch_seq(vleft.ac, start_i, end_i)
alt = ref[0] + alt
else:
ref = vleft.posedit.edit.ref
if ref == alt:
alt = '.'
return chrom, start_i + 1, ref, alt, typ

def vcf_to_g_hgvs(self, chrom, position, ref, alt):
ac = self.name_to_ac_map[chrom]

# Strip common prefix
if len(alt) > 1 and len(ref) > 1:
pfx = os.path.commonprefix([ref, alt])
lp = len(pfx)
if lp > 0:
ref = ref[lp:]
alt = alt[lp:]
position += lp

if ref == '': # Insert
# Insert uses coordinates around the insert point.
start = position - 1
end = position
else:
start = position
end = position + len(ref) - 1

if alt == '.':
alt = ref

var_g = SequenceVariant(ac=ac,
type='g',
posedit=PosEdit(Interval(start=SimplePosition(start),
end=SimplePosition(end),
uncertain=False),
NARefAlt(ref=ref or None,
alt=alt or None,
uncertain=False)))
n = Normalizer(self.hdp)
return n.normalize(var_g)


if __name__ == "__main__":
Expand Down
Binary file modified tests/data/cache-py3.hdp
Binary file not shown.
75 changes: 53 additions & 22 deletions tests/test_hgvs_extras_babelfish.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,43 @@
NORM_HGVS_VCF = [
# Columns are: (normed-HGVS, non-normalized HGVS, VCF coordinates, non-norm VCF)

# no-op
("NC_000006.12:g.49949407=", [],
("6", 49949407, "A", ".", 'identity'), [("6", 49949407, "A", "A", 'identity')]),

# snv
("NC_000006.12:g.49949407A>T",
[],
# was ("6", 49949406, "AA", "AT", "sub") however VT parsimony rules say it should be those below
("6", 49949407, "A", "T", "sub"), []),
# delins
("NC_000006.12:g.49949413_49949414delinsCC",
[],
# This was ("6", 49949412, "AAA", "ACC", "delins") - however VT parsimony rules say it should be those below
("6", 49949413, "AA", "CC", "delins"), []),

# del, no shift
("NC_000006.12:g.49949415del", [], ("6", 49949414, "AT", "A", "del"), []),

# del, w/ shift
("NC_000006.12:g.49949414del", ["NC_000006.12:g.49949413del"],
("6", 49949409, "GA", "G", "del"), []),
("NC_000006.12:g.49949413_49949414del", [], ("6", 49949409, "GAA", "G", "del"), []),

# ins, no shift
("NC_000006.12:g.49949413_49949414insC", [], ("6", 49949413, "A", "AC", "ins"), []),
("NC_000006.12:g.49949414_49949415insCC", [], ("6", 49949414, "A", "ACC", "ins"), []),

# ins/dup, w/shift
("NC_000006.12:g.49949414dup",
["NC_000006.12:g.49949413_49949414insA", "NC_000006.12:g.49949414_49949415insA"],
("6", 49949409, "G", "GA", "dup"), []),
("NC_000006.12:g.49949413_49949414dup",
["NC_000006.12:g.49949414_49949415insAA"],
("6", 49949409, "G", "GAA", "dup"), []),
]


def test_hgvs_to_vcf(parser, babelfish38):
"""
49949___ 400 410 420
Expand All @@ -9,28 +49,19 @@ def test_hgvs_to_vcf(parser, babelfish38):
def _h2v(h):
return babelfish38.hgvs_to_vcf(parser.parse(h))

# no-op
assert _h2v("NC_000006.12:g.49949407=") == None

# snv
assert _h2v("NC_000006.12:g.49949407A>T") == ("6", 49949406, "AA", "AT", "sub")

# delins
assert _h2v("NC_000006.12:g.49949413_49949414delinsCC") == ("6", 49949412, "AAA", "ACC", "delins")

# del, no shift
assert _h2v("NC_000006.12:g.49949415del") == ("6", 49949414, "AT", "A", "del")
for norm_hgvs_string, alt_hgvs, expected_variant_coordinate, _ in NORM_HGVS_VCF:
for hgvs_string in [norm_hgvs_string] + alt_hgvs:
variant_coordinates = _h2v(hgvs_string)
assert variant_coordinates == expected_variant_coordinate

# del, w/ shift
assert _h2v("NC_000006.12:g.49949413del") == ("6", 49949409, "GA", "G", "del")
assert _h2v("NC_000006.12:g.49949414del") == ("6", 49949409, "GA", "G", "del")
assert _h2v("NC_000006.12:g.49949413_49949414del") == ("6", 49949409, "GAA", "G", "del")

# ins, no shift
assert _h2v("NC_000006.12:g.49949413_49949414insC") == ("6", 49949413, "A", "AC", "ins")
assert _h2v("NC_000006.12:g.49949414_49949415insCC") == ("6", 49949414, "A", "ACC", "ins")
def test_vcf_to_hgvs(parser, babelfish38):
def _v2h(*v):
return babelfish38.vcf_to_g_hgvs(*v)

# ins/dup, w/shift
assert _h2v("NC_000006.12:g.49949413_49949414insA") == ("6", 49949409, "G", "GA", "dup")
assert _h2v("NC_000006.12:g.49949414_49949415insA") == ("6", 49949409, "G", "GA", "dup")
assert _h2v("NC_000006.12:g.49949414_49949415insAA") == ("6", 49949409, "G", "GAA", "dup")
for expected_hgvs_string, _, norm_variant_coordinate, alt_variant_coordinate in NORM_HGVS_VCF:
for variant_coordinate in [norm_variant_coordinate] + alt_variant_coordinate:
*v, typ = variant_coordinate # last column is type ie "dup"
hgvs_g = _v2h(*v)
hgvs_string = hgvs_g.format()
assert hgvs_string == expected_hgvs_string

0 comments on commit 4f838df

Please sign in to comment.