Skip to content

Commit

Permalink
Merge pull request #486 from brentp/fixes_0.16
Browse files Browse the repository at this point in the history
Fixes for 0.16.1
  • Loading branch information
arq5x committed Jun 24, 2015
2 parents b28706f + fc0cc0e commit e1ceb99
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 33 deletions.
11 changes: 11 additions & 0 deletions docs/content/history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,17 @@ Release History
#. Speed up and generalize database loading for multiple genome builds and species.
#. Add an `is_splicing` column.

0.16.2 (forthcoming)
=======================================
#. Hone rules for unphased and partially-phased compound hets.

0.16.1
=======================================
1. Fix regression in loading when AAF is None
2. Fix handing in mendelian error tool where all genotype likelihoods are low (thanks Bianca)
3. Don't phase de-novo's (caused error in comp_het tool). (thanks Bianca)
4. Fix regression in loading VEP with multicore (thanks Andrew)

0.16.0
=======================================
1. The built-in inheritance model tools (``auto_rec``, etc.) have been modified to be more
Expand Down
57 changes: 51 additions & 6 deletions gemini/family.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,23 @@ def famphase(self, gt_types, gt_phases, gt_bases,
... [False, False, False],
... ["A/C", "A/C", "A/C"])
([False, False, False], ['A/C', 'A/C', 'A/C'])
>>> f.famphase([HOM_REF, HET, HET],
... [False, False, False],
... ["AA/A", "AA/C", "C/AA"])
([False, False, True], ['AA/A', 'AA/C', 'AA|C'])
>>> f.famphase([HOM_REF, HET, HET],
... [False, False, False],
... ['G/G', 'AA/C', 'A/C'])
([False, False, False], ['G/G', 'AA/C', 'A/C'])
>>> f.famphase([HOM_REF, HET, HET],
... [False, False, False],
... ['G/G', 'A/C', 'A/C'])
([False, False, False], ['G/G', 'A/C', 'A/C'])
"""

# NOTE: this modifies in-place
# subjects are in same order as gt_types and _i is the index.
HOM_REF, HET, UNKNOWN, HOM_ALT = range(4)
Expand All @@ -241,25 +257,46 @@ def famphase(self, gt_types, gt_phases, gt_bases,
## cant have unknown
if UNKNOWN in (gt_types[s.mom._i], gt_types[s.dad._i]): continue

# should be able to phase here!
gt_phases[s._i] = True
kid_bases = set(_splitter.split(gt_bases[s._i]))
mom_bases = _splitter.split(gt_bases[s.mom._i])
dad_bases = _splitter.split(gt_bases[s.dad._i])

parent_bases = set(mom_bases + dad_bases)
# can't phase kid with de-novo

if kid_bases - parent_bases:
print >>sys.stderr, "skipping due to de_novo"
continue

# no alleles from dad
if len(kid_bases - set(dad_bases)) == len(kid_bases):
print >>sys.stderr, "skipping due no alleles from dad"
continue

if len(kid_bases - set(mom_bases)) == len(kid_bases):
print >>sys.stderr, "skipping due no alleles from mom"
continue

# should be able to phase here
if gt_types[s.mom._i] in (HOM_REF, HOM_ALT):
assert gt_types[s.mom._i] in (HOM_REF, HOM_ALT)
mom_allele = _splitter.split(gt_bases[s.mom._i])[0]
dad_alleles = _splitter.split(gt_bases[s.dad._i])
mom_allele = mom_bases[0]
dad_alleles = dad_bases
dad_allele = next(d for d in dad_alleles if d != mom_allele)


gt_bases[s._i] = "%s|%s" % (mom_allele, dad_allele)
else:
assert gt_types[s.dad._i] in (HOM_REF, HOM_ALT)

dad_allele = _splitter.split(gt_bases[s.dad._i])[0]
mom_alleles = _splitter.split(gt_bases[s.mom._i])
dad_allele = dad_bases[0]
mom_alleles = mom_bases
mom_allele = next(m for m in mom_alleles if m != dad_allele)

gt_bases[s._i] = "%s|%s" % (mom_allele, dad_allele)

gt_phases[s._i] = True

return gt_phases, gt_bases

@classmethod
Expand Down Expand Up @@ -664,6 +701,14 @@ def mendel_violations(self, min_depth=0, gt_ll=False, only_affected=False):
only_affected)
}

def comp_het_pair(self, gt_types1, gt_bases1, gt_types2, gt_bases2):
"""
TODO:
https://mail.google.com/mail/u/0/#inbox/14e22b8e359c9cb2
"""
pass


def comp_het(self, min_depth=0, gt_ll=False, strict=False,
only_affected=True):
"""
Expand Down
13 changes: 8 additions & 5 deletions gemini/gemini_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,9 @@ def add_extras(gemini_db, chunk_dbs, region_only):
",".join(fields),
region_only)
annotate(None, args)
os.unlink(extra_file[:-3] + ".fields")
rm(extra_file[:-3] + ".fields")
rm(extra_file)
rm(extra_file + ".tbi")


def rm(path):
Expand All @@ -402,7 +404,7 @@ def get_extra_vcf(gemini_db, tmpl=None):
path = os.path.join(tempfile.gettempdir(), "extra.%s.vcf" % base)
mode = "r" if tmpl is None else "w"
if mode == "r":
if not os.path.exists(path):
if not os.path.exists(path) and not os.path.exists(path + ".gz"):
return False
if not path.endswith(".gz"):
subprocess.check_call(["bgzip", "-f", path])
Expand All @@ -413,9 +415,10 @@ def get_extra_vcf(gemini_db, tmpl=None):

fh = open(path, "w")
if mode == "w":
atexit.register(rm, fh.name)
atexit.register(rm, fh.name + ".gz")
atexit.register(rm, fh.name + ".gz.tbi")
# can't do this here because we load in a separate process.
#atexit.register(rm, fh.name)
#atexit.register(rm, fh.name + ".gz")
#atexit.register(rm, fh.name + ".gz.tbi")

return vcf.Writer(fh, tmpl)
return vcf.Reader(fh)
2 changes: 2 additions & 0 deletions gemini/gemini_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,8 @@ def load_chunks_multicore(grabix_file, args):
start, stop = chunk
print "Loading chunk " + str(chunk_num) + "."
gemini_load = gemini_pipe_load_cmd().format(**locals())
if os.environ.get('GEMINI_DEBUG') == "TRUE":
print >>sys.stderr, gemini_load
procs.append(subprocess.Popen(submit_command.format(cmd=gemini_load),
shell=True))

Expand Down
4 changes: 3 additions & 1 deletion gemini/gemini_load_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,9 @@ def _prepare_variation(self, var):
pi_hat = var.nucl_diversity
else:
aaf = infotag.extract_aaf(var)
aaf = max(aaf)
if not isinstance(aaf, (float, int)):
if aaf is not None:
aaf = max(aaf)

############################################################
# collect annotations from gemini's custom annotation files
Expand Down
56 changes: 35 additions & 21 deletions gemini/gim.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python
import collections
import sys
from copy import copy
import re
import collections
from collections import Counter
import GeminiQuery
import sql_utils
import compiler
Expand Down Expand Up @@ -218,14 +219,15 @@ def report_candidates(self):
vs = []

if all(c in self.gt_cols for c in ('gt_phred_ll_homref', 'gt_phred_ll_het', 'gt_phred_ll_homalt')):
pdict["violation_prob"] = ""
for s in pdict['samples']:
# mom, dad, kid
mdk = str(s.mom.genotype_lls + s.dad.genotype_lls + s.genotype_lls)
# get all 3 at once so we just do 1 eval.
vals = eval(mdk, cols)
vs.append(mendelian_error(vals[:3], vals[3:6], vals[6:], pls=True))

pdict["violation_prob"] = ",".join("%.5f" % v for v in vs)
pdict["violation_prob"] = ",".join("%.5f" % v for v in vs if v is not None)
pdict['samples'] = ",".join(s.name or str(s.sample_id) for s in pdict['samples'])

s = str(pdict)
Expand All @@ -235,7 +237,15 @@ def report_candidates(self):

to_yield.append(pdict)

if len(kindreds) >= self.args.min_kindreds:
if 0 < len(kindreds) >= self.args.min_kindreds:
if 'comp_het_id' in to_yield[0]:
counts = Counter((item['comp_het_id'], item['family_id']) for item in to_yield)
# remove singletons.
to_yield = [item for item in to_yield if counts[(item['comp_het_id'], item['family_id'])] > 1]
# re-check min_kindreds
if len(set(item['family_id'] for item in to_yield)) < self.args.min_kindreds:
continue

for item in to_yield:
item['family_count'] = len(kindreds)
yield item
Expand Down Expand Up @@ -335,13 +345,14 @@ def find_valid_het_pairs(self, sample_hets):
# it's composite alleles. e.g. A|G -> ['A', 'G']
alleles_site1 = []
alleles_site2 = []
if not args.ignore_phasing:
alleles_site1 = site1.gt.split('|')
alleles_site2 = site2.gt.split('|')
else:
# NOTE: removed this due to family-based phasing
#if not args.ignore_phasing:
# alleles_site1 = site1.gt.split('|')
# alleles_site2 = site2.gt.split('|')
#lse:
# split on phased (|) or unphased (/) genotypes
alleles_site1 = splitter.split(site1.gt)
alleles_site2 = splitter.split(site2.gt)
alleles_site1 = splitter.split(site1.gt)
alleles_site2 = splitter.split(site2.gt)

# it is only a true compound heterozygote IFF
# the alternates are on opposite haplotypes.
Expand All @@ -357,9 +368,12 @@ def find_valid_het_pairs(self, sample_hets):
" alleles are not yet supported. The sites are:"
" %s and %s.\n" % (sample, site1, site2))
continue

alt_hap_1 = alleles_site1.index(site1.row['alt'])
alt_hap_2 = alleles_site2.index(site2.row['alt'])
try:
alt_hap_1 = alleles_site1.index(site1.row['alt'])
alt_hap_2 = alleles_site2.index(site2.row['alt'])
except ValueError:
# had a genotype like, e.g. "./G"
continue

# Keep as a candidate if
# 1. phasing is considered AND the alt alleles are on
Expand All @@ -368,7 +382,7 @@ def find_valid_het_pairs(self, sample_hets):
# TODO: Phase based on parental genotypes.
if (not args.ignore_phasing and alt_hap_1 != alt_hap_2) \
or args.ignore_phasing:
samples_w_hetpair[(site1,site2)].append(sample)
samples_w_hetpair[(site1, site2)].append(sample)

return samples_w_hetpair

Expand All @@ -389,15 +403,12 @@ def filter_candidates(self, samples_w_hetpair,
requested_fams = None if args.families is None else set(args.families.split(","))
for idx, comp_het in enumerate(candidates):
comp_het_counter[0] += 1
#seen = set()

for s in samples_w_hetpair[comp_het]:
family_id = subjects_dict[s].family_id
# NOTE: added this so each family only reported 1x.
#if family_id in seen:
# continue

if requested_fams is not None and not family_id in requested_fams:
continue
#seen.add(family_id)

ch_id = str(comp_het_counter[0])
cid = "%s_%d_%d" % (ch_id, comp_het[0].row['variant_id'],
Expand Down Expand Up @@ -446,10 +457,13 @@ def candidates(self):
if gt_type != HET:
continue
sample = idx_to_sample[idx]
# need to keep unaffecteds so we no if someone has the same
# pair.
#if args.only_affected and not self.subjects_dict[sample].affected:
# need to keep unaffecteds so we know if someone has the same
# if args.only_affected and not self.subjects_dict[sample].affected:
# continue

# e.g. .|G, we can't use this for comp_het
if "." in gt_bases[idx]: continue

sample_site = copy(site)
sample_site.phased = gt_phases[idx]

Expand Down

0 comments on commit e1ceb99

Please sign in to comment.