Skip to content

Commit

Permalink
fixing bugs in CalculateGenotypePosteriers
Browse files Browse the repository at this point in the history
fixing 3 issues in CalclualteGenotypePosteriers
* no longer emit duplicate variants if you have overlapping VCF records in the input
* specifying vcf.gz output will produce a vcf.gz instead of an unzipped file called vcf.gz
* fix potential NPE during shutdown
  • Loading branch information
lbergelson committed Feb 5, 2018
1 parent 2efffce commit fa09fce
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ public final class CalculateGenotypePosteriors extends VariantWalker {
public List<FeatureInput<VariantContext>> supportVariants = new ArrayList<>();

@Argument(doc="File to which variants should be written", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, optional = false)
public String out = null;
public File out = null;

/**
* The global prior of a variant site -- i.e. the expected allele frequency distribution knowing only that N alleles
Expand Down Expand Up @@ -223,21 +223,14 @@ public final class CalculateGenotypePosteriors extends VariantWalker {
private File pedigreeFile = null;

private FamilyLikelihoods famUtils;
private SampleDB sampleDB = null;

private VariantContextWriter vcfWriter;

@Override
public void onTraversalStart() {
final VariantContextWriterBuilder builder = new VariantContextWriterBuilder().setOutputFile(out).setOutputFileType(VariantContextWriterBuilder.OutputType.VCF);
if (hasReference()){
vcfWriter = builder.setReferenceDictionary(getBestAvailableSequenceDictionary()).setOption(Options.INDEX_ON_THE_FLY).build();
} else {
vcfWriter = builder.unsetOption(Options.INDEX_ON_THE_FLY).build();
logger.info("Can't make an index for output file " + out + " because a reference dictionary is required for creating Tribble indices on the fly");
}
vcfWriter = createVCFWriter(out);

sampleDB = initializeSampleDB();
SampleDB sampleDB = initializeSampleDB();

// Get list of samples to include in the output
final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
Expand All @@ -257,17 +250,6 @@ public void onTraversalStart() {
throw new UserException("VCF has no genotypes");
}

if ( header.hasInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY) ) {
final VCFInfoHeaderLine mleLine = header.getInfoHeaderLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
if ( mleLine.getCountType() != VCFHeaderLineCount.A ) {
throw new UserException("VCF does not have a properly formatted MLEAC field: the count type should be \"A\"");
}

if ( mleLine.getType() != VCFHeaderLineType.Integer ) {
throw new UserException("VCF does not have a properly formatted MLEAC field: the field type should be \"Integer\"");
}
}

// Initialize VCF header
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfHeaders.values(), true);
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY));
Expand Down Expand Up @@ -300,40 +282,39 @@ public void apply(final VariantContext variant,
final ReadsContext readsContext,
final ReferenceContext referenceContext,
final FeatureContext featureContext) {
final Collection<VariantContext> vcs = featureContext.getValues(getDrivingVariantsFeatureInput());

final Collection<VariantContext> otherVCs = featureContext.getValues(supportVariants);

final int missing = supportVariants.size() - otherVCs.size();

for ( final VariantContext vc : vcs ) {
final VariantContext vc_familyPriors;
final VariantContext vc_bothPriors;
final VariantContext vc_familyPriors;
final VariantContext vc_bothPriors;

//do family priors first (if applicable)
final VariantContextBuilder builder = new VariantContextBuilder(vc);
//only compute family priors for biallelelic sites
if (!skipFamilyPriors && vc.isBiallelic()){
final GenotypesContext gc = famUtils.calculatePosteriorGLs(vc);
builder.genotypes(gc);
}
//do family priors first (if applicable)
final VariantContextBuilder builder = new VariantContextBuilder(variant);
//only compute family priors for biallelelic sites
if (!skipFamilyPriors && variant.isBiallelic()){
final GenotypesContext gc = famUtils.calculatePosteriorGLs(variant);
builder.genotypes(gc);
}
VariantContextUtils.calculateChromosomeCounts(builder, false);
vc_familyPriors = builder.make();

if (!skipPopulationPriors) {
vc_bothPriors = PosteriorProbabilitiesUtils.calculatePosteriorProbs(vc_familyPriors, otherVCs, missing * numRefIfMissing, globalPrior, !ignoreInputSamples, defaultToAC, useACoff);
} else {
final VariantContextBuilder builder2 = new VariantContextBuilder(vc_familyPriors);
VariantContextUtils.calculateChromosomeCounts(builder, false);
vc_familyPriors = builder.make();

if (!skipPopulationPriors) {
vc_bothPriors = PosteriorProbabilitiesUtils.calculatePosteriorProbs(vc_familyPriors, otherVCs, missing * numRefIfMissing, globalPrior, !ignoreInputSamples, defaultToAC, useACoff);
} else {
final VariantContextBuilder builder2 = new VariantContextBuilder(vc_familyPriors);
VariantContextUtils.calculateChromosomeCounts(builder, false);
vc_bothPriors = builder2.make();
}
vcfWriter.add(vc_bothPriors);
vc_bothPriors = builder2.make();
}
vcfWriter.add(vc_bothPriors);
}

@Override
public void closeTool(){
vcfWriter.close();
if(vcfWriter != null) {
vcfWriter.close();
}
}
}

Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
package org.broadinstitute.hellbender.tools.walkers.variantutils;

import htsjdk.samtools.util.BlockCompressedInputStream;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
import org.broadinstitute.hellbender.utils.test.IntegrationTestSpec;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.IOException;
import java.io.*;
import java.util.Collections;

public final class CalculateGenotypePosteriorsIntegrationTest extends CommandLineProgramTest {
Expand Down Expand Up @@ -104,4 +107,30 @@ public void testFamilyPriorsMixedPloidy() throws IOException {
UserException.class);
spec.executeTest("testFamilyPriorsMixedPloidy", this);
}


@Test //test for https://github.com/broadinstitute/gatk/issues/4346
public void testNoDuplicatesForAdjacentInputs() throws IOException {
final IntegrationTestSpec spec = new IntegrationTestSpec(
" -O %s" +
" -V " + getTestFile("overlappingVariants.vcf") +
" --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE +" false",
Collections.singletonList(getTestFile("overlappingVariants.expected.no_duplicates.vcf").toString()));
spec.executeTest("testNoDuplicatesForOverlappingVariants", this);
}

@Test //test for https://github.com/broadinstitute/gatk/issues/4346
public void gzipOutputIsGzipped() throws IOException {
final File out = createTempFile("out", ".vcf.gz");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addOutput(out)
.addVCF(getTestFile("overlappingVariants.vcf"));

runCommandLine(args);

try( final InputStream in = new BufferedInputStream(new FileInputStream(out))) {
Assert.assertTrue(BlockCompressedInputStream.isValidFile(in));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=VQSRTrancheINDEL99.00to99.90,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -2.4321 <= x < -0.3748">
##FILTER=<ID=VQSRTrancheINDEL99.90to100.00+,Description="Truth sensitivity tranche level for INDEL model at VQS Lod < -103.9799">
##FILTER=<ID=VQSRTrancheINDEL99.90to100.00,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -103.9799 <= x < -2.4321">
##FILTER=<ID=VQSRTrancheSNP99.00to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -6.7044 <= x < -0.2206">
##FILTER=<ID=VQSRTrancheSNP99.90to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -3000.6876">
##FILTER=<ID=VQSRTrancheSNP99.90to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -3000.6876 <= x < -6.7044">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PP,Number=G,Type=Integer,Description="Phred-scaled Posterior Genotype Probabilities">
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=ApplyVQSR,CommandLine="ApplyVQSR --recal-file /media/md1200/analysis/Case/Chen_ming/171031/USH_trio.indel.recal --tranches-file USH_trio.indel.tranches --output USH_trio.recal.vcf.gz --truth-sensitivity-filter-level 99.0 --mode INDEL --variant USH_trio.snp.recal.vcf.gz --reference /media/md1200/ref/hsa/genome/GRCh38/fasta/Homo_sapiens_assembly38.fasta --use-allele-specific-annotations false --ignore-all-filters false --exclude-filtered false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --disable-tool-default-read-filters false",Version=4.0.0.0,Date="January 22, 2018 4:59:11 PM CST">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
##INFO=<ID=PG,Number=G,Type=Integer,Description="Genotype Likelihood Prior">
##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds of being a true variant versus being false under the trained gaussian mixture model">
##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT USH_C1 USH_F1 USH_M1
chr1 3872413 . AG A 400.09 PASS AC=1;AF=0.167;AN=6;BaseQRankSum=-4.760e-01;ClippingRankSum=0.00;DP=110;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.167;MQ=60.00;MQRankSum=0.00;PG=0,0,0;QD=10.53;ReadPosRankSum=-1.513e+00;SOR=0.787;VQSLOD=4.73;culprit=ReadPosRankSum GT:AD:DP:GQ:PL:PP 0/0:27,0:27:81:0,81,712:0,81,712 0/0:45,0:45:99:0,111,1665:0,111,1665 0/1:17,21:38:99:440,0,469:440,0,469
chr1 3872414 . GA G 29.18 PASS AC=1;AF=0.167;AN=6;BaseQRankSum=0.246;ClippingRankSum=0.00;DP=126;ExcessHet=3.0103;FS=1.192;MLEAC=1;MLEAF=0.167;MQ=76.93;MQRankSum=0.00;PG=0,0,0;QD=0.50;ReadPosRankSum=0.352;SOR=0.472;VQSLOD=3.93;culprit=QD GT:AD:DP:GQ:PL:PP 0/0:29,0:29:16:0,16,666:0,16,666 0/1:48,10:58:69:69,0,1051:69,0,1051 0/0:35,3:38:54:0,54,912:0,54,912

0 comments on commit fa09fce

Please sign in to comment.