fixing bugs in CalculateGenotypePosteriers

fixing 3 issues in CalclualteGenotypePosteriers * no longer emit duplicate variants if you have overlapping VCF records in the input * specifying vcf.gz output will produce a vcf.gz instead of an unzipped file called vcf.gz * fix potential NPE during shutdown
broadinstitute · Feb 5, 2018 · fa09fce · fa09fce
1 parent 2efffce
commit fa09fce
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 44 deletions.
diff --git a/...org/broadinstitute/hellbender/tools/walkers/variantutils/CalculateGenotypePosteriors.java b/...org/broadinstitute/hellbender/tools/walkers/variantutils/CalculateGenotypePosteriors.java
@@ -150,7 +150,7 @@ public final class CalculateGenotypePosteriors extends VariantWalker {
     public List<FeatureInput<VariantContext>> supportVariants = new ArrayList<>();
 
     @Argument(doc="File to which variants should be written", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, optional = false)
-    public String out = null;
+    public File out = null;
 
     /**
      * The global prior of a variant site -- i.e. the expected allele frequency distribution knowing only that N alleles
@@ -223,21 +223,14 @@ public final class CalculateGenotypePosteriors extends VariantWalker {
     private File pedigreeFile = null;
 
     private FamilyLikelihoods famUtils;
-    private SampleDB sampleDB = null;
 
     private VariantContextWriter vcfWriter;
 
     @Override
     public void onTraversalStart() {
-        final VariantContextWriterBuilder builder = new VariantContextWriterBuilder().setOutputFile(out).setOutputFileType(VariantContextWriterBuilder.OutputType.VCF);
-        if (hasReference()){
-            vcfWriter = builder.setReferenceDictionary(getBestAvailableSequenceDictionary()).setOption(Options.INDEX_ON_THE_FLY).build();
-        } else {
-            vcfWriter = builder.unsetOption(Options.INDEX_ON_THE_FLY).build();
-            logger.info("Can't make an index for output file " + out + " because a reference dictionary is required for creating Tribble indices on the fly");
-        }
+        vcfWriter = createVCFWriter(out);
 
-        sampleDB = initializeSampleDB();
+        SampleDB sampleDB = initializeSampleDB();
 
         // Get list of samples to include in the output
         final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
@@ -257,17 +250,6 @@ public void onTraversalStart() {
             throw new UserException("VCF has no genotypes");
         }
 
-        if ( header.hasInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY) ) {
-            final VCFInfoHeaderLine mleLine = header.getInfoHeaderLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
-            if ( mleLine.getCountType() != VCFHeaderLineCount.A ) {
-                throw new UserException("VCF does not have a properly formatted MLEAC field: the count type should be \"A\"");
-            }
-
-            if ( mleLine.getType() != VCFHeaderLineType.Integer ) {
-                throw new UserException("VCF does not have a properly formatted MLEAC field: the field type should be \"Integer\"");
-            }
-        }
-
         // Initialize VCF header
         final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfHeaders.values(), true);
         headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY));
@@ -300,40 +282,39 @@ public void apply(final VariantContext variant,
                       final ReadsContext readsContext,
                       final ReferenceContext referenceContext,
                       final FeatureContext featureContext) {
-        final Collection<VariantContext> vcs = featureContext.getValues(getDrivingVariantsFeatureInput());
 
         final Collection<VariantContext> otherVCs = featureContext.getValues(supportVariants);
 
         final int missing = supportVariants.size() - otherVCs.size();
 
-        for ( final VariantContext vc : vcs ) {
-            final VariantContext vc_familyPriors;
-            final VariantContext vc_bothPriors;
+        final VariantContext vc_familyPriors;
+        final VariantContext vc_bothPriors;
 
-            //do family priors first (if applicable)
-            final VariantContextBuilder builder = new VariantContextBuilder(vc);
-            //only compute family priors for biallelelic sites
-            if (!skipFamilyPriors && vc.isBiallelic()){
-                final GenotypesContext gc = famUtils.calculatePosteriorGLs(vc);
-                builder.genotypes(gc);
-            }
+        //do family priors first (if applicable)
+        final VariantContextBuilder builder = new VariantContextBuilder(variant);
+        //only compute family priors for biallelelic sites
+        if (!skipFamilyPriors && variant.isBiallelic()){
+            final GenotypesContext gc = famUtils.calculatePosteriorGLs(variant);
+            builder.genotypes(gc);
+        }
+        VariantContextUtils.calculateChromosomeCounts(builder, false);
+        vc_familyPriors = builder.make();
+
+        if (!skipPopulationPriors) {
+            vc_bothPriors = PosteriorProbabilitiesUtils.calculatePosteriorProbs(vc_familyPriors, otherVCs, missing * numRefIfMissing, globalPrior, !ignoreInputSamples, defaultToAC, useACoff);
+        } else {
+            final VariantContextBuilder builder2 = new VariantContextBuilder(vc_familyPriors);
             VariantContextUtils.calculateChromosomeCounts(builder, false);
-            vc_familyPriors = builder.make();
-
-            if (!skipPopulationPriors) {
-                vc_bothPriors = PosteriorProbabilitiesUtils.calculatePosteriorProbs(vc_familyPriors, otherVCs, missing * numRefIfMissing, globalPrior, !ignoreInputSamples, defaultToAC, useACoff);
-            } else {
-                final VariantContextBuilder builder2 = new VariantContextBuilder(vc_familyPriors);
-                VariantContextUtils.calculateChromosomeCounts(builder, false);
-                vc_bothPriors = builder2.make();
-            }
-            vcfWriter.add(vc_bothPriors);
+            vc_bothPriors = builder2.make();
         }
+        vcfWriter.add(vc_bothPriors);
     }
 
     @Override
     public void closeTool(){
-        vcfWriter.close();
+        if(vcfWriter != null) {
+            vcfWriter.close();
+        }
     }
 }
 
diff --git a/...ute/hellbender/tools/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java b/...ute/hellbender/tools/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java
@@ -1,12 +1,15 @@
 package org.broadinstitute.hellbender.tools.walkers.variantutils;
 
+import htsjdk.samtools.util.BlockCompressedInputStream;
 import org.broadinstitute.hellbender.CommandLineProgramTest;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
 import org.broadinstitute.hellbender.utils.test.IntegrationTestSpec;
+import org.testng.Assert;
 import org.testng.annotations.Test;
 
-import java.io.IOException;
+import java.io.*;
 import java.util.Collections;
 
 public final class CalculateGenotypePosteriorsIntegrationTest extends CommandLineProgramTest {
@@ -104,4 +107,30 @@ public void testFamilyPriorsMixedPloidy() throws IOException {
                 UserException.class);
         spec.executeTest("testFamilyPriorsMixedPloidy", this);
     }
+
+
+    @Test //test for https://github.com/broadinstitute/gatk/issues/4346
+    public void testNoDuplicatesForAdjacentInputs() throws IOException {
+        final IntegrationTestSpec spec = new IntegrationTestSpec(
+                " -O %s" +
+                        " -V " + getTestFile("overlappingVariants.vcf") +
+                        " --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE +" false",
+                Collections.singletonList(getTestFile("overlappingVariants.expected.no_duplicates.vcf").toString()));
+        spec.executeTest("testNoDuplicatesForOverlappingVariants", this);
+    }
+
+    @Test //test for https://github.com/broadinstitute/gatk/issues/4346
+    public void gzipOutputIsGzipped() throws IOException {
+        final File out = createTempFile("out", ".vcf.gz");
+
+        final ArgumentsBuilder args = new ArgumentsBuilder();
+        args.addOutput(out)
+                .addVCF(getTestFile("overlappingVariants.vcf"));
+
+        runCommandLine(args);
+
+        try( final InputStream in = new BufferedInputStream(new FileInputStream(out))) {
+            Assert.assertTrue(BlockCompressedInputStream.isValidFile(in));
+        }
+    }
 }
diff --git a/...s/variantutils/CalculateGenotypePosteriors/overlappingVariants.expected.no_duplicates.vcf b/...s/variantutils/CalculateGenotypePosteriors/overlappingVariants.expected.no_duplicates.vcf
@@ -0,0 +1,50 @@
+##fileformat=VCFv4.2
+##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
+##FILTER=<ID=LowQual,Description="Low quality">
+##FILTER=<ID=VQSRTrancheINDEL99.00to99.90,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -2.4321 <= x < -0.3748">
+##FILTER=<ID=VQSRTrancheINDEL99.90to100.00+,Description="Truth sensitivity tranche level for INDEL model at VQS Lod < -103.9799">
+##FILTER=<ID=VQSRTrancheINDEL99.90to100.00,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -103.9799 <= x < -2.4321">
+##FILTER=<ID=VQSRTrancheSNP99.00to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -6.7044 <= x < -0.2206">
+##FILTER=<ID=VQSRTrancheSNP99.90to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -3000.6876">
+##FILTER=<ID=VQSRTrancheSNP99.90to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -3000.6876 <= x < -6.7044">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
+##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
+##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FORMAT=<ID=PP,Number=G,Type=Integer,Description="Phred-scaled Posterior Genotype Probabilities">
+##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
+##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
+##GATKCommandLine=<ID=ApplyVQSR,CommandLine="ApplyVQSR  --recal-file /media/md1200/analysis/Case/Chen_ming/171031/USH_trio.indel.recal --tranches-file USH_trio.indel.tranches --output USH_trio.recal.vcf.gz --truth-sensitivity-filter-level 99.0 --mode INDEL --variant USH_trio.snp.recal.vcf.gz --reference /media/md1200/ref/hsa/genome/GRCh38/fasta/Homo_sapiens_assembly38.fasta  --use-allele-specific-annotations false --ignore-all-filters false --exclude-filtered false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --disable-tool-default-read-filters false",Version=4.0.0.0,Date="January 22, 2018 4:59:11 PM CST">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
+##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
+##INFO=<ID=PG,Number=G,Type=Integer,Description="Genotype Likelihood Prior">
+##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
+##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
+##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
+##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds of being a true variant versus being false under the trained gaussian mixture model">
+##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
+##contig=<ID=chr1,length=248956422>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	USH_C1	USH_F1	USH_M1
+chr1	3872413	.	AG	A	400.09	PASS	AC=1;AF=0.167;AN=6;BaseQRankSum=-4.760e-01;ClippingRankSum=0.00;DP=110;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.167;MQ=60.00;MQRankSum=0.00;PG=0,0,0;QD=10.53;ReadPosRankSum=-1.513e+00;SOR=0.787;VQSLOD=4.73;culprit=ReadPosRankSum	GT:AD:DP:GQ:PL:PP	0/0:27,0:27:81:0,81,712:0,81,712	0/0:45,0:45:99:0,111,1665:0,111,1665	0/1:17,21:38:99:440,0,469:440,0,469
+chr1	3872414	.	GA	G	29.18	PASS	AC=1;AF=0.167;AN=6;BaseQRankSum=0.246;ClippingRankSum=0.00;DP=126;ExcessHet=3.0103;FS=1.192;MLEAC=1;MLEAF=0.167;MQ=76.93;MQRankSum=0.00;PG=0,0,0;QD=0.50;ReadPosRankSum=0.352;SOR=0.472;VQSLOD=3.93;culprit=QD	GT:AD:DP:GQ:PL:PP	0/0:29,0:29:16:0,16,666:0,16,666	0/1:48,10:58:69:69,0,1051:69,0,1051	0/0:35,3:38:54:0,54,912:0,54,912