Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing bugs in CalculateGenotypePosteriers #4352

Merged
merged 2 commits into from
Feb 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ public final class CalculateGenotypePosteriors extends VariantWalker {
public List<FeatureInput<VariantContext>> supportVariants = new ArrayList<>();

@Argument(doc="File to which variants should be written", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, optional = false)
public String out = null;
public File out = null;

/**
* The global prior of a variant site -- i.e. the expected allele frequency distribution knowing only that N alleles
Expand Down Expand Up @@ -223,21 +223,14 @@ public final class CalculateGenotypePosteriors extends VariantWalker {
private File pedigreeFile = null;

private FamilyLikelihoods famUtils;
private SampleDB sampleDB = null;

private VariantContextWriter vcfWriter;

@Override
public void onTraversalStart() {
final VariantContextWriterBuilder builder = new VariantContextWriterBuilder().setOutputFile(out).setOutputFileType(VariantContextWriterBuilder.OutputType.VCF);
if (hasReference()){
vcfWriter = builder.setReferenceDictionary(getBestAvailableSequenceDictionary()).setOption(Options.INDEX_ON_THE_FLY).build();
} else {
vcfWriter = builder.unsetOption(Options.INDEX_ON_THE_FLY).build();
logger.info("Can't make an index for output file " + out + " because a reference dictionary is required for creating Tribble indices on the fly");
}
vcfWriter = createVCFWriter(out);

sampleDB = initializeSampleDB();
SampleDB sampleDB = initializeSampleDB();

// Get list of samples to include in the output
final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
Expand All @@ -254,22 +247,25 @@ public void onTraversalStart() {

final VCFHeader header = vcfHeaders.values().iterator().next();
if ( ! header.hasGenotypingData() ) {
throw new UserException("VCF has no genotypes");
throw new UserException.BadInput("VCF has no genotypes");
}

if ( header.hasInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY) ) {
final VCFInfoHeaderLine mleLine = header.getInfoHeaderLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
if ( mleLine.getCountType() != VCFHeaderLineCount.A ) {
throw new UserException("VCF does not have a properly formatted MLEAC field: the count type should be \"A\"");
throw new UserException.BadInput("VCF does not have a properly formatted MLEAC field: the count type should be \"A\"");
}

if ( mleLine.getType() != VCFHeaderLineType.Integer ) {
throw new UserException("VCF does not have a properly formatted MLEAC field: the field type should be \"Integer\"");
throw new UserException.BadInput("VCF does not have a properly formatted MLEAC field: the field type should be \"Integer\"");
}
}

// Initialize VCF header
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfHeaders.values(), true);
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.GENOTYPE_PRIOR_KEY));
if (!skipFamilyPriors) {
Expand Down Expand Up @@ -300,40 +296,39 @@ public void apply(final VariantContext variant,
final ReadsContext readsContext,
final ReferenceContext referenceContext,
final FeatureContext featureContext) {
final Collection<VariantContext> vcs = featureContext.getValues(getDrivingVariantsFeatureInput());

final Collection<VariantContext> otherVCs = featureContext.getValues(supportVariants);

final int missing = supportVariants.size() - otherVCs.size();

for ( final VariantContext vc : vcs ) {
final VariantContext vc_familyPriors;
final VariantContext vc_bothPriors;
final VariantContext vc_familyPriors;
final VariantContext vc_bothPriors;

//do family priors first (if applicable)
final VariantContextBuilder builder = new VariantContextBuilder(vc);
//only compute family priors for biallelelic sites
if (!skipFamilyPriors && vc.isBiallelic()){
final GenotypesContext gc = famUtils.calculatePosteriorGLs(vc);
builder.genotypes(gc);
}
//do family priors first (if applicable)
final VariantContextBuilder builder = new VariantContextBuilder(variant);
//only compute family priors for biallelelic sites
if (!skipFamilyPriors && variant.isBiallelic()){
final GenotypesContext gc = famUtils.calculatePosteriorGLs(variant);
builder.genotypes(gc);
}
VariantContextUtils.calculateChromosomeCounts(builder, false);
vc_familyPriors = builder.make();

if (!skipPopulationPriors) {
vc_bothPriors = PosteriorProbabilitiesUtils.calculatePosteriorProbs(vc_familyPriors, otherVCs, missing * numRefIfMissing, globalPrior, !ignoreInputSamples, defaultToAC, useACoff);
} else {
final VariantContextBuilder builder2 = new VariantContextBuilder(vc_familyPriors);
VariantContextUtils.calculateChromosomeCounts(builder, false);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, this should probably be builder2, right? All this time and no one ever said anything.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It definitely looks like it should be. It's effectively the same thing, but it's very confusing and if anyone ever make changes it could lead to problems... I might start a separate PR to address that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wait, what am I talking about... the thing that gets output should be the the result of modifying builder2 and we modify builder. That seems super wrong.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, i'm back to "it's dumb but fine"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new pr to fix this #4431

vc_familyPriors = builder.make();

if (!skipPopulationPriors) {
vc_bothPriors = PosteriorProbabilitiesUtils.calculatePosteriorProbs(vc_familyPriors, otherVCs, missing * numRefIfMissing, globalPrior, !ignoreInputSamples, defaultToAC, useACoff);
} else {
final VariantContextBuilder builder2 = new VariantContextBuilder(vc_familyPriors);
VariantContextUtils.calculateChromosomeCounts(builder, false);
vc_bothPriors = builder2.make();
}
vcfWriter.add(vc_bothPriors);
vc_bothPriors = builder2.make();
}
vcfWriter.add(vc_bothPriors);
}

@Override
public void closeTool(){
vcfWriter.close();
if(vcfWriter != null) {
vcfWriter.close();
}
}
}

Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
package org.broadinstitute.hellbender.tools.walkers.variantutils;

import htsjdk.samtools.util.BlockCompressedInputStream;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
import org.broadinstitute.hellbender.utils.test.IntegrationTestSpec;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.IOException;
import java.io.*;
import java.util.Collections;

public final class CalculateGenotypePosteriorsIntegrationTest extends CommandLineProgramTest {
Expand Down Expand Up @@ -104,4 +108,50 @@ public void testFamilyPriorsMixedPloidy() throws IOException {
UserException.class);
spec.executeTest("testFamilyPriorsMixedPloidy", this);
}


@Test //test for https://github.com/broadinstitute/gatk/issues/4346
public void testNoDuplicatesForAdjacentInputs() throws IOException {
final IntegrationTestSpec spec = new IntegrationTestSpec(
" -O %s" +
" -V " + getTestFile("overlappingVariants.vcf") +
" --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE +" false",
Collections.singletonList(getTestFile("overlappingVariants.expected.no_duplicates.vcf").toString()));
spec.executeTest("testNoDuplicatesForOverlappingVariants", this);
}

@Test //test for https://github.com/broadinstitute/gatk/issues/4346
public void gzipOutputIsGzipped() throws IOException {
final File out = createTempFile("out", ".vcf.gz");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addOutput(out)
.addVCF(getTestFile("overlappingVariants.vcf"));

runCommandLine(args);

try( final InputStream in = new BufferedInputStream(new FileInputStream(out))) {
Assert.assertTrue(BlockCompressedInputStream.isValidFile(in));
}
}

@DataProvider
public Object[][] getBadInputs(){
return new Object[][]{
{"noGenotypes.vcf"},
{"badMLEAC_count_not_A.vcf"},
{"badMLEAC_type_not_int.vcf"}
};
}

@Test(dataProvider = "getBadInputs", expectedExceptions = UserException.BadInput.class)
public void testBadInputFilesAreRejectedWithReasonableError(String badFile) throws IOException {
final File out = createTempFile("out", ".vcf.gz");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addOutput(out)
.addVCF(getTestFile(badFile));

runCommandLine(args);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.2
##INFO=<ID=MLEAC,Number=R,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT USH_C1 USH_F1 USH_M1
chr1 3872413 . AG A 400.09 PASS MLEAC=1 GT 0/0 0/0 0/1
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.2
##INFO=<ID=MLEAC,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT USH_C1 USH_F1 USH_M1
chr1 3872413 . AG A 400.09 PASS MLEAC=1 GT 0/0 0/0 0/1
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
##fileformat=VCFv4.2
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO
chr1 3872413 . AG A 400.09 PASS MLEAC=1
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=VQSRTrancheINDEL99.00to99.90,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -2.4321 <= x < -0.3748">
##FILTER=<ID=VQSRTrancheINDEL99.90to100.00+,Description="Truth sensitivity tranche level for INDEL model at VQS Lod < -103.9799">
##FILTER=<ID=VQSRTrancheINDEL99.90to100.00,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -103.9799 <= x < -2.4321">
##FILTER=<ID=VQSRTrancheSNP99.00to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -6.7044 <= x < -0.2206">
##FILTER=<ID=VQSRTrancheSNP99.90to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -3000.6876">
##FILTER=<ID=VQSRTrancheSNP99.90to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -3000.6876 <= x < -6.7044">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PP,Number=G,Type=Integer,Description="Phred-scaled Posterior Genotype Probabilities">
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=ApplyVQSR,CommandLine="ApplyVQSR --recal-file /media/md1200/analysis/Case/Chen_ming/171031/USH_trio.indel.recal --tranches-file USH_trio.indel.tranches --output USH_trio.recal.vcf.gz --truth-sensitivity-filter-level 99.0 --mode INDEL --variant USH_trio.snp.recal.vcf.gz --reference /media/md1200/ref/hsa/genome/GRCh38/fasta/Homo_sapiens_assembly38.fasta --use-allele-specific-annotations false --ignore-all-filters false --exclude-filtered false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --disable-tool-default-read-filters false",Version=4.0.0.0,Date="January 22, 2018 4:59:11 PM CST">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
##INFO=<ID=PG,Number=G,Type=Integer,Description="Genotype Likelihood Prior">
##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=RAW_MQ,Number=1,Type=Float,Description="Raw data for RMS Mapping Quality">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds of being a true variant versus being false under the trained gaussian mixture model">
##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT USH_C1 USH_F1 USH_M1
chr1 3872413 . AG A 400.09 PASS AC=1;AF=0.167;AN=6;BaseQRankSum=-4.760e-01;ClippingRankSum=0.00;DP=110;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.167;MQ=60.00;MQRankSum=0.00;PG=0,0,0;QD=10.53;ReadPosRankSum=-1.513e+00;SOR=0.787;VQSLOD=4.73;culprit=ReadPosRankSum GT:AD:DP:GQ:PL:PP 0/0:27,0:27:81:0,81,712:0,81,712 0/0:45,0:45:99:0,111,1665:0,111,1665 0/1:17,21:38:99:440,0,469:440,0,469
chr1 3872414 . GA G 29.18 PASS AC=1;AF=0.167;AN=6;BaseQRankSum=0.246;ClippingRankSum=0.00;DP=126;ExcessHet=3.0103;FS=1.192;MLEAC=1;MLEAF=0.167;MQ=76.93;MQRankSum=0.00;PG=0,0,0;QD=0.50;ReadPosRankSum=0.352;SOR=0.472;VQSLOD=3.93;culprit=QD GT:AD:DP:GQ:PL:PP 0/0:29,0:29:16:0,16,666:0,16,666 0/1:48,10:58:69:69,0,1051:69,0,1051 0/0:35,3:38:54:0,54,912:0,54,912