diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java index 678e1cf9a5..d674800adb 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -88,9 +88,6 @@ */ public class ReferenceConfidenceModel { - //public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now - public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele - private final GenomeLocParser genomeLocParser; private final SampleList samples; @@ -138,9 +135,7 @@ public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser, */ public Set getVCFHeaderLines() { final Set headerLines = new LinkedHashSet<>(); - // TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles? - headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); - //headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); + headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); return headerLines; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java index 443c8d6b98..fffd52e4f7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java @@ -165,6 +165,8 @@ public void initialize() { // take care of the VCF headers final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location")); + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index 032ee5f0ca..e003bef9d7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -217,10 +217,13 @@ public void initialize() { final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions()); headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); - // add the pool values for each genotype + + // add headers for annotations added by this tool + headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location")); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY)); headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY)); + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags if ( dbsnp != null && dbsnp.dbsnp.isBound() ) VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java index 930932fd25..d16e86eb38 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java @@ -113,20 +113,27 @@ public static VariantContext merge(final List VCs, final GenomeL final Map> annotationMap = new LinkedHashMap<>(); final GenotypesContext genotypes = GenotypesContext.create(); - final int variantContextCount = VCs.size(); // In this list we hold the mapping of each variant context alleles. - final List>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount); + final List>> vcAndNewAllelePairs = new ArrayList<>(VCs.size()); + // Keep track of whether we saw a spanning deletion and a non-spanning event + boolean sawSpanningDeletion = false; + boolean sawNonSpanningEvent = false; + // cycle through and add info from the other VCs for ( final VariantContext vc : VCs ) { // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) final boolean isSpanningEvent = loc.getStart() != vc.getStart(); + // record whether it's also a spanning deletion/event (we know this because the VariantContext type is no + // longer "symbolic" but "mixed" because there are real alleles mixed in with the symbolic non-ref allele) + sawSpanningDeletion |= ( isSpanningEvent && vc.isMixed() ) || vc.getAlternateAlleles().contains(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE); + sawNonSpanningEvent |= ( !isSpanningEvent && vc.isMixed() ); - vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) - : remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet))); + vcAndNewAllelePairs.add(new Pair<>(vc, isSpanningEvent ? replaceWithNoCallsAndDels(vc) : remapAlleles(vc, refAllele, finalAlleleSet))); } - // Add to the end if at all required in in the output. + // Add and to the end if at all required in in the output. + if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE); if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final List allelesList = new ArrayList<>(finalAlleleSet); @@ -171,13 +178,27 @@ public static VariantContext merge(final List VCs, final GenomeL final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + // note that in order to calculate the end position, we need a list of alleles that doesn't include anything symbolic final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList) - .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart()) + .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(nonSymbolicAlleles(allelesList), loc.getStart(), loc.getStart()) .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later return builder.make(); } + /** + * @param list the original alleles list + * @return a non-null list of non-symbolic alleles + */ + private static List nonSymbolicAlleles(final List list) { + final List result = new ArrayList<>(list.size()); + for ( final Allele allele : list ) { + if ( !allele.isSymbolic() ) + result.add(allele); + } + return result; + } + /** * Determines the ref allele given the provided reference base at this position * @@ -246,27 +267,28 @@ private static void addReferenceConfidenceAttributes(final Map m * collects alternative alleles present in variant context and add them to the {@code finalAlleles} set. * * - * @param vcAlleles the variant context allele list. - * @param refAllele final reference allele. + * @param vc the variant context. + * @param refAllele final reference allele. * @param finalAlleles where to add the final set of non-ref called alleles. * @return never {@code null} */ //TODO as part of a larger refactoring effort {@link #remapAlleles} can be merged with {@link GATKVariantContextUtils#remapAlleles}. - private static List remapAlleles(final List vcAlleles, final Allele refAllele, final LinkedHashSet finalAlleles) { - final Allele vcRef = vcAlleles.get(0); - if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference"); + private static List remapAlleles(final VariantContext vc, final Allele refAllele, final LinkedHashSet finalAlleles) { + + final Allele vcRef = vc.getReference(); final byte[] refBases = refAllele.getBases(); final int extraBaseCount = refBases.length - vcRef.getBases().length; if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected"); - final List result = new ArrayList<>(vcAlleles.size()); - for (final Allele a : vcAlleles) { - if (a.isReference()) { - result.add(refAllele); - } else if (a.isSymbolic()) { + final List result = new ArrayList<>(vc.getNAlleles()); + result.add(refAllele); + + for (final Allele a : vc.getAlternateAlleles()) { + if (a.isSymbolic()) { result.add(a); - // we always skip when adding to finalAlleles this is done outside if applies. - if (!a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) + // we always skip when adding to finalAlleles; this is done outside if applies. + // we also skip <*DEL> if there isn't a real alternate allele. + if ( !a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) && !vc.isSymbolic() ) finalAlleles.add(a); } else if (a.isCalled()) { final Allele newAllele; @@ -287,17 +309,31 @@ private static List remapAlleles(final List vcAlleles, final All } /** - * Replaces any alleles in the list with NO CALLS, except for the generic ALT allele + * Replaces any alleles in the VariantContext with NO CALLS or the symbolic deletion allele as appropriate, except for the generic ALT allele * - * @param alleles list of alleles to replace + * @param vc VariantContext with the alleles to replace * @return non-null list of alleles */ - private static List replaceWithNoCalls(final List alleles) { - if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null"); + private static List replaceWithNoCallsAndDels(final VariantContext vc) { + if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); + + final List result = new ArrayList<>(vc.getNAlleles()); - final List result = new ArrayList<>(alleles.size()); - for ( final Allele allele : alleles ) - result.add(allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL); + // no-call the reference allele + result.add(Allele.NO_CALL); + + // handle the alternate alleles + for ( final Allele allele : vc.getAlternateAlleles() ) { + final Allele replacement; + if ( allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ) + replacement = allele; + else if ( allele.length() < vc.getReference().length() ) + replacement = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE; + else + replacement = Allele.NO_CALL; + + result.add(replacement); + } return result; } @@ -387,43 +423,16 @@ protected static int[] getIndexesOfRelevantAlleles(final List remappedAl throw new UserException("The list of input alleles must contain " + GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); final int indexOfNonRef = remappedAlleles.indexOf(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); - - //if the refs don't match then let the non-ref allele be the most likely of the alts - //TODO: eventually it would be nice to be able to trim alleles for spanning events to see if they really do have the same ref - final boolean refsMatch = targetAlleles.get(0).equals(remappedAlleles.get(0),false); - final int indexOfBestAlt; - if (!refsMatch && g.hasPL()) { - final int[] PLs = g.getPL().clone(); - PLs[0] = Integer.MAX_VALUE; //don't pick 0/0 - final int indexOfBestAltPL = MathUtils.minElementIndex(PLs); - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(indexOfBestAltPL); - indexOfBestAlt = pair.alleleIndex2; - } - else - indexOfBestAlt = indexOfNonRef; - final int[] indexMapping = new int[targetAlleles.size()]; // the reference likelihoods should always map to each other (even if the alleles don't) indexMapping[0] = 0; - // create the index mapping, using the allele whenever such a mapping doesn't exist - final int targetNonRef = targetAlleles.indexOf(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); - final boolean targetHasNonRef = targetNonRef != -1; - final int lastConcreteAlt = targetHasNonRef ? targetAlleles.size()-2 : targetAlleles.size()-1; - for ( int i = 1; i <= lastConcreteAlt; i++ ) { + // create the index mapping, using the allele whenever such a mapping doesn't exist + for ( int i = 1; i < targetAlleles.size(); i++ ) { final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele; } - if (targetHasNonRef) { - if (refsMatch) { - final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(targetNonRef)); - indexMapping[targetNonRef] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele; - } - else { - indexMapping[targetNonRef] = indexOfBestAlt; - } - } return indexMapping; } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java index 51261fa693..c9fe0eda22 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java @@ -100,7 +100,7 @@ public void testTetraploidRun() { " -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", 1, - Arrays.asList("7b3153135e4f8e1d137d3f4beb46f182")); + Arrays.asList("ebe26077809961f53d5244643d24fd45")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -112,7 +112,7 @@ public void testMixedPloidyRun() { " -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", 1, - Arrays.asList("4f546634213ece6f08ec9258620b92bb")); + Arrays.asList("2d36a5f996cad47e5d05fcd78f6e572e")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -190,7 +190,7 @@ public void testOneHasDeletionAndTwoHasRefBlock() throws Exception { @Test public void testMD5s() throws Exception { final String cmd = baseTestString(" -L 1:69485-69791"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b7c753452ab0c05f9cee538e420b87fa")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("83ea9f4a9aadb1218c21c9d3780e8009")); spec.disableShadowBCF(); executeTest("testMD5s", spec); } @@ -198,7 +198,7 @@ public void testMD5s() throws Exception { @Test public void testBasepairResolutionOutput() throws Exception { final String cmd = baseTestString(" -L 1:69485-69791 --convertToBasePairResolution"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("bb6420ead95da4c72e76ca4bf5860ef0")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f153cb6e986efc9b50f0b8833fe5d3da")); spec.disableShadowBCF(); executeTest("testBasepairResolutionOutput", spec); } @@ -206,16 +206,27 @@ public void testBasepairResolutionOutput() throws Exception { @Test public void testBreakBlocks() throws Exception { final String cmd = baseTestString(" -L 1:69485-69791 --breakBandsAtMultiplesOf 5"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("dd31182124c4b78a8a03edb1e0cf618b")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6626ff272e7e76fba091f5bde4a1f963")); spec.disableShadowBCF(); executeTest("testBreakBlocks", spec); } + @Test + public void testSpanningDeletions() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf", + 1, + Arrays.asList("fba48ce2bf8761366ff2cd0b45d0421f")); + spec.disableShadowBCF(); + executeTest("testSpanningDeletions", spec); + } + @Test public void testWrongReferenceBaseBugFix() throws Exception { final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf" + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input2.vcf") + " -o %s --no_cmdline_in_header"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("75d81c247bef5e394b4a2d4126aee2b3")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("331c1a4a6a72ea1617c1697a5d945d56")); spec.disableShadowBCF(); executeTest("testWrongReferenceBaseBugFix",spec); @@ -224,7 +235,7 @@ public void testWrongReferenceBaseBugFix() throws Exception { @Test public void testBasepairResolutionInput() throws Exception { final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("994ff5c219c683635eadc1624cbbda74")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("207e89b5677fbf0ef4d1ff768262cf0c")); spec.disableShadowBCF(); executeTest("testBasepairResolutionInput", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index b0ede93d13..514eb59017 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -82,7 +82,7 @@ public void testUpdatePGT() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference), 1, - Arrays.asList("23ff3e22262929138ca1f00fc111cadf")); + Arrays.asList("4dfea9a9b1a77c4c6b9edc61f9ea8da2")); executeTest("testUpdatePGT", spec); } @@ -91,7 +91,7 @@ public void testUpdatePGTStrandAlleleCountsBySample() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample", b37KGReference), 1, - Arrays.asList("88fa4a021e4aac9a0e48bd54b2949ece")); + Arrays.asList("a96b79e7c3689c8d5506083cb6d27390")); executeTest("testUpdatePGT, adding StrandAlleleCountsBySample annotation", spec); } @@ -103,7 +103,7 @@ public void combineSingleSamplePipelineGVCF() { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("06b4e2589c5b903f7c51ae9968bebe77")); + Arrays.asList("bf3c1982ab6ffee410cb6a1fff6e7105")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -115,7 +115,7 @@ public void testTetraploidRun() { " -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Arrays.asList("599394c205c1d6641b9bebabbd29e13c")); + Arrays.asList("47d454936dc1f17cf4c4f84f02841346")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -127,7 +127,7 @@ public void testMixedPloidyRun() { " -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Arrays.asList("f7d5344a85e6d7fc2437d4253b424cb0")); + Arrays.asList("5d79ea9de8ada8520d01284cf0c9f720")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -139,7 +139,7 @@ public void combineSingleSamplePipelineGVCF_includeNonVariants() { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Arrays.asList("c9e4d1e52ee1f3a5233f1fb100f24d5e")); + Arrays.asList("d69b43cac448f45218e77308fc01e9e6")); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); } @@ -152,7 +152,7 @@ public void combineSingleSamplePipelineGVCFHierarchical() { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("aa19980b9a525afed43e98c821114ae5")); + Arrays.asList("7c93d82758bfb6e7efec257ef8a46217")); executeTest("combineSingleSamplePipelineGVCFHierarchical", spec); } @@ -164,7 +164,7 @@ public void combineSingleSamplePipelineGVCF_addDbsnp() { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Arrays.asList("f23c9d62542a69b5cbf0e9f89fdd235d")); + Arrays.asList("5b60a7a9575ea83407aa61123960a0cc")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } @@ -174,7 +174,7 @@ public void testJustOneSample() { "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + " -V " + privateTestDir + "gvcfExample1.vcf", 1, - Arrays.asList("d602d9e5d336798e4ccb52d2b5f91677")); + Arrays.asList("9e59b94c84dd673b8db9d35cae7e0f68")); executeTest("testJustOneSample", spec); } @@ -185,14 +185,14 @@ public void testSamplesWithDifferentLs() { " -V " + privateTestDir + "gvcfExample1.vcf" + " -V " + privateTestDir + "gvcfExample2.vcf", 1, - Arrays.asList("6c6d6ef90386eb6c6ed649379aac0c13")); + Arrays.asList("8407cb9a1ab34e705e5a54a0d4146d84")); executeTest("testSamplesWithDifferentLs", spec); } @Test(enabled = true) public void testNoPLsException() { // Test with input files with (1) 0/0 and (2) ./. - final String md5 = "d04b32cf2fa97d303ff7fdc779a653d4"; + final String md5 = "3e69805dc1c0ada0a050a65b89ecab30"; WalkerTestSpec spec1 = new WalkerTestSpec( "-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference + " --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf", @@ -212,7 +212,7 @@ public void testNDA() { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-nda"), 1, - Arrays.asList("7132a43d93a9855d03b27b4b0381194c")); + Arrays.asList("5a036de16b7a87626d2b76727376d9df")); executeTest("testNDA", spec); } @@ -221,7 +221,7 @@ public void testMaxAltAlleles() { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-maxAltAlleles 1"), 1, - Arrays.asList("07844593a4e1ff1110ef8c1de42cc290")); + Arrays.asList("2f3e6879fa27128a8be7b067ded78966")); executeTest("testMaxAltAlleles", spec); } @@ -230,7 +230,7 @@ public void testStandardConf() { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"), 1, - Arrays.asList("56caad762b26479ba5e2cc99222b9030")); + Arrays.asList("2e4a1ad71e8fc127b594077166c0344b")); executeTest("testStandardConf", spec); } @@ -274,7 +274,7 @@ public void testUniquifiedSamples() throws IOException { " -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + " --uniquifySamples", b37KGReference), 1, - Arrays.asList("ba36b36145e038e3cb004adf11bce96c")); + Arrays.asList("9a472c4e101fff4892efb9255c5cd8b3")); executeTest("testUniquifiedSamples", spec); } @@ -311,4 +311,173 @@ private List getAttributeValues(final File vcfFile, final String attribu return attributeValues; } + /** + * Section to test spanning deletions + */ + @Test + public void testSpanningDeletions() throws IOException { + final String gvcf1 = privateTestDir + "spanningDel.1.g.vcf"; + final String gvcf2 = privateTestDir + "spanningDel.2.g.vcf"; + final String gvcf3 = privateTestDir + "spanningDel.3.g.vcf"; + + // create the genotyped VCF to use as a basis for comparison against all of the combined versions + // case 0: GenotypeGVCFs(1.g.vcf, 2.g.vcf, 3.g.vcf) + final WalkerTestSpec genotypeBase = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3, + 1, + Arrays.asList("")); + genotypeBase.disableShadowBCF(); + final File genotypeBaseVCF = executeTest("genotypeBase", genotypeBase).getFirst().get(0); + final List BASE_VARIANT_CONTEXTS = getVariantContexts(genotypeBaseVCF); + + // case 1: GenotypeGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf), 3.g.vcf) + final WalkerTestSpec combine12 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + gvcf1 + " -V " + gvcf2, + 1, + Arrays.asList("")); + combine12.disableShadowBCF(); + final File combined_gVCF12 = executeTest("combine12", combine12).getFirst().get(0); + final WalkerTestSpec genotype12_3 = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF12.getAbsolutePath() + " -V " + gvcf3, + 1, + Arrays.asList("")); + genotype12_3.disableShadowBCF(); + final File genotype12_3VCF = executeTest("genotype12_3", genotype12_3).getFirst().get(0); + final List VARIANT_CONTEXTS_12_3 = getVariantContexts(genotype12_3VCF); + testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_12_3); + + // case 2: GenotypeGVCFs(CombineGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf), 3.g.vcf)) + final WalkerTestSpec combine12then3 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF12 + " -V " + gvcf3, + 1, + Arrays.asList("")); + combine12then3.disableShadowBCF(); + final File combined_gVCF12then3 = executeTest("combined_gVCF12then3", combine12then3).getFirst().get(0); + final WalkerTestSpec genotype12then3 = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF12then3.getAbsolutePath(), + 1, + Arrays.asList("")); + genotype12then3.disableShadowBCF(); + final File genotype12then3VCF = executeTest("genotype12then3", genotype12then3).getFirst().get(0); + final List VARIANT_CONTEXTS_12then3 = getVariantContexts(genotype12then3VCF); + testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_12then3); + + // case 3: GenotypeGVCFs(CombineGVCFs(CombineGVCFs(1.g.vcf, 3.g.vcf), 2.g.vcf)) + final WalkerTestSpec combine13 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + gvcf1 + " -V " + gvcf3, + 1, + Arrays.asList("")); + combine13.disableShadowBCF(); + final File combined_gVCF13 = executeTest("combine13", combine13).getFirst().get(0); + final WalkerTestSpec combine13then2 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF13 + " -V " + gvcf2, + 1, + Arrays.asList("")); + combine13then2.disableShadowBCF(); + final File combined_gVCF13then2 = executeTest("combined_gVCF13then2", combine13then2).getFirst().get(0); + final WalkerTestSpec genotype13then2 = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF13then2.getAbsolutePath(), + 1, + Arrays.asList("")); + genotype13then2.disableShadowBCF(); + final File genotype13then2VCF = executeTest("genotype13then2", genotype13then2).getFirst().get(0); + final List VARIANT_CONTEXTS_13then2 = getVariantContexts(genotype13then2VCF); + testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_13then2); + + // case 4: GenotypeGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf, 3.g.vcf)) + final WalkerTestSpec combine123 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3, + 1, + Arrays.asList("")); + combine123.disableShadowBCF(); + final File combined_gVCF123 = executeTest("combine123", combine123).getFirst().get(0); + final WalkerTestSpec genotype123 = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF123.getAbsolutePath(), + 1, + Arrays.asList("")); + genotype123.disableShadowBCF(); + final File genotype123VCF = executeTest("genotype123", genotype123).getFirst().get(0); + final List VARIANT_CONTEXTS_123 = getVariantContexts(genotype123VCF); + testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_123); + } + + /** + * Returns a list of VariantContext records from a VCF file + * + * @param vcfFile VCF file + * + * @throws IOException if the file does not exist or can not be opened + * + * @return list of VariantContext records + */ + private static List getVariantContexts(final File vcfFile) throws IOException { + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(vcfFile); + final LineIterator lineIteratorVCF = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIteratorVCF); + + final List VCs = new ArrayList<>(); + while ( lineIteratorVCF.hasNext() ) { + final String line = lineIteratorVCF.next(); + Assert.assertFalse(line == null); + VCs.add(codec.decode(line)); + } + + return VCs; + } + + private static void testVCsAreEqual(final List VCs1, final List VCs2) { + Assert.assertEquals(VCs1.size(), VCs2.size(), "number of Variant Contexts"); + for ( int i = 0; i < VCs1.size(); i++ ) { + final VariantContext vc1 = VCs1.get(i); + final VariantContext vc2 = VCs2.get(i); + Assert.assertEquals(vc1.toStringDecodeGenotypes(), vc2.toStringDecodeGenotypes()); + } + } + + + private static final String simpleSpanningDeletionsMD5 = "e8616a396d40b4918ad30189856ceb01"; + + @Test(enabled = true) + public void testSpanningDeletionsMD5() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf", + 1, + Arrays.asList(simpleSpanningDeletionsMD5)); + spec.disableShadowBCF(); + executeTest("testSpanningDeletionsMD5", spec); + } + + @Test(enabled = true) + public void testSpanningDeletionsFromCombinedGVCF() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.combined.g.vcf", + 1, + Arrays.asList(simpleSpanningDeletionsMD5)); + spec.disableShadowBCF(); + executeTest("testSpanningDeletionsFromCombinedGVCFMD5", spec); + } + + @Test(enabled = true) + public void testMultipleSpanningDeletionsMD5() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf -V " + privateTestDir + "spanningDel.3.g.vcf", + 1, + Arrays.asList("1c418229117bc8f148a69eda9c496309")); + spec.disableShadowBCF(); + executeTest("testMultipleSpanningDeletionsMD5", spec); + } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java index 578543652b..6290fdd961 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java @@ -78,6 +78,7 @@ public class VariantContextMergerUnitTest extends BaseTest { Allele ATref; Allele Anoref; Allele GT; + Allele del; private GenomeLocParser genomeLocParser; @@ -94,6 +95,7 @@ public void setup() throws IOException { ATCATCT = Allele.create("ATCATCT"); ATref = Allele.create("AT",true); Anoref = Allele.create("A",false); + del = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE; GT = Allele.create("GT",false); genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); } @@ -217,6 +219,7 @@ public Object[][] makeReferenceConfidenceMergeData() { final List AA_A_ALT = Arrays.asList(AAref, A, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); + final List A_C_del = Arrays.asList(Aref, C, del); // first test the case of a single record tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT), @@ -249,18 +252,19 @@ public Object[][] makeReferenceConfidenceMergeData() { // a SNP with a spanning deletion tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT), loc, false, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()}); + new VariantContextBuilder(VCbase).alleles(A_C_del).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73}).alleles(noCalls).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 20, 72, 10}).alleles(noCalls).make()).make()}); // combination of all tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), loc, false, false, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(), - new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(), - new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(), - new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(), - new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(), - new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(), - new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()}); + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC, del)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73, 71, 72, 73, 73, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73, 71, 73, 72, 73, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10, 71, 73, 73, 72, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74, 71, 72, 73, 74, 74}).alleles(noCalls).make(), + new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000, 100, 1000, 1000, 1000, 1000}).alleles(noCalls).make(), + new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800, 80, 800, 800, 800, 800}).alleles(noCalls).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73, 20, 72, 72, 72, 10}).alleles(noCalls).make()).make()}); // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT), diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index bcf8185fd8..95cf3e593f 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -123,7 +123,10 @@ their names (or descriptions) depend on some threshold. Those filters are not i public static final String BEAGLE_MONO_FILTER_NAME = "BGL_SET_TO_MONOMORPHIC"; public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - //Alleles + // Symbolic alleles + public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT"; public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site + public final static String SPANNING_DELETION_SYMBOLIC_ALLELE_NAME = "*:DEL"; + public final static Allele SPANNING_DELETION_SYMBOLIC_ALLELE = Allele.create("<"+SPANNING_DELETION_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible spanning deletion allele at this site }