diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java index 86f020394f..be2bf610ad 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java @@ -886,8 +886,8 @@ private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceS // Compile a set of sequence names that exist in the BAM files. SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - if (readsDictionary.size() == 0) { - logger.info("Reads file is unmapped. Skipping validation against reference."); + if (readsDictionary.isEmpty()) { + logger.info("Reads file is unmapped. Skipping validation against reference."); return; } diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java index f1e832d2ca..b8f4fffc61 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java @@ -39,18 +39,18 @@ public class CramIntegrationTest extends WalkerTest { @DataProvider(name="cramData") public Object[][] getCRAMData() { return new Object[][] { - {"PrintReads", "exampleBAM.bam", "", "cram", "95cce9111a377d8e84fd076f7f0744df"}, - {"PrintReads", "exampleCRAM.cram", "", "cram", "95cce9111a377d8e84fd076f7f0744df"}, - {"PrintReads", "exampleCRAM.cram", "", "bam", "7c716d5103c20e95a8115b3c2848bf0d"}, - {"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "5cc9cac96f7f9819688d47a28ed97778"}, + {"PrintReads", "exampleBAM.bam", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"}, + {"PrintReads", "exampleCRAM.cram", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"}, + {"PrintReads", "exampleCRAM.cram", "", "bam", "247805098718dd74b8a871796424d359"}, + {"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "a5b26631cd89f86f6184bcac7bc9c9ca"}, {"CountLoci", "exampleCRAM.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"}, {"CountLoci", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, {"CountReads", "exampleCRAM.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"}, {"CountReads", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, - {"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"}, + {"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "24dbd14b60220461f47ec5517962cb7f"}, {"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"}, {"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"}, - {"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"}, + {"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "84bee5063d8fa0d07e7c3ff7e825ae3a"}, {"CountLoci", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"}, {"CountReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"}, }; diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java index 9e3184f125..2d2ce2bd3e 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java @@ -38,7 +38,7 @@ public class GATKKeyIntegrationTest extends WalkerTest { public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s", publicTestDir + "exampleFASTA.fasta", publicTestDir + "exampleBAM.bam"); - public static final String MD5_UPON_SUCCESSFUL_RUN = "69b6432aed2d0cebf02e5d414f654425"; + public static final String MD5_UPON_SUCCESSFUL_RUN = "462656ec9632f8c21ee534d35093c3f8"; private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) { diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java index 870277d1dc..9f21233aa8 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java @@ -35,7 +35,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest { @Test public void testMissingReadGroup() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", + "-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", 0, UserException.ReadMissingReadGroup.class); executeTest("test Missing Read Group", spec); @@ -44,7 +44,7 @@ public void testMissingReadGroup() { @Test public void testUndefinedReadGroup() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", + "-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", 0, UserException.ReadHasUndefinedReadGroup.class); executeTest("test Undefined Read Group", spec); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java index 160abd38d2..71d87409fe 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java @@ -25,6 +25,7 @@ package org.broadinstitute.gatk.utils; +import java.math.BigInteger; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; @@ -79,7 +80,7 @@ public enum SequenceDictionaryCompatibility { IDENTICAL, // the dictionaries are identical COMMON_SUBSET, // there exists a common subset of equivalent contigs NO_COMMON_CONTIGS, // no overlap between dictionaries - UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths + UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths and/or MD5s NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine) OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different // orders with respect to each other @@ -92,7 +93,7 @@ public enum SequenceDictionaryCompatibility { * @param validationExclusion exclusions to validation * @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency */ - private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) { + private static boolean allowNonFatalIncompabilities(final ValidationExclusion.TYPE validationExclusion) { return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY || validationExclusion == ValidationExclusion.TYPE.ALL ); } @@ -133,15 +134,23 @@ public static void validateDictionaries( final Logger logger, throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); case UNEQUAL_COMMON_CONTIGS: { - List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); - SAMSequenceRecord elt1 = x.get(0); - SAMSequenceRecord elt2 = x.get(1); + final List x = findNotEqualCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); + final SAMSequenceRecord elt1 = x.get(0); + final SAMSequenceRecord elt2 = x.get(1); + + String msg = "Found contigs with the same name but different lengths"; + String contig1 = " contig " + name1 + " is named " + elt1.getSequenceName() + " with length " + Integer.toString(elt1.getSequenceLength()); + if ( elt1.getMd5() != null ) + contig1 += " and MD5 " + elt1.getMd5(); + String contig2 = " contig " + name2 + " is named " + elt2.getSequenceName() + " with length " + Integer.toString(elt2.getSequenceLength()); + if ( elt2.getMd5() != null ) + contig2 += " and MD5 " + elt2.getMd5(); + if ( elt1.getMd5() != null || elt2.getMd5() != null ) + msg += " or MD5s:"; + msg += "\n" + contig1 + "\n" + contig2; // todo -- replace with toString when SAMSequenceRecord has a nice toString routine - UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", - name1, elt1.getSequenceName(), elt1.getSequenceLength(), - name2, elt2.getSequenceName(), elt2.getSequenceLength()), - name1, dict1, name2, dict2); + final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); if ( allowNonFatalIncompabilities(validationExclusion) ) logger.warn(ex.getMessage()); @@ -226,7 +235,7 @@ public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequ final Set commonContigs = getCommonContigsByName(dict1, dict2); - if (commonContigs.size() == 0) + if (commonContigs.isEmpty()) return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; @@ -250,12 +259,12 @@ else if ( ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) ) * @param dict2 * @return true if all of the common contigs are equivalent */ - private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; + private static boolean commonContigsHaveSameLengths(final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) { + return findNotEqualCommonContigs(commonContigs, dict1, dict2) == null; } /** - * Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns + * Returns a List(x,y) that contains two sequence records that are not equal among the common contigs in both dicts. Returns * null if all common contigs are equivalent * * @param commonContigs @@ -263,7 +272,7 @@ private static boolean commonContigsHaveSameLengths(Set commonContigs, S * @param dict2 * @return */ - private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + private static List findNotEqualCommonContigs(final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) { for ( String name : commonContigs ) { SAMSequenceRecord elt1 = dict1.getSequence(name); SAMSequenceRecord elt2 = dict2.getSequence(name); @@ -275,32 +284,32 @@ private static List findDisequalCommonContigs(Set com } /** - * Helper routine that returns two sequence records are equivalent, defined as having the same name and - * lengths, if both are non-zero + * Helper routine that determines if two sequence records are equivalent, defined as having the same name, + * lengths (if both are non-zero) and MD5 (if present) * - * @param me - * @param that - * @return + * @param record1 a SAMSequenceRecord + * @param record2 a SAMSequenceRecord + * @return true if the records are equivalent, false otherwise */ - private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) { - if (me == that) return true; - if (that == null) return false; + private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord record1, final SAMSequenceRecord record2) { + if ( record1 == record2 ) return true; + if ( record1 == null || record2 == null ) return false; + + // compare length + if ( record1.getSequenceLength() != 0 && record2.getSequenceLength() != 0 && record1.getSequenceLength() != record2.getSequenceLength() ) + return false; - if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength()) + // compare name + if ( !record1.getSequenceName().equals(record2.getSequenceName() )) return false; - // todo -- reenable if we want to be really strict here -// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) { -// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); -// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); -// if (!thisMd5.equals(thatMd5)) { -// return false; -// } -// } -// else { - if (me.getSequenceName() != that.getSequenceName()) - return false; // Compare using == since we intern() the Strings -// } + // compare MD5 + if ( record1.getMd5() != null && record2.getMd5() != null ){ + final BigInteger firstMd5 = new BigInteger(record1.getMd5(), 16); + final BigInteger secondMd5 = new BigInteger(record2.getMd5(), 16); + if ( !firstMd5.equals(secondMd5) ) + return false; + } return true; } @@ -313,13 +322,13 @@ private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, * @param dict * @return */ - private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { + private static boolean nonCanonicalHumanContigOrder(final SAMSequenceDictionary dict) { if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false return false; SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; - for ( SAMSequenceRecord elt : dict.getSequences() ) { + for ( final SAMSequenceRecord elt : dict.getSequences() ) { if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt; if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt; if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt; @@ -355,7 +364,7 @@ private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord * @param dict2 second SAMSequenceDictionary * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false */ - private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + private static boolean commonContigsAreInSameRelativeOrder(final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) { List list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1)); List list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2)); @@ -376,8 +385,8 @@ private static boolean commonContigsAreInSameRelativeOrder(Set commonCon * @param dict * @return */ - private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) { - List l = new ArrayList(commonContigs.size()); + private static List getSequencesOfName(final Set commonContigs, final SAMSequenceDictionary dict) { + final List l = new ArrayList(commonContigs.size()); for ( String name : commonContigs ) { l.add(dict.getSequence(name) ); } @@ -401,7 +410,7 @@ public int compare(SAMSequenceRecord x, SAMSequenceRecord y) { * @param unsorted * @return */ - private static List sortSequenceListByIndex(List unsorted) { + private static List sortSequenceListByIndex(final List unsorted) { Collections.sort(unsorted, new CompareSequenceRecordsByIndex()); return unsorted; } @@ -418,8 +427,8 @@ private static List sortSequenceListByIndex(List commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { for ( String commonContig : commonContigs ) { - SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); - SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); + final SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); + final SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); // Each common contig must have the same index in both dictionaries if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { @@ -489,13 +498,13 @@ private static Set findMisindexedContigsInIntervals( final GenomeLocSort * @return */ public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - Set intersectingSequenceNames = getContigNames(dict1); + final Set intersectingSequenceNames = getContigNames(dict1); intersectingSequenceNames.retainAll(getContigNames(dict2)); return intersectingSequenceNames; } public static Set getContigNames(SAMSequenceDictionary dict) { - Set contigNames = new HashSet(Utils.optimumHashSize(dict.size())); + final Set contigNames = new HashSet(Utils.optimumHashSize(dict.size())); for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) contigNames.add(dictionaryEntry.getSequenceName()); return contigNames; @@ -515,7 +524,7 @@ public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { throw new IllegalArgumentException("Sequence dictionary must be non-null"); } - StringBuilder s = new StringBuilder("[ "); + final StringBuilder s = new StringBuilder("[ "); for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { s.append(dictionaryEntry.getSequenceName()); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java index 08d84b084a..c09ebe4cc7 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java @@ -40,7 +40,7 @@ public enum TYPE { ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file - ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities + ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompatibilities LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc. @EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL ALL // do not check for all of the above conditions, DEFAULT diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java index 42cf874331..30bd8ec6f6 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java @@ -102,7 +102,7 @@ public static void validateTrackSequenceDictionary(final String trackName, final SAMSequenceDictionary referenceDict, final ValidationExclusion.TYPE validationExclusionType ) { // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) + if (trackDict == null || trackDict.isEmpty()) logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); else { Set trackSequences = new TreeSet(); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java index 7863b022d9..76f2046af5 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java @@ -93,7 +93,7 @@ public File getFile() { * @param dict the sam sequence dictionary * @param codec the feature codec we use to decode this type */ - public RMDTrack(Class type, String name, File file, AbstractFeatureReader reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) { + public RMDTrack(final Class type, final String name, final File file, final AbstractFeatureReader reader, final SAMSequenceDictionary dict, final GenomeLocParser genomeLocParser, final FeatureCodec codec) { this.type = type; this.name = name; this.file = file; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java index 34be252a7e..0b84c05a21 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java @@ -26,6 +26,9 @@ package org.broadinstitute.gatk.utils.refdata.tracks; import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.variant.vcf.VCFContigHeaderLine; +import htsjdk.variant.vcf.VCFHeader; import org.apache.log4j.Logger; import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.FeatureCodec; @@ -34,6 +37,7 @@ import htsjdk.tribble.index.Index; import htsjdk.tribble.index.IndexFactory; import htsjdk.tribble.util.LittleEndianOutputStream; +import org.broadinstitute.gatk.utils.SequenceDictionaryUtils; import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; import org.broadinstitute.gatk.utils.commandline.Tags; import org.broadinstitute.gatk.utils.ValidationExclusion; @@ -49,6 +53,8 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.Map; @@ -131,7 +137,7 @@ public FeatureManager getFeatureManager() { * * @return an instance of the track */ - public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { + public RMDTrack createInstanceOfTrack(final RMDTriplet fileDescriptor) { String name = fileDescriptor.getName(); File inputFile = new File(fileDescriptor.getFile()); @@ -146,9 +152,43 @@ public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { else pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); + + validateVariantAgainstSequenceDictionary(name, descriptor.getName(), pair.first, pair.second); + return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile)); } + /** + * Validate the VCF dictionary against the sequence dictionary. + * + * @param name the name of this specific track + * @param descriptorName the name of the feature + * @param reader the feature reader to use as the underlying data source + * @param dict the sam sequence dictionary + */ + private void validateVariantAgainstSequenceDictionary(final String name, final String descriptorName, final AbstractFeatureReader reader, final SAMSequenceDictionary dict ) throws UserException { + // only process if the variant is a VCF + if ( name.equals("variant") && descriptorName.equals("VCF") ){ + if ( reader != null && dict != null && reader.getHeader() != null ){ + final List contigs = ((VCFHeader) reader.getHeader()).getContigLines(); + if (contigs != null) { + // make the VCF dictionary from the contig header fields + final List vcfContigRecords = new ArrayList(); + for (final VCFContigHeaderLine contig : contigs) + vcfContigRecords.add(contig.getSAMSequenceRecord()); + + // have VCF contig fields so can make a dictionary and compare it to the sequence dictionary + if (!vcfContigRecords.isEmpty()) { + final SAMSequenceDictionary vcfDictionary = new SAMSequenceDictionary(vcfContigRecords); + final SAMSequenceDictionary sequenceDictionary = new SAMSequenceDictionary(dict.getSequences()); + + SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, name, vcfDictionary, "sequence", sequenceDictionary, false, null); + } + } + } + } + } + /** * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. * @param codecClass Type of Tribble codec class to build. @@ -228,7 +268,7 @@ private Pair getFeatureSource(Feat sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match - if (sequenceDictionary.size() == 0 && dict != null) { + if (sequenceDictionary.isEmpty() && dict != null) { validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); if ( ! disableAutoIndexCreation ) { diff --git a/public/gatk-utils/src/test/resources/exampleBAM.bam b/public/gatk-utils/src/test/resources/exampleBAM.bam index 319dd1a72d..d1c9d8001b 100644 Binary files a/public/gatk-utils/src/test/resources/exampleBAM.bam and b/public/gatk-utils/src/test/resources/exampleBAM.bam differ diff --git a/public/gatk-utils/src/test/resources/exampleBAM.bam.bai b/public/gatk-utils/src/test/resources/exampleBAM.bam.bai index 052ac614bd..33beef3d18 100644 Binary files a/public/gatk-utils/src/test/resources/exampleBAM.bam.bai and b/public/gatk-utils/src/test/resources/exampleBAM.bam.bai differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram index 3340900310..d4db8b3dca 100644 Binary files a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram and b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-nocrai.cram differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram index 3340900310..d4db8b3dca 100644 Binary files a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram and b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram.crai b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram.crai index c5492846c1..c3d728b4ce 100644 Binary files a/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram.crai and b/public/gatk-utils/src/test/resources/exampleCRAM-nobai-withcrai.cram.crai differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM.cram b/public/gatk-utils/src/test/resources/exampleCRAM.cram index 3340900310..78d606d035 100644 Binary files a/public/gatk-utils/src/test/resources/exampleCRAM.cram and b/public/gatk-utils/src/test/resources/exampleCRAM.cram differ diff --git a/public/gatk-utils/src/test/resources/exampleCRAM.cram.crai b/public/gatk-utils/src/test/resources/exampleCRAM.cram.crai new file mode 100644 index 0000000000..3eee8e0072 Binary files /dev/null and b/public/gatk-utils/src/test/resources/exampleCRAM.cram.crai differ