Skip to content

Commit

Permalink
Validate VCF with sequence dictionary
Browse files Browse the repository at this point in the history
  • Loading branch information
ronlevine committed Nov 20, 2015
1 parent edca4d4 commit bacb6b2
Show file tree
Hide file tree
Showing 16 changed files with 111 additions and 62 deletions.
Expand Up @@ -886,8 +886,8 @@ private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceS
// Compile a set of sequence names that exist in the BAM files.
SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary();

if (readsDictionary.size() == 0) {
logger.info("Reads file is unmapped. Skipping validation against reference.");
if (readsDictionary.isEmpty()) {
logger.info("Reads file is unmapped. Skipping validation against reference.");
return;
}

Expand Down
Expand Up @@ -39,18 +39,18 @@ public class CramIntegrationTest extends WalkerTest {
@DataProvider(name="cramData")
public Object[][] getCRAMData() {
return new Object[][] {
{"PrintReads", "exampleBAM.bam", "", "cram", "95cce9111a377d8e84fd076f7f0744df"},
{"PrintReads", "exampleCRAM.cram", "", "cram", "95cce9111a377d8e84fd076f7f0744df"},
{"PrintReads", "exampleCRAM.cram", "", "bam", "7c716d5103c20e95a8115b3c2848bf0d"},
{"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "5cc9cac96f7f9819688d47a28ed97778"},
{"PrintReads", "exampleBAM.bam", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"},
{"PrintReads", "exampleCRAM.cram", "", "cram", "424c725c4ffe7215e358ecf5abd5e5e8"},
{"PrintReads", "exampleCRAM.cram", "", "bam", "247805098718dd74b8a871796424d359"},
{"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "a5b26631cd89f86f6184bcac7bc9c9ca"},
{"CountLoci", "exampleCRAM.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"},
{"CountLoci", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
{"CountReads", "exampleCRAM.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"},
{"CountReads", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"},
{"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"},
{"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "24dbd14b60220461f47ec5517962cb7f"},
{"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
{"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
{"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "2e1b175c9b36154e2bbd1a23ebaf4c22"},
{"PrintReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "bam", "84bee5063d8fa0d07e7c3ff7e825ae3a"},
{"CountLoci", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"},
{"CountReads", "exampleCRAM-nobai-withcrai.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"},
};
Expand Down
Expand Up @@ -38,7 +38,7 @@ public class GATKKeyIntegrationTest extends WalkerTest {
public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s",
publicTestDir + "exampleFASTA.fasta",
publicTestDir + "exampleBAM.bam");
public static final String MD5_UPON_SUCCESSFUL_RUN = "69b6432aed2d0cebf02e5d414f654425";
public static final String MD5_UPON_SUCCESSFUL_RUN = "462656ec9632f8c21ee534d35093c3f8";


private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) {
Expand Down
Expand Up @@ -35,7 +35,7 @@ public class BadReadGroupsIntegrationTest extends WalkerTest {
@Test
public void testMissingReadGroup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null",
"-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null",
0,
UserException.ReadMissingReadGroup.class);
executeTest("test Missing Read Group", spec);
Expand All @@ -44,7 +44,7 @@ public void testMissingReadGroup() {
@Test
public void testUndefinedReadGroup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null",
"-T TestPrintReadsWalker -R " + hg18Reference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null",
0,
UserException.ReadHasUndefinedReadGroup.class);
executeTest("test Undefined Read Group", spec);
Expand Down
Expand Up @@ -25,6 +25,7 @@

package org.broadinstitute.gatk.utils;

import java.math.BigInteger;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
Expand Down Expand Up @@ -79,7 +80,7 @@ public enum SequenceDictionaryCompatibility {
IDENTICAL, // the dictionaries are identical
COMMON_SUBSET, // there exists a common subset of equivalent contigs
NO_COMMON_CONTIGS, // no overlap between dictionaries
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths
UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths and/or MD5s
NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine)
OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different
// orders with respect to each other
Expand All @@ -92,7 +93,7 @@ public enum SequenceDictionaryCompatibility {
* @param validationExclusion exclusions to validation
* @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency
*/
private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) {
private static boolean allowNonFatalIncompabilities(final ValidationExclusion.TYPE validationExclusion) {
return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY ||
validationExclusion == ValidationExclusion.TYPE.ALL );
}
Expand Down Expand Up @@ -133,15 +134,23 @@ public static void validateDictionaries( final Logger logger,
throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2);

case UNEQUAL_COMMON_CONTIGS: {
List<SAMSequenceRecord> x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
SAMSequenceRecord elt1 = x.get(0);
SAMSequenceRecord elt2 = x.get(1);
final List<SAMSequenceRecord> x = findNotEqualCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2);
final SAMSequenceRecord elt1 = x.get(0);
final SAMSequenceRecord elt2 = x.get(1);

String msg = "Found contigs with the same name but different lengths";
String contig1 = " contig " + name1 + " is named " + elt1.getSequenceName() + " with length " + Integer.toString(elt1.getSequenceLength());
if ( elt1.getMd5() != null )
contig1 += " and MD5 " + elt1.getMd5();
String contig2 = " contig " + name2 + " is named " + elt2.getSequenceName() + " with length " + Integer.toString(elt2.getSequenceLength());
if ( elt2.getMd5() != null )
contig2 += " and MD5 " + elt2.getMd5();
if ( elt1.getMd5() != null || elt2.getMd5() != null )
msg += " or MD5s:";
msg += "\n" + contig1 + "\n" + contig2;

// todo -- replace with toString when SAMSequenceRecord has a nice toString routine
UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d",
name1, elt1.getSequenceName(), elt1.getSequenceLength(),
name2, elt2.getSequenceName(), elt2.getSequenceLength()),
name1, dict1, name2, dict2);
final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2);

if ( allowNonFatalIncompabilities(validationExclusion) )
logger.warn(ex.getMessage());
Expand Down Expand Up @@ -226,7 +235,7 @@ public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequ

final Set<String> commonContigs = getCommonContigsByName(dict1, dict2);

if (commonContigs.size() == 0)
if (commonContigs.isEmpty())
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) )
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
Expand All @@ -250,20 +259,20 @@ else if ( ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) )
* @param dict2
* @return true if all of the common contigs are equivalent
*/
private static boolean commonContigsHaveSameLengths(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
return findDisequalCommonContigs(commonContigs, dict1, dict2) == null;
private static boolean commonContigsHaveSameLengths(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
return findNotEqualCommonContigs(commonContigs, dict1, dict2) == null;
}

/**
* Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns
* Returns a List(x,y) that contains two sequence records that are not equal among the common contigs in both dicts. Returns
* null if all common contigs are equivalent
*
* @param commonContigs
* @param dict1
* @param dict2
* @return
*/
private static List<SAMSequenceRecord> findDisequalCommonContigs(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
private static List<SAMSequenceRecord> findNotEqualCommonContigs(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
for ( String name : commonContigs ) {
SAMSequenceRecord elt1 = dict1.getSequence(name);
SAMSequenceRecord elt2 = dict2.getSequence(name);
Expand All @@ -275,32 +284,32 @@ private static List<SAMSequenceRecord> findDisequalCommonContigs(Set<String> com
}

/**
* Helper routine that returns two sequence records are equivalent, defined as having the same name and
* lengths, if both are non-zero
* Helper routine that determines if two sequence records are equivalent, defined as having the same name,
* lengths (if both are non-zero) and MD5 (if present)
*
* @param me
* @param that
* @return
* @param record1 a SAMSequenceRecord
* @param record2 a SAMSequenceRecord
* @return true if the records are equivalent, false otherwise
*/
private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) {
if (me == that) return true;
if (that == null) return false;
private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord record1, final SAMSequenceRecord record2) {
if ( record1 == record2 ) return true;
if ( record1 == null || record2 == null ) return false;

// compare length
if ( record1.getSequenceLength() != 0 && record2.getSequenceLength() != 0 && record1.getSequenceLength() != record2.getSequenceLength() )
return false;

if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength())
// compare name
if ( !record1.getSequenceName().equals(record2.getSequenceName() ))
return false;

// todo -- reenable if we want to be really strict here
// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) {
// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
// if (!thisMd5.equals(thatMd5)) {
// return false;
// }
// }
// else {
if (me.getSequenceName() != that.getSequenceName())
return false; // Compare using == since we intern() the Strings
// }
// compare MD5
if ( record1.getMd5() != null && record2.getMd5() != null ){
final BigInteger firstMd5 = new BigInteger(record1.getMd5(), 16);
final BigInteger secondMd5 = new BigInteger(record2.getMd5(), 16);
if ( !firstMd5.equals(secondMd5) )
return false;
}

return true;
}
Expand All @@ -313,13 +322,13 @@ private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me,
* @param dict
* @return
*/
private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) {
private static boolean nonCanonicalHumanContigOrder(final SAMSequenceDictionary dict) {
if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false
return false;

SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null;

for ( SAMSequenceRecord elt : dict.getSequences() ) {
for ( final SAMSequenceRecord elt : dict.getSequences() ) {
if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt;
if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt;
if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt;
Expand Down Expand Up @@ -355,7 +364,7 @@ private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord
* @param dict2 second SAMSequenceDictionary
* @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false
*/
private static boolean commonContigsAreInSameRelativeOrder(Set<String> commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
private static boolean commonContigsAreInSameRelativeOrder(final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) {
List<SAMSequenceRecord> list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1));
List<SAMSequenceRecord> list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2));

Expand All @@ -376,8 +385,8 @@ private static boolean commonContigsAreInSameRelativeOrder(Set<String> commonCon
* @param dict
* @return
*/
private static List<SAMSequenceRecord> getSequencesOfName(Set<String> commonContigs, SAMSequenceDictionary dict) {
List<SAMSequenceRecord> l = new ArrayList<SAMSequenceRecord>(commonContigs.size());
private static List<SAMSequenceRecord> getSequencesOfName(final Set<String> commonContigs, final SAMSequenceDictionary dict) {
final List<SAMSequenceRecord> l = new ArrayList<SAMSequenceRecord>(commonContigs.size());
for ( String name : commonContigs ) {
l.add(dict.getSequence(name) );
}
Expand All @@ -401,7 +410,7 @@ public int compare(SAMSequenceRecord x, SAMSequenceRecord y) {
* @param unsorted
* @return
*/
private static List<SAMSequenceRecord> sortSequenceListByIndex(List<SAMSequenceRecord> unsorted) {
private static List<SAMSequenceRecord> sortSequenceListByIndex(final List<SAMSequenceRecord> unsorted) {
Collections.sort(unsorted, new CompareSequenceRecordsByIndex());
return unsorted;
}
Expand All @@ -418,8 +427,8 @@ private static List<SAMSequenceRecord> sortSequenceListByIndex(List<SAMSequenceR
*/
private static boolean commonContigsAreAtSameIndices( final Set<String> commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) {
for ( String commonContig : commonContigs ) {
SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);
final SAMSequenceRecord dict1Record = dict1.getSequence(commonContig);
final SAMSequenceRecord dict2Record = dict2.getSequence(commonContig);

// Each common contig must have the same index in both dictionaries
if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) {
Expand Down Expand Up @@ -489,13 +498,13 @@ private static Set<String> findMisindexedContigsInIntervals( final GenomeLocSort
* @return
*/
public static Set<String> getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
Set<String> intersectingSequenceNames = getContigNames(dict1);
final Set<String> intersectingSequenceNames = getContigNames(dict1);
intersectingSequenceNames.retainAll(getContigNames(dict2));
return intersectingSequenceNames;
}

public static Set<String> getContigNames(SAMSequenceDictionary dict) {
Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size()));
final Set<String> contigNames = new HashSet<String>(Utils.optimumHashSize(dict.size()));
for (SAMSequenceRecord dictionaryEntry : dict.getSequences())
contigNames.add(dictionaryEntry.getSequenceName());
return contigNames;
Expand All @@ -515,7 +524,7 @@ public static String getDictionaryAsString( final SAMSequenceDictionary dict ) {
throw new IllegalArgumentException("Sequence dictionary must be non-null");
}

StringBuilder s = new StringBuilder("[ ");
final StringBuilder s = new StringBuilder("[ ");

for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) {
s.append(dictionaryEntry.getSequenceName());
Expand Down
Expand Up @@ -40,7 +40,7 @@ public enum TYPE {
ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompatibilities
LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc.
@EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL
ALL // do not check for all of the above conditions, DEFAULT
Expand Down

0 comments on commit bacb6b2

Please sign in to comment.