-
Notifications
You must be signed in to change notification settings - Fork 365
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Duplicate Set Index for queryname sorted input to MarkDuplicates #1843
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -319,12 +319,12 @@ protected int doWork() { | |
RepresentativeReadIndexer rri = null; | ||
int representativeReadIndexInFile = -1; | ||
int duplicateSetSize = -1; | ||
int nextRepresentativeIndex = -1; | ||
int nextReadInDuplicateSetIndex = -1; | ||
if (TAG_DUPLICATE_SET_MEMBERS) { | ||
representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator(); | ||
if (representativeReadIterator.hasNext()) { | ||
rri = representativeReadIterator.next(); | ||
nextRepresentativeIndex = rri.readIndexInFile; | ||
nextReadInDuplicateSetIndex = rri.readIndexInFile; | ||
representativeReadIndexInFile = rri.representativeReadIndexInFile; | ||
duplicateSetSize = rri.setSize; | ||
} | ||
|
@@ -333,6 +333,7 @@ protected int doWork() { | |
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written"); | ||
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator; | ||
String duplicateQueryName = null; | ||
String representativeQueryName = null; | ||
|
||
while (iterator.hasNext()) { | ||
final SAMRecord rec = iterator.next(); | ||
|
@@ -375,22 +376,27 @@ protected int doWork() { | |
|
||
// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name | ||
if (TAG_DUPLICATE_SET_MEMBERS) { | ||
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex; | ||
final boolean needNextRepresentativeIndex = recordInFileIndex > nextReadInDuplicateSetIndex && | ||
(sortOrder == SAMFileHeader.SortOrder.coordinate || !rec.getReadName().equals(representativeQueryName)); | ||
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) { | ||
rri = representativeReadIterator.next(); | ||
nextRepresentativeIndex = rri.readIndexInFile; | ||
nextReadInDuplicateSetIndex = rri.readIndexInFile; | ||
representativeReadIndexInFile = rri.representativeReadIndexInFile; | ||
duplicateSetSize = rri.setSize; | ||
} | ||
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex || | ||
|
||
/* If this record's index is readInDuplicateSetIndex, then it is in a duplicateset. | ||
For queryname sorted data, we only have one representativeReadIndex entry per read name, so we need | ||
to also look for additional reads with the same name. | ||
*/ | ||
final boolean isInDuplicateSet = recordInFileIndex == nextReadInDuplicateSetIndex || | ||
(sortOrder == SAMFileHeader.SortOrder.queryname && | ||
recordInFileIndex > nextDuplicateIndex); | ||
recordInFileIndex > nextReadInDuplicateSetIndex && rec.getReadName().equals(representativeQueryName)); | ||
if (isInDuplicateSet) { | ||
if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) { | ||
if (TAG_DUPLICATE_SET_MEMBERS) { | ||
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile); | ||
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize); | ||
} | ||
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile); | ||
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize); | ||
representativeQueryName = rec.getReadName(); | ||
} | ||
} | ||
} | ||
|
@@ -400,23 +406,6 @@ protected int doWork() { | |
UmiUtil.setMolecularIdentifier(rec, "", MOLECULAR_IDENTIFIER_TAG, DUPLEX_UMI); | ||
} | ||
|
||
// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name | ||
if (TAG_DUPLICATE_SET_MEMBERS) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. was this code really copy-pasted like this all along and nobody noticed? 😞 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah... that does appear to be the case |
||
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex; | ||
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) { | ||
rri = representativeReadIterator.next(); | ||
nextRepresentativeIndex = rri.readIndexInFile; | ||
representativeReadIndexInFile = rri.representativeReadIndexInFile; | ||
duplicateSetSize = rri.setSize; | ||
} | ||
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex || | ||
(sortOrder == SAMFileHeader.SortOrder.queryname && | ||
recordInFileIndex > nextDuplicateIndex); | ||
if (isInDuplicateSet && !rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag() && TAG_DUPLICATE_SET_MEMBERS) { | ||
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile); | ||
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize); | ||
} | ||
} | ||
|
||
// Note, duplicateQueryName must be incremented after we have marked both optical and sequencing duplicates for queryname sorted files. | ||
if (isDuplicate) { | ||
|
@@ -874,7 +863,9 @@ private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> li | |
// for read name (for representative read name), add the last of the pair that was examined | ||
for (final ReadEndsForMarkDuplicates end : list) { | ||
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile); | ||
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile); | ||
if (end.read1IndexInFile != end.read2IndexInFile) { | ||
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile); | ||
} | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
import htsjdk.samtools.metrics.MetricsFile; | ||
import htsjdk.samtools.util.CloseableIterator; | ||
import htsjdk.samtools.util.FormatUtil; | ||
import htsjdk.samtools.util.Histogram; | ||
import htsjdk.samtools.util.IOUtil; | ||
import org.testng.Assert; | ||
import picard.cmdline.CommandLineProgram; | ||
|
@@ -40,7 +41,11 @@ | |
import java.io.FileNotFoundException; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.util.Arrays; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
|
||
/** | ||
|
@@ -53,6 +58,7 @@ abstract public class AbstractMarkDuplicatesCommandLineProgramTester extends Sam | |
final DuplicationMetrics expectedMetrics; | ||
|
||
boolean testOpticalDuplicateDTTag = false; | ||
final public Map<List<String>, Double> expectedSetSizeMap = new HashMap<>(); | ||
|
||
public AbstractMarkDuplicatesCommandLineProgramTester(final ScoringStrategy duplicateScoringStrategy, SAMFileHeader.SortOrder sortOrder) { | ||
this(duplicateScoringStrategy, sortOrder, true); | ||
|
@@ -139,26 +145,44 @@ public void setExpectedOpticalDuplicate(final int expectedOpticalDuplicatePairs) | |
expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES = expectedOpticalDuplicatePairs; | ||
} | ||
|
||
@Override | ||
public void test() throws IOException { | ||
testMetrics(); | ||
|
||
/** | ||
* This method is called before iterating through output records in test method. | ||
* Should be used to update any expectations to be used in implementation | ||
* specific tests. | ||
**/ | ||
void updateExpectationsHook() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. solid refactoring of the testing infrastructure. |
||
} | ||
|
||
/** | ||
* Runs test and returns metrics | ||
* @return Duplication metrics | ||
* @throws IOException | ||
* This method is called for each record in the output. Should be used | ||
* to perform any additional implementation specific tests not included | ||
* in default test. | ||
**/ | ||
void testRecordHook(final SAMRecord record){ | ||
} | ||
|
||
/** | ||
* This method is called after iterating through output records. | ||
* Should be used to test any implementation specific methods not | ||
* included in default test | ||
*/ | ||
public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException { | ||
void testOutputsHook() { | ||
} | ||
|
||
@Override | ||
public void test() throws IOException { | ||
try { | ||
updateExpectedDuplicationMetrics(); | ||
updateExpectationsHook(); | ||
|
||
// Read the output and check the duplicate flag | ||
int outputRecords = 0; | ||
final Set<String> sequencingDTErrorsSeen = new HashSet<>(); | ||
try(final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(fastaFiles.get(samRecordSetBuilder.getHeader())).open(getOutput())) { | ||
for (final SAMRecord record : reader) { | ||
outputRecords++; | ||
testRecordHook(record); | ||
final String key = samRecordToDuplicatesFlagsKey(record); | ||
Assert.assertTrue(this.duplicateFlags.containsKey(key), "DOES NOT CONTAIN KEY: " + key); | ||
final boolean value = this.duplicateFlags.get(key); | ||
|
@@ -197,7 +221,29 @@ public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException | |
Assert.assertEquals(sequencingDTErrorsSeen.size(), expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file"); | ||
Assert.assertEquals(sequencingDTErrorsSeen.size(), observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file"); | ||
} | ||
return metricsOutput; | ||
|
||
if (!expectedSetSizeMap.isEmpty()) { | ||
boolean checked = false; | ||
for (final Histogram<Double> histo : metricsOutput.getAllHistograms()) { | ||
final String label = histo.getValueLabel(); | ||
for (final Double bin : histo.keySet()) { | ||
final String binStr = String.valueOf(bin); | ||
final List<String> labelBinStr = Arrays.asList(label, binStr); | ||
if (expectedSetSizeMap.containsKey(labelBinStr)) { | ||
checked = true; | ||
Histogram.Bin<Double> binValue = histo.get(bin); | ||
final double actual = binValue.getValue(); | ||
final double expected = expectedSetSizeMap.get(labelBinStr); | ||
Assert.assertEquals(actual, expected); | ||
} | ||
} | ||
} | ||
if (!checked) { | ||
Assert.fail("Could not not find matching entry for expectedSetSizeMap in metrics."); | ||
} | ||
} | ||
|
||
testOutputsHook(); | ||
} finally { | ||
IOUtil.recursiveDelete(getOutputDir().toPath()); | ||
} | ||
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh boy this code was not in a good state before you got to it...