-
Notifications
You must be signed in to change notification settings - Fork 365
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Duplicate Set Index for queryname sorted input to MarkDuplicates #1843
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -333,6 +333,7 @@ protected int doWork() { | |
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written"); | ||
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator; | ||
String duplicateQueryName = null; | ||
String representativeQueryName = null; | ||
|
||
while (iterator.hasNext()) { | ||
final SAMRecord rec = iterator.next(); | ||
|
@@ -375,7 +376,8 @@ protected int doWork() { | |
|
||
// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name | ||
if (TAG_DUPLICATE_SET_MEMBERS) { | ||
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex; | ||
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex && | ||
(sortOrder == SAMFileHeader.SortOrder.coordinate || !rec.getReadName().equals(representativeQueryName)); | ||
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) { | ||
rri = representativeReadIterator.next(); | ||
nextRepresentativeIndex = rri.readIndexInFile; | ||
|
@@ -384,13 +386,12 @@ protected int doWork() { | |
} | ||
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex || | ||
(sortOrder == SAMFileHeader.SortOrder.queryname && | ||
recordInFileIndex > nextDuplicateIndex); | ||
recordInFileIndex > nextRepresentativeIndex && rec.getReadName().equals(representativeQueryName)); | ||
if (isInDuplicateSet) { | ||
if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) { | ||
if (TAG_DUPLICATE_SET_MEMBERS) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh boy this code was not in a good state before you got to it... |
||
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile); | ||
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize); | ||
} | ||
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile); | ||
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize); | ||
representativeQueryName = rec.getReadName(); | ||
} | ||
} | ||
} | ||
|
@@ -400,23 +401,6 @@ protected int doWork() { | |
UmiUtil.setMolecularIdentifier(rec, "", MOLECULAR_IDENTIFIER_TAG, DUPLEX_UMI); | ||
} | ||
|
||
// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name | ||
if (TAG_DUPLICATE_SET_MEMBERS) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. was this code really copy-pasted like this all along and nobody noticed? 😞 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah... that does appear to be the case |
||
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex; | ||
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) { | ||
rri = representativeReadIterator.next(); | ||
nextRepresentativeIndex = rri.readIndexInFile; | ||
representativeReadIndexInFile = rri.representativeReadIndexInFile; | ||
duplicateSetSize = rri.setSize; | ||
} | ||
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex || | ||
(sortOrder == SAMFileHeader.SortOrder.queryname && | ||
recordInFileIndex > nextDuplicateIndex); | ||
if (isInDuplicateSet && !rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag() && TAG_DUPLICATE_SET_MEMBERS) { | ||
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile); | ||
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize); | ||
} | ||
} | ||
|
||
// Note, duplicateQueryName must be incremented after we have marked both optical and sequencing duplicates for queryname sorted files. | ||
if (isDuplicate) { | ||
|
@@ -874,7 +858,9 @@ private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> li | |
// for read name (for representative read name), add the last of the pair that was examined | ||
for (final ReadEndsForMarkDuplicates end : list) { | ||
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile); | ||
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile); | ||
if (end.read1IndexInFile != end.read2IndexInFile) { | ||
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile); | ||
} | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
import htsjdk.samtools.metrics.MetricsFile; | ||
import htsjdk.samtools.util.CloseableIterator; | ||
import htsjdk.samtools.util.FormatUtil; | ||
import htsjdk.samtools.util.Histogram; | ||
import htsjdk.samtools.util.IOUtil; | ||
import org.testng.Assert; | ||
import picard.cmdline.CommandLineProgram; | ||
|
@@ -40,7 +41,11 @@ | |
import java.io.FileNotFoundException; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.util.Arrays; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
|
||
/** | ||
|
@@ -53,6 +58,7 @@ abstract public class AbstractMarkDuplicatesCommandLineProgramTester extends Sam | |
final DuplicationMetrics expectedMetrics; | ||
|
||
boolean testOpticalDuplicateDTTag = false; | ||
final public Map<List<String>, Double> expectedSetSizeMap = new HashMap<>(); | ||
|
||
public AbstractMarkDuplicatesCommandLineProgramTester(final ScoringStrategy duplicateScoringStrategy, SAMFileHeader.SortOrder sortOrder) { | ||
this(duplicateScoringStrategy, sortOrder, true); | ||
|
@@ -139,26 +145,44 @@ public void setExpectedOpticalDuplicate(final int expectedOpticalDuplicatePairs) | |
expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES = expectedOpticalDuplicatePairs; | ||
} | ||
|
||
@Override | ||
public void test() throws IOException { | ||
testMetrics(); | ||
|
||
/** | ||
* This method is called before iterating through output records in test method. | ||
* Should be used to update any expectations to be used in implementation | ||
* specific tests. | ||
**/ | ||
void updateExpectationsHook() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. solid refactoring of the testing infrastructure. |
||
} | ||
|
||
/** | ||
* Runs test and returns metrics | ||
* @return Duplication metrics | ||
* @throws IOException | ||
* This method is called for each record in the output. Should be used | ||
* to perform any additional implementation specific tests not included | ||
* in default test. | ||
**/ | ||
void testRecordHook(final SAMRecord record){ | ||
} | ||
|
||
/** | ||
* This method is called after iterating through output records. | ||
* Should be used to test any implementation specific methods not | ||
* included in default test | ||
*/ | ||
public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException { | ||
void testOutputsHook() { | ||
} | ||
|
||
@Override | ||
public void test() throws IOException { | ||
try { | ||
updateExpectedDuplicationMetrics(); | ||
updateExpectationsHook(); | ||
|
||
// Read the output and check the duplicate flag | ||
int outputRecords = 0; | ||
final Set<String> sequencingDTErrorsSeen = new HashSet<>(); | ||
try(final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(fastaFiles.get(samRecordSetBuilder.getHeader())).open(getOutput())) { | ||
for (final SAMRecord record : reader) { | ||
outputRecords++; | ||
testRecordHook(record); | ||
final String key = samRecordToDuplicatesFlagsKey(record); | ||
Assert.assertTrue(this.duplicateFlags.containsKey(key), "DOES NOT CONTAIN KEY: " + key); | ||
final boolean value = this.duplicateFlags.get(key); | ||
|
@@ -197,7 +221,29 @@ public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException | |
Assert.assertEquals(sequencingDTErrorsSeen.size(), expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file"); | ||
Assert.assertEquals(sequencingDTErrorsSeen.size(), observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file"); | ||
} | ||
return metricsOutput; | ||
|
||
if (!expectedSetSizeMap.isEmpty()) { | ||
boolean checked = false; | ||
for (final Histogram<Double> histo : metricsOutput.getAllHistograms()) { | ||
final String label = histo.getValueLabel(); | ||
for (final Double bin : histo.keySet()) { | ||
final String binStr = String.valueOf(bin); | ||
final List<String> labelBinStr = Arrays.asList(label, binStr); | ||
if (expectedSetSizeMap.containsKey(labelBinStr)) { | ||
checked = true; | ||
Histogram.Bin<Double> binValue = histo.get(bin); | ||
final double actual = binValue.getValue(); | ||
final double expected = expectedSetSizeMap.get(labelBinStr); | ||
Assert.assertEquals(actual, expected); | ||
} | ||
} | ||
} | ||
if (!checked) { | ||
Assert.fail("Could not not find matching entry for expectedSetSizeMap in metrics."); | ||
} | ||
} | ||
|
||
testOutputsHook(); | ||
} finally { | ||
IOUtil.recursiveDelete(getOutputDir().toPath()); | ||
} | ||
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you rename this/comment the logic for "isInDuplicateSet" here to clairify this? This took a while to pick through what all of the steps are doing...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe "Is RepresentativeReadForADuplicateSet"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So this essentially says that if we are in queryname sorted mode (It should be noted this won't work if we are in query grouped mode even though the logic should be the same, maybe that check should be updated), we mark both the first read and its mates with the duplicate sets but if we are coordinate sorted the mates don't get the flag? Is that asymmetrical behavior desirable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah this is super confusing, I actually couldn't figure out how this even works just now until I realized the variables are horribly named.
I believe that mates should also be marked with duplicate sets when the input is coordinate sorted as well (which is confirmed by our tests). The reason is that in
addRepresentativeReadIndex
we add an entry for both reads of the fragment ifend.read1IndexInFile != end.read2IndexInFile
. And when the ReadEnds object gets built we keep those entries the same for queryname sorted, but different for coordinate sorted (see L548 and thereabouts). I think this really goes back to your overall point that this code has become mangled to the point obfuscation. Which I completely agree with, but don't know that anyone has the capacity to fix, unfortunately.Regardless, I will try to at least make this part a little clearer