Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Duplicate Set Index for queryname sorted input to MarkDuplicates #1843

Merged
merged 4 commits into from Feb 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
47 changes: 19 additions & 28 deletions src/main/java/picard/sam/markduplicates/MarkDuplicates.java
Expand Up @@ -319,12 +319,12 @@ protected int doWork() {
RepresentativeReadIndexer rri = null;
int representativeReadIndexInFile = -1;
int duplicateSetSize = -1;
int nextRepresentativeIndex = -1;
int nextReadInDuplicateSetIndex = -1;
if (TAG_DUPLICATE_SET_MEMBERS) {
representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator();
if (representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
nextReadInDuplicateSetIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
Expand All @@ -333,6 +333,7 @@ protected int doWork() {
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written");
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
String duplicateQueryName = null;
String representativeQueryName = null;

while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
Expand Down Expand Up @@ -375,22 +376,27 @@ protected int doWork() {

// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
if (TAG_DUPLICATE_SET_MEMBERS) {
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
final boolean needNextRepresentativeIndex = recordInFileIndex > nextReadInDuplicateSetIndex &&
(sortOrder == SAMFileHeader.SortOrder.coordinate || !rec.getReadName().equals(representativeQueryName));
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
nextReadInDuplicateSetIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||

/* If this record's index is readInDuplicateSetIndex, then it is in a duplicateset.
For queryname sorted data, we only have one representativeReadIndex entry per read name, so we need
to also look for additional reads with the same name.
*/
final boolean isInDuplicateSet = recordInFileIndex == nextReadInDuplicateSetIndex ||
(sortOrder == SAMFileHeader.SortOrder.queryname &&
recordInFileIndex > nextDuplicateIndex);
recordInFileIndex > nextReadInDuplicateSetIndex && rec.getReadName().equals(representativeQueryName));
if (isInDuplicateSet) {
if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
if (TAG_DUPLICATE_SET_MEMBERS) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh boy this code was not in a good state before you got to it...

rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
}
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
representativeQueryName = rec.getReadName();
}
}
}
Expand All @@ -400,23 +406,6 @@ protected int doWork() {
UmiUtil.setMolecularIdentifier(rec, "", MOLECULAR_IDENTIFIER_TAG, DUPLEX_UMI);
}

// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
if (TAG_DUPLICATE_SET_MEMBERS) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was this code really copy-pasted like this all along and nobody noticed? 😞

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah... that does appear to be the case

final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||
(sortOrder == SAMFileHeader.SortOrder.queryname &&
recordInFileIndex > nextDuplicateIndex);
if (isInDuplicateSet && !rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag() && TAG_DUPLICATE_SET_MEMBERS) {
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
}
}

// Note, duplicateQueryName must be incremented after we have marked both optical and sequencing duplicates for queryname sorted files.
if (isDuplicate) {
Expand Down Expand Up @@ -874,7 +863,9 @@ private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> li
// for read name (for representative read name), add the last of the pair that was examined
for (final ReadEndsForMarkDuplicates end : list) {
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile);
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
if (end.read1IndexInFile != end.read2IndexInFile) {
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
}
}
}

Expand Down
Expand Up @@ -29,6 +29,7 @@
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.FormatUtil;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import org.testng.Assert;
import picard.cmdline.CommandLineProgram;
Expand All @@ -40,7 +41,11 @@
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
Expand All @@ -53,6 +58,7 @@ abstract public class AbstractMarkDuplicatesCommandLineProgramTester extends Sam
final DuplicationMetrics expectedMetrics;

boolean testOpticalDuplicateDTTag = false;
final public Map<List<String>, Double> expectedSetSizeMap = new HashMap<>();

public AbstractMarkDuplicatesCommandLineProgramTester(final ScoringStrategy duplicateScoringStrategy, SAMFileHeader.SortOrder sortOrder) {
this(duplicateScoringStrategy, sortOrder, true);
Expand Down Expand Up @@ -139,26 +145,44 @@ public void setExpectedOpticalDuplicate(final int expectedOpticalDuplicatePairs)
expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES = expectedOpticalDuplicatePairs;
}

@Override
public void test() throws IOException {
testMetrics();

/**
* This method is called before iterating through output records in test method.
* Should be used to update any expectations to be used in implementation
* specific tests.
**/
void updateExpectationsHook() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

solid refactoring of the testing infrastructure.

}

/**
* Runs test and returns metrics
* @return Duplication metrics
* @throws IOException
* This method is called for each record in the output. Should be used
* to perform any additional implementation specific tests not included
* in default test.
**/
void testRecordHook(final SAMRecord record){
}

/**
* This method is called after iterating through output records.
* Should be used to test any implementation specific methods not
* included in default test
*/
public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException {
void testOutputsHook() {
}

@Override
public void test() throws IOException {
try {
updateExpectedDuplicationMetrics();
updateExpectationsHook();

// Read the output and check the duplicate flag
int outputRecords = 0;
final Set<String> sequencingDTErrorsSeen = new HashSet<>();
try(final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(fastaFiles.get(samRecordSetBuilder.getHeader())).open(getOutput())) {
for (final SAMRecord record : reader) {
outputRecords++;
testRecordHook(record);
final String key = samRecordToDuplicatesFlagsKey(record);
Assert.assertTrue(this.duplicateFlags.containsKey(key), "DOES NOT CONTAIN KEY: " + key);
final boolean value = this.duplicateFlags.get(key);
Expand Down Expand Up @@ -197,7 +221,29 @@ public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException
Assert.assertEquals(sequencingDTErrorsSeen.size(), expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file");
Assert.assertEquals(sequencingDTErrorsSeen.size(), observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file");
}
return metricsOutput;

if (!expectedSetSizeMap.isEmpty()) {
boolean checked = false;
for (final Histogram<Double> histo : metricsOutput.getAllHistograms()) {
final String label = histo.getValueLabel();
for (final Double bin : histo.keySet()) {
final String binStr = String.valueOf(bin);
final List<String> labelBinStr = Arrays.asList(label, binStr);
if (expectedSetSizeMap.containsKey(labelBinStr)) {
checked = true;
Histogram.Bin<Double> binValue = histo.get(bin);
final double actual = binValue.getValue();
final double expected = expectedSetSizeMap.get(labelBinStr);
Assert.assertEquals(actual, expected);
}
}
}
if (!checked) {
Assert.fail("Could not not find matching entry for expectedSetSizeMap in metrics.");
}
}

testOutputsHook();
} finally {
IOUtil.recursiveDelete(getOutputDir().toPath());
}
Expand Down
Expand Up @@ -84,9 +84,8 @@ public CollectDuplicateMetricsTester( final SAMFileHeader.SortOrder sortOrder) {
public void recordOpticalDuplicatesMarked() {}

@Override
public void test() throws IOException {
void updateExpectationsHook() {
// CollectDuplicateMetrics cannot identify Optical duplicates....
setExpectedOpticalDuplicate(0);
super.test();
}
}
Expand Up @@ -35,16 +35,16 @@
*/
public class MarkDuplicatesSetSizeHistogramTest extends AbstractMarkDuplicatesCommandLineProgramTest {
@Override
protected MarkDuplicatesSetSizeHistogramTester getTester() {
return new MarkDuplicatesSetSizeHistogramTester();
protected MarkDuplicatesTester getTester() {
return new MarkDuplicatesTester();
}

@Test
public void TestSingleSet() {
// This tests checks that if I add two read pairs with the same alignment start, a duplicate
// set size entry of 2 is put into the histogram. The pair is an optical dup, so a 2 entry for
// the 'optical_sets' should also be marked
final MarkDuplicatesSetSizeHistogramTester tester = getTester();
final MarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(1);
String representativeReadName = "RUNID:1:1:16020:13352";
tester.addMatePair(representativeReadName, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
Expand All @@ -59,7 +59,7 @@ public void TestSingleSet() {
public void testOpticalAndNonOpticalSet() {
// This tests checks that if I have two optical dups and one non-optical in the same dup set that the correct histogram entries
// are specified
final MarkDuplicatesSetSizeHistogramTester tester = getTester();
final MarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(2);
String representativeReadName = "RUNID:1:1:16020:13352";
tester.addMatePair(representativeReadName, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
Expand All @@ -75,7 +75,7 @@ public void testOpticalAndNonOpticalSet() {
@Test
public void testSingleton() {
// This tests checks if having a read pair that is not duplicated, the correct histogram entry is updated
final MarkDuplicatesSetSizeHistogramTester tester = getTester();
final MarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(0);
// Add non-duplicate read
tester.addMatePair("RUNID:1:1:15993:13375", 1, 485255, 485255, false, false, false, false, "43M2S", "43M2S", false, true, false, false, false, DEFAULT_BASE_QUALITY);
Expand All @@ -86,7 +86,7 @@ public void testSingleton() {
@Test
public void testMultiRepresentativeReadTags() {
// This tests checks multiple different duplicate sets of varying sizes
final MarkDuplicatesSetSizeHistogramTester tester = getTester();
final MarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(3);
// Duplicate set: size 2 - all optical
String representativeReadName1 = "RUNID:1:1:16020:13352";
Expand Down

This file was deleted.