Fix Duplicate Set Index for queryname sorted input to MarkDuplicates (#…

…1843)
broadinstitute · Feb 13, 2023 · 21ccdb5 · 21ccdb5
1 parent 7ff09b8
commit 21ccdb5
Show file tree

Hide file tree

Showing 9 changed files with 183 additions and 256 deletions.
diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java
@@ -319,12 +319,12 @@ protected int doWork() {
             RepresentativeReadIndexer rri = null;
             int representativeReadIndexInFile = -1;
             int duplicateSetSize = -1;
-            int nextRepresentativeIndex = -1;
+            int nextReadInDuplicateSetIndex = -1;
             if (TAG_DUPLICATE_SET_MEMBERS) {
                 representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator();
                 if (representativeReadIterator.hasNext()) {
                     rri = representativeReadIterator.next();
-                    nextRepresentativeIndex = rri.readIndexInFile;
+                    nextReadInDuplicateSetIndex = rri.readIndexInFile;
                     representativeReadIndexInFile = rri.representativeReadIndexInFile;
                     duplicateSetSize = rri.setSize;
                 }
@@ -333,6 +333,7 @@ protected int doWork() {
             final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written");
             final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
             String duplicateQueryName = null;
+            String representativeQueryName = null;
 
             while (iterator.hasNext()) {
                 final SAMRecord rec = iterator.next();
@@ -375,22 +376,27 @@ protected int doWork() {
 
                 // Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
                 if (TAG_DUPLICATE_SET_MEMBERS) {
-                    final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
+                    final boolean needNextRepresentativeIndex = recordInFileIndex > nextReadInDuplicateSetIndex &&
+                            (sortOrder == SAMFileHeader.SortOrder.coordinate || !rec.getReadName().equals(representativeQueryName));
                     if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
                         rri = representativeReadIterator.next();
-                        nextRepresentativeIndex = rri.readIndexInFile;
+                        nextReadInDuplicateSetIndex = rri.readIndexInFile;
                         representativeReadIndexInFile = rri.representativeReadIndexInFile;
                         duplicateSetSize = rri.setSize;
                     }
-                    final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||
+
+                    /* If this record's index is readInDuplicateSetIndex, then it is in a duplicateset.
+                    For queryname sorted data, we only have one representativeReadIndex entry per read name, so we need
+                    to also look for additional reads with the same name.
+                     */
+                    final boolean isInDuplicateSet = recordInFileIndex == nextReadInDuplicateSetIndex ||
                             (sortOrder == SAMFileHeader.SortOrder.queryname &&
-                                    recordInFileIndex > nextDuplicateIndex);
+                                    recordInFileIndex > nextReadInDuplicateSetIndex && rec.getReadName().equals(representativeQueryName));
                     if (isInDuplicateSet) {
                         if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
-                            if (TAG_DUPLICATE_SET_MEMBERS) {
-                                rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
-                                rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
-                            }
+                            rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
+                            rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
+                            representativeQueryName = rec.getReadName();
                         }
                     }
                 }
@@ -400,23 +406,6 @@ protected int doWork() {
                     UmiUtil.setMolecularIdentifier(rec, "", MOLECULAR_IDENTIFIER_TAG, DUPLEX_UMI);
                 }
 
-                // Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
-                if (TAG_DUPLICATE_SET_MEMBERS) {
-                    final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
-                    if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
-                        rri = representativeReadIterator.next();
-                        nextRepresentativeIndex = rri.readIndexInFile;
-                        representativeReadIndexInFile = rri.representativeReadIndexInFile;
-                        duplicateSetSize = rri.setSize;
-                    }
-                    final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||
-                            (sortOrder == SAMFileHeader.SortOrder.queryname &&
-                                    recordInFileIndex > nextDuplicateIndex);
-                    if (isInDuplicateSet && !rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag() && TAG_DUPLICATE_SET_MEMBERS) {
-                        rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
-                        rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
-                    }
-                }
 
                 // Note, duplicateQueryName must be incremented after we have marked both optical and sequencing duplicates for queryname sorted files.
                 if (isDuplicate) {
@@ -874,7 +863,9 @@ private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> li
         // for read name (for representative read name), add the last of the pair that was examined
         for (final ReadEndsForMarkDuplicates end : list) {
             addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile);
-            addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
+            if (end.read1IndexInFile != end.read2IndexInFile) {
+                addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
+            }
         }
     }
 

diff --git a/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java b/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java
@@ -29,6 +29,7 @@
 import htsjdk.samtools.metrics.MetricsFile;
 import htsjdk.samtools.util.CloseableIterator;
 import htsjdk.samtools.util.FormatUtil;
+import htsjdk.samtools.util.Histogram;
 import htsjdk.samtools.util.IOUtil;
 import org.testng.Assert;
 import picard.cmdline.CommandLineProgram;
@@ -40,7 +41,11 @@
 import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 /**
@@ -53,6 +58,7 @@ abstract public class AbstractMarkDuplicatesCommandLineProgramTester extends Sam
     final DuplicationMetrics expectedMetrics;
 
     boolean testOpticalDuplicateDTTag = false;
+    final public Map<List<String>, Double> expectedSetSizeMap = new HashMap<>();
 
     public AbstractMarkDuplicatesCommandLineProgramTester(final ScoringStrategy duplicateScoringStrategy, SAMFileHeader.SortOrder sortOrder) {
         this(duplicateScoringStrategy, sortOrder, true);
@@ -139,26 +145,44 @@ public void setExpectedOpticalDuplicate(final int expectedOpticalDuplicatePairs)
         expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES = expectedOpticalDuplicatePairs;
     }
 
-    @Override
-    public void test() throws IOException {
-        testMetrics();
+
+    /**
+    * This method is called before iterating through output records in test method.
+    * Should be used to update any expectations to be used in implementation
+    * specific tests.
+     **/
+    void updateExpectationsHook() {
     }
 
     /**
-     * Runs test and returns metrics
-     * @return Duplication metrics
-     * @throws IOException
+    * This method is called for each record in the output.  Should be used
+    * to perform any additional implementation specific tests not included
+     * in default test.
+     **/
+    void testRecordHook(final SAMRecord record){
+    }
+
+    /**
+     * This method is called after iterating through output records.
+     * Should be used to test any implementation specific methods not
+     * included in default test
      */
-    public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException {
+    void testOutputsHook() {
+    }
+
+    @Override
+    public void test() throws IOException {
         try {
             updateExpectedDuplicationMetrics();
+            updateExpectationsHook();
 
             // Read the output and check the duplicate flag
             int outputRecords = 0;
             final Set<String> sequencingDTErrorsSeen = new HashSet<>();
             try(final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(fastaFiles.get(samRecordSetBuilder.getHeader())).open(getOutput())) {
                 for (final SAMRecord record : reader) {
                     outputRecords++;
+                    testRecordHook(record);
                     final String key = samRecordToDuplicatesFlagsKey(record);
                     Assert.assertTrue(this.duplicateFlags.containsKey(key), "DOES NOT CONTAIN KEY: " + key);
                     final boolean value = this.duplicateFlags.get(key);
@@ -197,7 +221,29 @@ public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException
                 Assert.assertEquals(sequencingDTErrorsSeen.size(), expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file");
                 Assert.assertEquals(sequencingDTErrorsSeen.size(), observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file");
             }
-            return metricsOutput;
+
+            if (!expectedSetSizeMap.isEmpty()) {
+                boolean checked = false;
+                for (final Histogram<Double> histo : metricsOutput.getAllHistograms()) {
+                    final String label = histo.getValueLabel();
+                    for (final Double bin : histo.keySet()) {
+                        final String binStr = String.valueOf(bin);
+                        final List<String> labelBinStr = Arrays.asList(label, binStr);
+                        if (expectedSetSizeMap.containsKey(labelBinStr)) {
+                            checked = true;
+                            Histogram.Bin<Double> binValue = histo.get(bin);
+                            final double actual = binValue.getValue();
+                            final double expected = expectedSetSizeMap.get(labelBinStr);
+                            Assert.assertEquals(actual, expected);
+                        }
+                    }
+                }
+                if (!checked) {
+                    Assert.fail("Could not not find matching entry for expectedSetSizeMap in metrics.");
+                }
+            }
+
+            testOutputsHook();
         } finally {
             IOUtil.recursiveDelete(getOutputDir().toPath());
         }

diff --git a/src/test/java/picard/sam/markduplicates/CollectDuplicateMetricsTester.java b/src/test/java/picard/sam/markduplicates/CollectDuplicateMetricsTester.java
@@ -84,9 +84,8 @@ public CollectDuplicateMetricsTester( final SAMFileHeader.SortOrder sortOrder) {
     public void recordOpticalDuplicatesMarked() {}
 
     @Override
-    public void test() throws IOException {
+    void updateExpectationsHook() {
         // CollectDuplicateMetrics cannot identify Optical duplicates....
         setExpectedOpticalDuplicate(0);
-        super.test();
     }
 }
diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTest.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTest.java
@@ -35,16 +35,16 @@
  */
 public class MarkDuplicatesSetSizeHistogramTest extends AbstractMarkDuplicatesCommandLineProgramTest {
     @Override
-    protected MarkDuplicatesSetSizeHistogramTester getTester() {
-        return new MarkDuplicatesSetSizeHistogramTester();
+    protected MarkDuplicatesTester getTester() {
+        return new MarkDuplicatesTester();
     }
 
     @Test
     public void TestSingleSet() {
         // This tests checks that if I add two read pairs with the same alignment start, a duplicate
         // set size entry of 2 is put into the histogram. The pair is an optical dup, so a 2 entry for
         // the 'optical_sets' should also be marked
-        final MarkDuplicatesSetSizeHistogramTester tester = getTester();
+        final MarkDuplicatesTester tester = getTester();
         tester.setExpectedOpticalDuplicate(1);
         String representativeReadName = "RUNID:1:1:16020:13352";
         tester.addMatePair(representativeReadName, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
@@ -59,7 +59,7 @@ public void TestSingleSet() {
     public void testOpticalAndNonOpticalSet() {
         // This tests checks that if I have two optical dups and one non-optical in the same dup set that the correct histogram entries
         // are specified
-        final MarkDuplicatesSetSizeHistogramTester tester = getTester();
+        final MarkDuplicatesTester tester = getTester();
         tester.setExpectedOpticalDuplicate(2);
         String representativeReadName = "RUNID:1:1:16020:13352";
         tester.addMatePair(representativeReadName, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
@@ -75,7 +75,7 @@ public void testOpticalAndNonOpticalSet() {
     @Test
     public void testSingleton() {
         // This tests checks if having a read pair that is not duplicated, the correct histogram entry is updated
-        final MarkDuplicatesSetSizeHistogramTester tester = getTester();
+        final MarkDuplicatesTester tester = getTester();
         tester.setExpectedOpticalDuplicate(0);
         // Add non-duplicate read
         tester.addMatePair("RUNID:1:1:15993:13375", 1, 485255, 485255, false, false, false, false, "43M2S", "43M2S", false, true, false, false, false, DEFAULT_BASE_QUALITY);
@@ -86,7 +86,7 @@ public void testSingleton() {
     @Test
     public void testMultiRepresentativeReadTags() {
         // This tests checks multiple different duplicate sets of varying sizes
-        final MarkDuplicatesSetSizeHistogramTester tester = getTester();
+        final MarkDuplicatesTester tester = getTester();
         tester.setExpectedOpticalDuplicate(3);
         // Duplicate set: size 2 - all optical
         String representativeReadName1 = "RUNID:1:1:16020:13352";

diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTester.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTester.java