Optional flag to add an attribute in the SAM/BAM file used to store t…

…he size of a duplicate set. A separate attribute is also added to store which read was selected as representative out of a duplicate set.
broadinstitute · Apr 26, 2017 · 335b198 · 335b198
1 parent 7a85c95
commit 335b198
Show file tree

Hide file tree

Showing 10 changed files with 460 additions and 21 deletions.
diff --git a/settings.gradle b/settings.gradle
@@ -1 +1,3 @@
 rootProject.name =	"picard"
+include 'picard_gradle_v2'
+
diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java
@@ -37,16 +37,9 @@
 import htsjdk.samtools.util.CloseableIterator;
 import htsjdk.samtools.util.SortingCollection;
 import htsjdk.samtools.util.SortingLongCollection;
-import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram;
-import picard.sam.markduplicates.util.DiskBasedReadEndsForMarkDuplicatesMap;
-import picard.sam.markduplicates.util.LibraryIdGenerator;
-import picard.sam.markduplicates.util.ReadEnds;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesCodec;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesMap;
 import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodes;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodesCodec;
+import picard.sam.markduplicates.util.*;
+import picard.sam.util.RepresentativeReadIndexer;
 
 import java.io.*;
 import java.util.*;
@@ -125,6 +118,10 @@ public enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All }
     public static final String DUPLICATE_TYPE_LIBRARY = "LB";
     /** The duplicate type tag value for duplicate type: sequencing (optical & pad-hopping, or "co-localized"). */
     public static final String DUPLICATE_TYPE_SEQUENCING = "SQ";
+    /** The attribute in the SAM/BAM file used to store which read was selected as representative out of a duplicate set */
+    public static final String DUPLICATE_SET_INDEX_TAG = "DI";
+    /** The attribute in the SAM/BAM file used to store the size of a duplicate set */
+    public static final String DUPLICATE_SET_SIZE_TAG = "DS";
 
     /** Enum for the possible values that a duplicate read can be tagged with in the DT attribute. */
     public enum DuplicateType {
@@ -165,6 +162,14 @@ public enum DuplicateType {
     @Option(doc = "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
     public String READ_TWO_BARCODE_TAG = null;
 
+    @Option(doc = "If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG (DS), " +
+            "indicates the size of the duplicate set. The smallest possible DS value is 2 which occurs when two " +
+            "reads map to the same portion of the reference only one of which is marked as duplicate. The second " +
+            "tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier for the duplicate set to which the " +
+            "record belongs. This identifier is the index-in-file of the representative read that was selected out " +
+            "of the duplicate set.", optional = true)
+    public boolean TAG_DUPLICATE_SET_MEMBERS = false;
+
     @Option(doc = "If true remove 'optical' duplicates and other duplicates that appear to have arisen from the " +
             "sequencing process instead of the library preparation process, even if REMOVE_DUPLICATES is false. " +
             "If REMOVE_DUPLICATES is true, all duplicates are removed and this option is ignored.")
@@ -177,6 +182,7 @@ public enum DuplicateType {
     private SortingCollection<ReadEndsForMarkDuplicates> fragSort;
     private SortingLongCollection duplicateIndexes;
     private SortingLongCollection opticalDuplicateIndexes;
+    private SortingCollection<RepresentativeReadIndexer> representativeReadIndicesForDuplicates;
 
     private int numDuplicateIndices = 0;
     static private final long NO_SUCH_INDEX = Long.MAX_VALUE; // needs to be large so that that >= test fails for query-sorted traversal
@@ -259,6 +265,22 @@ protected int doWork() {
         long nextOpticalDuplicateIndex = this.opticalDuplicateIndexes != null && this.opticalDuplicateIndexes.hasNext() ? this.opticalDuplicateIndexes.next() : NO_SUCH_INDEX;
         long nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : NO_SUCH_INDEX);
 
+        // initialize variables for optional representative read tagging
+        CloseableIterator<RepresentativeReadIndexer> representativeReadIterator = null;
+        RepresentativeReadIndexer rri = null;
+        int representativeReadIndexInFile = -1;
+        int duplicateSetSize = -1;
+        int nextRepresentativeIndex = -1;
+        if (TAG_DUPLICATE_SET_MEMBERS) {
+            representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator();
+            if (representativeReadIterator.hasNext()) {
+                rri = representativeReadIterator.next();
+                nextRepresentativeIndex = rri.readIndexInFile;
+                representativeReadIndexInFile = rri.representativeReadIndexInFile;
+                duplicateSetSize = rri.setSize;
+            }
+        }
+
         final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written");
         final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
         String duplicateQueryName = null;
@@ -343,6 +365,28 @@ protected int doWork() {
                 }
             }
 
+            // Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
+            if (TAG_DUPLICATE_SET_MEMBERS) {
+                final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
+                if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
+                    rri = representativeReadIterator.next();
+                    nextRepresentativeIndex = rri.readIndexInFile;
+                    representativeReadIndexInFile = rri.representativeReadIndexInFile;
+                    duplicateSetSize = rri.setSize;
+                }
+                final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||
+                        (sortOrder == SAMFileHeader.SortOrder.queryname &&
+                                recordInFileIndex > nextDuplicateIndex);
+                if (isInDuplicateSet) {
+                    if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
+                        if (TAG_DUPLICATE_SET_MEMBERS) {
+                            rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
+                            rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
+                        }
+                    }
+                }
+            }
+
             // Output the record if desired and bump the record index
             recordInFileIndex++;
             if (this.REMOVE_DUPLICATES            && rec.getDuplicateReadFlag()) continue;
@@ -357,6 +401,9 @@ protected int doWork() {
         iterator.close();
 
         this.duplicateIndexes.cleanup();
+        if (TAG_DUPLICATE_SET_MEMBERS) {
+            this.representativeReadIndicesForDuplicates.cleanup();
+        }
 
         reportMemoryStats("Before output close");
         out.close();
@@ -508,8 +555,8 @@ private void buildSortedReadEndLists(final boolean useBarcodes) {
                             if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
                                     pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
                                     pairedEnds.orientation == ReadEnds.RF) {
-                                pairedEnds.orientation = ReadEnds.FR;
-                            }
+                                        pairedEnds.orientation = ReadEnds.FR;
+                                    }
                         } else {
                             pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
                             pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
@@ -600,18 +647,35 @@ private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, fina
      * @return an array with an ordered list of indexes into the source file
      */
     private void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
+        int entryOverhead;
+        if (TAG_DUPLICATE_SET_MEMBERS) {
+            // Memory requirements for RepresentativeReadIndexer:
+            // three int entries + overhead: (3 * 4) + 4 = 16 bytes 
+            entryOverhead = 16;
+        }
+        else {
+            entryOverhead = SortingLongCollection.SIZEOF;
+        }
         // Keep this number from getting too large even if there is a huge heap.
-        int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / SortingLongCollection.SIZEOF, (double) (Integer.MAX_VALUE - 5));
-        // If we're also tracking optical duplicates, cut maxInMemory in half, since we'll need two sorting collections
+        int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5));
+        // If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections
         if (indexOpticalDuplicates) {
-            maxInMemory /= 2;
+            maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead);
             this.opticalDuplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
         }
         log.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk.");
         this.duplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
+        if (TAG_DUPLICATE_SET_MEMBERS) {
+            final RepresentativeReadIndexerCodec representativeIndexCodec = new RepresentativeReadIndexerCodec();
+            this.representativeReadIndicesForDuplicates = SortingCollection.newInstance(RepresentativeReadIndexer.class,
+                    representativeIndexCodec,
+                    new RepresentativeReadComparator(),
+                    maxInMemory,
+                    TMP_DIR);
+        }
 
         ReadEndsForMarkDuplicates firstOfNextChunk = null;
-        final List<ReadEndsForMarkDuplicates> nextChunk = new ArrayList<ReadEndsForMarkDuplicates>(200);
+        final List nextChunk = new ArrayList<ReadEndsForMarkDuplicates>(200);
 
         // First just do the pairs
         log.info("Traversing read pair information and detecting duplicates.");
@@ -621,13 +685,17 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i
             } else {
                 if (nextChunk.size() > 1) {
                     markDuplicatePairs(nextChunk);
+                    if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
                 }
                 nextChunk.clear();
                 nextChunk.add(next);
                 firstOfNextChunk = next;
             }
         }
-        if (nextChunk.size() > 1) markDuplicatePairs(nextChunk);
+        if (nextChunk.size() > 1) {
+            markDuplicatePairs(nextChunk);
+            if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
+        }
         this.pairSort.cleanup();
         this.pairSort = null;
 
@@ -661,6 +729,7 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i
         log.info("Sorting list of duplicate records.");
         this.duplicateIndexes.doneAddingStartIteration();
         if (this.opticalDuplicateIndexes != null) this.opticalDuplicateIndexes.doneAddingStartIteration();
+        if (TAG_DUPLICATE_SET_MEMBERS) this.representativeReadIndicesForDuplicates.doneAdding();
     }
 
     private boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2, final boolean useBarcodes) {
@@ -693,6 +762,42 @@ private void addIndexAsDuplicate(final long bamIndex) {
         ++this.numDuplicateIndices;
     }
 
+    private void addRepresentativeReadOfDuplicateSet(final long representativeReadIndexInFile, final int setSize, final long read1IndexInFile) {
+        final RepresentativeReadIndexer rri = new RepresentativeReadIndexer();
+        rri.representativeReadIndexInFile = (int) representativeReadIndexInFile;
+        rri.setSize = setSize;
+        rri.readIndexInFile = (int) read1IndexInFile;
+        this.representativeReadIndicesForDuplicates.add(rri);
+    }
+
+    /**
+     * Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
+     * quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
+     * read to the records of the first and second in a pair. This value becomes is used for
+     * the 'DI' tag.
+     *
+     * @param list
+     */
+    private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> list) {
+        short maxScore = 0;
+        ReadEndsForMarkDuplicates best = null;
+
+        /** All read ends should have orientation FF, FR, RF, or RR **/
+        for (final ReadEndsForMarkDuplicates end : list) {
+            if (end.score > maxScore || best == null) {
+                maxScore = end.score;
+                best = end;
+            }
+        }
+
+        // for read name (for representative read name), add the last of the pair that was examined
+        for (final ReadEndsForMarkDuplicates end : list) {
+            addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile);
+            addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
+        }
+    }
+
+
     /**
      * Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should
      * not be marked as duplicates.  This assumes that the list contains objects representing pairs.
@@ -795,4 +900,17 @@ public int compare(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDup
             return compareDifference;
         }
     }
+
+    // order representative read entries based on the record index
+    static class RepresentativeReadComparator implements Comparator<RepresentativeReadIndexer> {
+
+        public RepresentativeReadComparator() {}
+
+        public int compare(final RepresentativeReadIndexer lhs, final RepresentativeReadIndexer rhs) {
+            int compareDifference = lhs.readIndexInFile - rhs.readIndexInFile;
+            return compareDifference;
+        }
+    }
+
+
 }
diff --git a/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java b/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java
@@ -34,10 +34,10 @@ public class ReadEndsForMarkDuplicates extends ReadEnds implements Cloneable {
     What do we need to store you ask?  Well, we need to store:
        - byte: orientation
        - short: libraryId, readGroup, tile, x, y, score
-       - int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate
+       - int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate, duplicateSetSize
        - long: read1IndexInFile, read2IndexInFile
      */
-    protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (4 * 4) + (8 * 2) + 1
+    protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (5 * 4) + (8 * 2) + 1
             + 8 + // last 8 == reference overhead
             13; // This is determined experimentally with JProfiler
 
@@ -48,6 +48,7 @@ public static int getSizeOf() {
     public short score = 0;
     public long read1IndexInFile = -1;
     public long read2IndexInFile = -1;
+    public int duplicateSetSize = -1;
 
     public ReadEndsForMarkDuplicates() {}
 
@@ -76,4 +77,5 @@ public ReadEndsForMarkDuplicates(final ReadEndsForMarkDuplicates read) {
     public ReadEndsForMarkDuplicates clone() {
         return new ReadEndsForMarkDuplicates(this);
     }
+
 }
diff --git a/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java b/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java
@@ -69,6 +69,7 @@ public void encode(final ReadEndsForMarkDuplicates read) {
             this.out.writeShort((short)read.x);
             this.out.writeShort((short)read.y);
             this.out.writeByte(read.orientationForOpticalDuplicates);
+            this.out.writeInt(read.duplicateSetSize);
         } catch (final IOException ioe) {
             throw new PicardException("Exception writing ReadEnds to file.", ioe);
         }
@@ -102,6 +103,7 @@ public ReadEndsForMarkDuplicates decode() {
             read.y = this.in.readShort();
 
             read.orientationForOpticalDuplicates = this.in.readByte();
+            read.duplicateSetSize = this.in.readInt();
 
             return read;
         } catch (final IOException ioe) {

diff --git a/src/main/java/picard/sam/markduplicates/util/RepresentativeReadIndexerCodec.java b/src/main/java/picard/sam/markduplicates/util/RepresentativeReadIndexerCodec.java
@@ -0,0 +1,79 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package picard.sam.markduplicates.util;
+
+import htsjdk.samtools.util.SortingCollection;
+import picard.PicardException;
+import picard.sam.util.RepresentativeReadIndexer;
+
+import java.io.*;
+
+/** Codec for read names and integers that outputs the primitive fields and reads them back. */
+public class RepresentativeReadIndexerCodec implements SortingCollection.Codec<RepresentativeReadIndexer> {
+    protected DataInputStream in;
+    protected DataOutputStream out;
+
+    public SortingCollection.Codec<RepresentativeReadIndexer> clone() {
+        return new RepresentativeReadIndexerCodec();
+    }
+
+    public void setOutputStream(final OutputStream os) { this.out = new DataOutputStream(os); }
+
+    public void setInputStream(final InputStream is) { this.in = new DataInputStream(is); }
+
+    public DataInputStream getInputStream() {
+        return in;
+    }
+
+    public DataOutputStream getOutputStream() {
+        return out;
+    }
+
+    public void encode(final RepresentativeReadIndexer rni) {
+        try {
+            this.out.writeInt(rni.readIndexInFile);
+            this.out.writeInt(rni.setSize);
+            this.out.writeInt(rni.representativeReadIndexInFile);
+        } catch (final IOException ioe) {
+            throw new PicardException("Exception writing ReadEnds to file.", ioe);
+        }
+    }
+
+    public RepresentativeReadIndexer decode() {
+        final RepresentativeReadIndexer rni = new RepresentativeReadIndexer();
+        try {
+            // If the first read results in an EOF we've exhausted the stream
+            try {
+                rni.readIndexInFile = this.in.readInt();
+            } catch (final EOFException eof) {
+                return null;
+            }
+            rni.setSize = this.in.readInt();
+            rni.representativeReadIndexInFile = this.in.readInt();
+            return rni;
+        } catch (final IOException ioe) {
+            throw new PicardException("Exception writing ReadEnds to file.", ioe);
+        }
+    }
+}
diff --git a/src/main/java/picard/sam/util/RepresentativeReadIndexer.java b/src/main/java/picard/sam/util/RepresentativeReadIndexer.java
@@ -0,0 +1,11 @@
+package picard.sam.util;
+
+/**
+ * Little struct-like class to hold a record index, the index of the corresponding representative read, and duplicate set size information.
+ */
+
+public class RepresentativeReadIndexer {
+    public int readIndexInFile = -1;
+    public int setSize = -1;
+    public int representativeReadIndexInFile = -1;
+}
diff --git a/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java b/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java
@@ -598,5 +598,4 @@ public void testTwoMappedPairsWithSupplementaryReadsAfterCanonical(final Boolean
         tester.addMappedFragment(fragLikeFirst ? "RUNID:1:1:15993:13361" : "RUNID:1:1:16020:13352", 1, 400, markSecondaryAndSupplementaryRecordsLikeTheCanonical() && !fragLikeFirst, null, null, additionalFragSecondary, additionalFragSupplementary, DEFAULT_BASE_QUALITY);
         tester.runTest();
     }
-
 }