Permalink
Browse files

Optional flag to add an attribute in the SAM/BAM file used to store t…

…he size of a duplicate set. A separate attribute is also added to store which read was selected as representative out of a duplicate set.
  • Loading branch information...
1 parent 7a85c95 commit 335b198306e5b630af32512e601b41d5447bf8f4 @lhogstrom lhogstrom committed with lhogstrom Jun 13, 2016
View
@@ -1 +1,3 @@
rootProject.name = "picard"
+include 'picard_gradle_v2'
+
@@ -37,16 +37,9 @@
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.SortingLongCollection;
-import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram;
-import picard.sam.markduplicates.util.DiskBasedReadEndsForMarkDuplicatesMap;
-import picard.sam.markduplicates.util.LibraryIdGenerator;
-import picard.sam.markduplicates.util.ReadEnds;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesCodec;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesMap;
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodes;
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodesCodec;
+import picard.sam.markduplicates.util.*;
+import picard.sam.util.RepresentativeReadIndexer;
import java.io.*;
import java.util.*;
@@ -125,6 +118,10 @@
public static final String DUPLICATE_TYPE_LIBRARY = "LB";
/** The duplicate type tag value for duplicate type: sequencing (optical & pad-hopping, or "co-localized"). */
public static final String DUPLICATE_TYPE_SEQUENCING = "SQ";
+ /** The attribute in the SAM/BAM file used to store which read was selected as representative out of a duplicate set */
+ public static final String DUPLICATE_SET_INDEX_TAG = "DI";
+ /** The attribute in the SAM/BAM file used to store the size of a duplicate set */
+ public static final String DUPLICATE_SET_SIZE_TAG = "DS";
/** Enum for the possible values that a duplicate read can be tagged with in the DT attribute. */
public enum DuplicateType {
@@ -165,6 +162,14 @@
@Option(doc = "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
public String READ_TWO_BARCODE_TAG = null;
+ @Option(doc = "If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG (DS), " +
+ "indicates the size of the duplicate set. The smallest possible DS value is 2 which occurs when two " +
+ "reads map to the same portion of the reference only one of which is marked as duplicate. The second " +
+ "tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier for the duplicate set to which the " +
+ "record belongs. This identifier is the index-in-file of the representative read that was selected out " +
+ "of the duplicate set.", optional = true)
+ public boolean TAG_DUPLICATE_SET_MEMBERS = false;
+
@Option(doc = "If true remove 'optical' duplicates and other duplicates that appear to have arisen from the " +
"sequencing process instead of the library preparation process, even if REMOVE_DUPLICATES is false. " +
"If REMOVE_DUPLICATES is true, all duplicates are removed and this option is ignored.")
@@ -177,6 +182,7 @@
private SortingCollection<ReadEndsForMarkDuplicates> fragSort;
private SortingLongCollection duplicateIndexes;
private SortingLongCollection opticalDuplicateIndexes;
+ private SortingCollection<RepresentativeReadIndexer> representativeReadIndicesForDuplicates;
private int numDuplicateIndices = 0;
static private final long NO_SUCH_INDEX = Long.MAX_VALUE; // needs to be large so that that >= test fails for query-sorted traversal
@@ -259,6 +265,22 @@ protected int doWork() {
long nextOpticalDuplicateIndex = this.opticalDuplicateIndexes != null && this.opticalDuplicateIndexes.hasNext() ? this.opticalDuplicateIndexes.next() : NO_SUCH_INDEX;
long nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : NO_SUCH_INDEX);
+ // initialize variables for optional representative read tagging
+ CloseableIterator<RepresentativeReadIndexer> representativeReadIterator = null;
+ RepresentativeReadIndexer rri = null;
+ int representativeReadIndexInFile = -1;
+ int duplicateSetSize = -1;
+ int nextRepresentativeIndex = -1;
+ if (TAG_DUPLICATE_SET_MEMBERS) {
+ representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator();
+ if (representativeReadIterator.hasNext()) {
+ rri = representativeReadIterator.next();
+ nextRepresentativeIndex = rri.readIndexInFile;
+ representativeReadIndexInFile = rri.representativeReadIndexInFile;
+ duplicateSetSize = rri.setSize;
+ }
+ }
+
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written");
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
String duplicateQueryName = null;
@@ -343,6 +365,28 @@ protected int doWork() {
}
}
+ // Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
+ if (TAG_DUPLICATE_SET_MEMBERS) {
+ final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
+ if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
+ rri = representativeReadIterator.next();
+ nextRepresentativeIndex = rri.readIndexInFile;
+ representativeReadIndexInFile = rri.representativeReadIndexInFile;
+ duplicateSetSize = rri.setSize;
+ }
+ final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||
+ (sortOrder == SAMFileHeader.SortOrder.queryname &&
+ recordInFileIndex > nextDuplicateIndex);
+ if (isInDuplicateSet) {
+ if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
+ if (TAG_DUPLICATE_SET_MEMBERS) {
+ rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
+ rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
+ }
+ }
+ }
+ }
+
// Output the record if desired and bump the record index
recordInFileIndex++;
if (this.REMOVE_DUPLICATES && rec.getDuplicateReadFlag()) continue;
@@ -357,6 +401,9 @@ protected int doWork() {
iterator.close();
this.duplicateIndexes.cleanup();
+ if (TAG_DUPLICATE_SET_MEMBERS) {
+ this.representativeReadIndicesForDuplicates.cleanup();
+ }
reportMemoryStats("Before output close");
out.close();
@@ -508,8 +555,8 @@ private void buildSortedReadEndLists(final boolean useBarcodes) {
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
pairedEnds.orientation == ReadEnds.RF) {
- pairedEnds.orientation = ReadEnds.FR;
- }
+ pairedEnds.orientation = ReadEnds.FR;
+ }
} else {
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
@@ -600,18 +647,35 @@ private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, fina
* @return an array with an ordered list of indexes into the source file
*/
private void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
+ int entryOverhead;
+ if (TAG_DUPLICATE_SET_MEMBERS) {
+ // Memory requirements for RepresentativeReadIndexer:
+ // three int entries + overhead: (3 * 4) + 4 = 16 bytes
+ entryOverhead = 16;
+ }
+ else {
+ entryOverhead = SortingLongCollection.SIZEOF;
+ }
// Keep this number from getting too large even if there is a huge heap.
- int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / SortingLongCollection.SIZEOF, (double) (Integer.MAX_VALUE - 5));
- // If we're also tracking optical duplicates, cut maxInMemory in half, since we'll need two sorting collections
+ int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5));
+ // If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections
if (indexOpticalDuplicates) {
- maxInMemory /= 2;
+ maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead);
this.opticalDuplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
}
log.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk.");
this.duplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
+ if (TAG_DUPLICATE_SET_MEMBERS) {
+ final RepresentativeReadIndexerCodec representativeIndexCodec = new RepresentativeReadIndexerCodec();
+ this.representativeReadIndicesForDuplicates = SortingCollection.newInstance(RepresentativeReadIndexer.class,
+ representativeIndexCodec,
+ new RepresentativeReadComparator(),
+ maxInMemory,
+ TMP_DIR);
+ }
ReadEndsForMarkDuplicates firstOfNextChunk = null;
- final List<ReadEndsForMarkDuplicates> nextChunk = new ArrayList<ReadEndsForMarkDuplicates>(200);
+ final List nextChunk = new ArrayList<ReadEndsForMarkDuplicates>(200);
// First just do the pairs
log.info("Traversing read pair information and detecting duplicates.");
@@ -621,13 +685,17 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i
} else {
if (nextChunk.size() > 1) {
markDuplicatePairs(nextChunk);
+ if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
}
nextChunk.clear();
nextChunk.add(next);
firstOfNextChunk = next;
}
}
- if (nextChunk.size() > 1) markDuplicatePairs(nextChunk);
+ if (nextChunk.size() > 1) {
+ markDuplicatePairs(nextChunk);
+ if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
+ }
this.pairSort.cleanup();
this.pairSort = null;
@@ -661,6 +729,7 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i
log.info("Sorting list of duplicate records.");
this.duplicateIndexes.doneAddingStartIteration();
if (this.opticalDuplicateIndexes != null) this.opticalDuplicateIndexes.doneAddingStartIteration();
+ if (TAG_DUPLICATE_SET_MEMBERS) this.representativeReadIndicesForDuplicates.doneAdding();
}
private boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2, final boolean useBarcodes) {
@@ -693,6 +762,42 @@ private void addIndexAsDuplicate(final long bamIndex) {
++this.numDuplicateIndices;
}
+ private void addRepresentativeReadOfDuplicateSet(final long representativeReadIndexInFile, final int setSize, final long read1IndexInFile) {
+ final RepresentativeReadIndexer rri = new RepresentativeReadIndexer();
+ rri.representativeReadIndexInFile = (int) representativeReadIndexInFile;
+ rri.setSize = setSize;
+ rri.readIndexInFile = (int) read1IndexInFile;
+ this.representativeReadIndicesForDuplicates.add(rri);
+ }
+
+ /**
+ * Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
+ * quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
+ * read to the records of the first and second in a pair. This value becomes is used for
+ * the 'DI' tag.
+ *
+ * @param list
+ */
+ private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> list) {
+ short maxScore = 0;
+ ReadEndsForMarkDuplicates best = null;
+
+ /** All read ends should have orientation FF, FR, RF, or RR **/
+ for (final ReadEndsForMarkDuplicates end : list) {
+ if (end.score > maxScore || best == null) {
+ maxScore = end.score;
+ best = end;
+ }
+ }
+
+ // for read name (for representative read name), add the last of the pair that was examined
+ for (final ReadEndsForMarkDuplicates end : list) {
+ addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile);
+ addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
+ }
+ }
+
+
/**
* Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should
* not be marked as duplicates. This assumes that the list contains objects representing pairs.
@@ -795,4 +900,17 @@ public int compare(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDup
return compareDifference;
}
}
+
+ // order representative read entries based on the record index
+ static class RepresentativeReadComparator implements Comparator<RepresentativeReadIndexer> {
+
+ public RepresentativeReadComparator() {}
+
+ public int compare(final RepresentativeReadIndexer lhs, final RepresentativeReadIndexer rhs) {
+ int compareDifference = lhs.readIndexInFile - rhs.readIndexInFile;
+ return compareDifference;
+ }
+ }
+
+
}
@@ -34,10 +34,10 @@
What do we need to store you ask? Well, we need to store:
- byte: orientation
- short: libraryId, readGroup, tile, x, y, score
- - int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate
+ - int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate, duplicateSetSize
- long: read1IndexInFile, read2IndexInFile
*/
- protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (4 * 4) + (8 * 2) + 1
+ protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (5 * 4) + (8 * 2) + 1
+ 8 + // last 8 == reference overhead
13; // This is determined experimentally with JProfiler
@@ -48,6 +48,7 @@ public static int getSizeOf() {
public short score = 0;
public long read1IndexInFile = -1;
public long read2IndexInFile = -1;
+ public int duplicateSetSize = -1;
public ReadEndsForMarkDuplicates() {}
@@ -76,4 +77,5 @@ public ReadEndsForMarkDuplicates(final ReadEndsForMarkDuplicates read) {
public ReadEndsForMarkDuplicates clone() {
return new ReadEndsForMarkDuplicates(this);
}
+
}
@@ -69,6 +69,7 @@ public void encode(final ReadEndsForMarkDuplicates read) {
this.out.writeShort((short)read.x);
this.out.writeShort((short)read.y);
this.out.writeByte(read.orientationForOpticalDuplicates);
+ this.out.writeInt(read.duplicateSetSize);
} catch (final IOException ioe) {
throw new PicardException("Exception writing ReadEnds to file.", ioe);
}
@@ -102,6 +103,7 @@ public ReadEndsForMarkDuplicates decode() {
read.y = this.in.readShort();
read.orientationForOpticalDuplicates = this.in.readByte();
+ read.duplicateSetSize = this.in.readInt();
return read;
} catch (final IOException ioe) {
@@ -0,0 +1,79 @@
+/*
+ * The MIT License
+ *
+ * Copyright (c) 2016 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+package picard.sam.markduplicates.util;
+
+import htsjdk.samtools.util.SortingCollection;
+import picard.PicardException;
+import picard.sam.util.RepresentativeReadIndexer;
+
+import java.io.*;
+
+/** Codec for read names and integers that outputs the primitive fields and reads them back. */
+public class RepresentativeReadIndexerCodec implements SortingCollection.Codec<RepresentativeReadIndexer> {
+ protected DataInputStream in;
+ protected DataOutputStream out;
+
+ public SortingCollection.Codec<RepresentativeReadIndexer> clone() {
+ return new RepresentativeReadIndexerCodec();
+ }
+
+ public void setOutputStream(final OutputStream os) { this.out = new DataOutputStream(os); }
+
+ public void setInputStream(final InputStream is) { this.in = new DataInputStream(is); }
+
+ public DataInputStream getInputStream() {
+ return in;
+ }
+
+ public DataOutputStream getOutputStream() {
+ return out;
+ }
+
+ public void encode(final RepresentativeReadIndexer rni) {
+ try {
+ this.out.writeInt(rni.readIndexInFile);
+ this.out.writeInt(rni.setSize);
+ this.out.writeInt(rni.representativeReadIndexInFile);
+ } catch (final IOException ioe) {
+ throw new PicardException("Exception writing ReadEnds to file.", ioe);
+ }
+ }
+
+ public RepresentativeReadIndexer decode() {
+ final RepresentativeReadIndexer rni = new RepresentativeReadIndexer();
+ try {
+ // If the first read results in an EOF we've exhausted the stream
+ try {
+ rni.readIndexInFile = this.in.readInt();
+ } catch (final EOFException eof) {
+ return null;
+ }
+ rni.setSize = this.in.readInt();
+ rni.representativeReadIndexInFile = this.in.readInt();
+ return rni;
+ } catch (final IOException ioe) {
+ throw new PicardException("Exception writing ReadEnds to file.", ioe);
+ }
+ }
+}
@@ -0,0 +1,11 @@
+package picard.sam.util;
+
+/**
+ * Little struct-like class to hold a record index, the index of the corresponding representative read, and duplicate set size information.
+ */
+
+public class RepresentativeReadIndexer {
+ public int readIndexInFile = -1;
+ public int setSize = -1;
+ public int representativeReadIndexInFile = -1;
+}
@@ -598,5 +598,4 @@ public void testTwoMappedPairsWithSupplementaryReadsAfterCanonical(final Boolean
tester.addMappedFragment(fragLikeFirst ? "RUNID:1:1:15993:13361" : "RUNID:1:1:16020:13352", 1, 400, markSecondaryAndSupplementaryRecordsLikeTheCanonical() && !fragLikeFirst, null, null, additionalFragSecondary, additionalFragSupplementary, DEFAULT_BASE_QUALITY);
tester.runTest();
}
-
}
Oops, something went wrong.

0 comments on commit 335b198

Please sign in to comment.