Skip to content

Commit

Permalink
Optional flag to add an attribute in the SAM/BAM file used to store t…
Browse files Browse the repository at this point in the history
…he size of a duplicate set. A separate attribute is also added to store which read was selected as representative out of a duplicate set.
  • Loading branch information
lhogstrom committed Apr 26, 2017
1 parent 7a85c95 commit 335b198
Show file tree
Hide file tree
Showing 10 changed files with 460 additions and 21 deletions.
2 changes: 2 additions & 0 deletions settings.gradle
@@ -1 +1,3 @@
rootProject.name = "picard"
include 'picard_gradle_v2'

150 changes: 134 additions & 16 deletions src/main/java/picard/sam/markduplicates/MarkDuplicates.java
Expand Up @@ -37,16 +37,9 @@
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.SortingLongCollection;
import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram;
import picard.sam.markduplicates.util.DiskBasedReadEndsForMarkDuplicatesMap;
import picard.sam.markduplicates.util.LibraryIdGenerator;
import picard.sam.markduplicates.util.ReadEnds;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesCodec;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesMap;
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodes;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodesCodec;
import picard.sam.markduplicates.util.*;
import picard.sam.util.RepresentativeReadIndexer;

import java.io.*;
import java.util.*;
Expand Down Expand Up @@ -125,6 +118,10 @@ public enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All }
public static final String DUPLICATE_TYPE_LIBRARY = "LB";
/** The duplicate type tag value for duplicate type: sequencing (optical & pad-hopping, or "co-localized"). */
public static final String DUPLICATE_TYPE_SEQUENCING = "SQ";
/** The attribute in the SAM/BAM file used to store which read was selected as representative out of a duplicate set */
public static final String DUPLICATE_SET_INDEX_TAG = "DI";
/** The attribute in the SAM/BAM file used to store the size of a duplicate set */
public static final String DUPLICATE_SET_SIZE_TAG = "DS";

/** Enum for the possible values that a duplicate read can be tagged with in the DT attribute. */
public enum DuplicateType {
Expand Down Expand Up @@ -165,6 +162,14 @@ public enum DuplicateType {
@Option(doc = "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
public String READ_TWO_BARCODE_TAG = null;

@Option(doc = "If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG (DS), " +
"indicates the size of the duplicate set. The smallest possible DS value is 2 which occurs when two " +
"reads map to the same portion of the reference only one of which is marked as duplicate. The second " +
"tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier for the duplicate set to which the " +
"record belongs. This identifier is the index-in-file of the representative read that was selected out " +
"of the duplicate set.", optional = true)
public boolean TAG_DUPLICATE_SET_MEMBERS = false;

@Option(doc = "If true remove 'optical' duplicates and other duplicates that appear to have arisen from the " +
"sequencing process instead of the library preparation process, even if REMOVE_DUPLICATES is false. " +
"If REMOVE_DUPLICATES is true, all duplicates are removed and this option is ignored.")
Expand All @@ -177,6 +182,7 @@ public enum DuplicateType {
private SortingCollection<ReadEndsForMarkDuplicates> fragSort;
private SortingLongCollection duplicateIndexes;
private SortingLongCollection opticalDuplicateIndexes;
private SortingCollection<RepresentativeReadIndexer> representativeReadIndicesForDuplicates;

private int numDuplicateIndices = 0;
static private final long NO_SUCH_INDEX = Long.MAX_VALUE; // needs to be large so that that >= test fails for query-sorted traversal
Expand Down Expand Up @@ -259,6 +265,22 @@ protected int doWork() {
long nextOpticalDuplicateIndex = this.opticalDuplicateIndexes != null && this.opticalDuplicateIndexes.hasNext() ? this.opticalDuplicateIndexes.next() : NO_SUCH_INDEX;
long nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : NO_SUCH_INDEX);

// initialize variables for optional representative read tagging
CloseableIterator<RepresentativeReadIndexer> representativeReadIterator = null;
RepresentativeReadIndexer rri = null;
int representativeReadIndexInFile = -1;
int duplicateSetSize = -1;
int nextRepresentativeIndex = -1;
if (TAG_DUPLICATE_SET_MEMBERS) {
representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator();
if (representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
}

final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written");
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
String duplicateQueryName = null;
Expand Down Expand Up @@ -343,6 +365,28 @@ protected int doWork() {
}
}

// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
if (TAG_DUPLICATE_SET_MEMBERS) {
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||
(sortOrder == SAMFileHeader.SortOrder.queryname &&
recordInFileIndex > nextDuplicateIndex);
if (isInDuplicateSet) {
if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
if (TAG_DUPLICATE_SET_MEMBERS) {
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
}
}
}
}

// Output the record if desired and bump the record index
recordInFileIndex++;
if (this.REMOVE_DUPLICATES && rec.getDuplicateReadFlag()) continue;
Expand All @@ -357,6 +401,9 @@ protected int doWork() {
iterator.close();

this.duplicateIndexes.cleanup();
if (TAG_DUPLICATE_SET_MEMBERS) {
this.representativeReadIndicesForDuplicates.cleanup();
}

reportMemoryStats("Before output close");
out.close();
Expand Down Expand Up @@ -508,8 +555,8 @@ private void buildSortedReadEndLists(final boolean useBarcodes) {
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
pairedEnds.orientation == ReadEnds.RF) {
pairedEnds.orientation = ReadEnds.FR;
}
pairedEnds.orientation = ReadEnds.FR;
}
} else {
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
Expand Down Expand Up @@ -600,18 +647,35 @@ private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, fina
* @return an array with an ordered list of indexes into the source file
*/
private void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
int entryOverhead;
if (TAG_DUPLICATE_SET_MEMBERS) {
// Memory requirements for RepresentativeReadIndexer:
// three int entries + overhead: (3 * 4) + 4 = 16 bytes
entryOverhead = 16;
}
else {
entryOverhead = SortingLongCollection.SIZEOF;
}
// Keep this number from getting too large even if there is a huge heap.
int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / SortingLongCollection.SIZEOF, (double) (Integer.MAX_VALUE - 5));
// If we're also tracking optical duplicates, cut maxInMemory in half, since we'll need two sorting collections
int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5));
// If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections
if (indexOpticalDuplicates) {
maxInMemory /= 2;
maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead);
this.opticalDuplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
}
log.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk.");
this.duplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
if (TAG_DUPLICATE_SET_MEMBERS) {
final RepresentativeReadIndexerCodec representativeIndexCodec = new RepresentativeReadIndexerCodec();
this.representativeReadIndicesForDuplicates = SortingCollection.newInstance(RepresentativeReadIndexer.class,
representativeIndexCodec,
new RepresentativeReadComparator(),
maxInMemory,
TMP_DIR);
}

ReadEndsForMarkDuplicates firstOfNextChunk = null;
final List<ReadEndsForMarkDuplicates> nextChunk = new ArrayList<ReadEndsForMarkDuplicates>(200);
final List nextChunk = new ArrayList<ReadEndsForMarkDuplicates>(200);

// First just do the pairs
log.info("Traversing read pair information and detecting duplicates.");
Expand All @@ -621,13 +685,17 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i
} else {
if (nextChunk.size() > 1) {
markDuplicatePairs(nextChunk);
if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
}
nextChunk.clear();
nextChunk.add(next);
firstOfNextChunk = next;
}
}
if (nextChunk.size() > 1) markDuplicatePairs(nextChunk);
if (nextChunk.size() > 1) {
markDuplicatePairs(nextChunk);
if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
}
this.pairSort.cleanup();
this.pairSort = null;

Expand Down Expand Up @@ -661,6 +729,7 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i
log.info("Sorting list of duplicate records.");
this.duplicateIndexes.doneAddingStartIteration();
if (this.opticalDuplicateIndexes != null) this.opticalDuplicateIndexes.doneAddingStartIteration();
if (TAG_DUPLICATE_SET_MEMBERS) this.representativeReadIndicesForDuplicates.doneAdding();
}

private boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2, final boolean useBarcodes) {
Expand Down Expand Up @@ -693,6 +762,42 @@ private void addIndexAsDuplicate(final long bamIndex) {
++this.numDuplicateIndices;
}

private void addRepresentativeReadOfDuplicateSet(final long representativeReadIndexInFile, final int setSize, final long read1IndexInFile) {
final RepresentativeReadIndexer rri = new RepresentativeReadIndexer();
rri.representativeReadIndexInFile = (int) representativeReadIndexInFile;
rri.setSize = setSize;
rri.readIndexInFile = (int) read1IndexInFile;
this.representativeReadIndicesForDuplicates.add(rri);
}

/**
* Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
* quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
* read to the records of the first and second in a pair. This value becomes is used for
* the 'DI' tag.
*
* @param list
*/
private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> list) {
short maxScore = 0;
ReadEndsForMarkDuplicates best = null;

/** All read ends should have orientation FF, FR, RF, or RR **/
for (final ReadEndsForMarkDuplicates end : list) {
if (end.score > maxScore || best == null) {
maxScore = end.score;
best = end;
}
}

// for read name (for representative read name), add the last of the pair that was examined
for (final ReadEndsForMarkDuplicates end : list) {
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile);
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
}
}


/**
* Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should
* not be marked as duplicates. This assumes that the list contains objects representing pairs.
Expand Down Expand Up @@ -795,4 +900,17 @@ public int compare(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDup
return compareDifference;
}
}

// order representative read entries based on the record index
static class RepresentativeReadComparator implements Comparator<RepresentativeReadIndexer> {

public RepresentativeReadComparator() {}

public int compare(final RepresentativeReadIndexer lhs, final RepresentativeReadIndexer rhs) {
int compareDifference = lhs.readIndexInFile - rhs.readIndexInFile;
return compareDifference;
}
}


}
Expand Up @@ -34,10 +34,10 @@ public class ReadEndsForMarkDuplicates extends ReadEnds implements Cloneable {
What do we need to store you ask? Well, we need to store:
- byte: orientation
- short: libraryId, readGroup, tile, x, y, score
- int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate
- int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate, duplicateSetSize
- long: read1IndexInFile, read2IndexInFile
*/
protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (4 * 4) + (8 * 2) + 1
protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (5 * 4) + (8 * 2) + 1
+ 8 + // last 8 == reference overhead
13; // This is determined experimentally with JProfiler

Expand All @@ -48,6 +48,7 @@ public static int getSizeOf() {
public short score = 0;
public long read1IndexInFile = -1;
public long read2IndexInFile = -1;
public int duplicateSetSize = -1;

public ReadEndsForMarkDuplicates() {}

Expand Down Expand Up @@ -76,4 +77,5 @@ public ReadEndsForMarkDuplicates(final ReadEndsForMarkDuplicates read) {
public ReadEndsForMarkDuplicates clone() {
return new ReadEndsForMarkDuplicates(this);
}

}
Expand Up @@ -69,6 +69,7 @@ public void encode(final ReadEndsForMarkDuplicates read) {
this.out.writeShort((short)read.x);
this.out.writeShort((short)read.y);
this.out.writeByte(read.orientationForOpticalDuplicates);
this.out.writeInt(read.duplicateSetSize);
} catch (final IOException ioe) {
throw new PicardException("Exception writing ReadEnds to file.", ioe);
}
Expand Down Expand Up @@ -102,6 +103,7 @@ public ReadEndsForMarkDuplicates decode() {
read.y = this.in.readShort();

read.orientationForOpticalDuplicates = this.in.readByte();
read.duplicateSetSize = this.in.readInt();

return read;
} catch (final IOException ioe) {
Expand Down
@@ -0,0 +1,79 @@
/*
* The MIT License
*
* Copyright (c) 2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates.util;

import htsjdk.samtools.util.SortingCollection;
import picard.PicardException;
import picard.sam.util.RepresentativeReadIndexer;

import java.io.*;

/** Codec for read names and integers that outputs the primitive fields and reads them back. */
public class RepresentativeReadIndexerCodec implements SortingCollection.Codec<RepresentativeReadIndexer> {
protected DataInputStream in;
protected DataOutputStream out;

public SortingCollection.Codec<RepresentativeReadIndexer> clone() {
return new RepresentativeReadIndexerCodec();
}

public void setOutputStream(final OutputStream os) { this.out = new DataOutputStream(os); }

public void setInputStream(final InputStream is) { this.in = new DataInputStream(is); }

public DataInputStream getInputStream() {
return in;
}

public DataOutputStream getOutputStream() {
return out;
}

public void encode(final RepresentativeReadIndexer rni) {
try {
this.out.writeInt(rni.readIndexInFile);
this.out.writeInt(rni.setSize);
this.out.writeInt(rni.representativeReadIndexInFile);
} catch (final IOException ioe) {
throw new PicardException("Exception writing ReadEnds to file.", ioe);
}
}

public RepresentativeReadIndexer decode() {
final RepresentativeReadIndexer rni = new RepresentativeReadIndexer();
try {
// If the first read results in an EOF we've exhausted the stream
try {
rni.readIndexInFile = this.in.readInt();
} catch (final EOFException eof) {
return null;
}
rni.setSize = this.in.readInt();
rni.representativeReadIndexInFile = this.in.readInt();
return rni;
} catch (final IOException ioe) {
throw new PicardException("Exception writing ReadEnds to file.", ioe);
}
}
}
11 changes: 11 additions & 0 deletions src/main/java/picard/sam/util/RepresentativeReadIndexer.java
@@ -0,0 +1,11 @@
package picard.sam.util;

/**
* Little struct-like class to hold a record index, the index of the corresponding representative read, and duplicate set size information.
*/

public class RepresentativeReadIndexer {
public int readIndexInFile = -1;
public int setSize = -1;
public int representativeReadIndexInFile = -1;
}
Expand Up @@ -598,5 +598,4 @@ public void testTwoMappedPairsWithSupplementaryReadsAfterCanonical(final Boolean
tester.addMappedFragment(fragLikeFirst ? "RUNID:1:1:15993:13361" : "RUNID:1:1:16020:13352", 1, 400, markSecondaryAndSupplementaryRecordsLikeTheCanonical() && !fragLikeFirst, null, null, additionalFragSecondary, additionalFragSupplementary, DEFAULT_BASE_QUALITY);
tester.runTest();
}

}

0 comments on commit 335b198

Please sign in to comment.