|
|
@@ -37,16 +37,9 @@ |
|
|
import htsjdk.samtools.util.CloseableIterator;
|
|
|
import htsjdk.samtools.util.SortingCollection;
|
|
|
import htsjdk.samtools.util.SortingLongCollection;
|
|
|
-import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram;
|
|
|
-import picard.sam.markduplicates.util.DiskBasedReadEndsForMarkDuplicatesMap;
|
|
|
-import picard.sam.markduplicates.util.LibraryIdGenerator;
|
|
|
-import picard.sam.markduplicates.util.ReadEnds;
|
|
|
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
|
|
|
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesCodec;
|
|
|
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesMap;
|
|
|
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
|
|
|
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodes;
|
|
|
-import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodesCodec;
|
|
|
+import picard.sam.markduplicates.util.*;
|
|
|
+import picard.sam.util.RepresentativeReadIndexer;
|
|
|
|
|
|
import java.io.*;
|
|
|
import java.util.*;
|
|
|
@@ -125,6 +118,10 @@ |
|
|
public static final String DUPLICATE_TYPE_LIBRARY = "LB";
|
|
|
/** The duplicate type tag value for duplicate type: sequencing (optical & pad-hopping, or "co-localized"). */
|
|
|
public static final String DUPLICATE_TYPE_SEQUENCING = "SQ";
|
|
|
+ /** The attribute in the SAM/BAM file used to store which read was selected as representative out of a duplicate set */
|
|
|
+ public static final String DUPLICATE_SET_INDEX_TAG = "DI";
|
|
|
+ /** The attribute in the SAM/BAM file used to store the size of a duplicate set */
|
|
|
+ public static final String DUPLICATE_SET_SIZE_TAG = "DS";
|
|
|
|
|
|
/** Enum for the possible values that a duplicate read can be tagged with in the DT attribute. */
|
|
|
public enum DuplicateType {
|
|
|
@@ -165,6 +162,14 @@ |
|
|
@Option(doc = "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
|
|
|
public String READ_TWO_BARCODE_TAG = null;
|
|
|
|
|
|
+ @Option(doc = "If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG (DS), " +
|
|
|
+ "indicates the size of the duplicate set. The smallest possible DS value is 2 which occurs when two " +
|
|
|
+ "reads map to the same portion of the reference only one of which is marked as duplicate. The second " +
|
|
|
+ "tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier for the duplicate set to which the " +
|
|
|
+ "record belongs. This identifier is the index-in-file of the representative read that was selected out " +
|
|
|
+ "of the duplicate set.", optional = true)
|
|
|
+ public boolean TAG_DUPLICATE_SET_MEMBERS = false;
|
|
|
+
|
|
|
@Option(doc = "If true remove 'optical' duplicates and other duplicates that appear to have arisen from the " +
|
|
|
"sequencing process instead of the library preparation process, even if REMOVE_DUPLICATES is false. " +
|
|
|
"If REMOVE_DUPLICATES is true, all duplicates are removed and this option is ignored.")
|
|
|
@@ -177,6 +182,7 @@ |
|
|
private SortingCollection<ReadEndsForMarkDuplicates> fragSort;
|
|
|
private SortingLongCollection duplicateIndexes;
|
|
|
private SortingLongCollection opticalDuplicateIndexes;
|
|
|
+ private SortingCollection<RepresentativeReadIndexer> representativeReadIndicesForDuplicates;
|
|
|
|
|
|
private int numDuplicateIndices = 0;
|
|
|
static private final long NO_SUCH_INDEX = Long.MAX_VALUE; // needs to be large so that that >= test fails for query-sorted traversal
|
|
|
@@ -259,6 +265,22 @@ protected int doWork() { |
|
|
long nextOpticalDuplicateIndex = this.opticalDuplicateIndexes != null && this.opticalDuplicateIndexes.hasNext() ? this.opticalDuplicateIndexes.next() : NO_SUCH_INDEX;
|
|
|
long nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : NO_SUCH_INDEX);
|
|
|
|
|
|
+ // initialize variables for optional representative read tagging
|
|
|
+ CloseableIterator<RepresentativeReadIndexer> representativeReadIterator = null;
|
|
|
+ RepresentativeReadIndexer rri = null;
|
|
|
+ int representativeReadIndexInFile = -1;
|
|
|
+ int duplicateSetSize = -1;
|
|
|
+ int nextRepresentativeIndex = -1;
|
|
|
+ if (TAG_DUPLICATE_SET_MEMBERS) {
|
|
|
+ representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator();
|
|
|
+ if (representativeReadIterator.hasNext()) {
|
|
|
+ rri = representativeReadIterator.next();
|
|
|
+ nextRepresentativeIndex = rri.readIndexInFile;
|
|
|
+ representativeReadIndexInFile = rri.representativeReadIndexInFile;
|
|
|
+ duplicateSetSize = rri.setSize;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written");
|
|
|
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
|
|
|
String duplicateQueryName = null;
|
|
|
@@ -343,6 +365,28 @@ protected int doWork() { |
|
|
}
|
|
|
}
|
|
|
|
|
|
+ // Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
|
|
|
+ if (TAG_DUPLICATE_SET_MEMBERS) {
|
|
|
+ final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
|
|
|
+ if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
|
|
|
+ rri = representativeReadIterator.next();
|
|
|
+ nextRepresentativeIndex = rri.readIndexInFile;
|
|
|
+ representativeReadIndexInFile = rri.representativeReadIndexInFile;
|
|
|
+ duplicateSetSize = rri.setSize;
|
|
|
+ }
|
|
|
+ final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||
|
|
|
+ (sortOrder == SAMFileHeader.SortOrder.queryname &&
|
|
|
+ recordInFileIndex > nextDuplicateIndex);
|
|
|
+ if (isInDuplicateSet) {
|
|
|
+ if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
|
|
|
+ if (TAG_DUPLICATE_SET_MEMBERS) {
|
|
|
+ rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
|
|
|
+ rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
// Output the record if desired and bump the record index
|
|
|
recordInFileIndex++;
|
|
|
if (this.REMOVE_DUPLICATES && rec.getDuplicateReadFlag()) continue;
|
|
|
@@ -357,6 +401,9 @@ protected int doWork() { |
|
|
iterator.close();
|
|
|
|
|
|
this.duplicateIndexes.cleanup();
|
|
|
+ if (TAG_DUPLICATE_SET_MEMBERS) {
|
|
|
+ this.representativeReadIndicesForDuplicates.cleanup();
|
|
|
+ }
|
|
|
|
|
|
reportMemoryStats("Before output close");
|
|
|
out.close();
|
|
|
@@ -508,8 +555,8 @@ private void buildSortedReadEndLists(final boolean useBarcodes) { |
|
|
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
|
|
|
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
|
|
|
pairedEnds.orientation == ReadEnds.RF) {
|
|
|
- pairedEnds.orientation = ReadEnds.FR;
|
|
|
- }
|
|
|
+ pairedEnds.orientation = ReadEnds.FR;
|
|
|
+ }
|
|
|
} else {
|
|
|
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
|
|
|
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
|
|
|
@@ -600,18 +647,35 @@ private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, fina |
|
|
* @return an array with an ordered list of indexes into the source file
|
|
|
*/
|
|
|
private void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
|
|
|
+ int entryOverhead;
|
|
|
+ if (TAG_DUPLICATE_SET_MEMBERS) {
|
|
|
+ // Memory requirements for RepresentativeReadIndexer:
|
|
|
+ // three int entries + overhead: (3 * 4) + 4 = 16 bytes
|
|
|
+ entryOverhead = 16;
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ entryOverhead = SortingLongCollection.SIZEOF;
|
|
|
+ }
|
|
|
// Keep this number from getting too large even if there is a huge heap.
|
|
|
- int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / SortingLongCollection.SIZEOF, (double) (Integer.MAX_VALUE - 5));
|
|
|
- // If we're also tracking optical duplicates, cut maxInMemory in half, since we'll need two sorting collections
|
|
|
+ int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5));
|
|
|
+ // If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections
|
|
|
if (indexOpticalDuplicates) {
|
|
|
- maxInMemory /= 2;
|
|
|
+ maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead);
|
|
|
this.opticalDuplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
|
|
|
}
|
|
|
log.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk.");
|
|
|
this.duplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
|
|
|
+ if (TAG_DUPLICATE_SET_MEMBERS) {
|
|
|
+ final RepresentativeReadIndexerCodec representativeIndexCodec = new RepresentativeReadIndexerCodec();
|
|
|
+ this.representativeReadIndicesForDuplicates = SortingCollection.newInstance(RepresentativeReadIndexer.class,
|
|
|
+ representativeIndexCodec,
|
|
|
+ new RepresentativeReadComparator(),
|
|
|
+ maxInMemory,
|
|
|
+ TMP_DIR);
|
|
|
+ }
|
|
|
|
|
|
ReadEndsForMarkDuplicates firstOfNextChunk = null;
|
|
|
- final List<ReadEndsForMarkDuplicates> nextChunk = new ArrayList<ReadEndsForMarkDuplicates>(200);
|
|
|
+ final List nextChunk = new ArrayList<ReadEndsForMarkDuplicates>(200);
|
|
|
|
|
|
// First just do the pairs
|
|
|
log.info("Traversing read pair information and detecting duplicates.");
|
|
|
@@ -621,13 +685,17 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i |
|
|
} else {
|
|
|
if (nextChunk.size() > 1) {
|
|
|
markDuplicatePairs(nextChunk);
|
|
|
+ if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
|
|
|
}
|
|
|
nextChunk.clear();
|
|
|
nextChunk.add(next);
|
|
|
firstOfNextChunk = next;
|
|
|
}
|
|
|
}
|
|
|
- if (nextChunk.size() > 1) markDuplicatePairs(nextChunk);
|
|
|
+ if (nextChunk.size() > 1) {
|
|
|
+ markDuplicatePairs(nextChunk);
|
|
|
+ if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
|
|
|
+ }
|
|
|
this.pairSort.cleanup();
|
|
|
this.pairSort = null;
|
|
|
|
|
|
@@ -661,6 +729,7 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i |
|
|
log.info("Sorting list of duplicate records.");
|
|
|
this.duplicateIndexes.doneAddingStartIteration();
|
|
|
if (this.opticalDuplicateIndexes != null) this.opticalDuplicateIndexes.doneAddingStartIteration();
|
|
|
+ if (TAG_DUPLICATE_SET_MEMBERS) this.representativeReadIndicesForDuplicates.doneAdding();
|
|
|
}
|
|
|
|
|
|
private boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2, final boolean useBarcodes) {
|
|
|
@@ -693,6 +762,42 @@ private void addIndexAsDuplicate(final long bamIndex) { |
|
|
++this.numDuplicateIndices;
|
|
|
}
|
|
|
|
|
|
+ private void addRepresentativeReadOfDuplicateSet(final long representativeReadIndexInFile, final int setSize, final long read1IndexInFile) {
|
|
|
+ final RepresentativeReadIndexer rri = new RepresentativeReadIndexer();
|
|
|
+ rri.representativeReadIndexInFile = (int) representativeReadIndexInFile;
|
|
|
+ rri.setSize = setSize;
|
|
|
+ rri.readIndexInFile = (int) read1IndexInFile;
|
|
|
+ this.representativeReadIndicesForDuplicates.add(rri);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
|
|
|
+ * quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
|
|
|
+ * read to the records of the first and second in a pair. This value becomes is used for
|
|
|
+ * the 'DI' tag.
|
|
|
+ *
|
|
|
+ * @param list
|
|
|
+ */
|
|
|
+ private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> list) {
|
|
|
+ short maxScore = 0;
|
|
|
+ ReadEndsForMarkDuplicates best = null;
|
|
|
+
|
|
|
+ /** All read ends should have orientation FF, FR, RF, or RR **/
|
|
|
+ for (final ReadEndsForMarkDuplicates end : list) {
|
|
|
+ if (end.score > maxScore || best == null) {
|
|
|
+ maxScore = end.score;
|
|
|
+ best = end;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // for read name (for representative read name), add the last of the pair that was examined
|
|
|
+ for (final ReadEndsForMarkDuplicates end : list) {
|
|
|
+ addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile);
|
|
|
+ addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
/**
|
|
|
* Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should
|
|
|
* not be marked as duplicates. This assumes that the list contains objects representing pairs.
|
|
|
@@ -795,4 +900,17 @@ public int compare(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDup |
|
|
return compareDifference;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ // order representative read entries based on the record index
|
|
|
+ static class RepresentativeReadComparator implements Comparator<RepresentativeReadIndexer> {
|
|
|
+
|
|
|
+ public RepresentativeReadComparator() {}
|
|
|
+
|
|
|
+ public int compare(final RepresentativeReadIndexer lhs, final RepresentativeReadIndexer rhs) {
|
|
|
+ int compareDifference = lhs.readIndexInFile - rhs.readIndexInFile;
|
|
|
+ return compareDifference;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
}
|
0 comments on commit
335b198