Skip to content

Commit

Permalink
Fix Duplicate Set Index for queryname sorted input to MarkDuplicates (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
kachulis committed Feb 13, 2023
1 parent 7ff09b8 commit 21ccdb5
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 256 deletions.
47 changes: 19 additions & 28 deletions src/main/java/picard/sam/markduplicates/MarkDuplicates.java
Expand Up @@ -319,12 +319,12 @@ protected int doWork() {
RepresentativeReadIndexer rri = null;
int representativeReadIndexInFile = -1;
int duplicateSetSize = -1;
int nextRepresentativeIndex = -1;
int nextReadInDuplicateSetIndex = -1;
if (TAG_DUPLICATE_SET_MEMBERS) {
representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator();
if (representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
nextReadInDuplicateSetIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
Expand All @@ -333,6 +333,7 @@ protected int doWork() {
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written");
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
String duplicateQueryName = null;
String representativeQueryName = null;

while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
Expand Down Expand Up @@ -375,22 +376,27 @@ protected int doWork() {

// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
if (TAG_DUPLICATE_SET_MEMBERS) {
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
final boolean needNextRepresentativeIndex = recordInFileIndex > nextReadInDuplicateSetIndex &&
(sortOrder == SAMFileHeader.SortOrder.coordinate || !rec.getReadName().equals(representativeQueryName));
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
nextReadInDuplicateSetIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||

/* If this record's index is readInDuplicateSetIndex, then it is in a duplicateset.
For queryname sorted data, we only have one representativeReadIndex entry per read name, so we need
to also look for additional reads with the same name.
*/
final boolean isInDuplicateSet = recordInFileIndex == nextReadInDuplicateSetIndex ||
(sortOrder == SAMFileHeader.SortOrder.queryname &&
recordInFileIndex > nextDuplicateIndex);
recordInFileIndex > nextReadInDuplicateSetIndex && rec.getReadName().equals(representativeQueryName));
if (isInDuplicateSet) {
if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
if (TAG_DUPLICATE_SET_MEMBERS) {
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
}
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
representativeQueryName = rec.getReadName();
}
}
}
Expand All @@ -400,23 +406,6 @@ protected int doWork() {
UmiUtil.setMolecularIdentifier(rec, "", MOLECULAR_IDENTIFIER_TAG, DUPLEX_UMI);
}

// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
if (TAG_DUPLICATE_SET_MEMBERS) {
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||
(sortOrder == SAMFileHeader.SortOrder.queryname &&
recordInFileIndex > nextDuplicateIndex);
if (isInDuplicateSet && !rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag() && TAG_DUPLICATE_SET_MEMBERS) {
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
}
}

// Note, duplicateQueryName must be incremented after we have marked both optical and sequencing duplicates for queryname sorted files.
if (isDuplicate) {
Expand Down Expand Up @@ -874,7 +863,9 @@ private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> li
// for read name (for representative read name), add the last of the pair that was examined
for (final ReadEndsForMarkDuplicates end : list) {
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile);
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
if (end.read1IndexInFile != end.read2IndexInFile) {
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
}
}
}

Expand Down
Expand Up @@ -29,6 +29,7 @@
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.FormatUtil;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import org.testng.Assert;
import picard.cmdline.CommandLineProgram;
Expand All @@ -40,7 +41,11 @@
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
Expand All @@ -53,6 +58,7 @@ abstract public class AbstractMarkDuplicatesCommandLineProgramTester extends Sam
final DuplicationMetrics expectedMetrics;

boolean testOpticalDuplicateDTTag = false;
final public Map<List<String>, Double> expectedSetSizeMap = new HashMap<>();

public AbstractMarkDuplicatesCommandLineProgramTester(final ScoringStrategy duplicateScoringStrategy, SAMFileHeader.SortOrder sortOrder) {
this(duplicateScoringStrategy, sortOrder, true);
Expand Down Expand Up @@ -139,26 +145,44 @@ public void setExpectedOpticalDuplicate(final int expectedOpticalDuplicatePairs)
expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES = expectedOpticalDuplicatePairs;
}

@Override
public void test() throws IOException {
testMetrics();

/**
* This method is called before iterating through output records in test method.
* Should be used to update any expectations to be used in implementation
* specific tests.
**/
void updateExpectationsHook() {
}

/**
* Runs test and returns metrics
* @return Duplication metrics
* @throws IOException
* This method is called for each record in the output. Should be used
* to perform any additional implementation specific tests not included
* in default test.
**/
void testRecordHook(final SAMRecord record){
}

/**
* This method is called after iterating through output records.
* Should be used to test any implementation specific methods not
* included in default test
*/
public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException {
void testOutputsHook() {
}

@Override
public void test() throws IOException {
try {
updateExpectedDuplicationMetrics();
updateExpectationsHook();

// Read the output and check the duplicate flag
int outputRecords = 0;
final Set<String> sequencingDTErrorsSeen = new HashSet<>();
try(final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(fastaFiles.get(samRecordSetBuilder.getHeader())).open(getOutput())) {
for (final SAMRecord record : reader) {
outputRecords++;
testRecordHook(record);
final String key = samRecordToDuplicatesFlagsKey(record);
Assert.assertTrue(this.duplicateFlags.containsKey(key), "DOES NOT CONTAIN KEY: " + key);
final boolean value = this.duplicateFlags.get(key);
Expand Down Expand Up @@ -197,7 +221,29 @@ public MetricsFile<DuplicationMetrics, Double> testMetrics() throws IOException
Assert.assertEquals(sequencingDTErrorsSeen.size(), expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file");
Assert.assertEquals(sequencingDTErrorsSeen.size(), observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match duplicate groups observed in the file");
}
return metricsOutput;

if (!expectedSetSizeMap.isEmpty()) {
boolean checked = false;
for (final Histogram<Double> histo : metricsOutput.getAllHistograms()) {
final String label = histo.getValueLabel();
for (final Double bin : histo.keySet()) {
final String binStr = String.valueOf(bin);
final List<String> labelBinStr = Arrays.asList(label, binStr);
if (expectedSetSizeMap.containsKey(labelBinStr)) {
checked = true;
Histogram.Bin<Double> binValue = histo.get(bin);
final double actual = binValue.getValue();
final double expected = expectedSetSizeMap.get(labelBinStr);
Assert.assertEquals(actual, expected);
}
}
}
if (!checked) {
Assert.fail("Could not not find matching entry for expectedSetSizeMap in metrics.");
}
}

testOutputsHook();
} finally {
IOUtil.recursiveDelete(getOutputDir().toPath());
}
Expand Down
Expand Up @@ -84,9 +84,8 @@ public CollectDuplicateMetricsTester( final SAMFileHeader.SortOrder sortOrder) {
public void recordOpticalDuplicatesMarked() {}

@Override
public void test() throws IOException {
void updateExpectationsHook() {
// CollectDuplicateMetrics cannot identify Optical duplicates....
setExpectedOpticalDuplicate(0);
super.test();
}
}
Expand Up @@ -35,16 +35,16 @@
*/
public class MarkDuplicatesSetSizeHistogramTest extends AbstractMarkDuplicatesCommandLineProgramTest {
@Override
protected MarkDuplicatesSetSizeHistogramTester getTester() {
return new MarkDuplicatesSetSizeHistogramTester();
protected MarkDuplicatesTester getTester() {
return new MarkDuplicatesTester();
}

@Test
public void TestSingleSet() {
// This tests checks that if I add two read pairs with the same alignment start, a duplicate
// set size entry of 2 is put into the histogram. The pair is an optical dup, so a 2 entry for
// the 'optical_sets' should also be marked
final MarkDuplicatesSetSizeHistogramTester tester = getTester();
final MarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(1);
String representativeReadName = "RUNID:1:1:16020:13352";
tester.addMatePair(representativeReadName, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
Expand All @@ -59,7 +59,7 @@ public void TestSingleSet() {
public void testOpticalAndNonOpticalSet() {
// This tests checks that if I have two optical dups and one non-optical in the same dup set that the correct histogram entries
// are specified
final MarkDuplicatesSetSizeHistogramTester tester = getTester();
final MarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(2);
String representativeReadName = "RUNID:1:1:16020:13352";
tester.addMatePair(representativeReadName, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY);
Expand All @@ -75,7 +75,7 @@ public void testOpticalAndNonOpticalSet() {
@Test
public void testSingleton() {
// This tests checks if having a read pair that is not duplicated, the correct histogram entry is updated
final MarkDuplicatesSetSizeHistogramTester tester = getTester();
final MarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(0);
// Add non-duplicate read
tester.addMatePair("RUNID:1:1:15993:13375", 1, 485255, 485255, false, false, false, false, "43M2S", "43M2S", false, true, false, false, false, DEFAULT_BASE_QUALITY);
Expand All @@ -86,7 +86,7 @@ public void testSingleton() {
@Test
public void testMultiRepresentativeReadTags() {
// This tests checks multiple different duplicate sets of varying sizes
final MarkDuplicatesSetSizeHistogramTester tester = getTester();
final MarkDuplicatesTester tester = getTester();
tester.setExpectedOpticalDuplicate(3);
// Duplicate set: size 2 - all optical
String representativeReadName1 = "RUNID:1:1:16020:13352";
Expand Down

This file was deleted.

0 comments on commit 21ccdb5

Please sign in to comment.