Skip to content

Commit

Permalink
Add DEDUPLICATE_RECORDS option to RevertSam.
Browse files Browse the repository at this point in the history
In some cases, the same record may be found multiple times in a BAM
file.  This option allows the user to remove the duplicate record,
whereas the default behavior is to discard the records (and mates if
present).
  • Loading branch information
nh13 committed Dec 13, 2017
1 parent 2bb2714 commit 175984e
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 7 deletions.
24 changes: 24 additions & 0 deletions src/main/java/picard/sam/RevertSam.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,11 @@
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
* Reverts a SAM file by optionally restoring original quality scores and by removing
Expand Down Expand Up @@ -164,6 +167,10 @@ public static enum FileType {sam, bam, cram,dynamic}
"the program will exit with an Exception instead of exiting cleanly. Output BAM will still be valid.")
public double MAX_DISCARD_FRACTION = 0.01;

@Argument(doc = "If SANITIZE=true discard duplicate records. Duplicate records will have the same values for all field" +
"including tags.")
public boolean DEDUPLICATE_RECORDS = false;

@Argument(doc = "The sample alias to use in the reverted output file. This will override the existing " +
"sample alias in the file and is used only if all the read groups in the input file have the " +
"same sample alias ", shortName = StandardOptionDefinitions.SAMPLE_ALIAS_SHORT_NAME, optional = true)
Expand All @@ -190,6 +197,8 @@ protected String[] customCommandLineValidation() {
ValidationUtil.validateSanitizeSortOrder(SANITIZE, SORT_ORDER, errors);
ValidationUtil.validateOutputParams(OUTPUT_BY_READGROUP, OUTPUT, OUTPUT_MAP, errors);

if (!SANITIZE && DEDUPLICATE_RECORDS) errors.add("DEDUPLICATE_RECORDS cannot be used without SANITIZE");

if (!errors.isEmpty()) {
return errors.toArray(new String[errors.size()]);
}
Expand Down Expand Up @@ -374,6 +383,21 @@ private long[] sanitize(final Map<SAMReadGroupRecord, FastqQualityFormat> readGr
}
}

// Remove records that have the SAM SAMString (*** SLOW ***)
if (DEDUPLICATE_RECORDS) {
final Iterator<SAMRecord> iter = recs.iterator();
final Set<String> samStrings = new HashSet<>();
while (iter.hasNext()) {
final String samString = iter.next().getSAMString();
if (samStrings.contains(samString)) {
iter.remove();
}
else {
samStrings.add(samString);
}
}
}

// Check that if the first read is marked as unpaired that there is in fact only one read
if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
log.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
Expand Down
40 changes: 33 additions & 7 deletions src/test/java/picard/sam/RevertSamTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,7 @@
*/
package picard.sam;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.*;
import htsjdk.samtools.util.CloserUtil;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.testng.Assert;
Expand Down Expand Up @@ -500,4 +494,36 @@ public void testNoRgInfoSanitize() throws Exception {
Assert.assertEquals(runPicardCommandLine(args), 0);
verifyPositiveResults(output, new RevertSam(), true, true, false, false, null, 240, null, null);
}

@Test
public void testSanitizeAndDeduplicateRecords() throws Exception {
final File input = File.createTempFile("test-input-santize-and-deduplicate-records", ".sam");
final File output = File.createTempFile("test-output-santize-and-deduplicate-records", ".sam");

// Create a SAM file that has duplicate records
final SamReader reader = SamReaderFactory.makeDefault().open(Paths.get(basicSamToRevert));
final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), false, input);
int numDuplicated = 0;
for (final SAMRecord rec : reader) {
writer.addAlignment(rec);
if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) {
writer.addAlignment(rec);
numDuplicated++;
}
}
reader.close();
writer.close();

// Make sure some records are duplicated
Assert.assertTrue(numDuplicated > 0);

final String [] args = new String[]{
"I=" + input.getAbsolutePath(),
"SANITIZE=true",
"DEDUPLICATE_RECORDS=true",
"O=" + output.getAbsolutePath()
};
Assert.assertEquals(runPicardCommandLine(args), 0);
verifyPositiveResults(output, new RevertSam(), true, true, false, false, null, 8, null, null);
}
}

0 comments on commit 175984e

Please sign in to comment.