Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DEDUPLICATE_RECORDS option to RevertSam. #1029

Merged
merged 1 commit into from
Dec 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
68 changes: 47 additions & 21 deletions src/main/java/picard/sam/RevertSam.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,7 @@
import java.nio.file.Paths;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;

/**
* Reverts a SAM file by optionally restoring original quality scores and by removing
Expand Down Expand Up @@ -164,6 +161,11 @@ public static enum FileType {sam, bam, cram,dynamic}
"the program will exit with an Exception instead of exiting cleanly. Output BAM will still be valid.")
public double MAX_DISCARD_FRACTION = 0.01;

@Argument(doc = "If SANITIZE=true keep the first record when we find more than one record with the same name for R1/R2/unpaired reads respectively. " +
"For paired end reads, keeps only the first R1 and R2 found respectively, and discards all unpaired reads. " +
"Duplicates do not refer to the duplicate flag in the FLAG field, but instead reads with the same name.")
public boolean KEEP_FIRST_DUPLICATE = false;

@Argument(doc = "The sample alias to use in the reverted output file. This will override the existing " +
"sample alias in the file and is used only if all the read groups in the input file have the " +
"same sample alias ", shortName = StandardOptionDefinitions.SAMPLE_ALIAS_SHORT_NAME, optional = true)
Expand All @@ -190,6 +192,8 @@ protected String[] customCommandLineValidation() {
ValidationUtil.validateSanitizeSortOrder(SANITIZE, SORT_ORDER, errors);
ValidationUtil.validateOutputParams(OUTPUT_BY_READGROUP, OUTPUT, OUTPUT_MAP, errors);

if (!SANITIZE && KEEP_FIRST_DUPLICATE) errors.add("KEEP_FIRST_DUPLICATE cannot be used without SANITIZE");

if (!errors.isEmpty()) {
return errors.toArray(new String[errors.size()]);
}
Expand Down Expand Up @@ -362,36 +366,58 @@ private long[] sanitize(final Map<SAMReadGroupRecord, FastqQualityFormat> readGr
for (final PeekableIterator<SAMRecord> iterator : iterators) {
readNameLoop:
while (iterator.hasNext()) {
final List<SAMRecord> recs = fetchByReadName(iterator);
List<SAMRecord> recs = fetchByReadName(iterator);
total += recs.size();

// Check that all the reads have bases and qualities of the same length
for (final SAMRecord rec : recs) {
if (rec.getReadBases().length != rec.getBaseQualities().length) {
log.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length.");
log.debug("Discarding ", recs.size(), " reads with name ", rec.getReadName(), " for mismatching bases and quals length.");
discarded += recs.size();
continue readNameLoop;
}
}

// Check that if the first read is marked as unpaired that there is in fact only one read
if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) {
log.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired.");
discarded += recs.size();
continue readNameLoop;
// Get the number of R1s, R2s, and unpaired reads respectively.
int firsts = 0, seconds = 0, unpaired = 0;
SAMRecord firstRecord = null, secondRecord = null, unpairedRecord = null;
for (final SAMRecord rec : recs) {
if (!rec.getReadPairedFlag()) {
if (unpairedRecord == null) unpairedRecord = rec;
++unpaired;
}
if (rec.getFirstOfPairFlag()) {
if (firstRecord == null) firstRecord = rec;
++firsts;
}
if (rec.getSecondOfPairFlag()) {
if (secondRecord == null) secondRecord = rec;
++seconds;
}
}

// Check that if we have paired reads there is exactly one first of pair and one second of pair
if (recs.get(0).getReadPairedFlag()) {
int firsts = 0, seconds = 0, unpaired = 0;
for (final SAMRecord rec : recs) {
if (!rec.getReadPairedFlag()) ++unpaired;
if (rec.getFirstOfPairFlag()) ++firsts;
if (rec.getSecondOfPairFlag()) ++seconds;
}
// If we have paired reads, then check that there is exactly one first of pair and one second of pair.
// Otherwise, check that we have only one unpaired read.
if (firsts > 0 || seconds > 0) { // if we have any paired reads
if (firsts != 1 || seconds != 1) { // if we do not have exactly one R1 and one R2
if (KEEP_FIRST_DUPLICATE && firsts >= 1 && seconds >= 1) { // if we have at least one R1 and one R2, we can discard all but the first encountered
discarded += recs.size() - 2;
recs = Arrays.asList(firstRecord, secondRecord);
} else {
log.debug("Discarding ", recs.size(), " reads with name ", recs.get(0).getReadName(), " because we found ", firsts, " R1s ", seconds, " R2s and ", unpaired, " unpaired reads.");
discarded += recs.size();
continue readNameLoop;
}

if (unpaired > 0 || firsts != 1 || seconds != 1) {
log.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt.");
}
}
else if (unpaired > 1) { // only unpaired reads, and we have too many
if (KEEP_FIRST_DUPLICATE) {
discarded += recs.size() - 1;
recs = Collections.singletonList(unpairedRecord);
}
else {
log.debug("Discarding ", recs.size(), " reads with name ", recs.get(0).getReadName(), " because we found ", unpaired, " unpaired reads.");
discarded += recs.size();
continue readNameLoop;
}
Expand Down
41 changes: 34 additions & 7 deletions src/test/java/picard/sam/RevertSamTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,7 @@
*/
package picard.sam;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.*;
import htsjdk.samtools.util.CloserUtil;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.testng.Assert;
Expand Down Expand Up @@ -500,4 +494,37 @@ public void testNoRgInfoSanitize() throws Exception {
Assert.assertEquals(runPicardCommandLine(args), 0);
verifyPositiveResults(output, new RevertSam(), true, true, false, false, null, 240, null, null);
}

@Test
public void testSanitizeAndDeduplicateRecords() throws Exception {
final File input = File.createTempFile("test-input-santize-and-deduplicate-records", ".sam");
final File output = File.createTempFile("test-output-santize-and-deduplicate-records", ".sam");

// Create a SAM file that has duplicate records
final SamReader reader = SamReaderFactory.makeDefault().open(Paths.get(basicSamToRevert));
final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), false, input);
int numDuplicated = 0;
for (final SAMRecord rec : reader) {
writer.addAlignment(rec);
if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) {
writer.addAlignment(rec);
numDuplicated++;
}
}
reader.close();
writer.close();

// Make sure some records are duplicated
Assert.assertTrue(numDuplicated > 0);

final String [] args = new String[]{
"I=" + input.getAbsolutePath(),
"SANITIZE=true",
"KEEP_FIRST_DUPLICATE=true",
"MAX_DISCARD_FRACTION=1",
"O=" + output.getAbsolutePath()
};
Assert.assertEquals(runPicardCommandLine(args), 0);
verifyPositiveResults(output, new RevertSam(), true, true, false, false, null, 8, null, null);
}
}