Skip to content

Commit

Permalink
Add per-library option to SplitReads.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Aug 19, 2015
1 parent 1832d3d commit 12c6626
Show file tree
Hide file tree
Showing 9 changed files with 181 additions and 27 deletions.
54 changes: 46 additions & 8 deletions src/main/java/org/broadinstitute/hellbender/tools/SplitReads.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.readersplitters.LibraryNameSplitter;
import org.broadinstitute.hellbender.tools.readersplitters.ReadGroupIdSplitter;
import org.broadinstitute.hellbender.tools.readersplitters.ReaderSplitter;
import org.broadinstitute.hellbender.tools.readersplitters.SampleNameSplitter;
Expand All @@ -31,6 +33,8 @@ public final class SplitReads extends ReadWalker {

public static final String SAMPLE_SHORT_NAME = "SM";
public static final String READ_GROUP_SHORT_NAME = "RG";
public static final String LIBRARY_NAME_SHORT_NAME = "LB";
public static final String UNKNOWN_OUT_PREFIX = "unknown";


@Argument(shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
Expand All @@ -46,6 +50,10 @@ public final class SplitReads extends ReadWalker {
doc = "Split file by read group.")
public boolean READ_GROUP;

@Argument(shortName = LIBRARY_NAME_SHORT_NAME,
doc = "Split file by library.")
public boolean LIBRARY_NAME;

private final List<ReaderSplitter<?>> splitters = new ArrayList<>();
private Map<String, SAMFileGATKReadWriter> outs = null;

Expand All @@ -62,12 +70,15 @@ public void onTraversalStart() {
if (READ_GROUP) {
splitters.add(new ReadGroupIdSplitter());
}
if (LIBRARY_NAME) {
splitters.add(new LibraryNameSplitter());
}
outs = createWriters(splitters);
}

@Override
public void apply( GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext ) {
outs.get(getKey(splitters, read)).addRead(read);
outs.computeIfAbsent(getKey(splitters, read), this::createUnknownOutOnDemand).addRead(read);
}

@Override
Expand All @@ -78,6 +89,33 @@ public Object onTraversalDone() {
return null;
}

// Create an output stream on demand for holding any reads that do not have a value for one or more of the
// attributes we're grouping by
private SAMFileGATKReadWriter createUnknownOutOnDemand(String attributeValue) {
if (!attributeValue.equals("."+UNKNOWN_OUT_PREFIX)) {
// the only attribute value we should ever discover at runtime is the string ".unknown" which is
// synthesized by "getkey" below when a splitter returns null because we're splitting on some
// attribute for which a given read/group has no value; anything else indicates a coding error
throw new GATKException.ShouldNeverReachHereException("Unrecognized attribute value found: " + attributeValue);
}
final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory();
final SAMFileHeader samFileHeaderIn = getHeaderForReads();

return prepareSAMFileWriter(samFileWriterFactory, samFileHeaderIn, attributeValue);
}

// Create a new output file and prepare and return the corresponding SAMFileGATKReadWriter.
private SAMFileGATKReadWriter prepareSAMFileWriter(
SAMFileWriterFactory samFileWriterFactory,
SAMFileHeader samFileHeaderIn,
final String keyName) {
final String base = FilenameUtils.getBaseName(readArguments.getReadFiles().get(0).getName());
final String extension = "." + FilenameUtils.getExtension(readArguments.getReadFiles().get(0).getName());
final File outFile = new File(OUTPUT_DIRECTORY, base + keyName + extension);
final SAMFileHeader samFileHeaderOut = ReadUtils.cloneSAMFileHeader(samFileHeaderIn);
return new SAMFileGATKReadWriter(samFileWriterFactory.makeWriter(samFileHeaderOut, true, outFile, referenceArguments.getReferenceFile()));
}

/**
* Creates SAMFileWriter instances for the reader splitters based on the input file.
* @param splitters Reader splitters.
Expand All @@ -87,10 +125,7 @@ private Map<String, SAMFileGATKReadWriter> createWriters(final List<ReaderSplitt
final Map<String, SAMFileGATKReadWriter> outs = new HashMap<>();

final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory();

final SAMFileHeader samFileHeaderIn = getHeaderForReads();
final String base = FilenameUtils.getBaseName(readArguments.getReadFiles().get(0).getName());
final String extension = "." + FilenameUtils.getExtension(readArguments.getReadFiles().get(0).getName());

// Build up a list of key options at each level.
final List<List<?>> splitKeys = splitters.stream()
Expand All @@ -99,9 +134,7 @@ private Map<String, SAMFileGATKReadWriter> createWriters(final List<ReaderSplitt

// For every combination of keys, add a SAMFileWriter.
addKey(splitKeys, 0, "", key -> {
final SAMFileHeader samFileHeaderOut = ReadUtils.cloneSAMFileHeader(samFileHeaderIn);
final File outFile = new File(OUTPUT_DIRECTORY, base + key + extension);
outs.put(key, new SAMFileGATKReadWriter(samFileWriterFactory.makeWriter(samFileHeaderOut, true, outFile, referenceArguments.getReferenceFile())));
outs.put(key, prepareSAMFileWriter(samFileWriterFactory, samFileHeaderIn, key));
});

return outs;
Expand Down Expand Up @@ -132,8 +165,13 @@ private void addKey(final List<List<?>> listKeys, final int listIndex,
* @return The generated key that may then be used to find the appropriate SAMFileWriter.
*/
private String getKey(final List<ReaderSplitter<?>> splitters, final GATKRead record) {
// if a read is missing the value for the target split, return the value "unknown" which will
// result in a new output stream being created on demand to hold uncategorized reads
return splitters.stream()
.map(s -> s.getSplitBy(record, getHeaderForReads()).toString())
.map(s -> {
final Object key = s.getSplitBy(record, getHeaderForReads());
return key == null ? UNKNOWN_OUT_PREFIX : key.toString();
})
.reduce("", (acc, item) -> acc + "." + item);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package org.broadinstitute.hellbender.tools.readersplitters;

import htsjdk.samtools.SAMReadGroupRecord;

import java.util.function.Function;

/**
* Splits readers by library name.
*/
public final class LibraryNameSplitter extends ReadGroupSplitter<String> {
@Override
protected Function<SAMReadGroupRecord, String> getSplitByFunction() {
return SAMReadGroupRecord::getLibrary;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,17 @@

public final class SplitReadsIntegrationTest extends CommandLineProgramTest {

private static final String TEST_DATA_PREFIX = "split_reads";
private static final String REFERENCE_SEQUENCE = TEST_DATA_PREFIX + ".fasta";
private static final String TEST_DATA_GOOD_READS_PREFIX = "split_reads";
private static final String TEST_DATA_MISSING_LIB__PREFIX = "split_reads_missing_lib";

private File getReferenceSequence() {
return new File(getTestDataDir(), REFERENCE_SEQUENCE);
}

private String getReferenceSequenceName(final String baseName) { return baseName + ".fasta"; }

private boolean isReferenceRequired(final SamReader.Type type) {
return type == SamReader.Type.CRAM_TYPE;
}

@DataProvider(name = "splitReadsData", parallel = true)
@DataProvider(name = "splitReadsData")
public Object[][] getSplitReadsData() {
final Map<String, Integer> byNone = new TreeMap<>();
byNone.put("", 19);
Expand All @@ -46,66 +45,85 @@ public Object[][] getSplitReadsData() {
byRG.put(".0", 17);
byRG.put(".1", 2);

final Map<String, Integer> byLibrary = new TreeMap<>();
byLibrary.put(".whatever", 19);

final Map<String, Integer> bySampleAndRG = new TreeMap<>();
bySampleAndRG.put(".Momma.0", 17);
bySampleAndRG.put(".Poppa.1", 2);

final Map<String, Integer> bySampleAndRGAndLibrary = new TreeMap<>();
bySampleAndRGAndLibrary.put(".Momma.0.whatever", 17);

// test that reads from RGs with no library attribute are output to "unknown"
final Map<String, Integer> byUnknown = new TreeMap<>();
byUnknown.put(".whatever", 2);
byUnknown.put("." + SplitReads.UNKNOWN_OUT_PREFIX, 17);

final Function<SamReader.Type, Stream<Object[]>> argTests = t -> Stream.of(
new Object[]{t, Collections.<String>emptyList(), byNone},
new Object[]{t, Collections.singletonList(SplitReads.SAMPLE_SHORT_NAME), bySample},
new Object[]{t, Collections.singletonList(SplitReads.READ_GROUP_SHORT_NAME), byRG},
new Object[]{t, Arrays.asList(SplitReads.SAMPLE_SHORT_NAME, SplitReads.READ_GROUP_SHORT_NAME), bySampleAndRG}
new Object[]{t, TEST_DATA_GOOD_READS_PREFIX, Collections.<String>emptyList(), byNone},
new Object[]{t, TEST_DATA_GOOD_READS_PREFIX, Collections.singletonList(SplitReads.SAMPLE_SHORT_NAME), bySample},
new Object[]{t, TEST_DATA_GOOD_READS_PREFIX, Collections.singletonList(SplitReads.READ_GROUP_SHORT_NAME), byRG},
new Object[]{t, TEST_DATA_GOOD_READS_PREFIX, Collections.singletonList(SplitReads.LIBRARY_NAME_SHORT_NAME), byLibrary},
new Object[]{t, TEST_DATA_GOOD_READS_PREFIX, Arrays.asList(SplitReads.SAMPLE_SHORT_NAME, SplitReads.READ_GROUP_SHORT_NAME), bySampleAndRG},
new Object[]{t, TEST_DATA_GOOD_READS_PREFIX, Arrays.asList(
SplitReads.SAMPLE_SHORT_NAME,
SplitReads.READ_GROUP_SHORT_NAME,
SplitReads.LIBRARY_NAME_SHORT_NAME),
bySampleAndRGAndLibrary
},
new Object[]{t, TEST_DATA_MISSING_LIB__PREFIX, Collections.singletonList(SplitReads.LIBRARY_NAME_SHORT_NAME), byUnknown}
);

return getSamReaderTypes()
.filter(t -> t != SamReader.Type.CRAM_TYPE) // https://github.com/samtools/htsjdk/issues/148 && https://github.com/samtools/htsjdk/issues/153
.map(argTests)
.flatMap(Function.identity())
.toArray(Object[][]::new);
}

@Test(dataProvider = "splitReadsData")
public void testSplitReadsByReadGroup(final SamReader.Type type,
final String baseName,
final List<String> splitArgs,
final Map<String, Integer> splitCounts) throws Exception {
final String fileExtension = "." + type.fileExtension();
final List<String> args = new ArrayList<>();

Path outputDir = Files.createTempDirectory(
splitArgs.stream().reduce(TEST_DATA_PREFIX, (acc, arg) -> acc + "." + arg) + fileExtension + "."
splitArgs.stream().reduce(baseName, (acc, arg) -> acc + "." + arg) + fileExtension + "."
);
outputDir.toFile().deleteOnExit();

args.add("-"+ StandardArgumentDefinitions.INPUT_SHORT_NAME);
args.add(getTestDataDir() + "/" + TEST_DATA_PREFIX + fileExtension);
args.add(getTestDataDir() + "/" + baseName + fileExtension);

args.add("-"+ StandardArgumentDefinitions.OUTPUT_SHORT_NAME );
args.add(outputDir.toString());

if (isReferenceRequired(type)) {
args.add("-" + StandardArgumentDefinitions.REFERENCE_SHORT_NAME );
args.add(getTestDataDir()+ "/" + REFERENCE_SEQUENCE);
args.add(getTestDataDir()+ "/" + getReferenceSequenceName(baseName));
}

splitArgs.forEach(arg -> {
args.add("-" + arg );
args.add("-" + arg);
});

Assert.assertNull(runCommandLine(args));

for (final Map.Entry<String, Integer> splitCount: splitCounts.entrySet()) {
final String outputFileName = TEST_DATA_PREFIX + splitCount.getKey() + fileExtension;
final String outputFileName = baseName + splitCount.getKey() + fileExtension;
Assert.assertEquals(
getReadCounts(outputDir, outputFileName),
getReadCounts(outputDir, baseName, outputFileName),
(int)splitCount.getValue(),
"unexpected read count for " + outputFileName);
}
}

private int getReadCounts(final Path tempDirectory, final String fileName) {
private int getReadCounts(final Path tempDirectory, final String baseName, final String fileName) {
final File path = tempDirectory.resolve(fileName).toFile();
IOUtil.assertFileIsReadable(path);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(getReferenceSequence()).open(path);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(new File(getTestDataDir(), getReferenceSequenceName(baseName))).open(path);
int count = 0;
for (@SuppressWarnings("unused") final SAMRecord rec : in) {
count++;
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
@HD VN:1.5 SO:unsorted
@SQ SN:chr1 LN:101 M5:bd01f7e11515bb6beda8f7257902aa67 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/split_reads_missing_lib.fasta
@SQ SN:chr2 LN:101 M5:31c33e2155b3de5e2554b693c475b310 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/split_reads_missing_lib.fasta
@SQ SN:chr3 LN:101 M5:631593c6dd2048ae88dcce2bd505d295 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/split_reads_missing_lib.fasta
@SQ SN:chr4 LN:101 M5:c60cb92f1ee5b78053c92bdbfa19abf1 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/split_reads_missing_lib.fasta
@SQ SN:chr5 LN:101 M5:07ebc213c7611db0eacbb1590c3e9bda UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/split_reads_missing_lib.fasta
@SQ SN:chr6 LN:101 M5:7be2f5e7ee39e60a6c3b5b6a41178c6d UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/split_reads_missing_lib.fasta
@SQ SN:chr7 LN:202 M5:93763aaf6a455871c7d7a7718bff9ccf UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/split_reads_missing_lib.fasta
@SQ SN:chr8 LN:202 M5:d339678efce576d5546e88b49a487b63 UR:file:/home/chris/projects/hellbender/src/test/resources/org/broadinstitute/hellbender/tools/split_reads_missing_lib.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
>chr1
TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC
TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA
A
>chr2
CATCTCTACAAGCGCGTCCTACCAGACGCGCTTCCGATCTGAGAGCATAC
TTTTCATTGGATTCCAGCACAACTCCATTTTTGATCCACTTGACACCTTT
T
>chr3
CGTATGCGCTTTTTATGTCGCCCACAGTGCCTAGTATAGCCCCTGCTAAT
AAAAAGAGATGAATACGTTTACTTAAAAAACTGAAACTAGGAATGTGCAA
A
>chr4
CGTGATACCAACTCATGTTCACAGCCAAAGCCTGAAGCTGTCTATTATAT
TTCTCAACCATAAACTTTTGCCTCAGGCATCCGCAGAATGGTTTGCAGCC
C
>chr5
NTCTCATTTAAAAATGGTTATAAAAACATTTATGCTGAAAAGGTGAAGTT
CATTAATGAACAGGCTGACTGTCTCACTATCGCGTTCGCAAGACGTTATC
T
>chr6
NAATTGTTCTTAGTTTCTCGGTTTATGTGCTCTTCCAGGTGGGTAACACA
ATAATGGCCTTCCAGATCGTAAGAGCGACGTGTGTTGCACCAGTGTCGAT
C
>chr7
CAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGG
TTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCCGA
AACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCA
GCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCATA
CA
>chr8
CACATCGTGAATCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGA
GAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCCTAAGATGAC
CCCAGGTTCAAATGTGCAGCCCCTTTTGAGAGATTTTTTTTTTGGGCTGG
AAAAAAGACACAGCTATTCCTAAGATGACAAGATCAGAAAAAAAGTCAAG
CA
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
chr1 101 6 50 51
chr2 101 116 50 51
chr3 101 226 50 51
chr4 101 336 50 51
chr5 101 446 50 51
chr6 101 556 50 51
chr7 202 666 50 51
chr8 202 879 50 51
Loading

0 comments on commit 12c6626

Please sign in to comment.