Skip to content

Commit

Permalink
A port of FastaAlternateReferenceMaker and FastaReferenceMaker from G…
Browse files Browse the repository at this point in the history
…ATK3 (#5549)

* Ported FastaAlternateReferenceMaker and FastaReferenceMaker from GATK3

* Added a new walker type ReferenceWalker, and a new tool CountBasesInReference to demonstrate it

* Made changes to the close behavior in FastaWriter, and added additional buffering

Resolves #4891
  • Loading branch information
lbergelson authored and droazen committed Jan 29, 2019
1 parent 5cdb1d9 commit 64e2d3a
Show file tree
Hide file tree
Showing 111 changed files with 2,212 additions and 112 deletions.
Expand Up @@ -2,13 +2,12 @@

import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequence;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.iterators.ByteArrayIterator;

import java.util.Arrays;
import java.util.Iterator;

/**
Expand Down
@@ -0,0 +1,107 @@
package org.broadinstitute.hellbender.engine;

import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.tools.examples.ExampleReferenceWalker;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.iterators.IntervalLocusIterator;

/**
* A reference walker is a tool which processes each base in a given reference. Each base can be processed individually
* or in a context of multiple bases in a window. Reads and Feature tracks can optionally be included as well. The
* reference bases to process can be subset to a given set of intervals.
*
* ReferenceWalker authors must implement the apply() method to process each position, and may optionally implement
* {@link #onTraversalStart()} and/or {@link #onTraversalSuccess()}.
*
* Tool authors may also implement {@link #getReferenceWindow(SimpleInterval)} to provide additional bases of context
* around each position.
*
* See the {@link ExampleReferenceWalker} walker for an example.
*/
public abstract class ReferenceWalker extends GATKTool {

@Override
public String getProgressMeterRecordLabel() { return "bases"; }

@Override
public final boolean requiresReference() { return true; }

/**
* Initialize data sources for traversal.
*
* Marked final so that tool authors don't override it. Tool authors should override onTraversalStart() instead.
*/
@Override
protected final void onStartup() {
super.onStartup();
}

/**
* Implementation of reference-locus-based traversal.
* Subclasses can override to provide own behavior but default implementation should be suitable for most uses.
*/
@Override
public void traverse() {
final CountingReadFilter readFilter = makeReadFilter();

for(final SimpleInterval locus : getIntervalIterator()){
final SimpleInterval referenceWindow = getReferenceWindow(locus);
final ReferenceContext referenceContext = new ReferenceContext(reference, locus, referenceWindow);
apply(referenceContext,
new ReadsContext(reads, referenceContext.getWindow(), readFilter), // Will create an empty ReadsContext if reads == null
new FeatureContext(features, referenceContext.getWindow())); // Will create an empty FeatureContext if features == null

progressMeter.update(referenceContext.getInterval());
};

}

/**
* Determine the window to use when creating the ReferenceContext in apply. This determines which reference bases are
* seen at each position, as well as which reads / features are considered to overlap the reference site.
*
* The default implementation returns the single reference locus passed that is passed in, but subclasses may override
* this method in order to change the window size.
*
* @param locus the current locus being processed in the traversal
* @return the window to use when creating the ReferenceContext that is passed into {@link #apply(ReferenceContext, ReadsContext, FeatureContext)}
*/
protected SimpleInterval getReferenceWindow(SimpleInterval locus){
return locus;
}

private Iterable<SimpleInterval> getIntervalIterator(){
return () -> new IntervalLocusIterator(getTraversalIntervals().iterator());
}

/**
* Process an individual reference locus (with optional contextual information). Must be implemented by tool authors.
* In general, tool authors should simply stream their output from apply(), and maintain as little internal state
* as possible.
*
* @param referenceContext Reference bases at the current locus. The window around the single locus is provided by {@link #getReferenceWindow(SimpleInterval)}
* Additional bases my be retrieved by invoking the by invoking {@link ReferenceContext#setWindow}
* on this object before calling {@link ReferenceContext#getBases}, however this will not effect
* the set of reads and features that are considered overlapping which are based on the preset
* window passed in.
* @param readsContext Reads spanning the current reference window. Will be an empty, but non-null, context object
* if there is no backing source of Read data (in which case all queries on it will return an
* empty List).
* @param featureContext Features spanning the current reference window. Will be an empty, but non-null, context object
* if there is no backing source of Feature data (in which case all queries on it will return an
* empty List).
*
*/
public abstract void apply(ReferenceContext referenceContext, ReadsContext readsContext, FeatureContext featureContext );

/**
* Shutdown data sources.
*
* Marked final so that tool authors don't override it. Tool authors should override {@link #onTraversalSuccess()} instead.
*/
@Override
protected final void onShutdown() {
// Overridden only to make final so that concrete tool implementations don't override
super.onShutdown();
}
}
@@ -0,0 +1,81 @@
package org.broadinstitute.hellbender.tools.examples;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.ExampleProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

/**
* Counts the number of times each reference context is seen as well as how many times it's overlapped by reads and variants.
* Why you would want to do this is a mystery.
*
* This is a toy example to illustrate how to implement a ReferenceWalker.
*/
@CommandLineProgramProperties(
summary = "Example of how to implement a ReferenceWalker that uses reads and features as well as a custom window",
oneLineSummary = "Example of how to implement a ReferenceWalker",
programGroup = ExampleProgramGroup.class,
omitFromCommandLine = true)
public class ExampleReferenceWalker extends ReferenceWalker {

@Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME,
shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME,
doc="variants to count overlaps of")
private FeatureInput<VariantContext> variants;

@VisibleForTesting
final Map<String, OverlapCounts> contextCounts = new HashMap<>();

@Override
protected SimpleInterval getReferenceWindow(SimpleInterval locus) {
// add one base of padding to each side of each reference locus.
return new SimpleInterval(locus.getContig(), Math.max(locus.getStart() -1, 1), locus.getStart()+1);
}

@Override
public void apply(ReferenceContext referenceContext, ReadsContext readsContext, FeatureContext featureContext) {
final byte[] bases = referenceContext.getBases();
final String baseString = new String(bases);
final OverlapCounts counts = contextCounts.getOrDefault(baseString, new OverlapCounts());
if(readsContext.iterator().hasNext()){
counts.overlappedByReads++;
}

if(!featureContext.getValues(variants).isEmpty()){
counts.overlappedByVariants++;
}

counts.timesSeen++;

contextCounts.put(baseString, counts);
}

@Override
public Object onTraversalSuccess(){
contextCounts.entrySet().stream()
.sorted(Comparator.comparing(Entry::getKey))
.forEachOrdered(entry -> System.out.println(entry.getKey() + " " + entry.getValue().toString()));
return 0;
}

@VisibleForTesting
static class OverlapCounts {
long timesSeen = 0;
long overlappedByReads = 0;
long overlappedByVariants = 0;

@Override
public String toString(){
return String.format("Seen: %d Reads %d Variants %d", timesSeen, overlappedByReads, overlappedByVariants);
}
}
}
@@ -0,0 +1,37 @@
package org.broadinstitute.hellbender.tools.walkers.fasta;

import com.google.common.annotations.VisibleForTesting;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.ReferenceWalker;
import picard.cmdline.programgroups.ReferenceProgramGroup;

@DocumentedFeature
@CommandLineProgramProperties(
oneLineSummary = "Count the numbers of each base in a reference file",
summary = "Count the number of times each individual base occurs in a reference file.",
programGroup = ReferenceProgramGroup.class
)
public class CountBasesInReference extends ReferenceWalker {
@VisibleForTesting
final long[] baseCounts = new long[256];

@Override
public void apply(ReferenceContext referenceContext, ReadsContext read, FeatureContext featureContext) {
baseCounts[referenceContext.getBase()]++;
}

@Override
public Object onTraversalSuccess(){
for (int i = 0; i < baseCounts.length; i++) {
final long count = baseCounts[i];
if (count > 0){
System.out.println((char)i + " : " + count );
}
}
return 0;
}
}

0 comments on commit 64e2d3a

Please sign in to comment.