Skip to content

Commit

Permalink
Allow Feature-containing files to be used as sources of intervals in …
Browse files Browse the repository at this point in the history
…the -L/-XL arguments

Replicates a much-used feature from GATK3.

Resolves #605
  • Loading branch information
droazen committed Jul 28, 2015
1 parent fa1f43d commit bbaa73b
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,14 @@ else if ( candidateCodecs.size() > 1 ) {
return candidateCodecs;
}

/**
* @param file file to check
* @return True if the file exists and contains Features (ie., we have a FeatureCodec that can decode it), otherwise false
*/
public static boolean isFeatureFile( final File file ) {
return file.exists() && ! getCandidateCodecsForFile(file).isEmpty();
}

/**
* Permanently closes this manager by closing all backing data sources
*/
Expand Down
146 changes: 87 additions & 59 deletions src/main/java/org/broadinstitute/hellbender/utils/IntervalUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,14 @@
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureManager;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.text.XReadLines;
Expand All @@ -25,6 +29,13 @@
*/
public final class IntervalUtils {

/**
* Recognized extensions for interval files
*/
public static final List<String> INTERVAL_FILE_EXTENSIONS = Collections.unmodifiableList(Arrays.asList(
".LIST", ".INTERVAL_LIST", ".INTERVALS", ".PICARD"
));

/**
* Lexicographical (contig) order comparator.
* <p>
Expand Down Expand Up @@ -125,7 +136,12 @@ public static List<GenomeLoc> parseIntervalArguments(final GenomeLocParser parse
if (isUnmapped(arg)) {
throw new UserException.BadArgumentValue("-L/-XL", arg, "Currently the only way to view unmapped intervals " +
"is to perform a traversal of the entire file without specifying any intervals");
}// if it's a file, add items to raw interval list
}
// If it's a Feature-containing file, convert it to a list of intervals
else if ( FeatureManager.isFeatureFile(new File(arg)) ) {
rawIntervals.addAll(featureFileToIntervals(parser, arg));
}
// If it's an interval file, add items to raw interval list
else if (isIntervalFile(arg)) {
try {
rawIntervals.addAll(intervalFileToList(parser, arg));
Expand All @@ -146,7 +162,27 @@ else if (isIntervalFile(arg)) {
}

/**
* Read a file of genome locations to process. The file may be in BED, Picard,
* Converts a Feature-containing file to a list of intervals
*
* @param parser GenomeLocParser for creating intervals
* @param featureFileName file containing Features to convert to intervals
* @return a List of intervals corresponding to the locations of the Features in the provided file
* @throws UserException.CouldNotReadInputFile if the provided file is not in a supported Feature file format
*/
public static List<GenomeLoc> featureFileToIntervals( final GenomeLocParser parser, final String featureFileName ) {
final File featureFile = new File(featureFileName);
final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(new File(featureFileName));
final FeatureDataSource<? extends Feature> dataSource = new FeatureDataSource<>(featureFile, codec);
final List<GenomeLoc> featureIntervals = new ArrayList<>();

for ( final Feature feature : dataSource ) {
featureIntervals.add(parser.createGenomeLoc(feature));
}
return featureIntervals;
}

/**
* Read a file of genome locations to process. The file may be in Picard
* or GATK interval format.
*
* @param glParser GenomeLocParser
Expand All @@ -156,56 +192,47 @@ else if (isIntervalFile(arg)) {
public static List<GenomeLoc> intervalFileToList(final GenomeLocParser glParser, final String fileName) {
Utils.nonNull(glParser, "glParser is null");
Utils.nonNull(fileName, "file name is null");
// try to open file

final File inputFile = new File(fileName);
final List<GenomeLoc> ret = new ArrayList<>();

// case: BED file
if ( fileName.toUpperCase().endsWith(".BED") ) {
// this is now supported in Tribble
throw new UserException("BED files must be parsed through Tribble; parsing them as intervals through the GATK engine is no longer supported");
}
else {
/**
* IF not a BED file:
* first try to read it as a Picard interval file since that's well structured
* we'll fail quickly if it's not a valid file.
*/
boolean isPicardInterval = false;
try {
// Note: Picard will skip over intervals with contigs not in the sequence dictionary
final IntervalList il = IntervalList.fromFile(inputFile);
isPicardInterval = true;

int nInvalidIntervals = 0;
for (final Interval interval : il.getIntervals()) {
if ( glParser.isValidGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true)) {
ret.add(glParser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true));
} else {
nInvalidIntervals++;
}
}
if ( nInvalidIntervals > 0 ) {
logger.warn("Ignoring " + nInvalidIntervals + " invalid intervals from " + inputFile);
/**
* First try to read the file as a Picard interval file since that's well structured --
* we'll fail quickly if it's not a valid file.
*/
boolean isPicardInterval = false;
try {
// Note: Picard will skip over intervals with contigs not in the sequence dictionary
final IntervalList il = IntervalList.fromFile(inputFile);
isPicardInterval = true;

int nInvalidIntervals = 0;
for (final Interval interval : il.getIntervals()) {
if ( glParser.isValidGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true)) {
ret.add(glParser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true));
} else {
nInvalidIntervals++;
}
}

// if that didn't work, try parsing file as a GATK interval file
catch (final Exception e) {
if ( isPicardInterval ) // definitely a picard file, but we failed to parse
{
throw new UserException.CouldNotReadInputFile(inputFile, e);
} else {
try (XReadLines reader = new XReadLines(new File(fileName))) {
for (final String line : reader) {
if (line.trim().length() > 0) {
ret.add(glParser.parseGenomeLoc(line));
}
if ( nInvalidIntervals > 0 ) {
logger.warn("Ignoring " + nInvalidIntervals + " invalid intervals from " + inputFile);
}
}
// if that didn't work, try parsing file as a GATK interval file
catch (final Exception e) {
if ( isPicardInterval ) // definitely a picard file, but we failed to parse
{
throw new UserException.CouldNotReadInputFile(inputFile, e);
} else {
try (XReadLines reader = new XReadLines(new File(fileName))) {
for (final String line : reader) {
if (line.trim().length() > 0) {
ret.add(glParser.parseGenomeLoc(line));
}
}
catch (final IOException e2) {
throw new UserException.CouldNotReadInputFile(inputFile, e2);
}
}
catch (final IOException e2) {
throw new UserException.CouldNotReadInputFile(inputFile, e2);
}
}
}
Expand Down Expand Up @@ -360,31 +387,32 @@ public static boolean isIntervalFile(final String str) {

/**
* Check if string argument was intented as a file
* Accepted file extensions: .bed .list, .picard, .interval_list, .intervals.
* Accepted file extensions are defined in {@link #INTERVAL_FILE_EXTENSIONS}
* @param str token to identify as a filename.
* @param checkExists if true throws an exception if the file doesn't exist.
* @return true if the token looks like a filename, or false otherwise.
* @param checkExists if true throws an exception if the file doesn't exist and has an interval file extension
* @return true if the token looks like an interval file name, or false otherwise.
*/
public static boolean isIntervalFile(final String str, final boolean checkExists) {
Utils.nonNull(str);
// should we define list of file extensions as a public array somewhere?
// is regex or endsiwth better?
final File file = new File(str);
if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") ||
str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST")
|| str.toUpperCase().endsWith(".INTERVALS")) {
if (!checkExists) {
return true;
} else if (file.exists()) {

boolean hasIntervalFileExtension = false;
for ( final String extension : INTERVAL_FILE_EXTENSIONS ) {
if ( str.toUpperCase().endsWith(extension) ) {
hasIntervalFileExtension = true;
}
}

if ( hasIntervalFileExtension ) {
if ( ! checkExists || file.exists() ) {
return true;
} else {
throw new UserException.CouldNotReadInputFile(file, "The interval file does not exist.");
}
}

if(file.exists()) {
else if ( file.exists() ) {
throw new UserException.CouldNotReadInputFile(file, String.format("The interval file %s does not have one of " +
"the supported extensions (.bed, .list, .picard, .interval_list, or .intervals). " +
"the supported extensions: " + INTERVAL_FILE_EXTENSIONS + ". " +
"Please rename your file with the appropriate extension. If %s is NOT supposed to be a file, " +
"please move or rename the file at location %s", str, str, file.getAbsolutePath()));
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ public void testIsIntervalFile() {
Assert.assertTrue(IntervalUtils.isIntervalFile(emptyIntervals));
Assert.assertTrue(IntervalUtils.isIntervalFile(emptyIntervals, true));

List<String> extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard");
List<String> extensions = Arrays.asList("interval_list", "intervals", "list", "picard");
for (String extension: extensions) {
Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension);
}
Expand Down

0 comments on commit bbaa73b

Please sign in to comment.