Skip to content

Commit

Permalink
Merge 3666087 into 3542342
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Sep 6, 2018
2 parents 3542342 + 3666087 commit fb96920
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 2 deletions.
Expand Up @@ -202,13 +202,30 @@ object FeatureRDD {
* @return Returns the feature as a single line BED string.
*/
private[rdd] def toBed(feature: Feature): String = {
toBed(feature, None, None, None)
}

/**
* @param feature Feature to write in BED format.
* @return Returns the feature as a single line BED string.
*/
private[rdd] def toBed(feature: Feature,
minimumScore: Option[Double],
maximumScore: Option[Double],
missingValue: Option[Int]): String = {

val chrom = feature.getContigName
val start = feature.getStart
val end = feature.getEnd
val name = Features.nameOf(feature)
val score = Option(feature.getScore).getOrElse(".")
val strand = Features.asString(feature.getStrand)

val score = if (minimumScore.isDefined && maximumScore.isDefined && missingValue.isDefined) {
Features.interpolateScore(feature.getScore, minimumScore.get, maximumScore.get, missingValue.get)
} else {
Features.formatScore(feature.getScore)
}

if (!feature.getAttributes.containsKey("thickStart") &&
!feature.getAttributes.containsKey("itemRgb") &&
!feature.getAttributes.containsKey("blockCount")) {
Expand Down Expand Up @@ -661,7 +678,35 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd
}

/**
* Save this FeatureRDD in BED format.
* Save this FeatureRDD in UCSC BED format, where score is formatted as
* integer values between 0 and 1000, with missing value as specified.
*
* @param fileName The path to save BED formatted text file(s) to.
* @param asSingleFile By default (false), writes file to disk as shards with
* one shard per partition. If true, we save the file to disk as a single
* file by merging the shards.
* @param disableFastConcat If asSingleFile is true, disables the use of the
* parallel file merging engine.
* @param minimumScore Minimum score, interpolated to 0.
* @param maximumScore Maximum score, interpolated to 1000.
* @param missingValue Value to use if score is not specified. Defaults to 0.
*/
def saveAsUcscBed(fileName: String,
asSingleFile: Boolean = false,
disableFastConcat: Boolean = false,
minimumScore: Double,
maximumScore: Double,
missingValue: Int = 0) = {

writeTextRdd(rdd.map(FeatureRDD.toBed(_, Some(minimumScore), Some(maximumScore), Some(missingValue))),
fileName,
asSingleFile,
disableFastConcat)
}

/**
* Save this FeatureRDD in bedtools2 BED format, where score is formatted
* as double floating point values with missing values.
*
* @param fileName The path to save BED formatted text file(s) to.
* @param asSingleFile By default (false), writes file to disk as shards with
Expand All @@ -673,6 +718,7 @@ sealed abstract class FeatureRDD extends AvroGenomicDataset[Feature, FeatureProd
def saveAsBed(fileName: String,
asSingleFile: Boolean = false,
disableFastConcat: Boolean = false) = {

writeTextRdd(rdd.map(FeatureRDD.toBed),
fileName,
asSingleFile,
Expand Down
Expand Up @@ -20,6 +20,7 @@ package org.bdgenomics.adam.rdd.feature
import org.bdgenomics.formats.avro.{ Dbxref, Feature, OntologyTerm, Strand }
import scala.collection.JavaConversions._
import scala.collection.mutable.{ ArrayBuffer, HashMap, MutableList }
import scala.math.{ max, min }

/**
* Utility methods on features and related classes.
Expand Down Expand Up @@ -225,4 +226,55 @@ private[feature] object Features {
}
"sequence_feature"
}

/**
* Format the feature score as double floating point values
* with "." as missing value.
*
* @param score Feature score to format.
* @return Return the specified feature score formatted as
* double floating point values with "." as missing value.
*/
def formatScore(score: java.lang.Double): String = {
Option(score).fold(".")(_.toString)
}

/**
* Interpolate the feature score to integer values between 0 and 1000,
* with missing value as specified.
*
* @param score Feature score to interpolate.
* @param minimumScore Minimum score, interpolated to 0.
* @param maximumScore Maximum score, interpolated to 1000.
* @param missingValue Value to use if score is not specified.
* @return Return the specified feature score interpolated to integer values
* between 0 and 1000, with missing value as specified.
*/
def interpolateScore(score: java.lang.Double,
minimumScore: Double,
maximumScore: Double,
missingValue: Int): Int = {

def constrain(v: Double, min: Double, max: Double): Double = {
if (v < min) {
min
} else if (v > max) {
max
} else {
v
}
}

def interp(value: Double,
sourceMin: Double,
sourceMax: Double,
targetMin: Double,
targetMax: Double): Double = {

val v = max(min(sourceMax, value), sourceMin)
targetMin + (targetMax - targetMin) * ((v - sourceMin) / (sourceMax - sourceMin))
}

Option(score).fold(missingValue)(interp(_, minimumScore, maximumScore, 0, 1000).toInt)
}
}
Expand Up @@ -343,6 +343,26 @@ class FeatureRDDSuite extends ADAMFunSuite {
checkFiles(inputPath, outputPath)
}

sparkTest("save to UCSC BED format") {
val inputPath = testFile("dvl1.200.bed")
val expected = sc.loadBed(inputPath)
val outputPath = tempLocation(".bed")
expected.saveAsUcscBed(outputPath,
asSingleFile = true,
minimumScore = 0.0,
maximumScore = 200.0)

val lines = sc.textFile(outputPath)
val bedCols = lines.first.split("\t")
assert(bedCols.size === 6)
assert(bedCols(0) === "1")
assert(bedCols(1) === "1331345")
assert(bedCols(2) === "1331536")
assert(bedCols(3) === "106624")
assert(bedCols(4) === "67")
assert(bedCols(5) === "+")
}

sparkTest("save IntervalList as GTF format") {
val inputPath = testFile("SeqCap_EZ_Exome_v3.hg19.interval_list")
val features = sc.loadIntervalList(inputPath)
Expand Down

0 comments on commit fb96920

Please sign in to comment.