Skip to content

Commit

Permalink
Merge pull request #404 from tdanford/gene-models
Browse files Browse the repository at this point in the history
[ADAM-327] Adding gene, transcript, and exon models.
  • Loading branch information
fnothaft committed Oct 13, 2014
2 parents 3d2e5c8 + ba2e4a5 commit beec466
Show file tree
Hide file tree
Showing 33 changed files with 1,288 additions and 602 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ object ADAMMain extends Logging {
FindReads,
CalculateDepth,
CountKmers,
PileupAggregator,
Transform,
/* TODO (nealsid): Reimplement in terms of new schema
ComputeVariants
Expand All @@ -52,6 +51,7 @@ object ADAMMain extends Logging {
Features2ADAM)),
CommandGroup("PRINT", List(
PrintADAM,
PrintGenes,
FlagStat,
VizReads,
PrintTags,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.models.BaseFeature
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.features.ADAMFeaturesContext._
import org.bdgenomics.formats.avro.Feature
import org.kohsuke.args4j.Argument

object Features2ADAM extends ADAMCommandCompanion {
Expand Down Expand Up @@ -52,11 +53,13 @@ class Features2ADAM(val args: Features2ADAMArgs)
// regex: anything . (extension) EOL
val extensionPattern = """.*[.]([^.]*)$""".r
val extensionPattern(extension) = args.featuresFile
val features: RDD[_ <: BaseFeature] = extension.toLowerCase match {
val features: RDD[Feature] = extension.toLowerCase match {
case "gff" => sc.adamGTFFeatureLoad(args.featuresFile) // TODO(Timothy) write a GFF-specific loader?
case "gtf" => sc.adamGTFFeatureLoad(args.featuresFile)
case "bed" => sc.adamBEDFeatureLoad(args.featuresFile)
case "narrowPeak" => sc.adamNarrowPeakFeatureLoad(args.featuresFile)
}
features.map(f => f.feature).adamSave(args.outputPath, blockSize = args.blockSize,
features.adamSave(args.outputPath, blockSize = args.blockSize,
pageSize = args.pageSize, compressCodec = args.compressionCodec,
disableDictionaryEncoding = args.disableDictionary)
}
Expand Down

This file was deleted.

72 changes: 72 additions & 0 deletions adam-cli/src/main/scala/org/bdgenomics/adam/cli/PrintGenes.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.cli

import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.models._
import org.bdgenomics.adam.rdd.features.ADAMFeaturesContext._
import org.bdgenomics.adam.models.GeneContext._
import org.bdgenomics.adam.rdd.features.GeneFeatureRDD._
import org.bdgenomics.formats.avro.Feature
import org.kohsuke.args4j.{ Option => option, Argument }
import org.bdgenomics.adam.rdd.ADAMContext._
import org.apache.spark.SparkContext

object PrintGenes extends ADAMCommandCompanion {
val commandName: String = "print_genes"
val commandDescription: String = "Load a GTF file containing gene annotations and print the corresponding gene models"

def apply(cmdLine: Array[String]) = {
new PrintGenes(Args4j[PrintGenesArgs](cmdLine))
}
}

class PrintGenesArgs extends Args4jBase with ParquetArgs with Serializable {
@Argument(metaVar = "GTF", required = true, usage = "GTF file with gene model data", index = 0)
var gtfInput: String = _
}

class PrintGenes(protected val args: PrintGenesArgs)
extends ADAMSparkCommand[PrintGenesArgs] with Serializable {

val companion = PrintGenes

def run(sc: SparkContext, job: Job): Unit = {
val features: RDD[Feature] = sc.adamGTFFeatureLoad(args.gtfInput)
val genes: RDD[Gene] = features.asGenes()

genes.map(printGene).collect().foreach(println)
}

def printGene(gene: Gene): String = {
val builder = new StringBuilder()
builder.append("Gene %s (%s)".format(gene.id, gene.names.mkString(",")))
gene.transcripts.foreach(t => builder.append(printTranscript(t)))
builder.toString()
}

def printTranscript(transcript: Transcript): String = {
"\n\tTranscript %s %s:%d-%d:%s (%d exons)".format(
transcript.id,
transcript.region.referenceName,
transcript.region.start, transcript.region.end,
if (transcript.strand) "+" else "-",
transcript.exons.size)
}
}
13 changes: 2 additions & 11 deletions adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Ref.scala
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ class Reads2RefArgs extends Args4jBase with ParquetArgs {
@option(name = "-mapq", usage = "Minimal mapq value allowed for a read (default = 30)")
var minMapq: Long = Reads2RefArgs.MIN_MAPQ_DEFAULT

@option(name = "-aggregate", usage = "Aggregates data at each pileup position, to reduce storage cost.")
var aggregate: Boolean = false

@option(name = "-allowNonPrimaryAlignments", usage = "Converts reads that are not at their primary alignment positions to pileups.")
var nonPrimary: Boolean = true
}
Expand All @@ -71,13 +68,7 @@ class Reads2Ref(protected val args: Reads2RefArgs) extends ADAMSparkCommand[Read

val coverage = pileupCount / readCount

if (args.aggregate) {
pileups.adamAggregatePileups(coverage.toInt).adamSave(args.pileupOutput,
blockSize = args.blockSize, pageSize = args.pageSize, compressCodec = args.compressionCodec,
disableDictionaryEncoding = args.disableDictionary)
} else {
pileups.adamSave(args.pileupOutput, blockSize = args.blockSize, pageSize = args.pageSize,
compressCodec = args.compressionCodec, disableDictionaryEncoding = args.disableDictionary)
}
pileups.adamSave(args.pileupOutput, blockSize = args.blockSize, pageSize = args.pageSize,
compressCodec = args.compressionCodec, disableDictionaryEncoding = args.disableDictionary)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,125 @@
*/
package org.bdgenomics.adam.models

import org.bdgenomics.formats.avro.Feature
import org.bdgenomics.formats.avro.{ Strand, Feature }

class BaseFeature(val feature: Feature) {
def featureId = feature.getFeatureId
def contig = feature.getContig
def start = feature.getStart
def end = feature.getEnd
import scala.collection.JavaConversions
import scala.collection.JavaConversions._
import scala.collection._

object BaseFeature {

def strandChar(strand: Strand): Char =
strand match {
case Strand.Forward => '+'
case Strand.Reverse => '-'
case Strand.Independent => '.'
}

def frameChar(feature: Feature): Char = {
val opt: Map[CharSequence, CharSequence] = feature.getAttributes
opt.get("frame").map(_.charAt(0)).getOrElse('.')
}

def attributeString(feature: Feature): String =
feature.getAttributes.mkString(",")

def attrs(f: Feature): Map[CharSequence, CharSequence] =
JavaConversions.mapAsScalaMap(f.getAttributes)

def attributeString(feature: Feature, attrName: String): Option[String] =
attrs(feature).get(attrName).map(_.toString)

def attributeLong(feature: Feature, attrName: String): Option[Long] =
attrs(feature).get(attrName).map(_.toString).map(_.toLong)

def attributeDouble(feature: Feature, attrName: String): Option[Double] =
attrs(feature).get(attrName).map(_.toString).map(_.toDouble)

def attributeInt(feature: Feature, attrName: String): Option[Int] =
attrs(feature).get(attrName).map(_.toString).map(_.toInt)
}

class BaseFeature(val feature: Feature) extends Serializable {
override def toString: String = feature.toString
}

class GTFFeature(override val feature: Feature) extends BaseFeature(feature) {
def getSeqname: String = feature.getContig.getContigName.toString

def getSource: String = feature.getSource.toString

def getFeature: String = feature.getFeatureType.toString

def getStart: Long = feature.getStart

def getEnd: Long = feature.getEnd

def getScore: Double = feature.getValue

def getStrand: Char = BaseFeature.strandChar(feature.getStrand)

def getFrame: Char = BaseFeature.frameChar(feature)

def getAttribute: String = BaseFeature.attributeString(feature)
}

class BEDFeature(feature: Feature) extends BaseFeature(feature) {
def name = feature.getTrackName
def score = feature.getValue
def strand = feature.getStrand
def thickStart = feature.getThickStart
def thickEnd = feature.getThickEnd
def itemRgb = feature.getItemRgb
def blockCount = feature.getBlockCount
def blockSizes = feature.getBlockSizes
def blockStarts = feature.getBlockStarts
object BEDFeature {

def parseBlockSizes(attribute: Option[String]): Array[Int] =
attribute.getOrElse("").split(",").map(_.toInt)

def parseBlockStarts(attribute: Option[String]): Array[Int] =
attribute.getOrElse("").split(",").map(_.toInt)
}

class NarrowPeakFeature(feature: Feature) extends BaseFeature(feature) {
def name = feature.getTrackName
def score = feature.getValue
def strand = feature.getStrand
def signalValue = feature.getSignalValue
def pValue = feature.getPValue
def qValue = feature.getQValue
def peak = feature.getPeak
class BEDFeature(override val feature: Feature) extends BaseFeature(feature) {
def getChrom: String = feature.getContig.getContigName.toString

def getChromStart: Long = feature.getStart

def getChromEnd: Long = feature.getEnd

def getName: String = feature.getFeatureId.toString

def getScore: Double = feature.getValue

def getStrand: Char = BaseFeature.strandChar(feature.getStrand)

def getThickStart: Option[Long] = BaseFeature.attributeLong(feature, "thickStart")

def getThickEnd: Option[Long] = BaseFeature.attributeLong(feature, "thickEnd")

def getItemRGB: Option[String] = BaseFeature.attributeString(feature, "itemRgb")

def getBlockCount: Option[Int] = BaseFeature.attributeInt(feature, "blockCount")

def getBlockSizes: Array[Int] = BEDFeature.parseBlockSizes(BaseFeature.attributeString(feature, "blockSizes"))

def getBlockStarts: Array[Int] = BEDFeature.parseBlockStarts(BaseFeature.attributeString(feature, "blockStarts"))
}

/**
* See: http://genome.ucsc.edu/FAQ/FAQformat.html#format12
*/
class NarrowPeakFeature(override val feature: Feature) extends BaseFeature(feature) {
def getChrom: String = feature.getContig.getContigName.toString

def getChromStart: Long = feature.getStart

def getChromEnd: Long = feature.getEnd

def getName: String = feature.getFeatureId.toString

def getScore: Double = feature.getValue

def getStrand: Char = BaseFeature.strandChar(feature.getStrand)

def getSignalValue: Option[Double] = BaseFeature.attributeDouble(feature, "signalValue")

def getPValue: Option[Double] = BaseFeature.attributeDouble(feature, "pValue")

def getQValue: Option[Double] = BaseFeature.attributeDouble(feature, "qValue")

def getPeak: Option[Long] = BaseFeature.attributeLong(feature, "peak")
}
Loading

0 comments on commit beec466

Please sign in to comment.