Merge pull request #404 from tdanford/gene-models

[ADAM-327] Adding gene, transcript, and exon models.
bigdatagenomics · Oct 13, 2014 · beec466 · beec466
2 parents 3d2e5c8 + ba2e4a5
commit beec466
Show file tree

Hide file tree

Showing 33 changed files with 1,288 additions and 602 deletions.
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAMMain.scala
@@ -34,7 +34,6 @@ object ADAMMain extends Logging {
         FindReads,
         CalculateDepth,
         CountKmers,
-        PileupAggregator,
         Transform,
         /* TODO (nealsid): Reimplement in terms of new schema
 	  ComputeVariants
@@ -52,6 +51,7 @@ object ADAMMain extends Logging {
         Features2ADAM)),
       CommandGroup("PRINT", List(
         PrintADAM,
+        PrintGenes,
         FlagStat,
         VizReads,
         PrintTags,

diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Features2ADAM.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Features2ADAM.scala
@@ -23,6 +23,7 @@ import org.apache.spark.rdd.RDD
 import org.bdgenomics.adam.models.BaseFeature
 import org.bdgenomics.adam.rdd.ADAMContext._
 import org.bdgenomics.adam.rdd.features.ADAMFeaturesContext._
+import org.bdgenomics.formats.avro.Feature
 import org.kohsuke.args4j.Argument
 
 object Features2ADAM extends ADAMCommandCompanion {
@@ -52,11 +53,13 @@ class Features2ADAM(val args: Features2ADAMArgs)
     // regex: anything . (extension) EOL
     val extensionPattern = """.*[.]([^.]*)$""".r
     val extensionPattern(extension) = args.featuresFile
-    val features: RDD[_ <: BaseFeature] = extension.toLowerCase match {
+    val features: RDD[Feature] = extension.toLowerCase match {
+      case "gff"        => sc.adamGTFFeatureLoad(args.featuresFile) // TODO(Timothy) write a GFF-specific loader?
+      case "gtf"        => sc.adamGTFFeatureLoad(args.featuresFile)
       case "bed"        => sc.adamBEDFeatureLoad(args.featuresFile)
       case "narrowPeak" => sc.adamNarrowPeakFeatureLoad(args.featuresFile)
     }
-    features.map(f => f.feature).adamSave(args.outputPath, blockSize = args.blockSize,
+    features.adamSave(args.outputPath, blockSize = args.blockSize,
       pageSize = args.pageSize, compressCodec = args.compressionCodec,
       disableDictionaryEncoding = args.disableDictionary)
   }

diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/PileupAggregator.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/PileupAggregator.scala
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/PrintGenes.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/PrintGenes.scala
@@ -0,0 +1,72 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.cli
+
+import org.apache.hadoop.mapreduce.Job
+import org.apache.spark.rdd.RDD
+import org.bdgenomics.adam.models._
+import org.bdgenomics.adam.rdd.features.ADAMFeaturesContext._
+import org.bdgenomics.adam.models.GeneContext._
+import org.bdgenomics.adam.rdd.features.GeneFeatureRDD._
+import org.bdgenomics.formats.avro.Feature
+import org.kohsuke.args4j.{ Option => option, Argument }
+import org.bdgenomics.adam.rdd.ADAMContext._
+import org.apache.spark.SparkContext
+
+object PrintGenes extends ADAMCommandCompanion {
+  val commandName: String = "print_genes"
+  val commandDescription: String = "Load a GTF file containing gene annotations and print the corresponding gene models"
+
+  def apply(cmdLine: Array[String]) = {
+    new PrintGenes(Args4j[PrintGenesArgs](cmdLine))
+  }
+}
+
+class PrintGenesArgs extends Args4jBase with ParquetArgs with Serializable {
+  @Argument(metaVar = "GTF", required = true, usage = "GTF file with gene model data", index = 0)
+  var gtfInput: String = _
+}
+
+class PrintGenes(protected val args: PrintGenesArgs)
+    extends ADAMSparkCommand[PrintGenesArgs] with Serializable {
+
+  val companion = PrintGenes
+
+  def run(sc: SparkContext, job: Job): Unit = {
+    val features: RDD[Feature] = sc.adamGTFFeatureLoad(args.gtfInput)
+    val genes: RDD[Gene] = features.asGenes()
+
+    genes.map(printGene).collect().foreach(println)
+  }
+
+  def printGene(gene: Gene): String = {
+    val builder = new StringBuilder()
+    builder.append("Gene %s (%s)".format(gene.id, gene.names.mkString(",")))
+    gene.transcripts.foreach(t => builder.append(printTranscript(t)))
+    builder.toString()
+  }
+
+  def printTranscript(transcript: Transcript): String = {
+    "\n\tTranscript %s %s:%d-%d:%s (%d exons)".format(
+      transcript.id,
+      transcript.region.referenceName,
+      transcript.region.start, transcript.region.end,
+      if (transcript.strand) "+" else "-",
+      transcript.exons.size)
+  }
+}
diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Ref.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Ref.scala
@@ -50,9 +50,6 @@ class Reads2RefArgs extends Args4jBase with ParquetArgs {
   @option(name = "-mapq", usage = "Minimal mapq value allowed for a read (default = 30)")
   var minMapq: Long = Reads2RefArgs.MIN_MAPQ_DEFAULT
 
-  @option(name = "-aggregate", usage = "Aggregates data at each pileup position, to reduce storage cost.")
-  var aggregate: Boolean = false
-
   @option(name = "-allowNonPrimaryAlignments", usage = "Converts reads that are not at their primary alignment positions to pileups.")
   var nonPrimary: Boolean = true
 }
@@ -71,13 +68,7 @@ class Reads2Ref(protected val args: Reads2RefArgs) extends ADAMSparkCommand[Read
 
     val coverage = pileupCount / readCount
 
-    if (args.aggregate) {
-      pileups.adamAggregatePileups(coverage.toInt).adamSave(args.pileupOutput,
-        blockSize = args.blockSize, pageSize = args.pageSize, compressCodec = args.compressionCodec,
-        disableDictionaryEncoding = args.disableDictionary)
-    } else {
-      pileups.adamSave(args.pileupOutput, blockSize = args.blockSize, pageSize = args.pageSize,
-        compressCodec = args.compressionCodec, disableDictionaryEncoding = args.disableDictionary)
-    }
+    pileups.adamSave(args.pileupOutput, blockSize = args.blockSize, pageSize = args.pageSize,
+      compressCodec = args.compressionCodec, disableDictionaryEncoding = args.disableDictionary)
   }
 }
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/FeatureHierarchy.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/FeatureHierarchy.scala
@@ -17,33 +17,125 @@
  */
 package org.bdgenomics.adam.models
 
-import org.bdgenomics.formats.avro.Feature
+import org.bdgenomics.formats.avro.{ Strand, Feature }
 
-class BaseFeature(val feature: Feature) {
-  def featureId = feature.getFeatureId
-  def contig = feature.getContig
-  def start = feature.getStart
-  def end = feature.getEnd
+import scala.collection.JavaConversions
+import scala.collection.JavaConversions._
+import scala.collection._
+
+object BaseFeature {
+
+  def strandChar(strand: Strand): Char =
+    strand match {
+      case Strand.Forward     => '+'
+      case Strand.Reverse     => '-'
+      case Strand.Independent => '.'
+    }
+
+  def frameChar(feature: Feature): Char = {
+    val opt: Map[CharSequence, CharSequence] = feature.getAttributes
+    opt.get("frame").map(_.charAt(0)).getOrElse('.')
+  }
+
+  def attributeString(feature: Feature): String =
+    feature.getAttributes.mkString(",")
+
+  def attrs(f: Feature): Map[CharSequence, CharSequence] =
+    JavaConversions.mapAsScalaMap(f.getAttributes)
+
+  def attributeString(feature: Feature, attrName: String): Option[String] =
+    attrs(feature).get(attrName).map(_.toString)
+
+  def attributeLong(feature: Feature, attrName: String): Option[Long] =
+    attrs(feature).get(attrName).map(_.toString).map(_.toLong)
+
+  def attributeDouble(feature: Feature, attrName: String): Option[Double] =
+    attrs(feature).get(attrName).map(_.toString).map(_.toDouble)
+
+  def attributeInt(feature: Feature, attrName: String): Option[Int] =
+    attrs(feature).get(attrName).map(_.toString).map(_.toInt)
+}
+
+class BaseFeature(val feature: Feature) extends Serializable {
+  override def toString: String = feature.toString
+}
+
+class GTFFeature(override val feature: Feature) extends BaseFeature(feature) {
+  def getSeqname: String = feature.getContig.getContigName.toString
+
+  def getSource: String = feature.getSource.toString
+
+  def getFeature: String = feature.getFeatureType.toString
+
+  def getStart: Long = feature.getStart
+
+  def getEnd: Long = feature.getEnd
+
+  def getScore: Double = feature.getValue
+
+  def getStrand: Char = BaseFeature.strandChar(feature.getStrand)
+
+  def getFrame: Char = BaseFeature.frameChar(feature)
+
+  def getAttribute: String = BaseFeature.attributeString(feature)
 }
 
-class BEDFeature(feature: Feature) extends BaseFeature(feature) {
-  def name = feature.getTrackName
-  def score = feature.getValue
-  def strand = feature.getStrand
-  def thickStart = feature.getThickStart
-  def thickEnd = feature.getThickEnd
-  def itemRgb = feature.getItemRgb
-  def blockCount = feature.getBlockCount
-  def blockSizes = feature.getBlockSizes
-  def blockStarts = feature.getBlockStarts
+object BEDFeature {
+
+  def parseBlockSizes(attribute: Option[String]): Array[Int] =
+    attribute.getOrElse("").split(",").map(_.toInt)
+
+  def parseBlockStarts(attribute: Option[String]): Array[Int] =
+    attribute.getOrElse("").split(",").map(_.toInt)
 }
 
-class NarrowPeakFeature(feature: Feature) extends BaseFeature(feature) {
-  def name = feature.getTrackName
-  def score = feature.getValue
-  def strand = feature.getStrand
-  def signalValue = feature.getSignalValue
-  def pValue = feature.getPValue
-  def qValue = feature.getQValue
-  def peak = feature.getPeak
+class BEDFeature(override val feature: Feature) extends BaseFeature(feature) {
+  def getChrom: String = feature.getContig.getContigName.toString
+
+  def getChromStart: Long = feature.getStart
+
+  def getChromEnd: Long = feature.getEnd
+
+  def getName: String = feature.getFeatureId.toString
+
+  def getScore: Double = feature.getValue
+
+  def getStrand: Char = BaseFeature.strandChar(feature.getStrand)
+
+  def getThickStart: Option[Long] = BaseFeature.attributeLong(feature, "thickStart")
+
+  def getThickEnd: Option[Long] = BaseFeature.attributeLong(feature, "thickEnd")
+
+  def getItemRGB: Option[String] = BaseFeature.attributeString(feature, "itemRgb")
+
+  def getBlockCount: Option[Int] = BaseFeature.attributeInt(feature, "blockCount")
+
+  def getBlockSizes: Array[Int] = BEDFeature.parseBlockSizes(BaseFeature.attributeString(feature, "blockSizes"))
+
+  def getBlockStarts: Array[Int] = BEDFeature.parseBlockStarts(BaseFeature.attributeString(feature, "blockStarts"))
+}
+
+/**
+ * See: http://genome.ucsc.edu/FAQ/FAQformat.html#format12
+ */
+class NarrowPeakFeature(override val feature: Feature) extends BaseFeature(feature) {
+  def getChrom: String = feature.getContig.getContigName.toString
+
+  def getChromStart: Long = feature.getStart
+
+  def getChromEnd: Long = feature.getEnd
+
+  def getName: String = feature.getFeatureId.toString
+
+  def getScore: Double = feature.getValue
+
+  def getStrand: Char = BaseFeature.strandChar(feature.getStrand)
+
+  def getSignalValue: Option[Double] = BaseFeature.attributeDouble(feature, "signalValue")
+
+  def getPValue: Option[Double] = BaseFeature.attributeDouble(feature, "pValue")
+
+  def getQValue: Option[Double] = BaseFeature.attributeDouble(feature, "qValue")
+
+  def getPeak: Option[Long] = BaseFeature.attributeLong(feature, "peak")
 }