Merge 11a89a2 into 6f30ec9

bigdatagenomics · Jan 30, 2017 · 6b78a58 · 6b78a58
2 parents 6f30ec9 + 11a89a2
commit 6b78a58
Show file tree

Hide file tree

Showing 11 changed files with 571 additions and 15 deletions.
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/BEDInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/BEDInFormatter.scala
@@ -0,0 +1,65 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.rdd.feature
+
+import java.io.{
+  BufferedWriter,
+  OutputStream,
+  OutputStreamWriter
+}
+import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
+import org.bdgenomics.formats.avro.Feature
+import org.bdgenomics.utils.misc.Logging
+
+/**
+ * InFormatter companion that builds a BEDInFormatter to write features in BED format to a pipe.
+ */
+object BEDInFormatter extends InFormatterCompanion[Feature, FeatureRDD, BEDInFormatter] {
+
+  /**
+   * Apply method for building the BEDInFormatter from a FeatureRDD.
+   *
+   * @param fRdd FeatureRDD to build from.
+   */
+  def apply(fRdd: FeatureRDD): BEDInFormatter = {
+    BEDInFormatter()
+  }
+}
+
+case class BEDInFormatter private () extends InFormatter[Feature, FeatureRDD, BEDInFormatter] {
+  protected val companion = BEDInFormatter
+
+  /**
+   * Writes features to an output stream in BED format.
+   *
+   * @param os An OutputStream connected to a process we are piping to.
+   * @param iter An iterator of features to write.
+   */
+  def write(os: OutputStream, iter: Iterator[Feature]) {
+    val writer = new BufferedWriter(new OutputStreamWriter(os))
+
+    // write the features
+    iter.foreach(f => {
+      writer.write(FeatureRDD.toBed(f))
+      writer.newLine()
+    })
+
+    // close the writer, else stream may be defective
+    writer.close() // os is flushed and closed in InFormatterRunner
+  }
+}
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/BEDOutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/BEDOutFormatter.scala
@@ -0,0 +1,63 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.rdd.feature
+
+import htsjdk.samtools.ValidationStringency
+import htsjdk.tribble.readers.{
+  AsciiLineReader,
+  AsciiLineReaderIterator
+}
+import java.io.InputStream
+import org.bdgenomics.adam.rdd.OutFormatter
+import org.bdgenomics.formats.avro.Feature
+import scala.annotation.tailrec
+import scala.collection.mutable.ListBuffer
+
+/**
+ * OutFormatter that reads streaming BED format.
+ */
+case class BEDOutFormatter() extends OutFormatter[Feature] {
+  val bedParser = new BEDParser
+  val stringency = ValidationStringency.STRICT
+
+  /**
+   * Reads features from an input stream in BED format.
+   *
+   * @param is An InputStream connected to the process we are piping from.
+   * @return Returns an iterator of Features read from the stream.
+   */
+  def read(is: InputStream): Iterator[Feature] = {
+
+    // make line reader iterator
+    val lri = new AsciiLineReaderIterator(new AsciiLineReader(is))
+
+    @tailrec def convertIterator(iter: AsciiLineReaderIterator,
+                                 features: ListBuffer[Feature] = ListBuffer.empty): Iterator[Feature] = {
+      if (!iter.hasNext) {
+        iter.close()
+        features.toIterator
+      } else {
+        val nextFeatures = bedParser.parse(iter.next, stringency).fold(features)(features += _)
+        convertIterator(iter, nextFeatures)
+      }
+    }
+
+    // convert the iterator
+    convertIterator(lri)
+  }
+}
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala
@@ -101,14 +101,6 @@ private object FeatureOrdering extends FeatureOrdering[Feature] {}
 
 object FeatureRDD {
 
-  /**
-   * @param elem The feature to extract a sequence record from.
-   * @return Gets the SequenceRecord for this feature.
-   */
-  private def getSequenceRecord(elem: Feature): SequenceRecord = {
-    SequenceRecord(elem.getContigName, 1L)
-  }
-
   /**
    * Builds a FeatureRDD without SequenceDictionary information by running an
    * aggregate to rebuild the SequenceDictionary.
@@ -121,11 +113,13 @@ object FeatureRDD {
     // cache the rdd, since we're making multiple passes
     rdd.cache()
 
-    // aggregate to create the sequence dictionary
-    val sd = new SequenceDictionary(rdd.map(getSequenceRecord)
-      .distinct
-      .collect
-      .toVector)
+    // create sequence records based on largest end coordinate
+    val featuresByContigName = rdd.keyBy(_.getContigName)
+    val featureEndsByContigName: RDD[(String, Long)] = featuresByContigName.map(kv => (kv._1, kv._2.getEnd))
+    val maxFeatureEndsByContigName = featureEndsByContigName.reduceByKey(math.max(_, _))
+    val sequenceRecords = maxFeatureEndsByContigName.map(kv => SequenceRecord(kv._1, kv._2))
+
+    val sd = new SequenceDictionary(sequenceRecords.collect.toVector)
 
     FeatureRDD(rdd, sd)
   }

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GFF3InFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GFF3InFormatter.scala
@@ -0,0 +1,65 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.rdd.feature
+
+import java.io.{
+  BufferedWriter,
+  OutputStream,
+  OutputStreamWriter
+}
+import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
+import org.bdgenomics.formats.avro.Feature
+import org.bdgenomics.utils.misc.Logging
+
+/**
+ * InFormatter companion that builds a GFF3InFormatter to write features in GFF3 format to a pipe.
+ */
+object GFF3InFormatter extends InFormatterCompanion[Feature, FeatureRDD, GFF3InFormatter] {
+
+  /**
+   * Apply method for building the GFF3InFormatter from a FeatureRDD.
+   *
+   * @param fRdd FeatureRDD to build from.
+   */
+  def apply(fRdd: FeatureRDD): GFF3InFormatter = {
+    GFF3InFormatter()
+  }
+}
+
+case class GFF3InFormatter private () extends InFormatter[Feature, FeatureRDD, GFF3InFormatter] {
+  protected val companion = GFF3InFormatter
+
+  /**
+   * Writes features to an output stream in GFF3 format.
+   *
+   * @param os An OutputStream connected to a process we are piping to.
+   * @param iter An iterator of features to write.
+   */
+  def write(os: OutputStream, iter: Iterator[Feature]) {
+    val writer = new BufferedWriter(new OutputStreamWriter(os))
+
+    // write the features
+    iter.foreach(f => {
+      writer.write(FeatureRDD.toGff3(f))
+      writer.newLine()
+    })
+
+    // close the writer, else stream may be defective
+    writer.close() // os is flushed and closed in InFormatterRunner
+  }
+}
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GFF3OutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GFF3OutFormatter.scala
@@ -0,0 +1,63 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.rdd.feature
+
+import htsjdk.samtools.ValidationStringency
+import htsjdk.tribble.readers.{
+  AsciiLineReader,
+  AsciiLineReaderIterator
+}
+import java.io.InputStream
+import org.bdgenomics.adam.rdd.OutFormatter
+import org.bdgenomics.formats.avro.Feature
+import scala.annotation.tailrec
+import scala.collection.mutable.ListBuffer
+
+/**
+ * OutFormatter that reads streaming GFF3 format.
+ */
+case class GFF3OutFormatter() extends OutFormatter[Feature] {
+  val gff3Parser = new GFF3Parser
+  val stringency = ValidationStringency.STRICT
+
+  /**
+   * Reads features from an input stream in GFF3 format.
+   *
+   * @param is An InputStream connected to the process we are piping from.
+   * @return Returns an iterator of Features read from the stream.
+   */
+  def read(is: InputStream): Iterator[Feature] = {
+
+    // make line reader iterator
+    val lri = new AsciiLineReaderIterator(new AsciiLineReader(is))
+
+    @tailrec def convertIterator(iter: AsciiLineReaderIterator,
+                                 features: ListBuffer[Feature] = ListBuffer.empty): Iterator[Feature] = {
+      if (!iter.hasNext) {
+        iter.close()
+        features.toIterator
+      } else {
+        val nextFeatures = gff3Parser.parse(iter.next, stringency).fold(features)(features += _)
+        convertIterator(iter, nextFeatures)
+      }
+    }
+
+    // convert the iterator
+    convertIterator(lri)
+  }
+}
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GTFInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/GTFInFormatter.scala
@@ -0,0 +1,65 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.rdd.feature
+
+import java.io.{
+  BufferedWriter,
+  OutputStream,
+  OutputStreamWriter
+}
+import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
+import org.bdgenomics.formats.avro.Feature
+import org.bdgenomics.utils.misc.Logging
+
+/**
+ * InFormatter companion that builds a GTFInFormatter to write features in GTF format to a pipe.
+ */
+object GTFInFormatter extends InFormatterCompanion[Feature, FeatureRDD, GTFInFormatter] {
+
+  /**
+   * Apply method for building the GTFInFormatter from a FeatureRDD.
+   *
+   * @param fRdd FeatureRDD to build from.
+   */
+  def apply(fRdd: FeatureRDD): GTFInFormatter = {
+    GTFInFormatter()
+  }
+}
+
+case class GTFInFormatter private () extends InFormatter[Feature, FeatureRDD, GTFInFormatter] {
+  protected val companion = GTFInFormatter
+
+  /**
+   * Writes features to an output stream in GTF format.
+   *
+   * @param os An OutputStream connected to a process we are piping to.
+   * @param iter An iterator of features to write.
+   */
+  def write(os: OutputStream, iter: Iterator[Feature]) {
+    val writer = new BufferedWriter(new OutputStreamWriter(os))
+
+    // write the features
+    iter.foreach(f => {
+      writer.write(FeatureRDD.toGtf(f))
+      writer.newLine()
+    })
+
+    // close the writer, else stream may be defective
+    writer.close() // os is flushed and closed in InFormatterRunner
+  }
+}