Add FASTA in formatter for sequence datasets.

bigdatagenomics · Sep 12, 2019 · f02a1de · f02a1de
1 parent 5359a75
commit f02a1de
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 0 deletions.
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/FASTAInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/FASTAInFormatter.scala
@@ -0,0 +1,70 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.rdd.sequence
+
+import java.io.OutputStream
+import org.apache.hadoop.conf.Configuration
+import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
+import org.bdgenomics.adam.sql.{ Sequence => SequenceProduct }
+import org.bdgenomics.formats.avro.Sequence
+
+/**
+ * InFormatter companion that creates an InFormatter that writes FASTA.
+ */
+object FASTAInFormatter extends InFormatterCompanion[Sequence, SequenceProduct, SequenceDataset, FASTAInFormatter] {
+
+  /**
+   * Builds a FASTAInFormatter to write FASTA.
+   *
+   * @param gDataset GenomicDataset of Sequences. Used to get HadoopConfiguration.
+   * @return Returns a new FASTA InFormatter.
+   */
+  def apply(gDataset: SequenceDataset): FASTAInFormatter = {
+    new FASTAInFormatter(gDataset.rdd.context.hadoopConfiguration)
+  }
+}
+
+class FASTAInFormatter private (
+    conf: Configuration) extends InFormatter[Sequence, SequenceProduct, SequenceDataset, FASTAInFormatter] {
+
+  protected val companion = FASTAInFormatter
+  private val lineWidth = conf.getInt(SequenceDataset.FASTA_LINE_WIDTH, 60)
+
+  /**
+   * Writes sequences to an output stream in FASTA format.
+   *
+   * @param os An OutputStream connected to a process we are piping to.
+   * @param iter An iterator of records to write.
+   */
+  def write(os: OutputStream, iter: Iterator[Sequence]) {
+    def toFasta(sequence: Sequence): String = {
+      val sb = new StringBuilder()
+      sb.append(">")
+      sb.append(sequence.getName)
+      Option(sequence.getDescription).foreach(n => sb.append(" ").append(n))
+      sequence.getSequence.grouped(lineWidth).foreach(line => {
+        sb.append("\n")
+        sb.append(line)
+      })
+      sb.append("\n")
+      sb.toString
+    }
+
+    iter.foreach(sequence => os.write(toFasta(sequence).getBytes))
+  }
+}
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SequenceDataset.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SequenceDataset.scala
@@ -74,6 +74,12 @@ private[adam] class SequenceArraySerializer extends IntervalArraySerializer[Refe
 
 object SequenceDataset {
 
+  /**
+   * Hadoop configuration path to specify line width at
+   * which to hard wrap FASTA formatted sequences. Defaults to 60.
+   */
+  val FASTA_LINE_WIDTH = "org.bdgenomics.adam.rdd.sequence.SequenceDataset.lineWidth"
+
   /**
    * A genomic dataset that wraps a dataset of Sequence data.
    *