Merge e3084d8 into 18191f9

bigdatagenomics · May 12, 2017 · 48bf67d · 48bf67d
2 parents 18191f9 + e3084d8
commit 48bf67d
Show file tree

Hide file tree

Showing 4 changed files with 230 additions and 0 deletions.
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala
@@ -157,6 +157,72 @@ class AlignmentRecordConverter extends Serializable {
     "%s\t%s\t%s".format(name, sequence, qualityScores)
   }
 
+  /**
+   * Converts a single record to Bowtie tab5 format.
+   *
+   * In Bowtie tab5 format, each alignment record or pair is on a single line.
+   * An unpaired alignment record line is [name]\t[seq]\t[qual]\n.
+   * A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n.
+   *
+   * The index suffix will be trimmed from the read name if present.
+   *
+   * If the base qualities are unknown (qual is null or equals "*"), the quality
+   * scores will be a repeated string of 'B's that is equal to the read length.
+   *
+   * @param adamRecord Read to convert to FASTQ.
+   * @param outputOriginalBaseQualities If true and the original base quality
+   *   field is set (SAM "OQ" tag), outputs the original qualities. Else,
+   *   output the qual field. Defaults to false.
+   * @return Returns this read in string form.
+   */
+  def convertToTab5(
+    adamRecord: AlignmentRecord,
+    outputOriginalBaseQualities: Boolean = false): String = {
+
+    val (name, sequence, qualityScores) =
+      prepareFastq(adamRecord, maybeAddSuffix = false, outputOriginalBaseQualities)
+
+    "%s\t%s\t%s".format(trimSuffix(name), sequence, qualityScores)
+  }
+
+  /**
+   * Converts a single record representing the second read of a pair to Bowtie
+   * tab5 format.
+   *
+   * In Bowtie tab5 format, each alignment record or pair is on a single line.
+   * An unpaired alignment record line is [name]\t[seq]\t[qual]\n.
+   * A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n.
+   *
+   * If the base qualities are unknown (qual is null or equals "*"), the quality
+   * scores will be a repeated string of 'B's that is equal to the read length.
+   *
+   * @param adamRecord Read to convert to FASTQ.
+   * @param outputOriginalBaseQualities If true and the original base quality
+   *   field is set (SAM "OQ" tag), outputs the original qualities. Else,
+   *   output the qual field. Defaults to false.
+   * @return Returns this read in string form.
+   */
+  def convertSecondReadToTab5(
+    adamRecord: AlignmentRecord,
+    outputOriginalBaseQualities: Boolean = false): String = {
+
+    val (name, sequence, qualityScores) =
+      prepareFastq(adamRecord, maybeAddSuffix = false, outputOriginalBaseQualities)
+
+    // name of second read is ignored
+    "%s\t%s".format(sequence, qualityScores)
+  }
+
+  /**
+   * Trim the index suffix from the read name if present.
+   *
+   * @param name Read name to trim.
+   * @return The read name after trimming the index suffix if present.
+   */
+  private def trimSuffix(name: String): String = {
+    name.replace("/[0-9]+^", "")
+  }
+
   /**
    * Converts a single ADAM record into a SAM record.
    *

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab5InFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab5InFormatter.scala
@@ -0,0 +1,96 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.rdd.fragment
+
+import java.io.OutputStream
+import org.apache.hadoop.conf.Configuration
+import org.bdgenomics.adam.converters.AlignmentRecordConverter
+import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
+import org.bdgenomics.formats.avro.Fragment
+import org.bdgenomics.utils.misc.Logging
+
+/**
+ * InFormatter companion that creates an InFormatter that writes Bowtie tab5 format.
+ */
+object Tab5InFormatter extends InFormatterCompanion[Fragment, FragmentRDD, Tab5InFormatter] {
+
+  /**
+   * Builds an Tab5InFormatter to write Bowtie tab5 format.
+   *
+   * @param gRdd GenomicRDD of Fragments. Used to get HadoopConfiguration.
+   * @return Returns a new Tab6InFormatter.
+   */
+  def apply(gRdd: FragmentRDD): Tab5InFormatter = {
+    new Tab5InFormatter(gRdd.rdd.context.hadoopConfiguration)
+  }
+}
+
+class Tab5InFormatter private (
+    conf: Configuration) extends InFormatter[Fragment, FragmentRDD, Tab5InFormatter] with Logging {
+
+  protected val companion = Tab5InFormatter
+  private val newLine = "\n".getBytes
+  private val converter = new AlignmentRecordConverter
+  private val writeOriginalQualities = conf.getBoolean(FragmentRDD.WRITE_ORIGINAL_QUALITIES, false)
+
+  /**
+   * Writes alignment records to an output stream in Bowtie tab5 format.
+   *
+   * In Bowtie tab5 format, each alignment record or pair is on a single line.
+   * An unpaired alignment record line is [name]\t[seq]\t[qual]\n.
+   * A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n.
+   *
+   * The read name for a paired-end read line is the name of the first
+   * read with the suffix trimmed.
+   *
+   * @param os An OutputStream connected to a process we are piping to.
+   * @param iter An iterator of records to write.
+   */
+  def write(os: OutputStream, iter: Iterator[Fragment]) {
+    iter.map(frag => {
+      val reads = converter.convertFragment(frag).toSeq
+
+      if (reads.size < 2) {
+        reads
+      } else {
+        if (reads.size > 2) {
+          log.warn("More than two reads for %s. Taking first 2.".format(frag))
+        }
+        reads.take(2)
+      }
+    }).foreach(reads => {
+
+      // write unpaired read or first of paired-end reads
+      val first = converter.convertToTab5(reads(0),
+        outputOriginalBaseQualities = writeOriginalQualities)
+
+      os.write(first.getBytes)
+
+      // write second of paired-end reads, if present
+      if (reads.size > 1) {
+        val second = "\t" + converter.convertSecondReadToTab5(reads(1),
+          outputOriginalBaseQualities = writeOriginalQualities)
+
+        os.write(second.getBytes)
+      }
+
+      // end line
+      os.write(newLine)
+    })
+  }
+}
diff --git a/adam-core/src/test/resources/tab5_to_usam.py b/adam-core/src/test/resources/tab5_to_usam.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import sys
+
+# read lines from stdin
+lines = sys.stdin.readlines()
+
+# print sam header
+print("@HD\tVN:1.5\tSO:unsorted")
+
+# loop and print sam lines
+for line in lines:
+    fields = line.split()
+    readName = fields[0]
+    firstSequence = fields[1]
+    firstQualities = fields[2]
+    secondSequence = fields[3]
+    secondQualities = fields[4]
+
+    # flags:
+    # 1 = paired (we assume that in this script)
+    # 4 = unmapped
+    # 8 = mate unmapped
+    # 64 = first of pair
+    # 128 = second of pair
+    firstFlags = 64 | 8 | 4 | 1
+    secondFlags = 128 | 8 | 4 | 1
+
+    # sam is the following tab-delimited columns:
+    #
+    # 1. read name
+    # 2. flags
+    # 3. ref (* = unaligned)
+    # 4. pos (0 = unaligned)
+    # 5. map qual (0 if unmapped)
+    # 6. cigar (* = unavailable)
+    # 7. mate ref (* = unaligned)
+    # 8. mate pos (0 = unaligned)
+    # 9. tlen (0 = unknown)
+    # 10. sequence
+    # 11. qualities
+    print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/1",
+                                                   firstFlags,
+                                                   firstSequence,
+                                                   firstQualities))
+    print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/2",
+                                                   secondFlags,
+                                                   secondSequence,
+                                                   secondQualities))
diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala
@@ -49,6 +49,24 @@ class FragmentRDDSuite extends ADAMFunSuite {
     assert(2 * records === newRecords)
   }
 
+  sparkTest("don't lose any reads when piping tab5 to sam") {
+    val fragmentsPath = testFile("interleaved_fastq_sample1.ifq")
+    val ardd = sc.loadFragments(fragmentsPath)
+    val records = ardd.rdd.count
+    assert(records === 3)
+
+    implicit val tFormatter = Tab5InFormatter
+    implicit val uFormatter = new AnySAMOutFormatter
+
+    // this script converts tab5 to unaligned sam
+    val scriptPath = testFile("tab5_to_usam.py")
+
+    val pipedRdd: AlignmentRecordRDD = ardd.pipe("python $0",
+      files = Seq(scriptPath))
+    val newRecords = pipedRdd.rdd.count
+    assert(2 * records === newRecords)
+  }
+
   sparkTest("don't lose any reads when piping tab6 to sam") {
     // write suffixes at end of reads
     sc.hadoopConfiguration.setBoolean(FragmentRDD.WRITE_SUFFIXES, true)