diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala index 7ea2bdfd89..e07b48ce62 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/converters/AlignmentRecordConverter.scala @@ -157,6 +157,72 @@ class AlignmentRecordConverter extends Serializable { "%s\t%s\t%s".format(name, sequence, qualityScores) } + /** + * Converts a single record to Bowtie tab5 format. + * + * In Bowtie tab5 format, each alignment record or pair is on a single line. + * An unpaired alignment record line is [name]\t[seq]\t[qual]\n. + * A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n. + * + * The index suffix will be trimmed from the read name if present. + * + * If the base qualities are unknown (qual is null or equals "*"), the quality + * scores will be a repeated string of 'B's that is equal to the read length. + * + * @param adamRecord Read to convert to FASTQ. + * @param outputOriginalBaseQualities If true and the original base quality + * field is set (SAM "OQ" tag), outputs the original qualities. Else, + * output the qual field. Defaults to false. + * @return Returns this read in string form. + */ + def convertToTab5( + adamRecord: AlignmentRecord, + outputOriginalBaseQualities: Boolean = false): String = { + + val (name, sequence, qualityScores) = + prepareFastq(adamRecord, maybeAddSuffix = false, outputOriginalBaseQualities) + + "%s\t%s\t%s".format(trimSuffix(name), sequence, qualityScores) + } + + /** + * Converts a single record representing the second read of a pair to Bowtie + * tab5 format. + * + * In Bowtie tab5 format, each alignment record or pair is on a single line. + * An unpaired alignment record line is [name]\t[seq]\t[qual]\n. + * A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n. + * + * If the base qualities are unknown (qual is null or equals "*"), the quality + * scores will be a repeated string of 'B's that is equal to the read length. + * + * @param adamRecord Read to convert to FASTQ. + * @param outputOriginalBaseQualities If true and the original base quality + * field is set (SAM "OQ" tag), outputs the original qualities. Else, + * output the qual field. Defaults to false. + * @return Returns this read in string form. + */ + def convertSecondReadToTab5( + adamRecord: AlignmentRecord, + outputOriginalBaseQualities: Boolean = false): String = { + + val (name, sequence, qualityScores) = + prepareFastq(adamRecord, maybeAddSuffix = false, outputOriginalBaseQualities) + + // name of second read is ignored + "%s\t%s".format(sequence, qualityScores) + } + + /** + * Trim the index suffix from the read name if present. + * + * @param name Read name to trim. + * @return The read name after trimming the index suffix if present. + */ + private def trimSuffix(name: String): String = { + name.replace("/[0-9]+^", "") + } + /** * Converts a single ADAM record into a SAM record. * diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab5InFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab5InFormatter.scala new file mode 100644 index 0000000000..bdadd69f79 --- /dev/null +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/Tab5InFormatter.scala @@ -0,0 +1,96 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.adam.rdd.fragment + +import java.io.OutputStream +import org.apache.hadoop.conf.Configuration +import org.bdgenomics.adam.converters.AlignmentRecordConverter +import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion } +import org.bdgenomics.formats.avro.Fragment +import org.bdgenomics.utils.misc.Logging + +/** + * InFormatter companion that creates an InFormatter that writes Bowtie tab5 format. + */ +object Tab5InFormatter extends InFormatterCompanion[Fragment, FragmentRDD, Tab5InFormatter] { + + /** + * Builds an Tab5InFormatter to write Bowtie tab5 format. + * + * @param gRdd GenomicRDD of Fragments. Used to get HadoopConfiguration. + * @return Returns a new Tab6InFormatter. + */ + def apply(gRdd: FragmentRDD): Tab5InFormatter = { + new Tab5InFormatter(gRdd.rdd.context.hadoopConfiguration) + } +} + +class Tab5InFormatter private ( + conf: Configuration) extends InFormatter[Fragment, FragmentRDD, Tab5InFormatter] with Logging { + + protected val companion = Tab5InFormatter + private val newLine = "\n".getBytes + private val converter = new AlignmentRecordConverter + private val writeOriginalQualities = conf.getBoolean(FragmentRDD.WRITE_ORIGINAL_QUALITIES, false) + + /** + * Writes alignment records to an output stream in Bowtie tab5 format. + * + * In Bowtie tab5 format, each alignment record or pair is on a single line. + * An unpaired alignment record line is [name]\t[seq]\t[qual]\n. + * A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n. + * + * The read name for a paired-end read line is the name of the first + * read with the suffix trimmed. + * + * @param os An OutputStream connected to a process we are piping to. + * @param iter An iterator of records to write. + */ + def write(os: OutputStream, iter: Iterator[Fragment]) { + iter.map(frag => { + val reads = converter.convertFragment(frag).toSeq + + if (reads.size < 2) { + reads + } else { + if (reads.size > 2) { + log.warn("More than two reads for %s. Taking first 2.".format(frag)) + } + reads.take(2) + } + }).foreach(reads => { + + // write unpaired read or first of paired-end reads + val first = converter.convertToTab5(reads(0), + outputOriginalBaseQualities = writeOriginalQualities) + + os.write(first.getBytes) + + // write second of paired-end reads, if present + if (reads.size > 1) { + val second = "\t" + converter.convertSecondReadToTab5(reads(1), + outputOriginalBaseQualities = writeOriginalQualities) + + os.write(second.getBytes) + } + + // end line + os.write(newLine) + }) + } +} diff --git a/adam-core/src/test/resources/tab5_to_usam.py b/adam-core/src/test/resources/tab5_to_usam.py new file mode 100644 index 0000000000..02296ca576 --- /dev/null +++ b/adam-core/src/test/resources/tab5_to_usam.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +from __future__ import print_function +import sys + +# read lines from stdin +lines = sys.stdin.readlines() + +# print sam header +print("@HD\tVN:1.5\tSO:unsorted") + +# loop and print sam lines +for line in lines: + fields = line.split() + readName = fields[0] + firstSequence = fields[1] + firstQualities = fields[2] + secondSequence = fields[3] + secondQualities = fields[4] + + # flags: + # 1 = paired (we assume that in this script) + # 4 = unmapped + # 8 = mate unmapped + # 64 = first of pair + # 128 = second of pair + firstFlags = 64 | 8 | 4 | 1 + secondFlags = 128 | 8 | 4 | 1 + + # sam is the following tab-delimited columns: + # + # 1. read name + # 2. flags + # 3. ref (* = unaligned) + # 4. pos (0 = unaligned) + # 5. map qual (0 if unmapped) + # 6. cigar (* = unavailable) + # 7. mate ref (* = unaligned) + # 8. mate pos (0 = unaligned) + # 9. tlen (0 = unknown) + # 10. sequence + # 11. qualities + print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/1", + firstFlags, + firstSequence, + firstQualities)) + print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/2", + secondFlags, + secondSequence, + secondQualities)) diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala index fe29626b2b..9f94b764af 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala @@ -49,6 +49,24 @@ class FragmentRDDSuite extends ADAMFunSuite { assert(2 * records === newRecords) } + sparkTest("don't lose any reads when piping tab5 to sam") { + val fragmentsPath = testFile("interleaved_fastq_sample1.ifq") + val ardd = sc.loadFragments(fragmentsPath) + val records = ardd.rdd.count + assert(records === 3) + + implicit val tFormatter = Tab5InFormatter + implicit val uFormatter = new AnySAMOutFormatter + + // this script converts tab5 to unaligned sam + val scriptPath = testFile("tab5_to_usam.py") + + val pipedRdd: AlignmentRecordRDD = ardd.pipe("python $0", + files = Seq(scriptPath)) + val newRecords = pipedRdd.rdd.count + assert(2 * records === newRecords) + } + sparkTest("don't lose any reads when piping tab6 to sam") { // write suffixes at end of reads sc.hadoopConfiguration.setBoolean(FragmentRDD.WRITE_SUFFIXES, true)