Skip to content

Commit

Permalink
Merge e3084d8 into 18191f9
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed May 12, 2017
2 parents 18191f9 + e3084d8 commit 48bf67d
Show file tree
Hide file tree
Showing 4 changed files with 230 additions and 0 deletions.
Expand Up @@ -157,6 +157,72 @@ class AlignmentRecordConverter extends Serializable {
"%s\t%s\t%s".format(name, sequence, qualityScores)
}

/**
* Converts a single record to Bowtie tab5 format.
*
* In Bowtie tab5 format, each alignment record or pair is on a single line.
* An unpaired alignment record line is [name]\t[seq]\t[qual]\n.
* A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n.
*
* The index suffix will be trimmed from the read name if present.
*
* If the base qualities are unknown (qual is null or equals "*"), the quality
* scores will be a repeated string of 'B's that is equal to the read length.
*
* @param adamRecord Read to convert to FASTQ.
* @param outputOriginalBaseQualities If true and the original base quality
* field is set (SAM "OQ" tag), outputs the original qualities. Else,
* output the qual field. Defaults to false.
* @return Returns this read in string form.
*/
def convertToTab5(
adamRecord: AlignmentRecord,
outputOriginalBaseQualities: Boolean = false): String = {

val (name, sequence, qualityScores) =
prepareFastq(adamRecord, maybeAddSuffix = false, outputOriginalBaseQualities)

"%s\t%s\t%s".format(trimSuffix(name), sequence, qualityScores)
}

/**
* Converts a single record representing the second read of a pair to Bowtie
* tab5 format.
*
* In Bowtie tab5 format, each alignment record or pair is on a single line.
* An unpaired alignment record line is [name]\t[seq]\t[qual]\n.
* A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n.
*
* If the base qualities are unknown (qual is null or equals "*"), the quality
* scores will be a repeated string of 'B's that is equal to the read length.
*
* @param adamRecord Read to convert to FASTQ.
* @param outputOriginalBaseQualities If true and the original base quality
* field is set (SAM "OQ" tag), outputs the original qualities. Else,
* output the qual field. Defaults to false.
* @return Returns this read in string form.
*/
def convertSecondReadToTab5(
adamRecord: AlignmentRecord,
outputOriginalBaseQualities: Boolean = false): String = {

val (name, sequence, qualityScores) =
prepareFastq(adamRecord, maybeAddSuffix = false, outputOriginalBaseQualities)

// name of second read is ignored
"%s\t%s".format(sequence, qualityScores)
}

/**
* Trim the index suffix from the read name if present.
*
* @param name Read name to trim.
* @return The read name after trimming the index suffix if present.
*/
private def trimSuffix(name: String): String = {
name.replace("/[0-9]+^", "")
}

/**
* Converts a single ADAM record into a SAM record.
*
Expand Down
@@ -0,0 +1,96 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.rdd.fragment

import java.io.OutputStream
import org.apache.hadoop.conf.Configuration
import org.bdgenomics.adam.converters.AlignmentRecordConverter
import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
import org.bdgenomics.formats.avro.Fragment
import org.bdgenomics.utils.misc.Logging

/**
* InFormatter companion that creates an InFormatter that writes Bowtie tab5 format.
*/
object Tab5InFormatter extends InFormatterCompanion[Fragment, FragmentRDD, Tab5InFormatter] {

/**
* Builds an Tab5InFormatter to write Bowtie tab5 format.
*
* @param gRdd GenomicRDD of Fragments. Used to get HadoopConfiguration.
* @return Returns a new Tab6InFormatter.
*/
def apply(gRdd: FragmentRDD): Tab5InFormatter = {
new Tab5InFormatter(gRdd.rdd.context.hadoopConfiguration)
}
}

class Tab5InFormatter private (
conf: Configuration) extends InFormatter[Fragment, FragmentRDD, Tab5InFormatter] with Logging {

protected val companion = Tab5InFormatter
private val newLine = "\n".getBytes
private val converter = new AlignmentRecordConverter
private val writeOriginalQualities = conf.getBoolean(FragmentRDD.WRITE_ORIGINAL_QUALITIES, false)

/**
* Writes alignment records to an output stream in Bowtie tab5 format.
*
* In Bowtie tab5 format, each alignment record or pair is on a single line.
* An unpaired alignment record line is [name]\t[seq]\t[qual]\n.
* A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n.
*
* The read name for a paired-end read line is the name of the first
* read with the suffix trimmed.
*
* @param os An OutputStream connected to a process we are piping to.
* @param iter An iterator of records to write.
*/
def write(os: OutputStream, iter: Iterator[Fragment]) {
iter.map(frag => {
val reads = converter.convertFragment(frag).toSeq

if (reads.size < 2) {
reads
} else {
if (reads.size > 2) {
log.warn("More than two reads for %s. Taking first 2.".format(frag))
}
reads.take(2)
}
}).foreach(reads => {

// write unpaired read or first of paired-end reads
val first = converter.convertToTab5(reads(0),
outputOriginalBaseQualities = writeOriginalQualities)

os.write(first.getBytes)

// write second of paired-end reads, if present
if (reads.size > 1) {
val second = "\t" + converter.convertSecondReadToTab5(reads(1),
outputOriginalBaseQualities = writeOriginalQualities)

os.write(second.getBytes)
}

// end line
os.write(newLine)
})
}
}
50 changes: 50 additions & 0 deletions adam-core/src/test/resources/tab5_to_usam.py
@@ -0,0 +1,50 @@
#!/usr/bin/env python

from __future__ import print_function
import sys

# read lines from stdin
lines = sys.stdin.readlines()

# print sam header
print("@HD\tVN:1.5\tSO:unsorted")

# loop and print sam lines
for line in lines:
fields = line.split()
readName = fields[0]
firstSequence = fields[1]
firstQualities = fields[2]
secondSequence = fields[3]
secondQualities = fields[4]

# flags:
# 1 = paired (we assume that in this script)
# 4 = unmapped
# 8 = mate unmapped
# 64 = first of pair
# 128 = second of pair
firstFlags = 64 | 8 | 4 | 1
secondFlags = 128 | 8 | 4 | 1

# sam is the following tab-delimited columns:
#
# 1. read name
# 2. flags
# 3. ref (* = unaligned)
# 4. pos (0 = unaligned)
# 5. map qual (0 if unmapped)
# 6. cigar (* = unavailable)
# 7. mate ref (* = unaligned)
# 8. mate pos (0 = unaligned)
# 9. tlen (0 = unknown)
# 10. sequence
# 11. qualities
print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/1",
firstFlags,
firstSequence,
firstQualities))
print("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t%s\t%s" % (readName + "/2",
secondFlags,
secondSequence,
secondQualities))
Expand Up @@ -49,6 +49,24 @@ class FragmentRDDSuite extends ADAMFunSuite {
assert(2 * records === newRecords)
}

sparkTest("don't lose any reads when piping tab5 to sam") {
val fragmentsPath = testFile("interleaved_fastq_sample1.ifq")
val ardd = sc.loadFragments(fragmentsPath)
val records = ardd.rdd.count
assert(records === 3)

implicit val tFormatter = Tab5InFormatter
implicit val uFormatter = new AnySAMOutFormatter

// this script converts tab5 to unaligned sam
val scriptPath = testFile("tab5_to_usam.py")

val pipedRdd: AlignmentRecordRDD = ardd.pipe("python $0",
files = Seq(scriptPath))
val newRecords = pipedRdd.rdd.count
assert(2 * records === newRecords)
}

sparkTest("don't lose any reads when piping tab6 to sam") {
// write suffixes at end of reads
sc.hadoopConfiguration.setBoolean(FragmentRDD.WRITE_SUFFIXES, true)
Expand Down

0 comments on commit 48bf67d

Please sign in to comment.