Skip to content

Commit

Permalink
Use qualityScores for base quality scores.
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Sep 4, 2019
1 parent 951ba6a commit 041aabf
Show file tree
Hide file tree
Showing 40 changed files with 253 additions and 240 deletions.
10 changes: 5 additions & 5 deletions adam-cli/src/main/scala/org/bdgenomics/adam/cli/ADAM2Fastq.scala
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ class ADAM2FastqArgs extends Args4jBase {
var persistLevel: String = null
@Args4jOption(required = false, name = "-no_projection", usage = "Disable projection on records. No great reason to do this, but useful for testing / comparison.")
var disableProjection: Boolean = false
@Args4jOption(required = false, name = "-output_oq", usage = "Output the original sequencing quality scores")
var outputOriginalBaseQualities = false
@Args4jOption(required = false, name = "-output_oq", usage = "Write the original sequencing quality scores")
var writeOriginalQualityScores = false
}

object ADAM2Fastq extends BDGCommandCompanion {
Expand All @@ -71,9 +71,9 @@ class ADAM2Fastq(val args: ADAM2FastqArgs) extends BDGSparkCommand[ADAM2FastqArg
Projection(
AlignmentRecordField.readName,
AlignmentRecordField.sequence,
AlignmentRecordField.quality,
AlignmentRecordField.qualityScores,
AlignmentRecordField.readInFragment,
AlignmentRecordField.originalQuality
AlignmentRecordField.originalQualityScores
)
)
else
Expand All @@ -91,7 +91,7 @@ class ADAM2Fastq(val args: ADAM2FastqArgs) extends BDGSparkCommand[ADAM2FastqArg
Option(args.outputPath2),
asSingleFile = args.asSingleFile,
disableFastConcat = args.disableFastConcat,
outputOriginalBaseQualities = args.outputOriginalBaseQualities,
writeOriginalQualityScores = args.writeOriginalQualityScores,
validationStringency = args.validationStringency,
persistLevel = Option(args.persistLevel).map(StorageLevel.fromString(_))
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ class TransformAlignments(protected val args: TransformAlignmentsArgs) extends B

val proj = if (args.limitProjection) {
Some(Filter(AlignmentRecordField.attributes,
AlignmentRecordField.originalQuality))
AlignmentRecordField.originalQualityScores))
} else {
None
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class TransformAlignmentsSuite extends ADAMFunSuite {
TransformAlignments(Array(inputPath, finalPath, "-bin_quality_scores", "0,20,10;20,40,30;40,60,50")).run(sc)
val qualityScoreCounts = sc.loadAlignments(finalPath)
.rdd
.flatMap(_.getQuality)
.flatMap(_.getQualityScores)
.map(s => s.toInt - 33)
.countByValue

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class TransformFragmentsSuite extends ADAMFunSuite {
"-bin_quality_scores", "0,20,10;20,40,30;40,60,50")).run(sc)
val qualityScoreCounts = sc.loadAlignments(finalPath)
.rdd
.flatMap(_.getQuality)
.flatMap(_.getQualityScores)
.map(s => s.toInt - 33)
.countByValue

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,23 @@ class AlignmentRecordConverter extends Serializable {
* Prepare a single record for conversion to FASTQ and similar formats by
* splitting into a tuple of (name, sequence, qualityScores).
*
* If the base qualities are unknown (qual is null or equals "*"), the quality
* scores will be a repeated string of 'B's that is equal to the read length.
* If the base quality scores are unknown (qualityScores is null or equals "*"),
* the quality scores will be a repeated string of 'B's that is equal to the read
* length.
*
* @param adamRecord Read to prepare for conversion to FASTQ and similar formats.
* @param maybeAddSuffix If true, check if a "/%d" suffix is attached to the
* read. If there is no suffix, a slash and the number of the read in the
* sequenced fragment is appended to the readname. Default is false.
* @param outputOriginalBaseQualities If true and the original base quality
* field is set (SAM "OQ" tag), outputs the original qualities. Else,
* output the qual field. Defaults to false.
* @param writeOriginalQualityScores If true and the original base quality
* scores field is set (SAM "OQ" tag), outputs the original quality scores. Else,
* output the qualityScores field. Defaults to false.
* @return Returns tuple of (name, sequence, qualityScores).
*/
private def prepareFastq(
adamRecord: AlignmentRecord,
maybeAddSuffix: Boolean,
outputOriginalBaseQualities: Boolean): (String, String, String) = {
writeOriginalQualityScores: Boolean): (String, String, String) = {

val readNameSuffix =
if (maybeAddSuffix &&
Expand All @@ -69,15 +70,15 @@ class AlignmentRecordConverter extends Serializable {
else
adamRecord.getSequence.length
val qualityScores =
if (outputOriginalBaseQualities && adamRecord.getOriginalQuality != null)
if (adamRecord.getOriginalQuality == "*")
if (writeOriginalQualityScores && adamRecord.getOriginalQualityScores != null)
if (adamRecord.getOriginalQualityScores == "*")
"B" * seqLength
else
adamRecord.getOriginalQuality
else if (adamRecord.getQuality == null)
adamRecord.getOriginalQualityScores
else if (adamRecord.getQualityScores == null)
"B" * seqLength
else
adamRecord.getQuality
adamRecord.getQualityScores

(
adamRecord.getReadName + readNameSuffix,
Expand All @@ -103,25 +104,26 @@ class AlignmentRecordConverter extends Serializable {
* ASCII quality scores
* }}}
*
* If the base qualities are unknown (qual is null or equals "*"), the quality
* scores will be a repeated string of 'B's that is equal to the read length.
* If the base quality scores are unknown (qualityScores is null or equals "*"),
* the quality scores will be a repeated string of 'B's that is equal to the read
* length.
*
* @param adamRecord Read to convert to FASTQ.
* @param maybeAddSuffix If true, check if a "/%d" suffix is attached to the
* read. If there is no suffix, a slash and the number of the read in the
* sequenced fragment is appended to the readname. Default is false.
* @param outputOriginalBaseQualities If true and the original base quality
* field is set (SAM "OQ" tag), outputs the original qualities. Else,
* output the qual field. Defaults to false.
* @param writeOriginalQualityScores If true and the original base quality
* score field is set (SAM "OQ" tag), outputs the original quality scores. Else,
* output the qualityScores field. Defaults to false.
* @return Returns this read in string form.
*/
def convertToFastq(
adamRecord: AlignmentRecord,
maybeAddSuffix: Boolean = false,
outputOriginalBaseQualities: Boolean = false): String = {
writeOriginalQualityScores: Boolean = false): String = {

val (name, sequence, qualityScores) =
prepareFastq(adamRecord, maybeAddSuffix, outputOriginalBaseQualities)
prepareFastq(adamRecord, maybeAddSuffix, writeOriginalQualityScores)

"@%s\n%s\n+\n%s".format(name, sequence, qualityScores)
}
Expand All @@ -130,29 +132,30 @@ class AlignmentRecordConverter extends Serializable {
* Converts a single record to Bowtie tab6 format.
*
* In Bowtie tab6 format, each alignment record or pair is on a single line.
* An unpaired alignment record line is [name]\t[seq]\t[qual]\n.
* An unpaired alignment record line is [name]\t[seq]\t[qualityScores]\n.
* For paired-end alignment records, the second end can have a different name
* from the first: [name1]\t[seq1]\t[qual1]\t[name2]\t[seq2]\t[qual2]\n.
* from the first: [name1]\t[seq1]\t[qualityScores1]\t[name2]\t[seq2]\t[qualityScores2]\n.
*
* If the base qualities are unknown (qual is null or equals "*"), the quality
* scores will be a repeated string of 'B's that is equal to the read length.
* If the base quality scores are unknown (qualityScores is null or equals "*"),
* the quality scores will be a repeated string of 'B's that is equal to the read
* length.
*
* @param adamRecord Read to convert to FASTQ.
* @param maybeAddSuffix If true, check if a "/%d" suffix is attached to the
* read. If there is no suffix, a slash and the number of the read in the
* sequenced fragment is appended to the readname. Default is false.
* @param outputOriginalBaseQualities If true and the original base quality
* field is set (SAM "OQ" tag), outputs the original qualities. Else,
* output the qual field. Defaults to false.
* @param writeOriginalQualityScores If true and the original base quality
* scores field is set (SAM "OQ" tag), outputs the original quality scores. Else,
* output the qualityScores field. Defaults to false.
* @return Returns this read in string form.
*/
def convertToTab6(
adamRecord: AlignmentRecord,
maybeAddSuffix: Boolean = false,
outputOriginalBaseQualities: Boolean = false): String = {
writeOriginalQualityScores: Boolean = false): String = {

val (name, sequence, qualityScores) =
prepareFastq(adamRecord, maybeAddSuffix, outputOriginalBaseQualities)
prepareFastq(adamRecord, maybeAddSuffix, writeOriginalQualityScores)

"%s\t%s\t%s".format(name, sequence, qualityScores)
}
Expand All @@ -161,26 +164,27 @@ class AlignmentRecordConverter extends Serializable {
* Converts a single record to Bowtie tab5 format.
*
* In Bowtie tab5 format, each alignment record or pair is on a single line.
* An unpaired alignment record line is [name]\t[seq]\t[qual]\n.
* A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n.
* An unpaired alignment record line is [name]\t[seq]\t[qualityScores]\n.
* A paired-end read line is [name]\t[seq1]\t[qualityScores1]\t[seq2]\t[qualityScores2]\n.
*
* The index suffix will be trimmed from the read name if present.
*
* If the base qualities are unknown (qual is null or equals "*"), the quality
* scores will be a repeated string of 'B's that is equal to the read length.
* If the base quality scores are unknown (qualityScores is null or equals "*"),
* the quality scores will be a repeated string of 'B's that is equal to the read
* length.
*
* @param adamRecord Read to convert to FASTQ.
* @param outputOriginalBaseQualities If true and the original base quality
* field is set (SAM "OQ" tag), outputs the original qualities. Else,
* output the qual field. Defaults to false.
* @param writeOriginalQualityScores If true and the original base quality
* scores field is set (SAM "OQ" tag), outputs the original quality scores. Else,
* output the qualityScores field. Defaults to false.
* @return Returns this read in string form.
*/
def convertToTab5(
adamRecord: AlignmentRecord,
outputOriginalBaseQualities: Boolean = false): String = {
writeOriginalQualityScores: Boolean = false): String = {

val (name, sequence, qualityScores) =
prepareFastq(adamRecord, maybeAddSuffix = false, outputOriginalBaseQualities)
prepareFastq(adamRecord, maybeAddSuffix = false, writeOriginalQualityScores)

"%s\t%s\t%s".format(trimSuffix(name), sequence, qualityScores)
}
Expand All @@ -190,24 +194,25 @@ class AlignmentRecordConverter extends Serializable {
* tab5 format.
*
* In Bowtie tab5 format, each alignment record or pair is on a single line.
* An unpaired alignment record line is [name]\t[seq]\t[qual]\n.
* A paired-end read line is [name]\t[seq1]\t[qual1]\t[seq2]\t[qual2]\n.
* An unpaired alignment record line is [name]\t[seq]\t[qualityScores]\n.
* A paired-end read line is [name]\t[seq1]\t[qualityScores1]\t[seq2]\t[qualityScores2]\n.
*
* If the base qualities are unknown (qual is null or equals "*"), the quality
* scores will be a repeated string of 'B's that is equal to the read length.
* If the base quality scores are unknown (qualityScores is null or equals "*"),
* the quality scores will be a repeated string of 'B's that is equal to the read
* length.
*
* @param adamRecord Read to convert to FASTQ.
* @param outputOriginalBaseQualities If true and the original base quality
* field is set (SAM "OQ" tag), outputs the original qualities. Else,
* output the qual field. Defaults to false.
* @param writeOriginalQualityScores If true and the original base quality
* scores field is set (SAM "OQ" tag), outputs the original quality scores. Else,
* output the qualityScores field. Defaults to false.
* @return Returns this read in string form.
*/
def convertSecondReadToTab5(
adamRecord: AlignmentRecord,
outputOriginalBaseQualities: Boolean = false): String = {
writeOriginalQualityScores: Boolean = false): String = {

val (name, sequence, qualityScores) =
prepareFastq(adamRecord, maybeAddSuffix = false, outputOriginalBaseQualities)
prepareFastq(adamRecord, maybeAddSuffix = false, writeOriginalQualityScores)

// name of second read is ignored
"%s\t%s".format(sequence, qualityScores)
Expand Down Expand Up @@ -242,7 +247,7 @@ class AlignmentRecordConverter extends Serializable {
// set canonically necessary fields
builder.setReadName(adamRecord.getReadName)
builder.setReadString(adamRecord.getSequence)
adamRecord.getQuality match {
adamRecord.getQualityScores match {
case null => builder.setBaseQualityString("*")
case s: String => builder.setBaseQualityString(s)
}
Expand Down Expand Up @@ -323,7 +328,7 @@ class AlignmentRecordConverter extends Serializable {
.foreach(v => builder.setReadFailsVendorQualityCheckFlag(v.booleanValue))
Option(adamRecord.getMismatchingPositions)
.foreach(builder.setAttribute("MD", _))
Option(adamRecord.getOriginalQuality)
Option(adamRecord.getOriginalQualityScores)
.map(s => s.getBytes.map(v => (v - 33).toByte)) // not ascii, but short int
.foreach(builder.setOriginalBaseQualities(_))
Option(adamRecord.getOriginalCigar)
Expand Down

0 comments on commit 041aabf

Please sign in to comment.