Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADAM-1359] Merge reads2fragments and fragments2reads into transformFragments #1543

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ object ADAMMain {
Fasta2ADAM,
ADAM2Fasta,
ADAM2Fastq,
Fragments2Reads,
Reads2Fragments
TransformFragments
)
),
CommandGroup(
Expand Down

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.cli

import org.apache.spark.SparkContext
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs
import org.bdgenomics.adam.rdd.read.QualityScoreBin
import org.bdgenomics.adam.rdd.fragment.FragmentRDD
import org.bdgenomics.utils.cli._
import org.bdgenomics.utils.misc.Logging
import org.kohsuke.args4j.{ Argument, Option => Args4jOption }

object TransformFragments extends BDGCommandCompanion {
val commandName = "transformFragments"
val commandDescription = "Convert alignment records into fragment records."

def apply(cmdLine: Array[String]) = {
new TransformFragments(Args4j[TransformFragmentsArgs](cmdLine))
}
}

class TransformFragmentsArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs {
@Argument(required = true, metaVar = "INPUT", usage = "The Fragment file to apply the transforms to", index = 0)
var inputPath: String = null
@Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the transformed fragments", index = 1)
var outputPath: String = null
@Args4jOption(required = false, name = "-load_as_reads", usage = "Treats the input data as reads")
var loadAsReads: Boolean = false
@Args4jOption(required = false, name = "-save_as_reads", usage = "Saves the output data as reads")
var saveAsReads: Boolean = false
@Args4jOption(required = false, name = "-single", usage = "Saves OUTPUT as single file")
var asSingleFile: Boolean = false
@Args4jOption(required = false, name = "-sort_reads", usage = "Sort the reads by referenceId and read position. Only valid if run with -save_as_reads")
var sortReads: Boolean = false
@Args4jOption(required = false, name = "-defer_merging", usage = "Defers merging single file output")
var deferMerging: Boolean = false
@Args4jOption(required = false, name = "-disable_fast_concat", usage = "Disables the parallel file concatenation engine.")
var disableFastConcat: Boolean = false
@Args4jOption(required = false, name = "-sort_lexicographically", usage = "Sort the reads lexicographically by contig name, instead of by index.")
var sortLexicographically: Boolean = false
@Args4jOption(required = false, name = "-mark_duplicate_reads", usage = "Mark duplicate reads")
var markDuplicates: Boolean = false
@Args4jOption(required = false, name = "-bin_quality_scores", usage = "Rewrites quality scores of reads into bins from a string of bin descriptions, e.g. 0,20,10;20,40,30.")
var binQualityScores: String = null

// this is required because of the ADAMSaveAnyArgs trait... fix this trait???
var sortFastqOutput = false
}

class TransformFragments(protected val args: TransformFragmentsArgs) extends BDGSparkCommand[TransformFragmentsArgs] with Logging {
val companion = TransformFragments

/**
* @param reads An RDD of fragments.
* @return If the mark duplicates argument is sent, deduplicates the reads.
* Else, returns the input reads.
*/
def maybeDedupe(reads: FragmentRDD): FragmentRDD = {
if (args.markDuplicates) {
reads.markDuplicates()
} else {
reads
}
}

/**
* @param rdd An RDD of fragments.
* @return If the binQualityScores argument is set, rewrites the quality scores of the
* reads into bins. Else, returns the original RDD.
*/
private def maybeBin(rdd: FragmentRDD): FragmentRDD = {
Option(args.binQualityScores).fold(rdd)(binDescription => {
val bins = QualityScoreBin(binDescription)
rdd.binQualityScores(bins)
})
}

def run(sc: SparkContext) {
if (args.loadAsReads && args.saveAsReads) {
log.warn("If loading and saving as reads, consider using TransformAlignments instead.")
}
if (args.sortReads) {
require(args.saveAsReads,
"-sort_reads is only valid if -save_as_reads is given.")
}
if (args.sortLexicographically) {
require(args.saveAsReads,
"-sort_lexicographically is only valid if -save_as_reads is given.")
}

val rdd = if (args.loadAsReads) {
sc.loadAlignments(args.inputPath)
.toFragments
} else {
sc.loadFragments(args.inputPath)
}

// should we bin the quality scores?
val maybeBinnedReads = maybeBin(rdd)

// should we dedupe the reads?
val maybeDedupedReads = maybeDedupe(maybeBinnedReads)

if (args.saveAsReads) {
// save rdd as reads
val readRdd = maybeDedupedReads.toReads

// prep to save
val finalRdd = if (args.sortReads) {
readRdd.sortReadsByReferencePosition()
} else if (args.sortLexicographically) {
readRdd.sortReadsByReferencePositionAndIndex()
} else {
readRdd
}

// save the file
finalRdd.save(args,
isSorted = args.sortReads || args.sortLexicographically)
} else {
maybeDedupedReads.saveAsParquet(args)
}
}
}

This file was deleted.

Loading