From 4063b2922ec10c7b7a311acdf5c0d834acbca391 Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Sun, 13 Nov 2016 15:17:05 -0800 Subject: [PATCH] Finishing up the cleanup on org.bdgenomics.adam.rdd. * Made `MDTagging` private to `read`. Exposed it through `AlignmentRecordRDD`. Cleaned up where it was used in `Transform`. * Made `FlagStat` and most related models private to `read`. * Made all `read.recalibration` and `read.realignment` classes private to package. Depending on the class, this varied from: * `adam` for classes that need to be registered with `kryo` * `read` for the core `RealignIndels` and `BaseQualityRecalibrator` engines * Their subpackage where possible. * Made class values for genomic partitioners as private as possible. * Added documentation to all `InFormatter`s, `InFormatterCompanion`s, and `OutFormatter`s. Made the `InFormatter`s package private with private constructors. * Added method/class level scaladoc where missing. * Moved `org.bdgenomics.adam.cli.FlagStatSuite` to `org.bdgenomics.adam.read`, along with the `NA12878.sam` test resource, which was otherwise unused in the `adam-cli` submodule. --- .../org/bdgenomics/adam/cli/Transform.scala | 20 +++---- .../org/bdgenomics/adam/rdd/ADAMContext.scala | 9 +++ .../adam/rdd/ADAMRDDFunctions.scala | 51 +++++++++++++++- .../adam/rdd/GenomicPartitioners.scala | 55 ++++++++++++++++-- .../org/bdgenomics/adam/rdd/GenomicRDD.scala | 58 +++++++++++++++++++ .../org/bdgenomics/adam/rdd/InFormatter.scala | 17 ++++++ .../org/bdgenomics/adam/rdd/RegionJoin.scala | 12 ++++ .../adam/rdd/ShuffleRegionJoin.scala | 16 ++++- .../rdd/contig/FlankReferenceFragments.scala | 16 +++++ .../contig/NucleotideContigFragmentRDD.scala | 4 +- .../adam/rdd/feature/CoverageRDD.scala | 6 +- .../adam/rdd/feature/FeatureRDD.scala | 18 ++++-- .../InterleavedFASTQInFormatter.scala | 24 +++++++- .../adam/rdd/read/ADAMBAMOutputFormat.scala | 20 +++++++ .../adam/rdd/read/ADAMCRAMOutputFormat.scala | 20 +++++++ .../adam/rdd/read/AlignmentRecordRDD.scala | 25 ++++++++ .../adam/rdd/read/AnySAMInFormatter.scala | 28 +++++++++ .../adam/rdd/read/AnySAMOutFormatter.scala | 4 ++ .../adam/rdd/read/BAMInFormatter.scala | 10 +++- .../bdgenomics/adam/rdd/read/FlagStat.scala | 18 +++--- .../bdgenomics/adam/rdd/read/MDTagging.scala | 28 +++------ .../adam/rdd/read/MarkDuplicates.scala | 1 - .../adam/rdd/read/SAMInFormatter.scala | 10 +++- .../realignment/IndelRealignmentTarget.scala | 20 +++---- .../rdd/read/realignment/RealignIndels.scala | 5 +- .../realignment/RealignmentTargetFinder.scala | 5 +- .../BaseQualityRecalibration.scala | 4 +- .../rdd/read/recalibration/Covariate.scala | 10 ++-- .../read/recalibration/CycleCovariate.scala | 2 +- .../read/recalibration/DinucCovariate.scala | 2 +- .../read/recalibration/ObservationTable.scala | 14 ++--- .../rdd/read/recalibration/Recalibrator.scala | 12 ++-- .../rdd/variation/ADAMVCFOutputFormat.scala | 4 +- .../adam/rdd/variation/VCFInFormatter.scala | 17 +++++- .../adam/rdd/variation/VCFOutFormatter.scala | 5 +- .../src/test/resources/NA12878.sam | 0 .../adam/rdd/read}/FlagStatSuite.scala | 15 ++--- .../adam/rdd/read/MDTaggingSuite.scala | 4 +- 38 files changed, 463 insertions(+), 126 deletions(-) rename {adam-cli => adam-core}/src/test/resources/NA12878.sam (100%) rename {adam-cli/src/test/scala/org/bdgenomics/adam/cli => adam-core/src/test/scala/org/bdgenomics/adam/rdd/read}/FlagStatSuite.scala (92%) diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala index 109ac8080b..07b3a990b4 100644 --- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala +++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala @@ -316,19 +316,17 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans * these tags are recomputed or not. If MD tagging isn't requested, we * return the input RDD. */ - def maybeMdTag(rdd: AlignmentRecordRDD, + def maybeMdTag(sc: SparkContext, + rdd: AlignmentRecordRDD, stringencyOpt: Option[ValidationStringency]): AlignmentRecordRDD = { if (args.mdTagsReferenceFile != null) { log.info(s"Adding MDTags to reads based on reference file ${args.mdTagsReferenceFile}") - rdd.transform(r => { - MDTagging( - r, - args.mdTagsReferenceFile, - fragmentLength = args.mdTagsFragmentSize, - overwriteExistingTags = args.mdTagsOverwrite, - validationStringency = stringencyOpt.getOrElse(ValidationStringency.STRICT) - ) - }) + val referenceFile = sc.loadReferenceFile(args.mdTagsReferenceFile, + fragmentLength = args.mdTagsFragmentSize) + rdd.computeMismatchingPositions( + referenceFile, + overwriteExistingTags = args.mdTagsOverwrite, + validationStringency = stringencyOpt.getOrElse(ValidationStringency.STRICT)) } else { rdd } @@ -360,7 +358,7 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans val maybeSortedRdd = maybeSort(finalPreprocessedRdd, sl) // recompute md tags, if requested, and return - maybeMdTag(maybeSortedRdd, stringencyOpt) + maybeMdTag(sc, maybeSortedRdd, stringencyOpt) } def forceNonParquet(): Boolean = { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala index f797b71fc5..6eba62cc60 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala @@ -87,6 +87,10 @@ private case class LocatableReferenceRegion(rr: ReferenceRegion) extends Locatab def getContig(): String = rr.referenceName } +/** + * This singleton provides an implicit conversion from a SparkContext to the + * ADAMContext, as well as implicit functions for the Pipe API. + */ object ADAMContext { // conversion functions for pipes @@ -141,6 +145,11 @@ private class FileFilter(private val name: String) extends PathFilter { import org.bdgenomics.adam.rdd.ADAMContext._ +/** + * The ADAMContext provides functions on top of a SparkContext for loading genomic data. + * + * @param sc The SparkContext to wrap. + */ class ADAMContext private (@transient val sc: SparkContext) extends Serializable with Logging { /** diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala index ca589822cc..7c79894bb9 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala @@ -18,7 +18,6 @@ package org.bdgenomics.adam.rdd import java.util.logging.Level - import java.io.OutputStream import org.apache.avro.Schema import org.apache.avro.file.DataFileWriter @@ -41,9 +40,32 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.parquet.hadoop.util.ContextUtil import scala.reflect.ClassTag +/** + * Argument configuration for saving any output format. + */ trait ADAMSaveAnyArgs extends SaveArgs { + + /** + * If true and saving as FASTQ, we will sort by read name. + */ var sortFastqOutput: Boolean + + /** + * If true and saving as a legacy format, we will write shards so that they + * can be merged into a single file. + * + * @see deferMerging + */ var asSingleFile: Boolean + + /** + * If true and asSingleFile is true, we will not merge the shards once we + * write them, and will leave them for the user to merge later. If false and + * asSingleFile is true, then we will merge the shards on write. If + * asSingleFile is false, this is ignored. + * + * @see asSingleFile + */ var deferMerging: Boolean } @@ -104,6 +126,17 @@ private[rdd] abstract class ADAMRDDFunctions[T <% IndexedRecord: Manifest] exten ) } + /** + * Saves an RDD of Avro data to Parquet. + * + * @param filePath The path to save the file to. + * @param blockSize The size in bytes of blocks to write. + * @param pageSize The size in bytes of pages to write. + * @param compressCodec The compression codec to apply to pages. + * @param disableDictionaryEncoding If false, dictionary encoding is used. If + * true, delta encoding is used. + * @param schema The schema to set. + */ protected def saveRddAsParquet( filePath: String, blockSize: Int = 128 * 1024 * 1024, @@ -138,10 +171,26 @@ private[rdd] abstract class ADAMRDDFunctions[T <% IndexedRecord: Manifest] exten since = "0.20.0") private[rdd] class ConcreteADAMRDDFunctions[T <% IndexedRecord: Manifest](val rdd: RDD[T]) extends ADAMRDDFunctions[T] { + /** + * Saves an RDD of Avro data to Parquet. + * + * @param args The output format configuration to use when saving the data. + */ def saveAsParquet(args: SaveArgs): Unit = { saveRddAsParquet(args) } + /** + * Saves an RDD of Avro data to Parquet. + * + * @param filePath The path to save the file to. + * @param blockSize The size in bytes of blocks to write. + * @param pageSize The size in bytes of pages to write. + * @param compressCodec The compression codec to apply to pages. + * @param disableDictionaryEncoding If false, dictionary encoding is used. If + * true, delta encoding is used. + * @param schema The schema to set. + */ def saveAsParquet( filePath: String, blockSize: Int = 128 * 1024 * 1024, diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala index f0bc80e1e9..e123f66fb2 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala @@ -41,15 +41,15 @@ case class GenomicPositionPartitioner(numParts: Int, seqLengths: Map[String, Lon log.info("Have genomic position partitioner with " + numParts + " partitions, and sequences:") seqLengths.foreach(kv => log.info("Contig " + kv._1 + " with length " + kv._2)) - val names: Seq[String] = seqLengths.keys.toSeq.sortWith(_ < _) - val lengths: Seq[Long] = names.map(seqLengths(_)) + private val names: Seq[String] = seqLengths.keys.toSeq.sortWith(_ < _) + private val lengths: Seq[Long] = names.map(seqLengths(_)) private val cumuls: Seq[Long] = lengths.scan(0L)(_ + _) // total # of bases in the sequence dictionary - val totalLength: Long = lengths.sum + private val totalLength: Long = lengths.sum // referenceName -> cumulative length before this sequence (using seqDict.records as the implicit ordering) - val cumulativeLengths: Map[String, Long] = Map( + private[rdd] val cumulativeLengths: Map[String, Long] = Map( names.zip(cumuls): _* ) @@ -57,11 +57,28 @@ case class GenomicPositionPartitioner(numParts: Int, seqLengths: Map[String, Lon * 'parts' is the total number of partitions for non-UNMAPPED ReferencePositions -- * the total number of partitions (see numPartitions, below) is parts+1, with the * extra partition being included for handling ReferencePosition.UNMAPPED + * + * @see numPartitions */ private val parts = min(numParts, totalLength).toInt + /** + * This is the total number of partitions for both mapped and unmapped + * positions. All unmapped positions go into the last partition. + * + * @see parts + */ override def numPartitions: Int = parts + 1 + /** + * Computes the partition for a key. + * + * @param key A key to compute the partition for. + * @return The partition that this key belongs to. + * + * @throws IllegalArgumentException if the key is not a ReferencePosition, or + * (ReferencePosition, _) tuple. + */ override def getPartition(key: Any): Int = { // This allows partitions that cross chromosome boundaries. @@ -101,9 +118,11 @@ case class GenomicPositionPartitioner(numParts: Int, seqLengths: Map[String, Lon override def toString(): String = { return "%d parts, %d partitions, %s" format (parts, numPartitions, cumulativeLengths.toString) } - } +/** + * Helper for creating genomic position partitioners. + */ object GenomicPositionPartitioner { /** @@ -120,7 +139,17 @@ object GenomicPositionPartitioner { seqDict.records.toSeq.map(rec => (rec.name, rec.length)).toMap } -case class GenomicRegionPartitioner(partitionSize: Long, seqLengths: Map[String, Long], start: Boolean = true) extends Partitioner with Logging { +/** + * A partitioner for ReferenceRegion-keyed data. + * + * @param partitionSize The number of bases per partition. + * @param seqLengths A map between contig names and contig lengths. + * @param start If true, use the start position (instead of the end position) to + * decide which partition a key belongs to. + */ +case class GenomicRegionPartitioner(partitionSize: Long, + seqLengths: Map[String, Long], + start: Boolean = true) extends Partitioner with Logging { private val names: Seq[String] = seqLengths.keys.toSeq.sortWith(_ < _) private val lengths: Seq[Long] = names.map(seqLengths(_)) private val parts: Seq[Int] = lengths.map(v => round(ceil(v.toDouble / partitionSize)).toInt) @@ -138,8 +167,19 @@ case class GenomicRegionPartitioner(partitionSize: Long, seqLengths: Map[String, (cumulParts(refReg.referenceName) + pos / partitionSize).toInt } + /** + * @return The number of partitions described by this partitioner. Roughly the + * size of the genome divided by the partition length. + */ override def numPartitions: Int = parts.sum + /** + * @param key The key to get the partition index for. + * @return The partition that a key should map to. + * + * @throws IllegalArgumentException Throws an exception if the data is not a + * ReferenceRegion or a tuple of (ReferenceRegion, _). + */ override def getPartition(key: Any): Int = { key match { case region: ReferenceRegion => { @@ -153,6 +193,9 @@ case class GenomicRegionPartitioner(partitionSize: Long, seqLengths: Map[String, } } +/** + * Helper object for creating GenomicRegionPartitioners. + */ object GenomicRegionPartitioner { /** diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala index 7b1f31e25c..2a4a6fe7d4 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala @@ -81,16 +81,39 @@ private[rdd] object GenomicRDD { } } +/** + * A trait that wraps an RDD of genomic data with helpful metadata. + * + * @tparam T The type of the data in the wrapped RDD. + * @tparam U The type of this GenomicRDD. + */ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { + /** + * The RDD of genomic data that we are wrapping. + */ val rdd: RDD[T] + /** + * The sequence dictionary describing the reference assembly this data is + * aligned to. + */ val sequences: SequenceDictionary + /** + * The underlying RDD of genomic data, as a JavaRDD. + */ lazy val jrdd: JavaRDD[T] = { rdd.toJavaRDD() } + /** + * Applies a function that transforms the underlying RDD into a new RDD. + * + * @param tFn A function that transforms the underlying RDD. + * @return A new RDD where the RDD of genomic data has been replaced, but the + * metadata (sequence dictionary, and etc) is copied without modification. + */ def transform(tFn: RDD[T] => RDD[T]): U = { replaceRdd(tFn(rdd)) } @@ -321,6 +344,14 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] { }) } + /** + * Runs a filter that selects data in the underlying RDD that overlaps a + * single genomic region. + * + * @param query The region to query for. + * @return Returns a new GenomicRDD containing only data that overlaps the + * query region. + */ def filterByOverlappingRegion(query: ReferenceRegion): U = { replaceRdd(rdd.filter(elem => { @@ -647,13 +678,28 @@ private case class GenericGenomicRDD[T](rdd: RDD[T], } } +/** + * A trait describing a GenomicRDD with data from multiple samples. + */ trait MultisampleGenomicRDD[T, U <: MultisampleGenomicRDD[T, U]] extends GenomicRDD[T, U] { + /** + * The samples who have data contained in this GenomicRDD. + */ val samples: Seq[Sample] } +/** + * An abstract class describing a GenomicRDD where: + * + * * The data are Avro IndexedRecords. + * * The data are associated to read groups (i.e., they are reads or fragments). + */ abstract class AvroReadGroupGenomicRDD[T <% IndexedRecord: Manifest, U <: AvroReadGroupGenomicRDD[T, U]] extends AvroGenomicRDD[T, U] { + /** + * A dictionary describing the read groups attached to this GenomicRDD. + */ val recordGroups: RecordGroupDictionary override protected def saveMetadata(filePath: String) { @@ -675,6 +721,10 @@ abstract class AvroReadGroupGenomicRDD[T <% IndexedRecord: Manifest, U <: AvroRe } } +/** + * An abstract class that extends the MultisampleGenomicRDD trait, where the data + * are Avro IndexedRecords. + */ abstract class MultisampleAvroGenomicRDD[T <% IndexedRecord: Manifest, U <: MultisampleAvroGenomicRDD[T, U]] extends AvroGenomicRDD[T, U] with MultisampleGenomicRDD[T, U] { @@ -695,6 +745,11 @@ abstract class MultisampleAvroGenomicRDD[T <% IndexedRecord: Manifest, U <: Mult } } +/** + * An abstract class that extends GenomicRDD and where the underlying data is + * Avro IndexedRecords. This abstract class provides methods for saving to + * Parquet, and provides hooks for writing the metadata. + */ abstract class AvroGenomicRDD[T <% IndexedRecord: Manifest, U <: AvroGenomicRDD[T, U]] extends ADAMRDDFunctions[T] with GenomicRDD[T, U] { @@ -793,6 +848,9 @@ abstract class AvroGenomicRDD[T <% IndexedRecord: Manifest, U <: AvroGenomicRDD[ } } +/** + * A trait for genomic data that is not aligned to a reference (e.g., raw reads). + */ trait Unaligned { val sequences = SequenceDictionary.empty diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala index 80647cce2e..f64099f62b 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala @@ -30,8 +30,25 @@ private[rdd] class InFormatterRunner[T, U <: GenomicRDD[T, U], V <: InFormatter[ } } +/** + * A trait for singleton objects that build an InFormatter from a GenomicRDD. + * + * Often, when creating an outputstream, we need to add metadata to the output + * that is not attached to individual records. An example of this is writing a + * header with contig/read group/format info, as is done with SAM/BAM/VCF. + * + * @tparam T The type of the records this InFormatter writes out. + * @tparam U The type of the GenomicRDD this companion object understands. + * @tparam V The type of InFormatter this companion object creates. + */ trait InFormatterCompanion[T, U <: GenomicRDD[T, U], V <: InFormatter[T, U, V]] { + /** + * Creates an InFormatter from a GenomicRDD. + * + * @param gRdd The GenomicRDD to get metadata from. + * @return Returns an InFormatter with attached metadata. + */ def apply(gRdd: U): V } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala index 3ebe5ed5d1..ffa64ed849 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala @@ -24,7 +24,19 @@ import scala.Predef._ import org.apache.spark.SparkContext import scala.reflect.ClassTag +/** + * A trait describing a join in the genomic coordinate space between two RDDs + * where the values are keyed by a ReferenceRegion. + * + * @tparam T The type of the left RDD. + * @tparam U The type of the right RDD. + * @tparam RT The type of data yielded by the left RDD at the output of the + * join. This may not match T if the join is an outer join, etc. + * @tparam RU The type of data yielded by the right RDD at the output of the + * join. + */ trait RegionJoin[T, U, RT, RU] { + /** * Performs a region join between two RDDs. * diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala index 07c51ecfe6..0869515de5 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala @@ -25,11 +25,21 @@ import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion } import scala.collection.mutable.ListBuffer import scala.reflect.ClassTag +/** + * A trait describing join implementations that are based on a sort-merge join. + * + * @tparam T The type of the left RDD. + * @tparam U The type of the right RDD. + * @tparam RT The type of data yielded by the left RDD at the output of the + * join. This may not match T if the join is an outer join, etc. + * @tparam RU The type of data yielded by the right RDD at the output of the + * join. + */ sealed trait ShuffleRegionJoin[T, U, RT, RU] extends RegionJoin[T, U, RT, RU] { - val sd: SequenceDictionary - val partitionSize: Long - val sc: SparkContext + protected val sd: SequenceDictionary + protected val partitionSize: Long + protected val sc: SparkContext // Create the set of bins across the genome for parallel processing // partitionSize (in nucleotides) may range from 10000 to 10000000+ diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala index 38615c3950..df98a749af 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala @@ -23,8 +23,24 @@ import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceDictionary } import org.bdgenomics.adam.rdd.ReferencePartitioner import org.bdgenomics.formats.avro.NucleotideContigFragment +/** + * Object that extends all of the fragments in an RDD of contig fragments + * with the sequence flanking said fragment. + */ private[contig] object FlankReferenceFragments extends Serializable { + /** + * Adds flanks to sequence fragments in an RDD. + * + * Assumes that after sorting, all fragments are contiguous. + * + * @param rdd The RDD to flank. + * @param sd The sequence dictionary describing all contigs in this sequence + * dictionary. + * @param flankSize The size of flanking sequence to add to each fragment. + * @return Returns a new RDD where each fragment has been extended with + * flanking sequence. + */ def apply( rdd: RDD[NucleotideContigFragment], sd: SequenceDictionary, diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala index 7067c1b4a2..d29557e6e7 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala @@ -32,7 +32,7 @@ import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment } import scala.collection.JavaConversions._ import scala.math.max -object NucleotideContigFragmentRDD extends Serializable { +private[rdd] object NucleotideContigFragmentRDD extends Serializable { /** * Helper function for building a NucleotideContigFragmentRDD when no @@ -42,7 +42,7 @@ object NucleotideContigFragmentRDD extends Serializable { * this RDD. * @return Returns a new NucleotideContigFragmentRDD. */ - private[rdd] def apply(rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { + def apply(rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = { // get sequence dictionary val sd = new SequenceDictionary(rdd.flatMap(ncf => { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala index 94fa80a21e..cdab4f80e7 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala @@ -25,13 +25,13 @@ import org.bdgenomics.adam.models.{ SequenceRecord } import org.bdgenomics.adam.rdd.GenomicRDD - import scala.annotation.tailrec /** - * An RDD containing Coverage. + * An RDD containing Coverage data. * - * @param rdd Coverage + * @param rdd An RDD containing data describing how many reads cover a genomic + * locus/region. * @param sequences A dictionary describing the reference genome. */ case class CoverageRDD(rdd: RDD[Coverage], diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala index adfc580a66..b376a9906e 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala @@ -72,7 +72,7 @@ private trait FeatureOrdering[T <: Feature] extends Ordering[T] { } private object FeatureOrdering extends FeatureOrdering[Feature] {} -object FeatureRDD { +private[rdd] object FeatureRDD { /** * @param elem The feature to extract a sequence record from. @@ -89,7 +89,7 @@ object FeatureRDD { * @param rdd The underlying Feature RDD to build from. * @return Returns a new FeatureRDD. */ - private[rdd] def apply(rdd: RDD[Feature]): FeatureRDD = { + def apply(rdd: RDD[Feature]): FeatureRDD = { // cache the rdd, since we're making multiple passes rdd.cache() @@ -128,7 +128,7 @@ object FeatureRDD { * @param feature Feature to write in IntervalList format. * @return Feature as a one line interval list string. */ - private[rdd] def toInterval(feature: Feature): String = { + def toInterval(feature: Feature): String = { val sequenceName = feature.getContigName val start = feature.getStart + 1 // IntervalList ranges are 1-based val end = feature.getEnd // IntervalList ranges are closed @@ -141,7 +141,7 @@ object FeatureRDD { * @param feature Feature to write in the narrow peak format. * @return Returns this feature as a single narrow peak line. */ - private[rdd] def toNarrowPeak(feature: Feature): String = { + def toNarrowPeak(feature: Feature): String = { val chrom = feature.getContigName val start = feature.getStart val end = feature.getEnd @@ -159,7 +159,7 @@ object FeatureRDD { * @param feature Feature to write in BED format. * @return Returns the feature as a single line BED string. */ - private[rdd] def toBed(feature: Feature): String = { + def toBed(feature: Feature): String = { val chrom = feature.getContigName val start = feature.getStart val end = feature.getEnd @@ -186,7 +186,7 @@ object FeatureRDD { * @param feature Feature to write in GFF3 format. * @return Returns this feature as a single line GFF3 string. */ - private[rdd] def toGff3(feature: Feature): String = { + def toGff3(feature: Feature): String = { def escape(entry: (Any, Any)): String = { entry._1 + "=" + entry._2 } @@ -204,6 +204,12 @@ object FeatureRDD { } } +/** + * A GenomicRDD that wraps Feature data. + * + * @param rdd An RDD of genomic Features. + * @param sequences The reference genome this data is aligned to. + */ case class FeatureRDD(rdd: RDD[Feature], sequences: SequenceDictionary) extends AvroGenomicRDD[Feature, FeatureRDD] with Logging { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala index 91bb6796b2..0f25b617b5 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala @@ -24,17 +24,39 @@ import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion } import org.bdgenomics.formats.avro.Fragment import org.bdgenomics.utils.misc.Logging +/** + * InFormatter companion that creates an InFormatter that writes interleaved + * FASTQ. + */ object InterleavedFASTQInFormatter extends InFormatterCompanion[Fragment, FragmentRDD, InterleavedFASTQInFormatter] { + /** + * Hadoop configuration path to check for a boolean value indicating whether + * the current or original read qualities should be written. True indicates + * to write the original qualities. The default is false. + */ val WRITE_ORIGINAL_QUAL = "org.bdgenomics.adam.rdd.fragment.InterleavedFASTQInFormatter.writeOriginalQual" + + /** + * Hadoop configuration path to check for a boolean value indicating whether + * to write the "/1" "/2" suffixes to the read name that indicate whether a + * read is first or second in a pair. Default is false (no suffixes). + */ val WRITE_SUFFIXES = "org.bdgenomics.adam.rdd.fragment.InterleavedFASTQInFormatter.writeSuffixes" + /** + * Builds an InterleavedFASTQInFormatter to write Interleaved FASTQ. + * + * @param gRdd GenomicRDD of Fragments. Used to get HadoopConfiguration. + * @return Returns a new Interleaved FASTQ InFormatter. + */ def apply(gRdd: FragmentRDD): InterleavedFASTQInFormatter = { new InterleavedFASTQInFormatter(gRdd.rdd.context.hadoopConfiguration) } } -class InterleavedFASTQInFormatter(conf: Configuration) extends InFormatter[Fragment, FragmentRDD, InterleavedFASTQInFormatter] with Logging { +private[fragment] class InterleavedFASTQInFormatter private ( + conf: Configuration) extends InFormatter[Fragment, FragmentRDD, InterleavedFASTQInFormatter] with Logging { protected val companion = InterleavedFASTQInFormatter private val converter = new AlignmentRecordConverter diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala index 1483046fa9..b4d3320ca6 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala @@ -28,6 +28,11 @@ import org.seqdoop.hadoop_bam.{ SAMRecordWritable } +/** + * Wrapper for Hadoop-BAM to work around requirement for no-args constructor. + * + * @tparam K The key type. Keys are not written. + */ class ADAMBAMOutputFormat[K] extends KeyIgnoringBAMOutputFormat[K] with Serializable { @@ -50,11 +55,21 @@ class ADAMBAMOutputFormat[K] } } +/** + * Wrapper that adds instrumentation to the BAM output format. + * + * @tparam K The key type. Keys are not written. + */ class InstrumentedADAMBAMOutputFormat[K] extends InstrumentedOutputFormat[K, org.seqdoop.hadoop_bam.SAMRecordWritable] { override def timerName(): String = Timers.WriteBAMRecord.timerName override def outputFormatClass(): Class[_ <: OutputFormat[K, SAMRecordWritable]] = classOf[ADAMBAMOutputFormat[K]] } +/** + * Wrapper for Hadoop-BAM to work around requirement for no-args constructor. + * + * @tparam K The key type. Keys are not written. + */ class ADAMBAMOutputFormatHeaderLess[K] extends KeyIgnoringBAMOutputFormat[K] with Serializable { @@ -77,6 +92,11 @@ class ADAMBAMOutputFormatHeaderLess[K] } } +/** + * Wrapper that adds instrumentation to the SAM output format. + * + * @tparam K The key type. Keys are not written. + */ class InstrumentedADAMBAMOutputFormatHeaderLess[K] extends InstrumentedOutputFormat[K, org.seqdoop.hadoop_bam.SAMRecordWritable] { override def timerName(): String = Timers.WriteBAMRecord.timerName override def outputFormatClass(): Class[_ <: OutputFormat[K, SAMRecordWritable]] = classOf[ADAMBAMOutputFormatHeaderLess[K]] diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala index 6cffed1ba0..28ce20ab13 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala @@ -28,6 +28,11 @@ import org.seqdoop.hadoop_bam.{ SAMRecordWritable } +/** + * Wrapper for Hadoop-BAM to work around requirement for no-args constructor. + * + * @tparam K The key type. Keys are not written. + */ class ADAMCRAMOutputFormat[K] extends KeyIgnoringCRAMOutputFormat[K] with Serializable { @@ -50,11 +55,21 @@ class ADAMCRAMOutputFormat[K] } } +/** + * Wrapper that adds instrumentation to the CRAM output format. + * + * @tparam K The key type. Keys are not written. + */ class InstrumentedADAMCRAMOutputFormat[K] extends InstrumentedOutputFormat[K, org.seqdoop.hadoop_bam.SAMRecordWritable] { override def timerName(): String = Timers.WriteCRAMRecord.timerName override def outputFormatClass(): Class[_ <: OutputFormat[K, SAMRecordWritable]] = classOf[ADAMCRAMOutputFormat[K]] } +/** + * Wrapper for Hadoop-BAM to work around requirement for no-args constructor. + * + * @tparam K The key type. Keys are not written. + */ class ADAMCRAMOutputFormatHeaderLess[K] extends KeyIgnoringCRAMOutputFormat[K] with Serializable { @@ -77,6 +92,11 @@ class ADAMCRAMOutputFormatHeaderLess[K] } } +/** + * Wrapper that adds instrumentation to the CRAM output format. + * + * @tparam K The key type. Keys are not written. + */ class InstrumentedADAMCRAMOutputFormatHeaderLess[K] extends InstrumentedOutputFormat[K, org.seqdoop.hadoop_bam.SAMRecordWritable] { override def timerName(): String = Timers.WriteCRAMRecord.timerName override def outputFormatClass(): Class[_ <: OutputFormat[K, SAMRecordWritable]] = classOf[ADAMCRAMOutputFormatHeaderLess[K]] diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala index e5710d003f..5e5fd221ed 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala @@ -57,6 +57,7 @@ import org.bdgenomics.adam.rdd.read.realignment.RealignIndels import org.bdgenomics.adam.rdd.read.recalibration.BaseQualityRecalibration import org.bdgenomics.adam.rdd.fragment.FragmentRDD import org.bdgenomics.adam.rich.RichAlignmentRecord +import org.bdgenomics.adam.util.ReferenceFile import org.bdgenomics.formats.avro._ import org.bdgenomics.utils.misc.Logging import org.seqdoop.hadoop_bam._ @@ -599,6 +600,30 @@ sealed trait AlignmentRecordRDD extends AvroReadGroupGenomicRDD[AlignmentRecord, replaceRdd(RealignIndels(rdd, consensusModel, isSorted, maxIndelSize, maxConsensusNumber, lodThreshold)) } + /** + * Computes the mismatching positions field (SAM "MD" tag). + * + * @param referenceFile A reference file that can be broadcast to all nodes. + * @param overwriteExistingTags If true, overwrites the MD tags on reads where + * it is already populated. If false, we only tag reads that are currently + * missing an MD tag. Default is false. + * @param validationStringency If we are recalculating existing tags and we + * find that the MD tag that was previously on the read doesn't match our + * new tag, LENIENT will log a warning message, STRICT will throw an + * exception, and SILENT will ignore. Default is LENIENT. + * @return Returns a new AlignmentRecordRDD where all reads have the + * mismatchingPositions field populated. + */ + def computeMismatchingPositions( + referenceFile: ReferenceFile, + overwriteExistingTags: Boolean = false, + validationStringency: ValidationStringency = ValidationStringency.LENIENT): AlignmentRecordRDD = { + replaceRdd(MDTagging(rdd, + referenceFile, + overwriteExistingTags = overwriteExistingTags, + validationStringency = validationStringency).taggedReads) + } + /** * Runs a quality control pass akin to the Samtools FlagStat tool. * diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala index 37a579ed1e..522be3e2a2 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala @@ -27,11 +27,23 @@ import org.bdgenomics.adam.models.{ import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion } import org.bdgenomics.formats.avro.AlignmentRecord +/** + * Companion object that builds an InFormatter that writes data where the metadata + * is contained in a SAMFileHeaderWritable. + * + * @tparam T The type of the underlying InFormatter. + */ trait AnySAMInFormatterCompanion[T <: AnySAMInFormatter[T]] extends InFormatterCompanion[AlignmentRecord, AlignmentRecordRDD, T] { protected def makeFormatter(header: SAMFileHeaderWritable, recordGroups: RecordGroupDictionary, converter: AlignmentRecordConverter): T + /** + * Makes an AnySAMInFormatter from a GenomicRDD of AlignmentRecords. + * + * @param gRdd AlignmentRecordRDD with reference build and record group info. + * @return Returns an InFormatter that extends AnySAMInFormatter. + */ def apply(gRdd: AlignmentRecordRDD): T = { // make a converter @@ -46,10 +58,26 @@ trait AnySAMInFormatterCompanion[T <: AnySAMInFormatter[T]] extends InFormatterC } } +/** + * A trait that writes reads using an Htsjdk SAMFileWriter. + * + * @tparam T The recursive type of the class that implements this trait. + */ trait AnySAMInFormatter[T <: AnySAMInFormatter[T]] extends InFormatter[AlignmentRecord, AlignmentRecordRDD, T] { + /** + * A serializable form of the SAM File Header. + */ val header: SAMFileHeaderWritable + + /** + * A dictionary describing the read groups these reads are from. + */ val recordGroups: RecordGroupDictionary + + /** + * A converter from AlignmentRecord to SAMRecord. + */ val converter: AlignmentRecordConverter protected def makeWriter(os: OutputStream): SAMFileWriter diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala index fc23606fd7..5f9c9051fd 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala @@ -25,6 +25,10 @@ import org.bdgenomics.formats.avro.AlignmentRecord import scala.annotation.tailrec import scala.collection.mutable.ListBuffer +/** + * An OutFormatter that automatically infers whether the piped input is SAM or + * BAM. Autodetecting streamed CRAM is not currently supported. + */ class AnySAMOutFormatter extends OutFormatter[AlignmentRecord] { /** diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BAMInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BAMInFormatter.scala index fbc26f62b5..2b23bc5d7e 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BAMInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BAMInFormatter.scala @@ -28,6 +28,9 @@ import org.bdgenomics.adam.models.{ SAMFileHeaderWritable } +/** + * InFormatter companion for building an InFormatter that streams BAM. + */ object BAMInFormatter extends AnySAMInFormatterCompanion[BAMInFormatter] { protected def makeFormatter(header: SAMFileHeaderWritable, @@ -37,9 +40,10 @@ object BAMInFormatter extends AnySAMInFormatterCompanion[BAMInFormatter] { } } -case class BAMInFormatter(header: SAMFileHeaderWritable, - recordGroups: RecordGroupDictionary, - converter: AlignmentRecordConverter) extends AnySAMInFormatter[BAMInFormatter] { +private[read] case class BAMInFormatter private ( + header: SAMFileHeaderWritable, + recordGroups: RecordGroupDictionary, + converter: AlignmentRecordConverter) extends AnySAMInFormatter[BAMInFormatter] { protected val companion = BAMInFormatter diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala index e1f24cb503..f3d96c22ba 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala @@ -20,12 +20,12 @@ package org.bdgenomics.adam.rdd.read import org.apache.spark.rdd.RDD import org.bdgenomics.formats.avro.AlignmentRecord -object FlagStatMetrics { +private[read] object FlagStatMetrics { val emptyFailedQuality = new FlagStatMetrics(0, DuplicateMetrics.empty, DuplicateMetrics.empty, 0, 0, 0, 0, 0, 0, 0, 0, 0, true) val emptyPassedQuality = new FlagStatMetrics(0, DuplicateMetrics.empty, DuplicateMetrics.empty, 0, 0, 0, 0, 0, 0, 0, 0, 0, false) } -object DuplicateMetrics { +private[read] object DuplicateMetrics { val empty = new DuplicateMetrics(0, 0, 0, 0) def apply(record: AlignmentRecord): (DuplicateMetrics, DuplicateMetrics) = { @@ -50,7 +50,7 @@ object DuplicateMetrics { } } -case class DuplicateMetrics(total: Long, bothMapped: Long, onlyReadMapped: Long, crossChromosome: Long) { +private[adam] case class DuplicateMetrics(total: Long, bothMapped: Long, onlyReadMapped: Long, crossChromosome: Long) { def +(that: DuplicateMetrics): DuplicateMetrics = { new DuplicateMetrics( total + that.total, @@ -61,11 +61,11 @@ case class DuplicateMetrics(total: Long, bothMapped: Long, onlyReadMapped: Long, } } -case class FlagStatMetrics(total: Long, duplicatesPrimary: DuplicateMetrics, duplicatesSecondary: DuplicateMetrics, - mapped: Long, pairedInSequencing: Long, - read1: Long, read2: Long, properlyPaired: Long, withSelfAndMateMapped: Long, - singleton: Long, withMateMappedToDiffChromosome: Long, - withMateMappedToDiffChromosomeMapQ5: Long, failedQuality: Boolean) { +private[adam] case class FlagStatMetrics(total: Long, duplicatesPrimary: DuplicateMetrics, duplicatesSecondary: DuplicateMetrics, + mapped: Long, pairedInSequencing: Long, + read1: Long, read2: Long, properlyPaired: Long, withSelfAndMateMapped: Long, + singleton: Long, withMateMappedToDiffChromosome: Long, + withMateMappedToDiffChromosomeMapQ5: Long, failedQuality: Boolean) { def +(that: FlagStatMetrics): FlagStatMetrics = { assert(failedQuality == that.failedQuality, "Can't reduce passedVendorQuality with different failedQuality values") new FlagStatMetrics( @@ -86,7 +86,7 @@ case class FlagStatMetrics(total: Long, duplicatesPrimary: DuplicateMetrics, dup } } -object FlagStat { +private[read] object FlagStat { def b2i(boolean: Boolean) = if (boolean) 1 else 0 def b(boolean: java.lang.Boolean) = Option(boolean).exists(x => x) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala index dc6eeb6a10..e358f23ba7 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala @@ -27,10 +27,9 @@ import org.bdgenomics.adam.util.ReferenceFile import org.bdgenomics.formats.avro.AlignmentRecord import org.bdgenomics.utils.misc.Logging -case class MDTagging( +private[read] case class MDTagging( reads: RDD[AlignmentRecord], @transient referenceFile: ReferenceFile, - partitionSize: Long = 1000000, overwriteExistingTags: Boolean = false, validationStringency: ValidationStringency = ValidationStringency.STRICT) extends Logging { @transient val sc = reads.sparkContext @@ -84,24 +83,13 @@ case class MDTagging( } } -object MDTagging { - def apply( - reads: RDD[AlignmentRecord], - referenceFile: String, - fragmentLength: Long, - overwriteExistingTags: Boolean, - validationStringency: ValidationStringency): RDD[AlignmentRecord] = { - val sc = reads.sparkContext - new MDTagging( - reads, - sc.loadReferenceFile(referenceFile, fragmentLength = fragmentLength), - partitionSize = fragmentLength, - overwriteExistingTags, - validationStringency - ).taggedReads - } -} - +/** + * A class describing an exception where a read's MD tag was recomputed and did + * not match the MD tag originally attached to the read. + * + * @param read The read whose MD tag was recomputed, with original MD tag. + * @param mdTag The recomputed MD tag. + */ case class IncorrectMDTagException(read: AlignmentRecord, mdTag: String) extends Exception { override def getMessage: String = s"Read: ${read.getReadName}, pos: ${read.getContigName}:${read.getStart}, cigar: ${read.getCigar}, existing MD tag: ${read.getMismatchingPositions}, correct MD tag: $mdTag" diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala index 419132bece..6170aec56a 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala @@ -154,5 +154,4 @@ private[rdd] object MarkDuplicates extends Serializable with Logging { scoreBucket(x._2) - scoreBucket(y._2) } } - } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SAMInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SAMInFormatter.scala index 514d0d986e..66d30c28ce 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SAMInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SAMInFormatter.scala @@ -28,6 +28,9 @@ import org.bdgenomics.adam.models.{ SAMFileHeaderWritable } +/** + * InFormatter companion for building an InFormatter that streams SAM. + */ object SAMInFormatter extends AnySAMInFormatterCompanion[SAMInFormatter] { protected def makeFormatter(header: SAMFileHeaderWritable, @@ -37,9 +40,10 @@ object SAMInFormatter extends AnySAMInFormatterCompanion[SAMInFormatter] { } } -case class SAMInFormatter(header: SAMFileHeaderWritable, - recordGroups: RecordGroupDictionary, - converter: AlignmentRecordConverter) extends AnySAMInFormatter[SAMInFormatter] { +private[read] case class SAMInFormatter private ( + header: SAMFileHeaderWritable, + recordGroups: RecordGroupDictionary, + converter: AlignmentRecordConverter) extends AnySAMInFormatter[SAMInFormatter] { protected val companion = SAMInFormatter diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala index aff47a0ad7..56edf5e5cd 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala @@ -28,7 +28,7 @@ import org.bdgenomics.adam.instrumentation.Timers._ import scala.collection.JavaConversions._ import scala.collection.immutable.TreeSet -object ZippedTargetOrdering extends Ordering[(IndelRealignmentTarget, Int)] { +private[realignment] object ZippedTargetOrdering extends Ordering[(IndelRealignmentTarget, Int)] { /** * Order two indel realignment targets by earlier starting position. @@ -42,7 +42,7 @@ object ZippedTargetOrdering extends Ordering[(IndelRealignmentTarget, Int)] { } } -object TargetOrdering extends Ordering[IndelRealignmentTarget] { +private[realignment] object TargetOrdering extends Ordering[IndelRealignmentTarget] { /** * Order two indel realignment targets by earlier starting position. @@ -88,7 +88,7 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] { } } -object IndelRealignmentTarget { +private[realignment] object IndelRealignmentTarget { /** * Generates 1+ indel realignment targets from a single read. @@ -136,7 +136,7 @@ object IndelRealignmentTarget { } } -class IndelRealignmentTargetSerializer extends Serializer[IndelRealignmentTarget] { +private[adam] class IndelRealignmentTargetSerializer extends Serializer[IndelRealignmentTarget] { def write(kryo: Kryo, output: Output, obj: IndelRealignmentTarget) = { output.writeString(obj.readRange.referenceName) @@ -161,7 +161,7 @@ class IndelRealignmentTargetSerializer extends Serializer[IndelRealignmentTarget } } -class IndelRealignmentTarget( +private[adam] class IndelRealignmentTarget( val variation: Option[ReferenceRegion], val readRange: ReferenceRegion) extends Logging { @@ -196,7 +196,7 @@ class IndelRealignmentTarget( } } -class TargetSetSerializer extends Serializer[TargetSet] { +private[adam] class TargetSetSerializer extends Serializer[TargetSet] { val irts = new IndelRealignmentTargetSerializer() @@ -217,7 +217,7 @@ class TargetSetSerializer extends Serializer[TargetSet] { } } -class ZippedTargetSetSerializer extends Serializer[ZippedTargetSet] { +private[adam] class ZippedTargetSetSerializer extends Serializer[ZippedTargetSet] { val irts = new IndelRealignmentTargetSerializer() @@ -241,15 +241,15 @@ class ZippedTargetSetSerializer extends Serializer[ZippedTargetSet] { } } -object TargetSet { +private[realignment] object TargetSet { def apply(): TargetSet = { new TargetSet(TreeSet[IndelRealignmentTarget]()(TargetOrdering)) } } // These two case classes are needed to get around some serialization issues -case class TargetSet(set: TreeSet[IndelRealignmentTarget]) extends Serializable { +private[adam] case class TargetSet(set: TreeSet[IndelRealignmentTarget]) extends Serializable { } -case class ZippedTargetSet(set: TreeSet[(IndelRealignmentTarget, Int)]) extends Serializable { +private[adam] case class ZippedTargetSet(set: TreeSet[(IndelRealignmentTarget, Int)]) extends Serializable { } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala index 866bacc7a7..75c457608d 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala @@ -34,7 +34,7 @@ import scala.collection.immutable.{ NumericRange, TreeSet } import scala.collection.mutable import scala.util.Random -private[rdd] object RealignIndels extends Serializable with Logging { +private[read] object RealignIndels extends Serializable with Logging { /** * Realigns an RDD of reads. @@ -222,7 +222,7 @@ private[rdd] object RealignIndels extends Serializable with Logging { import org.bdgenomics.adam.rdd.read.realignment.RealignIndels._ -private[rdd] class RealignIndels( +private[read] class RealignIndels( val consensusModel: ConsensusGenerator = new ConsensusGeneratorFromReads, val dataIsSorted: Boolean = false, val maxIndelSize: Int = 500, @@ -497,5 +497,4 @@ private[rdd] class RealignIndels( readsMappedToTarget.flatMap(realignTargetGroup).map(r => r.record) } } - } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala index 927deba3c0..a241aa7dfc 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala @@ -24,7 +24,7 @@ import org.bdgenomics.adam.instrumentation.Timers._ import scala.annotation.tailrec import scala.collection.immutable.TreeSet -object RealignmentTargetFinder { +private[realignment] object RealignmentTargetFinder { /** * Generates realignment targets from a set of reads. @@ -40,7 +40,7 @@ object RealignmentTargetFinder { } } -class RealignmentTargetFinder extends Serializable with Logging { +private[realignment] class RealignmentTargetFinder extends Serializable with Logging { /** * Joins two sorted sets of targets together. Is tail call recursive. @@ -120,5 +120,4 @@ class RealignmentTargetFinder extends Serializable with Logging { targetSet } - } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala index a13944f87e..ce1acc4a85 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala @@ -34,7 +34,7 @@ import org.bdgenomics.formats.avro.AlignmentRecord * a second pass over the reads to apply the recalibration and assign adjusted * quality scores. */ -class BaseQualityRecalibration( +private class BaseQualityRecalibration( val input: RDD[(Option[DecadentRead], Option[AlignmentRecord])], val knownSnps: Broadcast[SnpTable], val dumpObservationTableFile: Option[String] = None) @@ -122,7 +122,7 @@ class BaseQualityRecalibration( } } -object BaseQualityRecalibration { +private[read] object BaseQualityRecalibration { def apply( rdd: RDD[AlignmentRecord], knownSnps: Broadcast[SnpTable], diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Covariate.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Covariate.scala index c25e17a1ec..0d127cf11c 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Covariate.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Covariate.scala @@ -27,7 +27,7 @@ import org.bdgenomics.adam.rich.DecadentRead * @note Concrete implementations of Covariate should inherit from * AbstractCovariate, not Covariate. */ -trait Covariate { +private[recalibration] trait Covariate { type Value /** @@ -54,7 +54,7 @@ trait Covariate { def csvFieldName: String } -abstract class AbstractCovariate[ValueT] extends Covariate with Serializable { +private[recalibration] abstract class AbstractCovariate[ValueT] extends Covariate with Serializable { override type Value = ValueT } @@ -64,7 +64,7 @@ abstract class AbstractCovariate[ValueT] extends Covariate with Serializable { * The values for mandatory covariates are stored in member fields and optional * covariate values are in `extras`. */ -class CovariateKey( +private[adam] class CovariateKey( val readGroup: String, val quality: QualityScore, val extras: Seq[Option[Covariate#Value]]) extends Serializable { @@ -95,7 +95,7 @@ class CovariateKey( * Represents the abstract space of all possible CovariateKeys for the given set * of Covariates. */ -class CovariateSpace(val extras: IndexedSeq[Covariate]) extends Serializable { +private[adam] class CovariateSpace(val extras: IndexedSeq[Covariate]) extends Serializable { // Computes the covariate values for all residues in this read def apply(read: DecadentRead): Seq[CovariateKey] = { // Ask each 'extra' covariate to compute its values for this read @@ -133,7 +133,7 @@ class CovariateSpace(val extras: IndexedSeq[Covariate]) extends Serializable { } -object CovariateSpace { +private[recalibration] object CovariateSpace { def apply(extras: Covariate*): CovariateSpace = new CovariateSpace(extras.toIndexedSeq) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/CycleCovariate.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/CycleCovariate.scala index e09fbbe52a..ff180cfc0f 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/CycleCovariate.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/CycleCovariate.scala @@ -20,7 +20,7 @@ package org.bdgenomics.adam.rdd.read.recalibration import org.bdgenomics.adam.rich.DecadentRead // This is based on the CycleCovariate in GATK 1.6. -class CycleCovariate extends AbstractCovariate[Int] { +private[adam] class CycleCovariate extends AbstractCovariate[Int] { def compute(read: DecadentRead): Seq[Option[Int]] = { val (initial, increment) = initialization(read) read.residues.indices.map(pos => Some(initial + increment * pos)) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/DinucCovariate.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/DinucCovariate.scala index e17e944e38..cc6c5e8445 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/DinucCovariate.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/DinucCovariate.scala @@ -20,7 +20,7 @@ package org.bdgenomics.adam.rdd.read.recalibration import org.bdgenomics.adam.rich.DecadentRead // TODO: should inherit from something like AbstractCovariate[(DNABase, DNABase)] -class DinucCovariate extends AbstractCovariate[(Char, Char)] { +private[adam] class DinucCovariate extends AbstractCovariate[(Char, Char)] { def compute(read: DecadentRead): Seq[Option[(Char, Char)]] = { val sequence = read.residues.map(_.base) if (read.isNegativeRead) { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala index 21bad7f005..d605fef991 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala @@ -26,7 +26,7 @@ import scala.collection.mutable * * This is used in ObservationTable, which maps from CovariateKey to Observation. */ -class Observation(val total: Long, val mismatches: Long) extends Serializable { +private[adam] class Observation(val total: Long, val mismatches: Long) extends Serializable { require(mismatches >= 0 && mismatches <= total) def this(that: Observation) = this(that.total, that.mismatches) @@ -76,13 +76,13 @@ class Observation(val total: Long, val mismatches: Long) extends Serializable { } -object Observation { +private[recalibration] object Observation { val empty = new Observation(0, 0) def apply(isMismatch: Boolean) = new Observation(1, if (isMismatch) 1 else 0) } -class Aggregate private ( +private[adam] class Aggregate private ( total: Long, // number of total observations mismatches: Long, // number of mismatches observed val expectedMismatches: Double // expected number of mismatches based on reported quality scores @@ -100,7 +100,7 @@ class Aggregate private ( ) } -object Aggregate { +private[recalibration] object Aggregate { val empty: Aggregate = new Aggregate(0, 0, 0) def apply(key: CovariateKey, value: Observation) = @@ -110,7 +110,7 @@ object Aggregate { /** * Table containing the empirical frequency of mismatches for each set of covariate values. */ -class ObservationTable( +private[adam] class ObservationTable( val space: CovariateSpace, val entries: Map[CovariateKey, Observation]) extends Serializable { @@ -128,7 +128,7 @@ class ObservationTable( def csvHeader: Seq[String] = space.csvHeader ++ Seq("TotalCount", "MismatchCount", "EmpiricalQ", "IsSkipped") } -class ObservationAccumulator(val space: CovariateSpace) extends Serializable { +private[adam] class ObservationAccumulator(val space: CovariateSpace) extends Serializable { private val entries = mutable.HashMap[CovariateKey, Observation]() def +=(that: (CovariateKey, Observation)): ObservationAccumulator = ObservationAccumulatorSeq.time { @@ -150,6 +150,6 @@ class ObservationAccumulator(val space: CovariateSpace) extends Serializable { def result: ObservationTable = new ObservationTable(space, entries.toMap) } -object ObservationAccumulator { +private[recalibration] object ObservationAccumulator { def apply(space: CovariateSpace) = new ObservationAccumulator(space) } diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala index 801a11e811..a608a0c3a6 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala @@ -25,7 +25,7 @@ import org.bdgenomics.formats.avro.AlignmentRecord import org.bdgenomics.adam.instrumentation.Timers._ import scala.math.{ exp, log } -class Recalibrator(val table: RecalibrationTable, val minAcceptableQuality: QualityScore) +private[recalibration] class Recalibrator(val table: RecalibrationTable, val minAcceptableQuality: QualityScore) extends Serializable { def apply(r: (Option[DecadentRead], Option[AlignmentRecord])): AlignmentRecord = RecalibrateRead.time { @@ -53,13 +53,13 @@ class Recalibrator(val table: RecalibrationTable, val minAcceptableQuality: Qual } } -object Recalibrator { +private[recalibration] object Recalibrator { def apply(observed: ObservationTable, minAcceptableQuality: QualityScore): Recalibrator = { new Recalibrator(RecalibrationTable(observed), minAcceptableQuality) } } -class RecalibrationTable( +private[recalibration] class RecalibrationTable( // covariates for this recalibration val covariates: CovariateSpace, // marginal and quality scores by read group, @@ -169,6 +169,8 @@ object RecalibrationTable { } -class QualityTable(val table: Map[QualityScore, (Aggregate, ExtrasTables)]) extends Serializable +private[recalibration] class QualityTable( + val table: Map[QualityScore, (Aggregate, ExtrasTables)]) extends Serializable -class ExtrasTables(val extrasTables: IndexedSeq[Map[Option[Covariate#Value], Aggregate]]) extends Serializable +private[recalibration] class ExtrasTables( + val extrasTables: IndexedSeq[Map[Option[Covariate#Value], Aggregate]]) extends Serializable diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala index c80c7d209f..eeb1a872ea 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala @@ -32,7 +32,7 @@ import org.seqdoop.hadoop_bam.{ /** * Wrapper for Hadoop-BAM to work around requirement for no-args constructor. * - * @tparam K + * @tparam K The key type. Keys are not written. */ class ADAMVCFOutputFormat[K] extends KeyIgnoringVCFOutputFormat[K](VCFFormat.VCF) with Serializable { @@ -56,7 +56,7 @@ class ADAMVCFOutputFormat[K] extends KeyIgnoringVCFOutputFormat[K](VCFFormat.VCF /** * Wrapper for Hadoop-BAM to work around requirement for no-args constructor. * - * @tparam K + * @tparam K The key type. Keys are not written. */ class ADAMHeaderlessVCFOutputFormat[K] extends KeyIgnoringVCFOutputFormat[K](VCFFormat.VCF) with Serializable { diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFInFormatter.scala index 3d02d7d70a..7345755078 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFInFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFInFormatter.scala @@ -34,15 +34,26 @@ import org.bdgenomics.adam.models.{ import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion } import scala.collection.JavaConversions._ +/** + * InFormatter companion that builds a VCFInFormatter to write VCF to a pipe. + */ object VCFInFormatter extends InFormatterCompanion[VariantContext, VariantContextRDD, VCFInFormatter] { + /** + * Apply method for building the VCFInFormatter from a VariantContextRDD. + * + * @param gRdd VariantContextRDD to build VCF header from. + * @return A constructed VCFInFormatter with all needed metadata to write a + * VCF header. + */ def apply(gRdd: VariantContextRDD): VCFInFormatter = { VCFInFormatter(gRdd.sequences, gRdd.samples.map(_.getSampleId)) } } -case class VCFInFormatter(sequences: SequenceDictionary, - samples: Seq[String]) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] { +private[variation] case class VCFInFormatter private ( + sequences: SequenceDictionary, + samples: Seq[String]) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] { protected val companion = VCFInFormatter @@ -57,7 +68,7 @@ case class VCFInFormatter(sequences: SequenceDictionary, */ def write(os: OutputStream, iter: Iterator[VariantContext]) { - // create a sam file writer connected to the output stream + // create a vcf writer connected to the output stream val writer = new VariantContextWriterBuilder() .setOutputStream(os) .clearIndexCreator() diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFOutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFOutFormatter.scala index 131123c258..ade83703fd 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFOutFormatter.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFOutFormatter.scala @@ -29,10 +29,13 @@ import org.bdgenomics.adam.rdd.OutFormatter import scala.annotation.tailrec import scala.collection.mutable.ListBuffer +/** + * OutFormatter that reads streaming VCF. + */ case class VCFOutFormatter() extends OutFormatter[VariantContext] { /** - * Reads alignment records from an input stream. Autodetects SAM/BAM format. + * Reads VariantContexts from an input stream. Autodetects VCF format. * * @param is An InputStream connected to a process we are piping from. * @return Returns an iterator of AlignmentRecords read from the stream. diff --git a/adam-cli/src/test/resources/NA12878.sam b/adam-core/src/test/resources/NA12878.sam similarity index 100% rename from adam-cli/src/test/resources/NA12878.sam rename to adam-core/src/test/resources/NA12878.sam diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/FlagStatSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/FlagStatSuite.scala similarity index 92% rename from adam-cli/src/test/scala/org/bdgenomics/adam/cli/FlagStatSuite.scala rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/FlagStatSuite.scala index 61a6922e70..610ff260dc 100644 --- a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/FlagStatSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/FlagStatSuite.scala @@ -15,26 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.bdgenomics.adam.cli +package org.bdgenomics.adam.rdd.read import org.apache.spark.rdd.RDD import org.bdgenomics.adam.projections.{ AlignmentRecordField, Projection } import org.bdgenomics.adam.rdd.ADAMContext._ -import org.bdgenomics.adam.rdd.read.FlagStat._ -import org.bdgenomics.adam.rdd.read.{ DuplicateMetrics, FlagStatMetrics } import org.bdgenomics.adam.util.ADAMFunSuite import org.bdgenomics.formats.avro.AlignmentRecord -import org.bdgenomics.utils.cli.Args4j class FlagStatSuite extends ADAMFunSuite { sparkTest("Standard FlagStat test") { - val inputpath = testFile("NA12878.sam") - val argLine = "%s".format(inputpath).split("\\s+") - - val args: FlagStatArgs = Args4j.apply[FlagStatArgs](argLine) - + val inputPath = testFile("NA12878.sam") val projection = Projection( AlignmentRecordField.readMapped, AlignmentRecordField.mateMapped, @@ -50,9 +43,9 @@ class FlagStatSuite extends ADAMFunSuite { AlignmentRecordField.mapq, AlignmentRecordField.failedVendorQualityChecks) - val adamFile: RDD[AlignmentRecord] = sc.loadAlignments(args.inputPath, projection = Some(projection)).rdd + val adamFile: RDD[AlignmentRecord] = sc.loadAlignments(inputPath, projection = Some(projection)).rdd - val (failedVendorQuality, passedVendorQuality) = apply(adamFile) + val (failedVendorQuality, passedVendorQuality) = FlagStat(adamFile) def percent(fraction: Long, total: Long) = if (total == 0) 0.0 else 100.00 * fraction.toFloat / total diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala index da4c691d65..2c5dcf77b9 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala @@ -80,9 +80,7 @@ class MDTaggingSuite extends ADAMFunSuite { ) } - for (i <- List(1, 10)) { - check(MDTagging(reads, ReferenceContigMap(makeFrags(fs: _*)), partitionSize = i)) - } + check(MDTagging(reads, ReferenceContigMap(makeFrags(fs: _*)))) } sparkTest("test adding MDTags over boundary") {