From 4063b2922ec10c7b7a311acdf5c0d834acbca391 Mon Sep 17 00:00:00 2001
From: Frank Austin Nothaft <fnothaft@alumni.stanford.edu>
Date: Sun, 13 Nov 2016 15:17:05 -0800
Subject: [PATCH] Finishing up the cleanup on org.bdgenomics.adam.rdd.

* Made `MDTagging` private to `read`. Exposed it through `AlignmentRecordRDD`.
  Cleaned up where it was used in `Transform`.
* Made `FlagStat` and most related models private to `read`.
* Made all `read.recalibration` and `read.realignment` classes private to
  package. Depending on the class, this varied from:
  * `adam` for classes that need to be registered with `kryo`
  * `read` for the core `RealignIndels` and `BaseQualityRecalibrator` engines
  * Their subpackage where possible.
* Made class values for genomic partitioners as private as possible.
* Added documentation to all `InFormatter`s, `InFormatterCompanion`s, and
  `OutFormatter`s. Made the `InFormatter`s package private with private
  constructors.
* Added method/class level scaladoc where missing.
* Moved `org.bdgenomics.adam.cli.FlagStatSuite` to `org.bdgenomics.adam.read`,
  along with the `NA12878.sam` test resource, which was otherwise unused in the
  `adam-cli` submodule.
---
 .../org/bdgenomics/adam/cli/Transform.scala   | 20 +++----
 .../org/bdgenomics/adam/rdd/ADAMContext.scala |  9 +++
 .../adam/rdd/ADAMRDDFunctions.scala           | 51 +++++++++++++++-
 .../adam/rdd/GenomicPartitioners.scala        | 55 ++++++++++++++++--
 .../org/bdgenomics/adam/rdd/GenomicRDD.scala  | 58 +++++++++++++++++++
 .../org/bdgenomics/adam/rdd/InFormatter.scala | 17 ++++++
 .../org/bdgenomics/adam/rdd/RegionJoin.scala  | 12 ++++
 .../adam/rdd/ShuffleRegionJoin.scala          | 16 ++++-
 .../rdd/contig/FlankReferenceFragments.scala  | 16 +++++
 .../contig/NucleotideContigFragmentRDD.scala  |  4 +-
 .../adam/rdd/feature/CoverageRDD.scala        |  6 +-
 .../adam/rdd/feature/FeatureRDD.scala         | 18 ++++--
 .../InterleavedFASTQInFormatter.scala         | 24 +++++++-
 .../adam/rdd/read/ADAMBAMOutputFormat.scala   | 20 +++++++
 .../adam/rdd/read/ADAMCRAMOutputFormat.scala  | 20 +++++++
 .../adam/rdd/read/AlignmentRecordRDD.scala    | 25 ++++++++
 .../adam/rdd/read/AnySAMInFormatter.scala     | 28 +++++++++
 .../adam/rdd/read/AnySAMOutFormatter.scala    |  4 ++
 .../adam/rdd/read/BAMInFormatter.scala        | 10 +++-
 .../bdgenomics/adam/rdd/read/FlagStat.scala   | 18 +++---
 .../bdgenomics/adam/rdd/read/MDTagging.scala  | 28 +++------
 .../adam/rdd/read/MarkDuplicates.scala        |  1 -
 .../adam/rdd/read/SAMInFormatter.scala        | 10 +++-
 .../realignment/IndelRealignmentTarget.scala  | 20 +++----
 .../rdd/read/realignment/RealignIndels.scala  |  5 +-
 .../realignment/RealignmentTargetFinder.scala |  5 +-
 .../BaseQualityRecalibration.scala            |  4 +-
 .../rdd/read/recalibration/Covariate.scala    | 10 ++--
 .../read/recalibration/CycleCovariate.scala   |  2 +-
 .../read/recalibration/DinucCovariate.scala   |  2 +-
 .../read/recalibration/ObservationTable.scala | 14 ++---
 .../rdd/read/recalibration/Recalibrator.scala | 12 ++--
 .../rdd/variation/ADAMVCFOutputFormat.scala   |  4 +-
 .../adam/rdd/variation/VCFInFormatter.scala   | 17 +++++-
 .../adam/rdd/variation/VCFOutFormatter.scala  |  5 +-
 .../src/test/resources/NA12878.sam            |  0
 .../adam/rdd/read}/FlagStatSuite.scala        | 15 ++---
 .../adam/rdd/read/MDTaggingSuite.scala        |  4 +-
 38 files changed, 463 insertions(+), 126 deletions(-)
 rename {adam-cli => adam-core}/src/test/resources/NA12878.sam (100%)
 rename {adam-cli/src/test/scala/org/bdgenomics/adam/cli => adam-core/src/test/scala/org/bdgenomics/adam/rdd/read}/FlagStatSuite.scala (92%)

diff --git a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala
index 109ac8080b..07b3a990b4 100644
--- a/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala
+++ b/adam-cli/src/main/scala/org/bdgenomics/adam/cli/Transform.scala
@@ -316,19 +316,17 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans
    *   these tags are recomputed or not. If MD tagging isn't requested, we
    *   return the input RDD.
    */
-  def maybeMdTag(rdd: AlignmentRecordRDD,
+  def maybeMdTag(sc: SparkContext,
+                 rdd: AlignmentRecordRDD,
                  stringencyOpt: Option[ValidationStringency]): AlignmentRecordRDD = {
     if (args.mdTagsReferenceFile != null) {
       log.info(s"Adding MDTags to reads based on reference file ${args.mdTagsReferenceFile}")
-      rdd.transform(r => {
-        MDTagging(
-          r,
-          args.mdTagsReferenceFile,
-          fragmentLength = args.mdTagsFragmentSize,
-          overwriteExistingTags = args.mdTagsOverwrite,
-          validationStringency = stringencyOpt.getOrElse(ValidationStringency.STRICT)
-        )
-      })
+      val referenceFile = sc.loadReferenceFile(args.mdTagsReferenceFile,
+        fragmentLength = args.mdTagsFragmentSize)
+      rdd.computeMismatchingPositions(
+        referenceFile,
+        overwriteExistingTags = args.mdTagsOverwrite,
+        validationStringency = stringencyOpt.getOrElse(ValidationStringency.STRICT))
     } else {
       rdd
     }
@@ -360,7 +358,7 @@ class Transform(protected val args: TransformArgs) extends BDGSparkCommand[Trans
     val maybeSortedRdd = maybeSort(finalPreprocessedRdd, sl)
 
     // recompute md tags, if requested, and return
-    maybeMdTag(maybeSortedRdd, stringencyOpt)
+    maybeMdTag(sc, maybeSortedRdd, stringencyOpt)
   }
 
   def forceNonParquet(): Boolean = {
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala
index f797b71fc5..6eba62cc60 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala
@@ -87,6 +87,10 @@ private case class LocatableReferenceRegion(rr: ReferenceRegion) extends Locatab
   def getContig(): String = rr.referenceName
 }
 
+/**
+ * This singleton provides an implicit conversion from a SparkContext to the
+ * ADAMContext, as well as implicit functions for the Pipe API.
+ */
 object ADAMContext {
 
   // conversion functions for pipes
@@ -141,6 +145,11 @@ private class FileFilter(private val name: String) extends PathFilter {
 
 import org.bdgenomics.adam.rdd.ADAMContext._
 
+/**
+ * The ADAMContext provides functions on top of a SparkContext for loading genomic data.
+ *
+ * @param sc The SparkContext to wrap.
+ */
 class ADAMContext private (@transient val sc: SparkContext) extends Serializable with Logging {
 
   /**
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala
index ca589822cc..7c79894bb9 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMRDDFunctions.scala
@@ -18,7 +18,6 @@
 package org.bdgenomics.adam.rdd
 
 import java.util.logging.Level
-
 import java.io.OutputStream
 import org.apache.avro.Schema
 import org.apache.avro.file.DataFileWriter
@@ -41,9 +40,32 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName
 import org.apache.parquet.hadoop.util.ContextUtil
 import scala.reflect.ClassTag
 
+/**
+ * Argument configuration for saving any output format.
+ */
 trait ADAMSaveAnyArgs extends SaveArgs {
+
+  /**
+   * If true and saving as FASTQ, we will sort by read name.
+   */
   var sortFastqOutput: Boolean
+
+  /**
+   * If true and saving as a legacy format, we will write shards so that they
+   * can be merged into a single file.
+   *
+   * @see deferMerging
+   */
   var asSingleFile: Boolean
+
+  /**
+   * If true and asSingleFile is true, we will not merge the shards once we
+   * write them, and will leave them for the user to merge later. If false and
+   * asSingleFile is true, then we will merge the shards on write. If
+   * asSingleFile is false, this is ignored.
+   *
+   * @see asSingleFile
+   */
   var deferMerging: Boolean
 }
 
@@ -104,6 +126,17 @@ private[rdd] abstract class ADAMRDDFunctions[T <% IndexedRecord: Manifest] exten
     )
   }
 
+  /**
+   * Saves an RDD of Avro data to Parquet.
+   *
+   * @param filePath The path to save the file to.
+   * @param blockSize The size in bytes of blocks to write.
+   * @param pageSize The size in bytes of pages to write.
+   * @param compressCodec The compression codec to apply to pages.
+   * @param disableDictionaryEncoding If false, dictionary encoding is used. If
+   *   true, delta encoding is used.
+   * @param schema The schema to set.
+   */
   protected def saveRddAsParquet(
     filePath: String,
     blockSize: Int = 128 * 1024 * 1024,
@@ -138,10 +171,26 @@ private[rdd] abstract class ADAMRDDFunctions[T <% IndexedRecord: Manifest] exten
   since = "0.20.0")
 private[rdd] class ConcreteADAMRDDFunctions[T <% IndexedRecord: Manifest](val rdd: RDD[T]) extends ADAMRDDFunctions[T] {
 
+  /**
+   * Saves an RDD of Avro data to Parquet.
+   *
+   * @param args The output format configuration to use when saving the data.
+   */
   def saveAsParquet(args: SaveArgs): Unit = {
     saveRddAsParquet(args)
   }
 
+  /**
+   * Saves an RDD of Avro data to Parquet.
+   *
+   * @param filePath The path to save the file to.
+   * @param blockSize The size in bytes of blocks to write.
+   * @param pageSize The size in bytes of pages to write.
+   * @param compressCodec The compression codec to apply to pages.
+   * @param disableDictionaryEncoding If false, dictionary encoding is used. If
+   *   true, delta encoding is used.
+   * @param schema The schema to set.
+   */
   def saveAsParquet(
     filePath: String,
     blockSize: Int = 128 * 1024 * 1024,
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala
index f0bc80e1e9..e123f66fb2 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicPartitioners.scala
@@ -41,15 +41,15 @@ case class GenomicPositionPartitioner(numParts: Int, seqLengths: Map[String, Lon
   log.info("Have genomic position partitioner with " + numParts + " partitions, and sequences:")
   seqLengths.foreach(kv => log.info("Contig " + kv._1 + " with length " + kv._2))
 
-  val names: Seq[String] = seqLengths.keys.toSeq.sortWith(_ < _)
-  val lengths: Seq[Long] = names.map(seqLengths(_))
+  private val names: Seq[String] = seqLengths.keys.toSeq.sortWith(_ < _)
+  private val lengths: Seq[Long] = names.map(seqLengths(_))
   private val cumuls: Seq[Long] = lengths.scan(0L)(_ + _)
 
   // total # of bases in the sequence dictionary
-  val totalLength: Long = lengths.sum
+  private val totalLength: Long = lengths.sum
 
   // referenceName -> cumulative length before this sequence (using seqDict.records as the implicit ordering)
-  val cumulativeLengths: Map[String, Long] = Map(
+  private[rdd] val cumulativeLengths: Map[String, Long] = Map(
     names.zip(cumuls): _*
   )
 
@@ -57,11 +57,28 @@ case class GenomicPositionPartitioner(numParts: Int, seqLengths: Map[String, Lon
    * 'parts' is the total number of partitions for non-UNMAPPED ReferencePositions --
    * the total number of partitions (see numPartitions, below) is parts+1, with the
    * extra partition being included for handling ReferencePosition.UNMAPPED
+   *
+   * @see numPartitions
    */
   private val parts = min(numParts, totalLength).toInt
 
+  /**
+   * This is the total number of partitions for both mapped and unmapped
+   * positions. All unmapped positions go into the last partition.
+   *
+   * @see parts
+   */
   override def numPartitions: Int = parts + 1
 
+  /**
+   * Computes the partition for a key.
+   *
+   * @param key A key to compute the partition for.
+   * @return The partition that this key belongs to.
+   *
+   * @throws IllegalArgumentException if the key is not a ReferencePosition, or
+   *   (ReferencePosition, _) tuple.
+   */
   override def getPartition(key: Any): Int = {
 
     // This allows partitions that cross chromosome boundaries.
@@ -101,9 +118,11 @@ case class GenomicPositionPartitioner(numParts: Int, seqLengths: Map[String, Lon
   override def toString(): String = {
     return "%d parts, %d partitions, %s" format (parts, numPartitions, cumulativeLengths.toString)
   }
-
 }
 
+/**
+ * Helper for creating genomic position partitioners.
+ */
 object GenomicPositionPartitioner {
 
   /**
@@ -120,7 +139,17 @@ object GenomicPositionPartitioner {
     seqDict.records.toSeq.map(rec => (rec.name, rec.length)).toMap
 }
 
-case class GenomicRegionPartitioner(partitionSize: Long, seqLengths: Map[String, Long], start: Boolean = true) extends Partitioner with Logging {
+/**
+ * A partitioner for ReferenceRegion-keyed data.
+ *
+ * @param partitionSize The number of bases per partition.
+ * @param seqLengths A map between contig names and contig lengths.
+ * @param start If true, use the start position (instead of the end position) to
+ *   decide which partition a key belongs to.
+ */
+case class GenomicRegionPartitioner(partitionSize: Long,
+                                    seqLengths: Map[String, Long],
+                                    start: Boolean = true) extends Partitioner with Logging {
   private val names: Seq[String] = seqLengths.keys.toSeq.sortWith(_ < _)
   private val lengths: Seq[Long] = names.map(seqLengths(_))
   private val parts: Seq[Int] = lengths.map(v => round(ceil(v.toDouble / partitionSize)).toInt)
@@ -138,8 +167,19 @@ case class GenomicRegionPartitioner(partitionSize: Long, seqLengths: Map[String,
     (cumulParts(refReg.referenceName) + pos / partitionSize).toInt
   }
 
+  /**
+   * @return The number of partitions described by this partitioner. Roughly the
+   *   size of the genome divided by the partition length.
+   */
   override def numPartitions: Int = parts.sum
 
+  /**
+   * @param key The key to get the partition index for.
+   * @return The partition that a key should map to.
+   *
+   * @throws IllegalArgumentException Throws an exception if the data is not a
+   *   ReferenceRegion or a tuple of (ReferenceRegion, _).
+   */
   override def getPartition(key: Any): Int = {
     key match {
       case region: ReferenceRegion => {
@@ -153,6 +193,9 @@ case class GenomicRegionPartitioner(partitionSize: Long, seqLengths: Map[String,
   }
 }
 
+/**
+ * Helper object for creating GenomicRegionPartitioners.
+ */
 object GenomicRegionPartitioner {
 
   /**
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala
index 7b1f31e25c..2a4a6fe7d4 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/GenomicRDD.scala
@@ -81,16 +81,39 @@ private[rdd] object GenomicRDD {
   }
 }
 
+/**
+ * A trait that wraps an RDD of genomic data with helpful metadata.
+ *
+ * @tparam T The type of the data in the wrapped RDD.
+ * @tparam U The type of this GenomicRDD.
+ */
 trait GenomicRDD[T, U <: GenomicRDD[T, U]] {
 
+  /**
+   * The RDD of genomic data that we are wrapping.
+   */
   val rdd: RDD[T]
 
+  /**
+   * The sequence dictionary describing the reference assembly this data is
+   * aligned to.
+   */
   val sequences: SequenceDictionary
 
+  /**
+   * The underlying RDD of genomic data, as a JavaRDD.
+   */
   lazy val jrdd: JavaRDD[T] = {
     rdd.toJavaRDD()
   }
 
+  /**
+   * Applies a function that transforms the underlying RDD into a new RDD.
+   *
+   * @param tFn A function that transforms the underlying RDD.
+   * @return A new RDD where the RDD of genomic data has been replaced, but the
+   *   metadata (sequence dictionary, and etc) is copied without modification.
+   */
   def transform(tFn: RDD[T] => RDD[T]): U = {
     replaceRdd(tFn(rdd))
   }
@@ -321,6 +344,14 @@ trait GenomicRDD[T, U <: GenomicRDD[T, U]] {
     })
   }
 
+  /**
+   * Runs a filter that selects data in the underlying RDD that overlaps a
+   * single genomic region.
+   *
+   * @param query The region to query for.
+   * @return Returns a new GenomicRDD containing only data that overlaps the
+   *   query region.
+   */
   def filterByOverlappingRegion(query: ReferenceRegion): U = {
     replaceRdd(rdd.filter(elem => {
 
@@ -647,13 +678,28 @@ private case class GenericGenomicRDD[T](rdd: RDD[T],
   }
 }
 
+/**
+ * A trait describing a GenomicRDD with data from multiple samples.
+ */
 trait MultisampleGenomicRDD[T, U <: MultisampleGenomicRDD[T, U]] extends GenomicRDD[T, U] {
 
+  /**
+   * The samples who have data contained in this GenomicRDD.
+   */
   val samples: Seq[Sample]
 }
 
+/**
+ * An abstract class describing a GenomicRDD where:
+ *
+ * * The data are Avro IndexedRecords.
+ * * The data are associated to read groups (i.e., they are reads or fragments).
+ */
 abstract class AvroReadGroupGenomicRDD[T <% IndexedRecord: Manifest, U <: AvroReadGroupGenomicRDD[T, U]] extends AvroGenomicRDD[T, U] {
 
+  /**
+   * A dictionary describing the read groups attached to this GenomicRDD.
+   */
   val recordGroups: RecordGroupDictionary
 
   override protected def saveMetadata(filePath: String) {
@@ -675,6 +721,10 @@ abstract class AvroReadGroupGenomicRDD[T <% IndexedRecord: Manifest, U <: AvroRe
   }
 }
 
+/**
+ * An abstract class that extends the MultisampleGenomicRDD trait, where the data
+ * are Avro IndexedRecords.
+ */
 abstract class MultisampleAvroGenomicRDD[T <% IndexedRecord: Manifest, U <: MultisampleAvroGenomicRDD[T, U]] extends AvroGenomicRDD[T, U]
     with MultisampleGenomicRDD[T, U] {
 
@@ -695,6 +745,11 @@ abstract class MultisampleAvroGenomicRDD[T <% IndexedRecord: Manifest, U <: Mult
   }
 }
 
+/**
+ * An abstract class that extends GenomicRDD and where the underlying data is
+ * Avro IndexedRecords. This abstract class provides methods for saving to
+ * Parquet, and provides hooks for writing the metadata.
+ */
 abstract class AvroGenomicRDD[T <% IndexedRecord: Manifest, U <: AvroGenomicRDD[T, U]] extends ADAMRDDFunctions[T]
     with GenomicRDD[T, U] {
 
@@ -793,6 +848,9 @@ abstract class AvroGenomicRDD[T <% IndexedRecord: Manifest, U <: AvroGenomicRDD[
   }
 }
 
+/**
+ * A trait for genomic data that is not aligned to a reference (e.g., raw reads).
+ */
 trait Unaligned {
 
   val sequences = SequenceDictionary.empty
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala
index 80647cce2e..f64099f62b 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/InFormatter.scala
@@ -30,8 +30,25 @@ private[rdd] class InFormatterRunner[T, U <: GenomicRDD[T, U], V <: InFormatter[
   }
 }
 
+/**
+ * A trait for singleton objects that build an InFormatter from a GenomicRDD.
+ *
+ * Often, when creating an outputstream, we need to add metadata to the output
+ * that is not attached to individual records. An example of this is writing a
+ * header with contig/read group/format info, as is done with SAM/BAM/VCF.
+ *
+ * @tparam T The type of the records this InFormatter writes out.
+ * @tparam U The type of the GenomicRDD this companion object understands.
+ * @tparam V The type of InFormatter this companion object creates.
+ */
 trait InFormatterCompanion[T, U <: GenomicRDD[T, U], V <: InFormatter[T, U, V]] {
 
+  /**
+   * Creates an InFormatter from a GenomicRDD.
+   *
+   * @param gRdd The GenomicRDD to get metadata from.
+   * @return Returns an InFormatter with attached metadata.
+   */
   def apply(gRdd: U): V
 }
 
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala
index 3ebe5ed5d1..ffa64ed849 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/RegionJoin.scala
@@ -24,7 +24,19 @@ import scala.Predef._
 import org.apache.spark.SparkContext
 import scala.reflect.ClassTag
 
+/**
+ * A trait describing a join in the genomic coordinate space between two RDDs
+ * where the values are keyed by a ReferenceRegion.
+ *
+ * @tparam T The type of the left RDD.
+ * @tparam U The type of the right RDD.
+ * @tparam RT The type of data yielded by the left RDD at the output of the
+ *   join. This may not match T if the join is an outer join, etc.
+ * @tparam RU The type of data yielded by the right RDD at the output of the
+ *   join.
+ */
 trait RegionJoin[T, U, RT, RU] {
+
   /**
    * Performs a region join between two RDDs.
    *
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala
index 07c51ecfe6..0869515de5 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/ShuffleRegionJoin.scala
@@ -25,11 +25,21 @@ import org.bdgenomics.adam.models.{ SequenceDictionary, ReferenceRegion }
 import scala.collection.mutable.ListBuffer
 import scala.reflect.ClassTag
 
+/**
+ * A trait describing join implementations that are based on a sort-merge join.
+ *
+ * @tparam T The type of the left RDD.
+ * @tparam U The type of the right RDD.
+ * @tparam RT The type of data yielded by the left RDD at the output of the
+ *   join. This may not match T if the join is an outer join, etc.
+ * @tparam RU The type of data yielded by the right RDD at the output of the
+ *   join.
+ */
 sealed trait ShuffleRegionJoin[T, U, RT, RU] extends RegionJoin[T, U, RT, RU] {
 
-  val sd: SequenceDictionary
-  val partitionSize: Long
-  val sc: SparkContext
+  protected val sd: SequenceDictionary
+  protected val partitionSize: Long
+  protected val sc: SparkContext
 
   // Create the set of bins across the genome for parallel processing
   //   partitionSize (in nucleotides) may range from 10000 to 10000000+ 
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala
index 38615c3950..df98a749af 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/FlankReferenceFragments.scala
@@ -23,8 +23,24 @@ import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceDictionary }
 import org.bdgenomics.adam.rdd.ReferencePartitioner
 import org.bdgenomics.formats.avro.NucleotideContigFragment
 
+/**
+ * Object that extends all of the fragments in an RDD of contig fragments
+ * with the sequence flanking said fragment.
+ */
 private[contig] object FlankReferenceFragments extends Serializable {
 
+  /**
+   * Adds flanks to sequence fragments in an RDD.
+   *
+   * Assumes that after sorting, all fragments are contiguous.
+   *
+   * @param rdd The RDD to flank.
+   * @param sd The sequence dictionary describing all contigs in this sequence
+   *   dictionary.
+   * @param flankSize The size of flanking sequence to add to each fragment.
+   * @return Returns a new RDD where each fragment has been extended with
+   *   flanking sequence.
+   */
   def apply(
     rdd: RDD[NucleotideContigFragment],
     sd: SequenceDictionary,
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala
index 7067c1b4a2..d29557e6e7 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/contig/NucleotideContigFragmentRDD.scala
@@ -32,7 +32,7 @@ import org.bdgenomics.formats.avro.{ AlignmentRecord, NucleotideContigFragment }
 import scala.collection.JavaConversions._
 import scala.math.max
 
-object NucleotideContigFragmentRDD extends Serializable {
+private[rdd] object NucleotideContigFragmentRDD extends Serializable {
 
   /**
    * Helper function for building a NucleotideContigFragmentRDD when no
@@ -42,7 +42,7 @@ object NucleotideContigFragmentRDD extends Serializable {
    *   this RDD.
    * @return Returns a new NucleotideContigFragmentRDD.
    */
-  private[rdd] def apply(rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = {
+  def apply(rdd: RDD[NucleotideContigFragment]): NucleotideContigFragmentRDD = {
 
     // get sequence dictionary
     val sd = new SequenceDictionary(rdd.flatMap(ncf => {
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala
index 94fa80a21e..cdab4f80e7 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/CoverageRDD.scala
@@ -25,13 +25,13 @@ import org.bdgenomics.adam.models.{
   SequenceRecord
 }
 import org.bdgenomics.adam.rdd.GenomicRDD
-
 import scala.annotation.tailrec
 
 /**
- * An RDD containing Coverage.
+ * An RDD containing Coverage data.
  *
- * @param rdd Coverage
+ * @param rdd An RDD containing data describing how many reads cover a genomic
+ *   locus/region.
  * @param sequences A dictionary describing the reference genome.
  */
 case class CoverageRDD(rdd: RDD[Coverage],
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala
index adfc580a66..b376a9906e 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/feature/FeatureRDD.scala
@@ -72,7 +72,7 @@ private trait FeatureOrdering[T <: Feature] extends Ordering[T] {
 }
 private object FeatureOrdering extends FeatureOrdering[Feature] {}
 
-object FeatureRDD {
+private[rdd] object FeatureRDD {
 
   /**
    * @param elem The feature to extract a sequence record from.
@@ -89,7 +89,7 @@ object FeatureRDD {
    * @param rdd The underlying Feature RDD to build from.
    * @return Returns a new FeatureRDD.
    */
-  private[rdd] def apply(rdd: RDD[Feature]): FeatureRDD = {
+  def apply(rdd: RDD[Feature]): FeatureRDD = {
 
     // cache the rdd, since we're making multiple passes
     rdd.cache()
@@ -128,7 +128,7 @@ object FeatureRDD {
    * @param feature Feature to write in IntervalList format.
    * @return Feature as a one line interval list string.
    */
-  private[rdd] def toInterval(feature: Feature): String = {
+  def toInterval(feature: Feature): String = {
     val sequenceName = feature.getContigName
     val start = feature.getStart + 1 // IntervalList ranges are 1-based
     val end = feature.getEnd // IntervalList ranges are closed
@@ -141,7 +141,7 @@ object FeatureRDD {
    * @param feature Feature to write in the narrow peak format.
    * @return Returns this feature as a single narrow peak line.
    */
-  private[rdd] def toNarrowPeak(feature: Feature): String = {
+  def toNarrowPeak(feature: Feature): String = {
     val chrom = feature.getContigName
     val start = feature.getStart
     val end = feature.getEnd
@@ -159,7 +159,7 @@ object FeatureRDD {
    * @param feature Feature to write in BED format.
    * @return Returns the feature as a single line BED string.
    */
-  private[rdd] def toBed(feature: Feature): String = {
+  def toBed(feature: Feature): String = {
     val chrom = feature.getContigName
     val start = feature.getStart
     val end = feature.getEnd
@@ -186,7 +186,7 @@ object FeatureRDD {
    * @param feature Feature to write in GFF3 format.
    * @return Returns this feature as a single line GFF3 string.
    */
-  private[rdd] def toGff3(feature: Feature): String = {
+  def toGff3(feature: Feature): String = {
     def escape(entry: (Any, Any)): String = {
       entry._1 + "=" + entry._2
     }
@@ -204,6 +204,12 @@ object FeatureRDD {
   }
 }
 
+/**
+ * A GenomicRDD that wraps Feature data.
+ *
+ * @param rdd An RDD of genomic Features.
+ * @param sequences The reference genome this data is aligned to.
+ */
 case class FeatureRDD(rdd: RDD[Feature],
                       sequences: SequenceDictionary) extends AvroGenomicRDD[Feature, FeatureRDD] with Logging {
 
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala
index 91bb6796b2..0f25b617b5 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/fragment/InterleavedFASTQInFormatter.scala
@@ -24,17 +24,39 @@ import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
 import org.bdgenomics.formats.avro.Fragment
 import org.bdgenomics.utils.misc.Logging
 
+/**
+ * InFormatter companion that creates an InFormatter that writes interleaved
+ * FASTQ.
+ */
 object InterleavedFASTQInFormatter extends InFormatterCompanion[Fragment, FragmentRDD, InterleavedFASTQInFormatter] {
 
+  /**
+   * Hadoop configuration path to check for a boolean value indicating whether
+   * the current or original read qualities should be written. True indicates
+   * to write the original qualities. The default is false.
+   */
   val WRITE_ORIGINAL_QUAL = "org.bdgenomics.adam.rdd.fragment.InterleavedFASTQInFormatter.writeOriginalQual"
+
+  /**
+   * Hadoop configuration path to check for a boolean value indicating whether
+   * to write the "/1" "/2" suffixes to the read name that indicate whether a
+   * read is first or second in a pair. Default is false (no suffixes).
+   */
   val WRITE_SUFFIXES = "org.bdgenomics.adam.rdd.fragment.InterleavedFASTQInFormatter.writeSuffixes"
 
+  /**
+   * Builds an InterleavedFASTQInFormatter to write Interleaved FASTQ.
+   *
+   * @param gRdd GenomicRDD of Fragments. Used to get HadoopConfiguration.
+   * @return Returns a new Interleaved FASTQ InFormatter.
+   */
   def apply(gRdd: FragmentRDD): InterleavedFASTQInFormatter = {
     new InterleavedFASTQInFormatter(gRdd.rdd.context.hadoopConfiguration)
   }
 }
 
-class InterleavedFASTQInFormatter(conf: Configuration) extends InFormatter[Fragment, FragmentRDD, InterleavedFASTQInFormatter] with Logging {
+private[fragment] class InterleavedFASTQInFormatter private (
+    conf: Configuration) extends InFormatter[Fragment, FragmentRDD, InterleavedFASTQInFormatter] with Logging {
 
   protected val companion = InterleavedFASTQInFormatter
   private val converter = new AlignmentRecordConverter
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala
index 1483046fa9..b4d3320ca6 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMBAMOutputFormat.scala
@@ -28,6 +28,11 @@ import org.seqdoop.hadoop_bam.{
   SAMRecordWritable
 }
 
+/**
+ * Wrapper for Hadoop-BAM to work around requirement for no-args constructor.
+ *
+ * @tparam K The key type. Keys are not written.
+ */
 class ADAMBAMOutputFormat[K]
     extends KeyIgnoringBAMOutputFormat[K] with Serializable {
 
@@ -50,11 +55,21 @@ class ADAMBAMOutputFormat[K]
   }
 }
 
+/**
+ * Wrapper that adds instrumentation to the BAM output format.
+ *
+ * @tparam K The key type. Keys are not written.
+ */
 class InstrumentedADAMBAMOutputFormat[K] extends InstrumentedOutputFormat[K, org.seqdoop.hadoop_bam.SAMRecordWritable] {
   override def timerName(): String = Timers.WriteBAMRecord.timerName
   override def outputFormatClass(): Class[_ <: OutputFormat[K, SAMRecordWritable]] = classOf[ADAMBAMOutputFormat[K]]
 }
 
+/**
+ * Wrapper for Hadoop-BAM to work around requirement for no-args constructor.
+ *
+ * @tparam K The key type. Keys are not written.
+ */
 class ADAMBAMOutputFormatHeaderLess[K]
     extends KeyIgnoringBAMOutputFormat[K] with Serializable {
 
@@ -77,6 +92,11 @@ class ADAMBAMOutputFormatHeaderLess[K]
   }
 }
 
+/**
+ * Wrapper that adds instrumentation to the SAM output format.
+ *
+ * @tparam K The key type. Keys are not written.
+ */
 class InstrumentedADAMBAMOutputFormatHeaderLess[K] extends InstrumentedOutputFormat[K, org.seqdoop.hadoop_bam.SAMRecordWritable] {
   override def timerName(): String = Timers.WriteBAMRecord.timerName
   override def outputFormatClass(): Class[_ <: OutputFormat[K, SAMRecordWritable]] = classOf[ADAMBAMOutputFormatHeaderLess[K]]
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala
index 6cffed1ba0..28ce20ab13 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/ADAMCRAMOutputFormat.scala
@@ -28,6 +28,11 @@ import org.seqdoop.hadoop_bam.{
   SAMRecordWritable
 }
 
+/**
+ * Wrapper for Hadoop-BAM to work around requirement for no-args constructor.
+ *
+ * @tparam K The key type. Keys are not written.
+ */
 class ADAMCRAMOutputFormat[K]
     extends KeyIgnoringCRAMOutputFormat[K] with Serializable {
 
@@ -50,11 +55,21 @@ class ADAMCRAMOutputFormat[K]
   }
 }
 
+/**
+ * Wrapper that adds instrumentation to the CRAM output format.
+ *
+ * @tparam K The key type. Keys are not written.
+ */
 class InstrumentedADAMCRAMOutputFormat[K] extends InstrumentedOutputFormat[K, org.seqdoop.hadoop_bam.SAMRecordWritable] {
   override def timerName(): String = Timers.WriteCRAMRecord.timerName
   override def outputFormatClass(): Class[_ <: OutputFormat[K, SAMRecordWritable]] = classOf[ADAMCRAMOutputFormat[K]]
 }
 
+/**
+ * Wrapper for Hadoop-BAM to work around requirement for no-args constructor.
+ *
+ * @tparam K The key type. Keys are not written.
+ */
 class ADAMCRAMOutputFormatHeaderLess[K]
     extends KeyIgnoringCRAMOutputFormat[K] with Serializable {
 
@@ -77,6 +92,11 @@ class ADAMCRAMOutputFormatHeaderLess[K]
   }
 }
 
+/**
+ * Wrapper that adds instrumentation to the CRAM output format.
+ *
+ * @tparam K The key type. Keys are not written.
+ */
 class InstrumentedADAMCRAMOutputFormatHeaderLess[K] extends InstrumentedOutputFormat[K, org.seqdoop.hadoop_bam.SAMRecordWritable] {
   override def timerName(): String = Timers.WriteCRAMRecord.timerName
   override def outputFormatClass(): Class[_ <: OutputFormat[K, SAMRecordWritable]] = classOf[ADAMCRAMOutputFormatHeaderLess[K]]
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala
index e5710d003f..5e5fd221ed 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDD.scala
@@ -57,6 +57,7 @@ import org.bdgenomics.adam.rdd.read.realignment.RealignIndels
 import org.bdgenomics.adam.rdd.read.recalibration.BaseQualityRecalibration
 import org.bdgenomics.adam.rdd.fragment.FragmentRDD
 import org.bdgenomics.adam.rich.RichAlignmentRecord
+import org.bdgenomics.adam.util.ReferenceFile
 import org.bdgenomics.formats.avro._
 import org.bdgenomics.utils.misc.Logging
 import org.seqdoop.hadoop_bam._
@@ -599,6 +600,30 @@ sealed trait AlignmentRecordRDD extends AvroReadGroupGenomicRDD[AlignmentRecord,
     replaceRdd(RealignIndels(rdd, consensusModel, isSorted, maxIndelSize, maxConsensusNumber, lodThreshold))
   }
 
+  /**
+   * Computes the mismatching positions field (SAM "MD" tag).
+   *
+   * @param referenceFile A reference file that can be broadcast to all nodes.
+   * @param overwriteExistingTags If true, overwrites the MD tags on reads where
+   *   it is already populated. If false, we only tag reads that are currently
+   *   missing an MD tag. Default is false.
+   * @param validationStringency If we are recalculating existing tags and we
+   *   find that the MD tag that was previously on the read doesn't match our
+   *   new tag, LENIENT will log a warning message, STRICT will throw an
+   *   exception, and SILENT will ignore. Default is LENIENT.
+   * @return Returns a new AlignmentRecordRDD where all reads have the
+   *   mismatchingPositions field populated.
+   */
+  def computeMismatchingPositions(
+    referenceFile: ReferenceFile,
+    overwriteExistingTags: Boolean = false,
+    validationStringency: ValidationStringency = ValidationStringency.LENIENT): AlignmentRecordRDD = {
+    replaceRdd(MDTagging(rdd,
+      referenceFile,
+      overwriteExistingTags = overwriteExistingTags,
+      validationStringency = validationStringency).taggedReads)
+  }
+
   /**
    * Runs a quality control pass akin to the Samtools FlagStat tool.
    *
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala
index 37a579ed1e..522be3e2a2 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMInFormatter.scala
@@ -27,11 +27,23 @@ import org.bdgenomics.adam.models.{
 import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
 import org.bdgenomics.formats.avro.AlignmentRecord
 
+/**
+ * Companion object that builds an InFormatter that writes data where the metadata
+ * is contained in a SAMFileHeaderWritable.
+ *
+ * @tparam T The type of the underlying InFormatter.
+ */
 trait AnySAMInFormatterCompanion[T <: AnySAMInFormatter[T]] extends InFormatterCompanion[AlignmentRecord, AlignmentRecordRDD, T] {
   protected def makeFormatter(header: SAMFileHeaderWritable,
                               recordGroups: RecordGroupDictionary,
                               converter: AlignmentRecordConverter): T
 
+  /**
+   * Makes an AnySAMInFormatter from a GenomicRDD of AlignmentRecords.
+   *
+   * @param gRdd AlignmentRecordRDD with reference build and record group info.
+   * @return Returns an InFormatter that extends AnySAMInFormatter.
+   */
   def apply(gRdd: AlignmentRecordRDD): T = {
 
     // make a converter
@@ -46,10 +58,26 @@ trait AnySAMInFormatterCompanion[T <: AnySAMInFormatter[T]] extends InFormatterC
   }
 }
 
+/**
+ * A trait that writes reads using an Htsjdk SAMFileWriter.
+ *
+ * @tparam T The recursive type of the class that implements this trait.
+ */
 trait AnySAMInFormatter[T <: AnySAMInFormatter[T]] extends InFormatter[AlignmentRecord, AlignmentRecordRDD, T] {
 
+  /**
+   * A serializable form of the SAM File Header.
+   */
   val header: SAMFileHeaderWritable
+
+  /**
+   * A dictionary describing the read groups these reads are from.
+   */
   val recordGroups: RecordGroupDictionary
+
+  /**
+   * A converter from AlignmentRecord to SAMRecord.
+   */
   val converter: AlignmentRecordConverter
 
   protected def makeWriter(os: OutputStream): SAMFileWriter
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala
index fc23606fd7..5f9c9051fd 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AnySAMOutFormatter.scala
@@ -25,6 +25,10 @@ import org.bdgenomics.formats.avro.AlignmentRecord
 import scala.annotation.tailrec
 import scala.collection.mutable.ListBuffer
 
+/**
+ * An OutFormatter that automatically infers whether the piped input is SAM or
+ * BAM. Autodetecting streamed CRAM is not currently supported.
+ */
 class AnySAMOutFormatter extends OutFormatter[AlignmentRecord] {
 
   /**
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BAMInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BAMInFormatter.scala
index fbc26f62b5..2b23bc5d7e 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BAMInFormatter.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/BAMInFormatter.scala
@@ -28,6 +28,9 @@ import org.bdgenomics.adam.models.{
   SAMFileHeaderWritable
 }
 
+/**
+ * InFormatter companion for building an InFormatter that streams BAM.
+ */
 object BAMInFormatter extends AnySAMInFormatterCompanion[BAMInFormatter] {
 
   protected def makeFormatter(header: SAMFileHeaderWritable,
@@ -37,9 +40,10 @@ object BAMInFormatter extends AnySAMInFormatterCompanion[BAMInFormatter] {
   }
 }
 
-case class BAMInFormatter(header: SAMFileHeaderWritable,
-                          recordGroups: RecordGroupDictionary,
-                          converter: AlignmentRecordConverter) extends AnySAMInFormatter[BAMInFormatter] {
+private[read] case class BAMInFormatter private (
+    header: SAMFileHeaderWritable,
+    recordGroups: RecordGroupDictionary,
+    converter: AlignmentRecordConverter) extends AnySAMInFormatter[BAMInFormatter] {
 
   protected val companion = BAMInFormatter
 
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala
index e1f24cb503..f3d96c22ba 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/FlagStat.scala
@@ -20,12 +20,12 @@ package org.bdgenomics.adam.rdd.read
 import org.apache.spark.rdd.RDD
 import org.bdgenomics.formats.avro.AlignmentRecord
 
-object FlagStatMetrics {
+private[read] object FlagStatMetrics {
   val emptyFailedQuality = new FlagStatMetrics(0, DuplicateMetrics.empty, DuplicateMetrics.empty, 0, 0, 0, 0, 0, 0, 0, 0, 0, true)
   val emptyPassedQuality = new FlagStatMetrics(0, DuplicateMetrics.empty, DuplicateMetrics.empty, 0, 0, 0, 0, 0, 0, 0, 0, 0, false)
 }
 
-object DuplicateMetrics {
+private[read] object DuplicateMetrics {
   val empty = new DuplicateMetrics(0, 0, 0, 0)
 
   def apply(record: AlignmentRecord): (DuplicateMetrics, DuplicateMetrics) = {
@@ -50,7 +50,7 @@ object DuplicateMetrics {
   }
 }
 
-case class DuplicateMetrics(total: Long, bothMapped: Long, onlyReadMapped: Long, crossChromosome: Long) {
+private[adam] case class DuplicateMetrics(total: Long, bothMapped: Long, onlyReadMapped: Long, crossChromosome: Long) {
   def +(that: DuplicateMetrics): DuplicateMetrics = {
     new DuplicateMetrics(
       total + that.total,
@@ -61,11 +61,11 @@ case class DuplicateMetrics(total: Long, bothMapped: Long, onlyReadMapped: Long,
   }
 }
 
-case class FlagStatMetrics(total: Long, duplicatesPrimary: DuplicateMetrics, duplicatesSecondary: DuplicateMetrics,
-                           mapped: Long, pairedInSequencing: Long,
-                           read1: Long, read2: Long, properlyPaired: Long, withSelfAndMateMapped: Long,
-                           singleton: Long, withMateMappedToDiffChromosome: Long,
-                           withMateMappedToDiffChromosomeMapQ5: Long, failedQuality: Boolean) {
+private[adam] case class FlagStatMetrics(total: Long, duplicatesPrimary: DuplicateMetrics, duplicatesSecondary: DuplicateMetrics,
+                                         mapped: Long, pairedInSequencing: Long,
+                                         read1: Long, read2: Long, properlyPaired: Long, withSelfAndMateMapped: Long,
+                                         singleton: Long, withMateMappedToDiffChromosome: Long,
+                                         withMateMappedToDiffChromosomeMapQ5: Long, failedQuality: Boolean) {
   def +(that: FlagStatMetrics): FlagStatMetrics = {
     assert(failedQuality == that.failedQuality, "Can't reduce passedVendorQuality with different failedQuality values")
     new FlagStatMetrics(
@@ -86,7 +86,7 @@ case class FlagStatMetrics(total: Long, duplicatesPrimary: DuplicateMetrics, dup
   }
 }
 
-object FlagStat {
+private[read] object FlagStat {
 
   def b2i(boolean: Boolean) = if (boolean) 1 else 0
   def b(boolean: java.lang.Boolean) = Option(boolean).exists(x => x)
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala
index dc6eeb6a10..e358f23ba7 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MDTagging.scala
@@ -27,10 +27,9 @@ import org.bdgenomics.adam.util.ReferenceFile
 import org.bdgenomics.formats.avro.AlignmentRecord
 import org.bdgenomics.utils.misc.Logging
 
-case class MDTagging(
+private[read] case class MDTagging(
     reads: RDD[AlignmentRecord],
     @transient referenceFile: ReferenceFile,
-    partitionSize: Long = 1000000,
     overwriteExistingTags: Boolean = false,
     validationStringency: ValidationStringency = ValidationStringency.STRICT) extends Logging {
   @transient val sc = reads.sparkContext
@@ -84,24 +83,13 @@ case class MDTagging(
   }
 }
 
-object MDTagging {
-  def apply(
-    reads: RDD[AlignmentRecord],
-    referenceFile: String,
-    fragmentLength: Long,
-    overwriteExistingTags: Boolean,
-    validationStringency: ValidationStringency): RDD[AlignmentRecord] = {
-    val sc = reads.sparkContext
-    new MDTagging(
-      reads,
-      sc.loadReferenceFile(referenceFile, fragmentLength = fragmentLength),
-      partitionSize = fragmentLength,
-      overwriteExistingTags,
-      validationStringency
-    ).taggedReads
-  }
-}
-
+/**
+ * A class describing an exception where a read's MD tag was recomputed and did
+ * not match the MD tag originally attached to the read.
+ *
+ * @param read The read whose MD tag was recomputed, with original MD tag.
+ * @param mdTag The recomputed MD tag.
+ */
 case class IncorrectMDTagException(read: AlignmentRecord, mdTag: String) extends Exception {
   override def getMessage: String =
     s"Read: ${read.getReadName}, pos: ${read.getContigName}:${read.getStart}, cigar: ${read.getCigar}, existing MD tag: ${read.getMismatchingPositions}, correct MD tag: $mdTag"
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala
index 419132bece..6170aec56a 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/MarkDuplicates.scala
@@ -154,5 +154,4 @@ private[rdd] object MarkDuplicates extends Serializable with Logging {
       scoreBucket(x._2) - scoreBucket(y._2)
     }
   }
-
 }
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SAMInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SAMInFormatter.scala
index 514d0d986e..66d30c28ce 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SAMInFormatter.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/SAMInFormatter.scala
@@ -28,6 +28,9 @@ import org.bdgenomics.adam.models.{
   SAMFileHeaderWritable
 }
 
+/**
+ * InFormatter companion for building an InFormatter that streams SAM.
+ */
 object SAMInFormatter extends AnySAMInFormatterCompanion[SAMInFormatter] {
 
   protected def makeFormatter(header: SAMFileHeaderWritable,
@@ -37,9 +40,10 @@ object SAMInFormatter extends AnySAMInFormatterCompanion[SAMInFormatter] {
   }
 }
 
-case class SAMInFormatter(header: SAMFileHeaderWritable,
-                          recordGroups: RecordGroupDictionary,
-                          converter: AlignmentRecordConverter) extends AnySAMInFormatter[SAMInFormatter] {
+private[read] case class SAMInFormatter private (
+    header: SAMFileHeaderWritable,
+    recordGroups: RecordGroupDictionary,
+    converter: AlignmentRecordConverter) extends AnySAMInFormatter[SAMInFormatter] {
 
   protected val companion = SAMInFormatter
 
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala
index aff47a0ad7..56edf5e5cd 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/IndelRealignmentTarget.scala
@@ -28,7 +28,7 @@ import org.bdgenomics.adam.instrumentation.Timers._
 import scala.collection.JavaConversions._
 import scala.collection.immutable.TreeSet
 
-object ZippedTargetOrdering extends Ordering[(IndelRealignmentTarget, Int)] {
+private[realignment] object ZippedTargetOrdering extends Ordering[(IndelRealignmentTarget, Int)] {
 
   /**
    * Order two indel realignment targets by earlier starting position.
@@ -42,7 +42,7 @@ object ZippedTargetOrdering extends Ordering[(IndelRealignmentTarget, Int)] {
   }
 }
 
-object TargetOrdering extends Ordering[IndelRealignmentTarget] {
+private[realignment] object TargetOrdering extends Ordering[IndelRealignmentTarget] {
 
   /**
    * Order two indel realignment targets by earlier starting position.
@@ -88,7 +88,7 @@ object TargetOrdering extends Ordering[IndelRealignmentTarget] {
   }
 }
 
-object IndelRealignmentTarget {
+private[realignment] object IndelRealignmentTarget {
 
   /**
    * Generates 1+ indel realignment targets from a single read.
@@ -136,7 +136,7 @@ object IndelRealignmentTarget {
   }
 }
 
-class IndelRealignmentTargetSerializer extends Serializer[IndelRealignmentTarget] {
+private[adam] class IndelRealignmentTargetSerializer extends Serializer[IndelRealignmentTarget] {
 
   def write(kryo: Kryo, output: Output, obj: IndelRealignmentTarget) = {
     output.writeString(obj.readRange.referenceName)
@@ -161,7 +161,7 @@ class IndelRealignmentTargetSerializer extends Serializer[IndelRealignmentTarget
   }
 }
 
-class IndelRealignmentTarget(
+private[adam] class IndelRealignmentTarget(
     val variation: Option[ReferenceRegion],
     val readRange: ReferenceRegion) extends Logging {
 
@@ -196,7 +196,7 @@ class IndelRealignmentTarget(
   }
 }
 
-class TargetSetSerializer extends Serializer[TargetSet] {
+private[adam] class TargetSetSerializer extends Serializer[TargetSet] {
 
   val irts = new IndelRealignmentTargetSerializer()
 
@@ -217,7 +217,7 @@ class TargetSetSerializer extends Serializer[TargetSet] {
   }
 }
 
-class ZippedTargetSetSerializer extends Serializer[ZippedTargetSet] {
+private[adam] class ZippedTargetSetSerializer extends Serializer[ZippedTargetSet] {
 
   val irts = new IndelRealignmentTargetSerializer()
 
@@ -241,15 +241,15 @@ class ZippedTargetSetSerializer extends Serializer[ZippedTargetSet] {
   }
 }
 
-object TargetSet {
+private[realignment] object TargetSet {
   def apply(): TargetSet = {
     new TargetSet(TreeSet[IndelRealignmentTarget]()(TargetOrdering))
   }
 }
 
 // These two case classes are needed to get around some serialization issues
-case class TargetSet(set: TreeSet[IndelRealignmentTarget]) extends Serializable {
+private[adam] case class TargetSet(set: TreeSet[IndelRealignmentTarget]) extends Serializable {
 }
 
-case class ZippedTargetSet(set: TreeSet[(IndelRealignmentTarget, Int)]) extends Serializable {
+private[adam] case class ZippedTargetSet(set: TreeSet[(IndelRealignmentTarget, Int)]) extends Serializable {
 }
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala
index 866bacc7a7..75c457608d 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala
@@ -34,7 +34,7 @@ import scala.collection.immutable.{ NumericRange, TreeSet }
 import scala.collection.mutable
 import scala.util.Random
 
-private[rdd] object RealignIndels extends Serializable with Logging {
+private[read] object RealignIndels extends Serializable with Logging {
 
   /**
    * Realigns an RDD of reads.
@@ -222,7 +222,7 @@ private[rdd] object RealignIndels extends Serializable with Logging {
 
 import org.bdgenomics.adam.rdd.read.realignment.RealignIndels._
 
-private[rdd] class RealignIndels(
+private[read] class RealignIndels(
     val consensusModel: ConsensusGenerator = new ConsensusGeneratorFromReads,
     val dataIsSorted: Boolean = false,
     val maxIndelSize: Int = 500,
@@ -497,5 +497,4 @@ private[rdd] class RealignIndels(
       readsMappedToTarget.flatMap(realignTargetGroup).map(r => r.record)
     }
   }
-
 }
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala
index 927deba3c0..a241aa7dfc 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignmentTargetFinder.scala
@@ -24,7 +24,7 @@ import org.bdgenomics.adam.instrumentation.Timers._
 import scala.annotation.tailrec
 import scala.collection.immutable.TreeSet
 
-object RealignmentTargetFinder {
+private[realignment] object RealignmentTargetFinder {
 
   /**
    * Generates realignment targets from a set of reads.
@@ -40,7 +40,7 @@ object RealignmentTargetFinder {
   }
 }
 
-class RealignmentTargetFinder extends Serializable with Logging {
+private[realignment] class RealignmentTargetFinder extends Serializable with Logging {
 
   /**
    * Joins two sorted sets of targets together. Is tail call recursive.
@@ -120,5 +120,4 @@ class RealignmentTargetFinder extends Serializable with Logging {
 
     targetSet
   }
-
 }
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala
index a13944f87e..ce1acc4a85 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/BaseQualityRecalibration.scala
@@ -34,7 +34,7 @@ import org.bdgenomics.formats.avro.AlignmentRecord
  * a second pass over the reads to apply the recalibration and assign adjusted
  * quality scores.
  */
-class BaseQualityRecalibration(
+private class BaseQualityRecalibration(
   val input: RDD[(Option[DecadentRead], Option[AlignmentRecord])],
   val knownSnps: Broadcast[SnpTable],
   val dumpObservationTableFile: Option[String] = None)
@@ -122,7 +122,7 @@ class BaseQualityRecalibration(
   }
 }
 
-object BaseQualityRecalibration {
+private[read] object BaseQualityRecalibration {
   def apply(
     rdd: RDD[AlignmentRecord],
     knownSnps: Broadcast[SnpTable],
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Covariate.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Covariate.scala
index c25e17a1ec..0d127cf11c 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Covariate.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Covariate.scala
@@ -27,7 +27,7 @@ import org.bdgenomics.adam.rich.DecadentRead
  * @note Concrete implementations of Covariate should inherit from
  * AbstractCovariate, not Covariate.
  */
-trait Covariate {
+private[recalibration] trait Covariate {
   type Value
 
   /**
@@ -54,7 +54,7 @@ trait Covariate {
   def csvFieldName: String
 }
 
-abstract class AbstractCovariate[ValueT] extends Covariate with Serializable {
+private[recalibration] abstract class AbstractCovariate[ValueT] extends Covariate with Serializable {
   override type Value = ValueT
 }
 
@@ -64,7 +64,7 @@ abstract class AbstractCovariate[ValueT] extends Covariate with Serializable {
  * The values for mandatory covariates are stored in member fields and optional
  * covariate values are in `extras`.
  */
-class CovariateKey(
+private[adam] class CovariateKey(
     val readGroup: String,
     val quality: QualityScore,
     val extras: Seq[Option[Covariate#Value]]) extends Serializable {
@@ -95,7 +95,7 @@ class CovariateKey(
  * Represents the abstract space of all possible CovariateKeys for the given set
  * of Covariates.
  */
-class CovariateSpace(val extras: IndexedSeq[Covariate]) extends Serializable {
+private[adam] class CovariateSpace(val extras: IndexedSeq[Covariate]) extends Serializable {
   // Computes the covariate values for all residues in this read
   def apply(read: DecadentRead): Seq[CovariateKey] = {
     // Ask each 'extra' covariate to compute its values for this read
@@ -133,7 +133,7 @@ class CovariateSpace(val extras: IndexedSeq[Covariate]) extends Serializable {
 
 }
 
-object CovariateSpace {
+private[recalibration] object CovariateSpace {
   def apply(extras: Covariate*): CovariateSpace =
     new CovariateSpace(extras.toIndexedSeq)
 }
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/CycleCovariate.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/CycleCovariate.scala
index e09fbbe52a..ff180cfc0f 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/CycleCovariate.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/CycleCovariate.scala
@@ -20,7 +20,7 @@ package org.bdgenomics.adam.rdd.read.recalibration
 import org.bdgenomics.adam.rich.DecadentRead
 
 // This is based on the CycleCovariate in GATK 1.6.
-class CycleCovariate extends AbstractCovariate[Int] {
+private[adam] class CycleCovariate extends AbstractCovariate[Int] {
   def compute(read: DecadentRead): Seq[Option[Int]] = {
     val (initial, increment) = initialization(read)
     read.residues.indices.map(pos => Some(initial + increment * pos))
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/DinucCovariate.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/DinucCovariate.scala
index e17e944e38..cc6c5e8445 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/DinucCovariate.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/DinucCovariate.scala
@@ -20,7 +20,7 @@ package org.bdgenomics.adam.rdd.read.recalibration
 import org.bdgenomics.adam.rich.DecadentRead
 
 // TODO: should inherit from something like AbstractCovariate[(DNABase, DNABase)]
-class DinucCovariate extends AbstractCovariate[(Char, Char)] {
+private[adam] class DinucCovariate extends AbstractCovariate[(Char, Char)] {
   def compute(read: DecadentRead): Seq[Option[(Char, Char)]] = {
     val sequence = read.residues.map(_.base)
     if (read.isNegativeRead) {
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala
index 21bad7f005..d605fef991 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/ObservationTable.scala
@@ -26,7 +26,7 @@ import scala.collection.mutable
  *
  * This is used in ObservationTable, which maps from CovariateKey to Observation.
  */
-class Observation(val total: Long, val mismatches: Long) extends Serializable {
+private[adam] class Observation(val total: Long, val mismatches: Long) extends Serializable {
   require(mismatches >= 0 && mismatches <= total)
 
   def this(that: Observation) = this(that.total, that.mismatches)
@@ -76,13 +76,13 @@ class Observation(val total: Long, val mismatches: Long) extends Serializable {
 
 }
 
-object Observation {
+private[recalibration] object Observation {
   val empty = new Observation(0, 0)
 
   def apply(isMismatch: Boolean) = new Observation(1, if (isMismatch) 1 else 0)
 }
 
-class Aggregate private (
+private[adam] class Aggregate private (
     total: Long, // number of total observations
     mismatches: Long, // number of mismatches observed
     val expectedMismatches: Double // expected number of mismatches based on reported quality scores
@@ -100,7 +100,7 @@ class Aggregate private (
     )
 }
 
-object Aggregate {
+private[recalibration] object Aggregate {
   val empty: Aggregate = new Aggregate(0, 0, 0)
 
   def apply(key: CovariateKey, value: Observation) =
@@ -110,7 +110,7 @@ object Aggregate {
 /**
  * Table containing the empirical frequency of mismatches for each set of covariate values.
  */
-class ObservationTable(
+private[adam] class ObservationTable(
     val space: CovariateSpace,
     val entries: Map[CovariateKey, Observation]) extends Serializable {
 
@@ -128,7 +128,7 @@ class ObservationTable(
   def csvHeader: Seq[String] = space.csvHeader ++ Seq("TotalCount", "MismatchCount", "EmpiricalQ", "IsSkipped")
 }
 
-class ObservationAccumulator(val space: CovariateSpace) extends Serializable {
+private[adam] class ObservationAccumulator(val space: CovariateSpace) extends Serializable {
   private val entries = mutable.HashMap[CovariateKey, Observation]()
 
   def +=(that: (CovariateKey, Observation)): ObservationAccumulator = ObservationAccumulatorSeq.time {
@@ -150,6 +150,6 @@ class ObservationAccumulator(val space: CovariateSpace) extends Serializable {
   def result: ObservationTable = new ObservationTable(space, entries.toMap)
 }
 
-object ObservationAccumulator {
+private[recalibration] object ObservationAccumulator {
   def apply(space: CovariateSpace) = new ObservationAccumulator(space)
 }
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala
index 801a11e811..a608a0c3a6 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/recalibration/Recalibrator.scala
@@ -25,7 +25,7 @@ import org.bdgenomics.formats.avro.AlignmentRecord
 import org.bdgenomics.adam.instrumentation.Timers._
 import scala.math.{ exp, log }
 
-class Recalibrator(val table: RecalibrationTable, val minAcceptableQuality: QualityScore)
+private[recalibration] class Recalibrator(val table: RecalibrationTable, val minAcceptableQuality: QualityScore)
     extends Serializable {
 
   def apply(r: (Option[DecadentRead], Option[AlignmentRecord])): AlignmentRecord = RecalibrateRead.time {
@@ -53,13 +53,13 @@ class Recalibrator(val table: RecalibrationTable, val minAcceptableQuality: Qual
   }
 }
 
-object Recalibrator {
+private[recalibration] object Recalibrator {
   def apply(observed: ObservationTable, minAcceptableQuality: QualityScore): Recalibrator = {
     new Recalibrator(RecalibrationTable(observed), minAcceptableQuality)
   }
 }
 
-class RecalibrationTable(
+private[recalibration] class RecalibrationTable(
   // covariates for this recalibration
   val covariates: CovariateSpace,
   // marginal and quality scores by read group,
@@ -169,6 +169,8 @@ object RecalibrationTable {
 
 }
 
-class QualityTable(val table: Map[QualityScore, (Aggregate, ExtrasTables)]) extends Serializable
+private[recalibration] class QualityTable(
+  val table: Map[QualityScore, (Aggregate, ExtrasTables)]) extends Serializable
 
-class ExtrasTables(val extrasTables: IndexedSeq[Map[Option[Covariate#Value], Aggregate]]) extends Serializable
+private[recalibration] class ExtrasTables(
+  val extrasTables: IndexedSeq[Map[Option[Covariate#Value], Aggregate]]) extends Serializable
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala
index c80c7d209f..eeb1a872ea 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/ADAMVCFOutputFormat.scala
@@ -32,7 +32,7 @@ import org.seqdoop.hadoop_bam.{
 /**
  * Wrapper for Hadoop-BAM to work around requirement for no-args constructor.
  *
- * @tparam K
+ * @tparam K The key type. Keys are not written.
  */
 class ADAMVCFOutputFormat[K] extends KeyIgnoringVCFOutputFormat[K](VCFFormat.VCF) with Serializable {
 
@@ -56,7 +56,7 @@ class ADAMVCFOutputFormat[K] extends KeyIgnoringVCFOutputFormat[K](VCFFormat.VCF
 /**
  * Wrapper for Hadoop-BAM to work around requirement for no-args constructor.
  *
- * @tparam K
+ * @tparam K The key type. Keys are not written.
  */
 class ADAMHeaderlessVCFOutputFormat[K] extends KeyIgnoringVCFOutputFormat[K](VCFFormat.VCF) with Serializable {
 
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFInFormatter.scala
index 3d02d7d70a..7345755078 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFInFormatter.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFInFormatter.scala
@@ -34,15 +34,26 @@ import org.bdgenomics.adam.models.{
 import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
 import scala.collection.JavaConversions._
 
+/**
+ * InFormatter companion that builds a VCFInFormatter to write VCF to a pipe.
+ */
 object VCFInFormatter extends InFormatterCompanion[VariantContext, VariantContextRDD, VCFInFormatter] {
 
+  /**
+   * Apply method for building the VCFInFormatter from a VariantContextRDD.
+   *
+   * @param gRdd VariantContextRDD to build VCF header from.
+   * @return A constructed VCFInFormatter with all needed metadata to write a
+   *   VCF header.
+   */
   def apply(gRdd: VariantContextRDD): VCFInFormatter = {
     VCFInFormatter(gRdd.sequences, gRdd.samples.map(_.getSampleId))
   }
 }
 
-case class VCFInFormatter(sequences: SequenceDictionary,
-                          samples: Seq[String]) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] {
+private[variation] case class VCFInFormatter private (
+    sequences: SequenceDictionary,
+    samples: Seq[String]) extends InFormatter[VariantContext, VariantContextRDD, VCFInFormatter] {
 
   protected val companion = VCFInFormatter
 
@@ -57,7 +68,7 @@ case class VCFInFormatter(sequences: SequenceDictionary,
    */
   def write(os: OutputStream, iter: Iterator[VariantContext]) {
 
-    // create a sam file writer connected to the output stream
+    // create a vcf writer connected to the output stream
     val writer = new VariantContextWriterBuilder()
       .setOutputStream(os)
       .clearIndexCreator()
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFOutFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFOutFormatter.scala
index 131123c258..ade83703fd 100644
--- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFOutFormatter.scala
+++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/variation/VCFOutFormatter.scala
@@ -29,10 +29,13 @@ import org.bdgenomics.adam.rdd.OutFormatter
 import scala.annotation.tailrec
 import scala.collection.mutable.ListBuffer
 
+/**
+ * OutFormatter that reads streaming VCF.
+ */
 case class VCFOutFormatter() extends OutFormatter[VariantContext] {
 
   /**
-   * Reads alignment records from an input stream. Autodetects SAM/BAM format.
+   * Reads VariantContexts from an input stream. Autodetects VCF format.
    *
    * @param is An InputStream connected to a process we are piping from.
    * @return Returns an iterator of AlignmentRecords read from the stream.
diff --git a/adam-cli/src/test/resources/NA12878.sam b/adam-core/src/test/resources/NA12878.sam
similarity index 100%
rename from adam-cli/src/test/resources/NA12878.sam
rename to adam-core/src/test/resources/NA12878.sam
diff --git a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/FlagStatSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/FlagStatSuite.scala
similarity index 92%
rename from adam-cli/src/test/scala/org/bdgenomics/adam/cli/FlagStatSuite.scala
rename to adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/FlagStatSuite.scala
index 61a6922e70..610ff260dc 100644
--- a/adam-cli/src/test/scala/org/bdgenomics/adam/cli/FlagStatSuite.scala
+++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/FlagStatSuite.scala
@@ -15,26 +15,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.bdgenomics.adam.cli
+package org.bdgenomics.adam.rdd.read
 
 import org.apache.spark.rdd.RDD
 import org.bdgenomics.adam.projections.{ AlignmentRecordField, Projection }
 import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.adam.rdd.read.FlagStat._
-import org.bdgenomics.adam.rdd.read.{ DuplicateMetrics, FlagStatMetrics }
 import org.bdgenomics.adam.util.ADAMFunSuite
 import org.bdgenomics.formats.avro.AlignmentRecord
-import org.bdgenomics.utils.cli.Args4j
 
 class FlagStatSuite extends ADAMFunSuite {
 
   sparkTest("Standard FlagStat test") {
 
-    val inputpath = testFile("NA12878.sam")
-    val argLine = "%s".format(inputpath).split("\\s+")
-
-    val args: FlagStatArgs = Args4j.apply[FlagStatArgs](argLine)
-
+    val inputPath = testFile("NA12878.sam")
     val projection = Projection(
       AlignmentRecordField.readMapped,
       AlignmentRecordField.mateMapped,
@@ -50,9 +43,9 @@ class FlagStatSuite extends ADAMFunSuite {
       AlignmentRecordField.mapq,
       AlignmentRecordField.failedVendorQualityChecks)
 
-    val adamFile: RDD[AlignmentRecord] = sc.loadAlignments(args.inputPath, projection = Some(projection)).rdd
+    val adamFile: RDD[AlignmentRecord] = sc.loadAlignments(inputPath, projection = Some(projection)).rdd
 
-    val (failedVendorQuality, passedVendorQuality) = apply(adamFile)
+    val (failedVendorQuality, passedVendorQuality) = FlagStat(adamFile)
 
     def percent(fraction: Long, total: Long) = if (total == 0) 0.0 else 100.00 * fraction.toFloat / total
 
diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala
index da4c691d65..2c5dcf77b9 100644
--- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala
+++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/read/MDTaggingSuite.scala
@@ -80,9 +80,7 @@ class MDTaggingSuite extends ADAMFunSuite {
       )
     }
 
-    for (i <- List(1, 10)) {
-      check(MDTagging(reads, ReferenceContigMap(makeFrags(fs: _*)), partitionSize = i))
-    }
+    check(MDTagging(reads, ReferenceContigMap(makeFrags(fs: _*))))
   }
 
   sparkTest("test adding MDTags over boundary") {