bigdatagenomics · heuermh · Jul 6, 2016 · Jun 25, 2016 · heuermh · Jun 27, 2016
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/Consensus.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/Consensus.scala
@@ -0,0 +1,126 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.algorithms.consensus
+
+import htsjdk.samtools.{ Cigar, CigarOperator }
+import org.bdgenomics.adam.models.{ ReferencePosition, ReferenceRegion }
+import org.bdgenomics.adam.util.ImplicitJavaConversions._
+
+/**
+ * Singleton object for generating consensus sequences from alignments.
+ *
+ * Provides a helper method for turning a local read alignment into a consensus.
+ */
+private[adam] object Consensus extends Serializable {
+
+  /**
+   * Generates a consensus sequence from a local read alignment.
+   *
+   * Parses the read CIGAR and uses the read sequence to create consensuses. A
+   * consensus sequence is generated if an insertion or deletion is observed.
+   * When an insertion is observed, the insertion is cut from the read sequence
+   * to create the consensus.
+   *
+   * @note This method can only generate a single consensus from a read. Reads
+   *   with more than one INDEL operator are ignored.
+   *
+   * @param sequence Read sequence.
+   * @param start The start position of the read alignment.
+   * @param cigar The CIGAR string for the local alignment of the read.
+   * @return Returns an optional consensus sequence if there is exactly one
+   *   insertion/deletion in the read.
+   */
+  def generateAlternateConsensus(sequence: String,
+                                 start: ReferencePosition,
+                                 cigar: Cigar): Option[Consensus] = {
+
+    var readPos = 0
+    var referencePos = start.pos
+
+    // do we have a single indel alignment block?
+    if (cigar.getCigarElements.count(elem => elem.getOperator == CigarOperator.I ||
+      elem.getOperator == CigarOperator.D) == 1) {
+
+      // loop over elements and generate consensuses for indel blocks
+      cigar.getCigarElements.flatMap(cigarElement => {
+        cigarElement.getOperator match {
+          case CigarOperator.I => Some(new Consensus(sequence.substring(readPos,
+            readPos + cigarElement.getLength),
+            ReferenceRegion(start.referenceName,
+              referencePos,
+              referencePos + 1)))
+          case CigarOperator.D => Some(new Consensus("",
+            ReferenceRegion(start.referenceName,
+              referencePos,
+              referencePos + cigarElement.getLength + 1)))
+          case _ => {
+            if (cigarElement.getOperator.consumesReadBases &&
+              cigarElement.getOperator.consumesReferenceBases) {
+              readPos += cigarElement.getLength
+              referencePos += cigarElement.getLength
+            }
+            None
+          }
+        }
+      }).headOption
+    } else {
+      None
+    }
+  }
+}
+
+/**
+ * An INDEL alt allele to be realigned against.
+ *
+ * A consensus represents an INDEL that will be realigned against. This class
+ * stores the alternate allele string, and the location that this allele spans.
+ *
+ * @param consensus The alternate allele sequence. Empty if a deletion.
+ * @param index The reference region that this allele spans.
+ */
+private[adam] case class Consensus(consensus: String, index: ReferenceRegion) {
+
+  /**
+   * Inserts this consensus sequence into a reference genome sequence.
+   *
+   * @throws IllegalArgumentException If the consensus doesn't overlap with the
+   *   provided reference sequence.
+   *
+   * @param reference The reference genome string of this genomic region.
+   * @param rr The genomic region we are splicing into.
+   * @return Returns the sequence corresponding to the reference with this
+   *   allele spliced in.
+   */
+  def insertIntoReference(reference: String, rr: ReferenceRegion): String = {
+    require(rr.contains(index),
+      "Consensus not contained in reference region: %s vs. %s.".format(
+        index, rr))
+
+    "%s%s%s".format(reference.substring(0, (index.start - rr.start).toInt),
+      consensus,
+      reference.substring((index.end - 1 - rr.start).toInt))
+  }
+
+  override def toString: String = {
+    if (index.start + 1 != index.end) {
+      "Deletion over " + index.toString
+    } else {
+      "Inserted " + consensus + " at " + index.toString
+    }
+  }
+}
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGenerator.scala
@@ -18,11 +18,21 @@
 package org.bdgenomics.adam.algorithms.consensus
 
 import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.models.{ Consensus, ReferenceRegion }
+import org.bdgenomics.adam.models.ReferenceRegion
 import org.bdgenomics.adam.rdd.read.realignment.IndelRealignmentTarget
 import org.bdgenomics.adam.rich.RichAlignmentRecord
 
-abstract class ConsensusGenerator extends Serializable {
+/**
+ * Trait for generating consensus sequences for INDEL realignment.
+ *
+ * INDEL realignment scores read alinments against the reference genome and
+ * a set of "consensus" sequences. These consensus sequences represent alternate
+ * alleles/haplotypes, and can be generated via a variety of methods (e.g., seen
+ * in previous projects --> 1kg INDELs, seen in read alignments, etc). This
+ * trait provides an interface that a consensus generation method should
+ * implement to provide it's consensus sequences to the realigner.
+ */
+private[adam] trait ConsensusGenerator extends Serializable {
 
   /**
    * Generates targets to add to initial set of indel realignment targets, if additional

diff --git a/...rc/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromKnowns.scala b/...rc/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromKnowns.scala
@@ -26,9 +26,20 @@ import org.bdgenomics.adam.rich.RichAlignmentRecord
 import org.bdgenomics.formats.avro.Variant
 import scala.transient
 
-class ConsensusGeneratorFromKnowns(file: String, @transient sc: SparkContext) extends ConsensusGenerator {
+/**
+ * Generates consensus sequences from a set of variants.
+ *
+ * Generates a set of consensus sequences by loading variants and filtering on
+ * INDEL variants. The INDEL variants are mapped directly into consensuses using
+ * their alt allele string as the consensus string.
+ *
+ * @param file Path to file containing variants.
+ * @param sc Spark context to use.
+ */
+private[adam] class ConsensusGeneratorFromKnowns(file: String,
+                                                 @transient sc: SparkContext) extends ConsensusGenerator {
 
-  val indelTable = sc.broadcast(IndelTable(file, sc))
+  private val indelTable = sc.broadcast(IndelTable(file, sc))
 
   /**
    * Generates targets to add to initial set of indel realignment targets, if additional
@@ -46,7 +57,7 @@ class ConsensusGeneratorFromKnowns(file: String, @transient sc: SparkContext) ex
 
   /**
    * Performs any preprocessing specific to this consensus generation algorithm, e.g.,
-   * indel normalization.
+   * indel normalization. This is a no-op for the knowns model.
    *
    * @param reads Reads to preprocess.
    * @return Preprocessed reads.

diff --git a/...src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReads.scala b/...src/main/scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromReads.scala
@@ -18,17 +18,25 @@
 package org.bdgenomics.adam.algorithms.consensus
 
 import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.models.{ Consensus, ReferenceRegion, ReferencePosition }
+import org.bdgenomics.adam.models.{ ReferencePosition, ReferenceRegion }
 import org.bdgenomics.adam.rdd.read.realignment.IndelRealignmentTarget
-import org.bdgenomics.adam.rich.RichAlignmentRecord
 import org.bdgenomics.adam.rich.RichAlignmentRecord._
 import org.bdgenomics.adam.rich.RichCigar._
-import org.bdgenomics.adam.util.MdTag
+import org.bdgenomics.adam.rich.RichAlignmentRecord
 import org.bdgenomics.adam.util.ImplicitJavaConversions._
 import org.bdgenomics.adam.util.NormalizationUtils._
+import org.bdgenomics.adam.util.MdTag
 import org.bdgenomics.formats.avro.AlignmentRecord
 
-class ConsensusGeneratorFromReads extends ConsensusGenerator {
+/**
+ * Consensus generator that examines the read alignments.
+ *
+ * This consensus generation method preprocesses the reads by left normalizing
+ * all INDELs seen in the reads, and then generates consensus sequences from
+ * reads whose local alignments contain INDELs. These consensuses are deduped
+ * before they are returned.
+ */
+private[adam] class ConsensusGeneratorFromReads extends ConsensusGenerator {
 
   /**
    * No targets to add if generating consensus targets from reads.
@@ -68,7 +76,16 @@ class ConsensusGeneratorFromReads extends ConsensusGenerator {
   }
 
   /**
-   * Generates concensus sequences from reads with indels.
+   * Generates consensus sequences from reads with INDELs.
+   *
+   * Loops over provided reads. Filters all reads without an MD tag, and then
+   * generates consensus sequences. If a read contains a single INDEL aligned to
+   * the reference, we emit that INDEL. Else, we do not emit a consensus from
+   * the read. We dedup the consensuses to remove any INDELs observed in
+   * multiple reads and return.
+   *
+   * @param Reads to search for INDELs.
+   * @return Consensuses generated from reads with a singel INDEL
    */
   def findConsensus(reads: Iterable[RichAlignmentRecord]): Iterable[Consensus] = {
     reads.filter(r => r.mdTag.isDefined)
@@ -87,5 +104,4 @@ class ConsensusGeneratorFromReads extends ConsensusGenerator {
       .toSeq
       .distinct
   }
-
 }
diff --git a/.../scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromSmithWaterman.scala b/.../scala/org/bdgenomics/adam/algorithms/consensus/ConsensusGeneratorFromSmithWaterman.scala
@@ -19,13 +19,27 @@ package org.bdgenomics.adam.algorithms.consensus
 
 import org.bdgenomics.adam.algorithms.smithwaterman.SmithWatermanConstantGapScoring
 import org.bdgenomics.adam.models.ReferenceRegion
-import org.bdgenomics.adam.rich.RichAlignmentRecord
 import org.bdgenomics.adam.rich.RichAlignmentRecord._
 import org.bdgenomics.adam.rich.RichCigar._
+import org.bdgenomics.adam.rich.RichAlignmentRecord
 import org.bdgenomics.adam.util.MdTag
 import org.bdgenomics.formats.avro.AlignmentRecord
 
-class ConsensusGeneratorFromSmithWaterman(
+/**
+ * Generates realignment sequences by brute force locally realigning reads.
+ *
+ * Here, reads are first locally aligned with Smith-Waterman to hopefully
+ * produce consolidated INDEL blocks. Then, the model from the read-based
+ * consensus tool is applied.
+ *
+ * @see ConsensusGeneratorFromReads
+ *
+ * @param wMatch Match weight to use for Smith-Waterman.
+ * @param wMismatch Mismatch penalty to use for Smith-Waterman.
+ * @param wInsert Insert penalty to use for Smith-Waterman.
+ * @param wDelete Deletion penalty to use for Smith-Waterman.
+ */
+private[adam] class ConsensusGeneratorFromSmithWaterman(
     wMatch: Double,
     wMismatch: Double,
     wInsert: Double,

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/Consensus.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/Consensus.scala
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala b/adam-core/src/main/scala/org/bdgenomics/adam/models/IndelTable.scala
@@ -20,11 +20,12 @@ package org.bdgenomics.adam.models
 import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
+import org.bdgenomics.adam.algorithms.consensus.Consensus
 import org.bdgenomics.adam.rdd.ADAMContext._
 import org.bdgenomics.formats.avro.Variant
 import org.bdgenomics.utils.misc.Logging
 
-class IndelTable(private val table: Map[String, Iterable[Consensus]]) extends Serializable with Logging {
+private[adam] class IndelTable(private val table: Map[String, Iterable[Consensus]]) extends Serializable with Logging {
   log.info("Indel table has %s contigs and %s entries".format(
     table.size,
     table.values.map(_.size).sum
@@ -47,7 +48,7 @@ class IndelTable(private val table: Map[String, Iterable[Consensus]]) extends Se
   }
 }
 
-object IndelTable {
+private[adam] object IndelTable {
 
   /**
    * Creates an indel table from a file containing known indels.

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/AlignmentRecordRDDFunctions.scala
@@ -642,7 +642,7 @@ private[rdd] class AlignmentRecordRDDFunctions(val rdd: RDD[AlignmentRecord])
    * @param maxIndelSize The size of the largest indel to use for realignment.
    * @param maxConsensusNumber The maximum number of consensus sequences to realign against per
    * target region.
-   * @param lodThreshold Log-odds threhold to use when realigning; realignments are only finalized
+   * @param lodThreshold Log-odds threshold to use when realigning; realignments are only finalized
    * if the log-odds threshold is exceeded.
    * @param maxTargetSize The maximum width of a single target region for realignment.
    * @return Returns an RDD of mapped reads which have been realigned.

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/read/realignment/RealignIndels.scala
@@ -21,9 +21,9 @@ import htsjdk.samtools.{ Cigar, CigarElement, CigarOperator }
 import org.bdgenomics.utils.misc.Logging
 import org.apache.spark.rdd.MetricsContext._
 import org.apache.spark.rdd.RDD
-import org.bdgenomics.adam.algorithms.consensus.{ ConsensusGenerator, ConsensusGeneratorFromReads }
+import org.bdgenomics.adam.algorithms.consensus.{ Consensus, ConsensusGenerator, ConsensusGeneratorFromReads }
 import org.bdgenomics.adam.models.ReferenceRegion._
-import org.bdgenomics.adam.models.{ Consensus, ReferencePosition, ReferenceRegion }
+import org.bdgenomics.adam.models.{ ReferencePosition, ReferenceRegion }
 import org.bdgenomics.adam.rich.RichAlignmentRecord
 import org.bdgenomics.adam.rich.RichAlignmentRecord._
 import org.bdgenomics.adam.util.ImplicitJavaConversions._
@@ -282,7 +282,7 @@ private[rdd] class RealignIndels(
         // loop over all consensuses and evaluate
         consensus.foreach(c => {
           // generate a reference sequence from the consensus
-          val consensusSequence = c.insertIntoReference(reference, refStart, refEnd)
+          val consensusSequence = c.insertIntoReference(reference, refRegion)
 
           // evaluate all reads against the new consensus
           val sweptValues = readsToClean.map(r => {

diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala b/adam-core/src/main/scala/org/bdgenomics/adam/serialization/ADAMKryoRegistrator.scala
@@ -43,6 +43,7 @@ import org.apache.avro.io.{ BinaryDecoder, BinaryEncoder, DecoderFactory, Encode
 import org.apache.avro.specific.{ SpecificDatumReader, SpecificDatumWriter, SpecificRecord }
 import org.apache.hadoop.io.{ LongWritable, Text }
 import org.apache.spark.serializer.KryoRegistrator
+import org.bdgenomics.adam.algorithms.consensus.Consensus
 import org.bdgenomics.adam.converters.FastaConverter.FastaDescriptionLine
 import org.bdgenomics.adam.converters.FragmentCollector
 import org.bdgenomics.adam.models._