diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/TreeRegionJoin.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/TreeRegionJoin.scala index b321fe5fb7..e0e59287fc 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/TreeRegionJoin.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/TreeRegionJoin.scala @@ -155,7 +155,7 @@ class ForestSerializer[T: ClassTag, TS <: Serializer[T]]( } /** - * Implements a shuffle free broadcast region join. + * Implements a shuffle free (broadcast) region join. * * The broadcast values are stored in a sorted array. It was going to be an * ensemble of interval trees, but, that didn't work out. @@ -199,10 +199,13 @@ trait TreeRegionJoin[T, U] { } } +/** + * Implements an inner region join where the left side of the join is broadcast. + */ case class InnerTreeRegionJoin[T, U]() extends RegionJoin[T, U, T, U] with TreeRegionJoin[T, U] { /** - * Performs a region join between two RDDs. + * Performs an inner region join between two RDDs. * * @param baseRDD The 'left' side of the join * @param joinedRDD The 'right' side of the join @@ -225,10 +228,14 @@ case class InnerTreeRegionJoin[T, U]() extends RegionJoin[T, U, T, U] with TreeR } } +/** + * Implements a right outer region join where the left side of the join is + * broadcast. + */ case class RightOuterTreeRegionJoin[T, U]() extends RegionJoin[T, U, Option[T], U] with TreeRegionJoin[T, U] { /** - * Performs a region join between two RDDs. + * Performs a right outer region join between two RDDs. * * @param baseRDD The 'left' side of the join * @param joinedRDD The 'right' side of the join @@ -236,8 +243,10 @@ case class RightOuterTreeRegionJoin[T, U]() extends RegionJoin[T, U, Option[T], * @param uManifest implicit type of joinedRDD * @tparam T type of baseRDD * @tparam U type of joinedRDD - * @return An RDD of pairs (x, y), where x is from baseRDD, y is from joinedRDD, and the region - * corresponding to x overlaps the region corresponding to y. + * @return An RDD of pairs (Option[x], y), where the optional x value is from + * baseRDD, y is from joinedRDD, and the region corresponding to x overlaps + * the region corresponding to y. If there are no keys in the baseRDD that + * overlap a given key (y) from the joinedRDD, x will be None. */ def partitionAndJoin( baseRDD: RDD[(ReferenceRegion, T)], @@ -256,10 +265,16 @@ case class RightOuterTreeRegionJoin[T, U]() extends RegionJoin[T, U, Option[T], } } +/** + * Performs an inner region join, followed logically by grouping by the right + * value. This is implemented without any shuffling; the join naturally returns + * values on the left grouped by the right value. + */ case class InnerTreeRegionJoinAndGroupByRight[T, U]() extends RegionJoin[T, U, Iterable[T], U] with TreeRegionJoin[T, U] { /** - * Performs a region join between two RDDs. + * Performs an inner join between two RDDs, followed by a groupBy on the + * right object. * * @param baseRDD The 'left' side of the join * @param joinedRDD The 'right' side of the join @@ -267,8 +282,10 @@ case class InnerTreeRegionJoinAndGroupByRight[T, U]() extends RegionJoin[T, U, I * @param uManifest implicit type of joinedRDD * @tparam T type of baseRDD * @tparam U type of joinedRDD - * @return An RDD of pairs (x, y), where x is from baseRDD, y is from joinedRDD, and the region - * corresponding to x overlaps the region corresponding to y. + * @return An RDD of pairs (Iterable[x], y), where the Iterable[x] is from + * baseRDD, y is from joinedRDD, and all values in the Iterable[x] are + * aligned at regions that overlap the region corresponding to y. If the + * iterable is empty, the key-value pair is filtered out. */ def partitionAndJoin( baseRDD: RDD[(ReferenceRegion, T)], @@ -279,10 +296,17 @@ case class InnerTreeRegionJoinAndGroupByRight[T, U]() extends RegionJoin[T, U, I } } +/** + * Performs a right outer region join, followed logically by grouping by the right + * value. This is implemented without any shuffling; the join naturally returns + * values on the left grouped by the right value. In this implementation, empty + * collections on the left side of the join are kept. + */ case class RightOuterTreeRegionJoinAndGroupByRight[T, U]() extends RegionJoin[T, U, Iterable[T], U] with TreeRegionJoin[T, U] { /** - * Performs a region join between two RDDs. + * Performs an inner join between two RDDs, followed by a groupBy on the + * right object. * * @param baseRDD The 'left' side of the join * @param joinedRDD The 'right' side of the join @@ -290,8 +314,10 @@ case class RightOuterTreeRegionJoinAndGroupByRight[T, U]() extends RegionJoin[T, * @param uManifest implicit type of joinedRDD * @tparam T type of baseRDD * @tparam U type of joinedRDD - * @return An RDD of pairs (x, y), where x is from baseRDD, y is from joinedRDD, and the region - * corresponding to x overlaps the region corresponding to y. + * @return An RDD of pairs (Iterable[x], y), where the Iterable[x] is from + * baseRDD, y is from joinedRDD, and all values in the Iterable[x] are + * aligned at regions that overlap the region corresponding to y. If the + * iterable is empty, the key-value pair is NOT filtered out. */ def partitionAndJoin( baseRDD: RDD[(ReferenceRegion, T)],