Fixing estimators and causing all things to compile.

amplab · Jun 7, 2016 · bb062e3 · bb062e3
1 parent f092b97
commit bb062e3
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 38 deletions.
diff --git a/src/main/scala/nodes/images/FisherVector.scala b/src/main/scala/nodes/images/FisherVector.scala
@@ -63,7 +63,7 @@ case class FisherVector(gmm: GaussianMixtureModel)
  * @param k Number of centers to estimate.
  */
 case class ScalaGMMFisherVectorEstimator(k: Int) extends Estimator[DenseMatrix[Float], DenseMatrix[Float]] {
-  protected def fit(data: RDD[DenseMatrix[Float]]): FisherVector = {
+  def fit(data: RDD[DenseMatrix[Float]]): FisherVector = {
     val gmmTrainingData = data.flatMap(x => convert(MatrixUtils.matrixToColArray(x), Double))
     val gmmEst = new GaussianMixtureModelEstimator(k)
     val gmm = gmmEst.fit(gmmTrainingData)
@@ -84,12 +84,12 @@ case class ScalaGMMFisherVectorEstimator(k: Int) extends Estimator[DenseMatrix[F
 case class GMMFisherVectorEstimator(k: Int) extends OptimizableEstimator[DenseMatrix[Float], DenseMatrix[Float]] {
   val default = ScalaGMMFisherVectorEstimator(k)
 
-  override def optimize(sample: RDD[DenseMatrix[Float]], numPerPartition: Map[Int, Int])
-  : RDD[DenseMatrix[Float]] => Pipeline[DenseMatrix[Float], DenseMatrix[Float]] = {
+  def optimize(sample: RDD[DenseMatrix[Float]], numPerPartition: Map[Int, Int])
+  : Estimator[DenseMatrix[Float], DenseMatrix[Float]] = {
     if (k >= 32) {
-      nodes.images.external.EncEvalGMMFisherVectorEstimator(k).withData(_)
+      nodes.images.external.EncEvalGMMFisherVectorEstimator(k)
     } else {
-      ScalaGMMFisherVectorEstimator(k).withData(_)
+      ScalaGMMFisherVectorEstimator(k)
     }
   }
 }
diff --git a/src/main/scala/nodes/images/external/FisherVector.scala b/src/main/scala/nodes/images/external/FisherVector.scala
@@ -47,7 +47,7 @@ case class FisherVector(
  * @param k Number of centers to estimate.
  */
 case class EncEvalGMMFisherVectorEstimator(k: Int) extends Estimator[DenseMatrix[Float], DenseMatrix[Float]] {
-  protected def fit(data: RDD[DenseMatrix[Float]]): FisherVector = {
+  def fit(data: RDD[DenseMatrix[Float]]): FisherVector = {
     val gmmTrainingData = data.flatMap(x => convert(MatrixUtils.matrixToColArray(x), Double))
     val gmmEst = new GaussianMixtureModelEstimator(k)
     val gmm = gmmEst.fit(gmmTrainingData)

diff --git a/src/main/scala/nodes/learning/LeastSquaresEstimator.scala b/src/main/scala/nodes/learning/LeastSquaresEstimator.scala
@@ -4,25 +4,25 @@ import breeze.linalg._
 import nodes.util.{Densify, Sparsify}
 import org.apache.spark.rdd.RDD
 import pipelines.Logging
-import workflow.{Pipeline, WeightedNode, LabelEstimator, OptimizableLabelEstimator}
+import workflow._
 
 import scala.reflect._
 
 /**
- * A least squares solver that is optimized to use a fast algorithm, based on characteristics of the
- * workload and cost models.
- *
- * Currently selects between Dense LBFGS, Sparse LBFGS, Exact Distributed Solve, and Approximate Block Solve.
- *
- * The default weights were determined empirically via results run on a 16 r3.4xlarge node cluster.
- *
- * @param lambda  The L2 regularization parameter to use, defaults to 0
- * @param numMachines
- * @param cpuWeight
- * @param memWeight
- * @param networkWeight
- * @tparam T
- */
+  * A least squares solver that is optimized to use a fast algorithm, based on characteristics of the
+  * workload and cost models.
+  *
+  * Currently selects between Dense LBFGS, Sparse LBFGS, Exact Distributed Solve, and Approximate Block Solve.
+  *
+  * The default weights were determined empirically via results run on a 16 r3.4xlarge node cluster.
+  *
+  * @param lambda  The L2 regularization parameter to use, defaults to 0
+  * @param numMachines
+  * @param cpuWeight
+  * @param memWeight
+  * @param networkWeight
+  * @tparam T
+  */
 class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
     lambda: Double = 0,
     numMachines: Option[Int] = None,
@@ -33,22 +33,22 @@ class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
     with WeightedNode
     with Logging {
 
-  val options: Seq[(CostModel, (RDD[T], RDD[DenseVector[Double]]) => Pipeline[T, DenseVector[Double]])] = Seq(
+  val options: Seq[(CostModel, LabelEstimator[T, DenseVector[Double], DenseVector[Double]])] = Seq(
     {
       val solver = new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20)
-      (solver, solver.withData(_, _))
+      (solver, solver)
     },
     {
       val solver = new SparseLBFGSwithL2(new LeastSquaresSparseGradient, regParam = lambda, numIterations = 20)
-      (solver, Sparsify() andThen (solver, _, _))
+      (solver, TransformerLabelEstimatorChain(Sparsify(), solver))
     },
     {
       val solver = new BlockLeastSquaresEstimator(1000, 3, lambda = lambda)
-      (solver, Densify() andThen (solver, _, _))
+      (solver, TransformerLabelEstimatorChain(Densify(), solver))
     },
     {
       val solver = new LinearMapEstimator(Some(lambda))
-      (solver, Densify() andThen (solver, _, _))
+      (solver, TransformerLabelEstimatorChain(Densify(), solver))
     }
   )
 
@@ -57,10 +57,10 @@ class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
   }
 
   override def optimize(
-    sample: RDD[T],
-    sampleLabels: RDD[DenseVector[Double]],
-    numPerPartition: Map[Int, Int])
-  : (RDD[T], RDD[DenseVector[Double]]) => Pipeline[T, DenseVector[Double]] = {
+      sample: RDD[T],
+      sampleLabels: RDD[DenseVector[Double]],
+      numPerPartition: Map[Int, Int])
+  : LabelEstimator[T, DenseVector[Double], DenseVector[Double]] = {
     val n = numPerPartition.values.map(_.toLong).sum
     val d = sample.first().length
     val k = sampleLabels.first().length

diff --git a/src/main/scala/nodes/learning/PCA.scala b/src/main/scala/nodes/learning/PCA.scala
@@ -53,7 +53,7 @@ case class LocalColumnPCAEstimator(dims: Int) extends Estimator[DenseMatrix[Floa
 
   val pcaEstimator = new PCAEstimator(dims)
 
-  protected def fit(data: RDD[DenseMatrix[Float]]): Transformer[DenseMatrix[Float], DenseMatrix[Float]] = {
+  def fit(data: RDD[DenseMatrix[Float]]): Transformer[DenseMatrix[Float], DenseMatrix[Float]] = {
     val singleTransformer = pcaEstimator.fit(data.flatMap(x => MatrixUtils.matrixToColArray(x)))
     BatchPCATransformer(singleTransformer.pcaMat)
   }
@@ -83,7 +83,7 @@ case class DistributedColumnPCAEstimator(dims: Int) extends Estimator[DenseMatri
 
   val pcaEstimator = new DistributedPCAEstimator(dims)
 
-  protected def fit(data: RDD[DenseMatrix[Float]]): Transformer[DenseMatrix[Float], DenseMatrix[Float]] = {
+  def fit(data: RDD[DenseMatrix[Float]]): Transformer[DenseMatrix[Float], DenseMatrix[Float]] = {
     val singleTransformer = pcaEstimator.fit(data.flatMap(x => MatrixUtils.matrixToColArray(x)))
     BatchPCATransformer(singleTransformer.pcaMat)
   }
@@ -126,8 +126,8 @@ case class ColumnPCAEstimator(
   val distributedEstimator = new DistributedColumnPCAEstimator(dims)
   val default = distributedEstimator
 
-  override def optimize(sample: RDD[DenseMatrix[Float]], numPerPartition: Map[Int, Int])
-  : RDD[DenseMatrix[Float]] => Pipeline[DenseMatrix[Float], DenseMatrix[Float]] = {
+  def optimize(sample: RDD[DenseMatrix[Float]], numPerPartition: Map[Int, Int])
+  : Estimator[DenseMatrix[Float], DenseMatrix[Float]] = {
     val numColsPerMatrix: Double = sample.map(_.cols.toDouble).sum() / sample.count()
     val n = (numColsPerMatrix * numPerPartition.values.sum).toInt
     val d = sample.first().rows
@@ -148,9 +148,9 @@ case class ColumnPCAEstimator(
     )
 
     if (localCost < distributedCost) {
-      localEstimator.withData(_)
+      localEstimator
     } else {
-      distributedEstimator.withData(_)
+      distributedEstimator
     }
   }
 }

diff --git a/src/main/scala/pipelines/images/imagenet/ImageNetSiftLcsFV.scala b/src/main/scala/pipelines/images/imagenet/ImageNetSiftLcsFV.scala
@@ -45,12 +45,12 @@ object ImageNetSiftLcsFV extends Serializable with Logging {
     // Part 2: Compute dimensionality-reduced PCA features.
     val pcaTransformer = pcaFile match {
       case Some(fname) =>
-        new BatchPCATransformer(convert(csvread(new File(fname)), Float).t)
+        new BatchPCATransformer(convert(csvread(new File(fname)), Float).t).toPipeline
       case None =>
         val sampler = ColumnSampler(numColSamplesPerImage).toPipeline
         val pca = ColumnPCAEstimator(numPCADesc) withData (sampler(prefix(trainingData)))
 
-        pca
+        pca.toPipeline
     }
 
     // Part 2a: If necessary, compute a GMM based on the dimensionality-reduced features, or load from disk.