Renamed CostModel trait and added pca optimizer unit tests

amplab · Mar 21, 2016 · b9a1013 · b9a1013
1 parent 91cf4ba
commit b9a1013
Show file tree

Hide file tree

Showing 8 changed files with 66 additions and 11 deletions.
diff --git a/src/main/scala/nodes/learning/BlockLinearMapper.scala b/src/main/scala/nodes/learning/BlockLinearMapper.scala
@@ -199,7 +199,7 @@ object BlockLeastSquaresEstimator {
 class BlockLeastSquaresEstimator(blockSize: Int, numIter: Int, lambda: Double = 0.0, numFeaturesOpt: Option[Int] = None)
   extends LabelEstimator[DenseVector[Double], DenseVector[Double], DenseVector[Double]]
     with WeightedNode
-    with SolverCostModel {
+    with CostModel {
 
   override val weight = (3*numIter)+1
 

diff --git a/...cala/nodes/learning/SolverCostModel.scala → ...main/scala/nodes/learning/CostModel.scala b/...cala/nodes/learning/SolverCostModel.scala → ...main/scala/nodes/learning/CostModel.scala
@@ -3,7 +3,7 @@ package nodes.learning
 /**
  * A trait that represents a known system performance cost model for a solver.
  */
-trait SolverCostModel {
+trait CostModel {
   def cost(
     n: Long,
     d: Int,

diff --git a/src/main/scala/nodes/learning/DistributedPCA.scala b/src/main/scala/nodes/learning/DistributedPCA.scala
@@ -18,7 +18,7 @@ import edu.berkeley.cs.amplab.mlmatrix.{RowPartition, NormalEquations, RowPartit
  * @param dims Dimensions to reduce input dataset to.
  */
 class DistributedPCAEstimator(dims: Int) extends Estimator[DenseVector[Float], DenseVector[Float]]
-  with SolverCostModel with Logging {
+  with CostModel with Logging {
 
   /**
    * Adapted from the "PCA2" matlab code given in appendix B of this paper:

diff --git a/src/main/scala/nodes/learning/LBFGS.scala b/src/main/scala/nodes/learning/LBFGS.scala
@@ -139,7 +139,7 @@ class DenseLBFGSwithL2[T <: Vector[Double]](
     val convergenceTol: Double = 1e-4,
     val numIterations: Int = 100,
     val regParam: Double = 0.0)
-  extends LabelEstimator[T, DenseVector[Double], DenseVector[Double]] with WeightedNode with SolverCostModel {
+  extends LabelEstimator[T, DenseVector[Double], DenseVector[Double]] with WeightedNode with CostModel {
 
   override val weight: Int = numIterations + 1
 
@@ -215,7 +215,7 @@ class SparseLBFGSwithL2(
     val sparseOverhead: Double = 8)
   extends LabelEstimator[SparseVector[Double], DenseVector[Double], DenseVector[Double]]
     with WeightedNode
-    with SolverCostModel {
+    with CostModel {
 
   override val weight: Int = numIterations + 1
 

diff --git a/src/main/scala/nodes/learning/LeastSquaresEstimator.scala b/src/main/scala/nodes/learning/LeastSquaresEstimator.scala
@@ -33,7 +33,7 @@ class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
     with WeightedNode
     with Logging {
 
-  val options: Seq[(SolverCostModel, (RDD[T], RDD[DenseVector[Double]]) => Pipeline[T, DenseVector[Double]])] = Seq(
+  val options: Seq[(CostModel, (RDD[T], RDD[DenseVector[Double]]) => Pipeline[T, DenseVector[Double]])] = Seq(
     {
       val solver = new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20)
       (solver, solver.withData(_, _))

diff --git a/src/main/scala/nodes/learning/LinearMapper.scala b/src/main/scala/nodes/learning/LinearMapper.scala
@@ -67,7 +67,7 @@ case class LinearMapper[T <: Vector[Double]](
  * @param lambda L2 Regularization parameter
  */
 class LinearMapEstimator(lambda: Option[Double] = None)
-    extends LabelEstimator[DenseVector[Double], DenseVector[Double], DenseVector[Double]] with SolverCostModel {
+    extends LabelEstimator[DenseVector[Double], DenseVector[Double], DenseVector[Double]] with CostModel {
 
   /**
    * Learns a linear model (OLS) based on training features and training labels.

diff --git a/src/main/scala/nodes/learning/PCA.scala b/src/main/scala/nodes/learning/PCA.scala
@@ -49,7 +49,7 @@ case class BatchPCATransformer(pcaMat: DenseMatrix[Float]) extends Transformer[D
  * @param dims Dimensions to reduce input dataset to.
  */
 case class LocalColumnPCAEstimator(dims: Int) extends Estimator[DenseMatrix[Float], DenseMatrix[Float]]
-  with SolverCostModel {
+  with CostModel {
 
   val pcaEstimator = new PCAEstimator(dims)
 
@@ -79,7 +79,7 @@ case class LocalColumnPCAEstimator(dims: Int) extends Estimator[DenseMatrix[Floa
  * @param dims Dimensions to reduce input dataset to.
  */
 case class DistributedColumnPCAEstimator(dims: Int) extends Estimator[DenseMatrix[Float], DenseMatrix[Float]]
-  with SolverCostModel {
+  with CostModel {
 
   val pcaEstimator = new DistributedPCAEstimator(dims)
 
@@ -161,7 +161,7 @@ class ColumnPCAEstimator(
  * @param dims Dimensions to reduce input dataset to.
  */
 class PCAEstimator(dims: Int) extends Estimator[DenseVector[Float], DenseVector[Float]]
-  with SolverCostModel with Logging {
+  with CostModel with Logging {
 
   /**
    * Adapted from the "PCA2" matlab code given in appendix B of this paper:

diff --git a/src/test/scala/nodes/learning/PCASuite.scala b/src/test/scala/nodes/learning/PCASuite.scala
@@ -8,6 +8,7 @@ import org.apache.spark.SparkContext
 import org.scalatest.FunSuite
 import pipelines._
 import utils.{TestUtils, Stats, MatrixUtils}
+import workflow.WorkflowUtils
 
 class PCASuite extends FunSuite with LocalSparkContext with Logging {
 
@@ -111,7 +112,8 @@ class PCASuite extends FunSuite with LocalSparkContext with Logging {
     * B is Gaussian(0,1) \in R^{k \times d}
     * E is Gaussian(0,eps) in R^{n \times d}
     *
-    * @param n Number of rows.
+   *
+   * @param n Number of rows.
     * @param d Number of columns.
     * @param k Rank of factors.
     * @param eps Variance of the Gaussian noise.
@@ -223,4 +225,57 @@ class PCASuite extends FunSuite with LocalSparkContext with Logging {
 
     assert(Stats.aboutEq(offDiagCadm, DenseMatrix.zeros[Double](cadm.rows, cadm.rows), 0.1))
   }
+
+
+  test("small n small d dense column pca") {
+    sc = new SparkContext("local", "test")
+
+    val n = 1000
+    val numColsPerMatrix = 10
+    val d = 1000
+    val k = 100
+    val numMachines = 16
+    val numParts = numMachines
+
+    val data = sc.parallelize(
+      Seq.fill(numParts)(convert(DenseMatrix.rand[Double](d, numColsPerMatrix), Float)), numParts
+    )
+    val numPerPartition = WorkflowUtils.numPerPartition(data).mapValues(x => n / (numColsPerMatrix * numParts))
+
+    val solver = new ColumnPCAEstimator(dims = k, numMachines = Some(numMachines))
+    val optimizedSolver = solver.optimize(data, numPerPartition).apply(data)
+
+    val instructions = WorkflowUtils.pipelineToInstructions(optimizedSolver)
+    val isLocalColumnPCAEstimator = instructions.exists {
+      case _: LocalColumnPCAEstimator => true
+      case _ => false
+    }
+    assert(isLocalColumnPCAEstimator, "Expected local pca estimator")
+  }
+
+  test("big n big d dense column pca") {
+    sc = new SparkContext("local", "test")
+
+    val n = 100000
+    val numColsPerMatrix = 10
+    val d = 10000
+    val k = 100
+    val numMachines = 16
+    val numParts = numMachines
+
+    val data = sc.parallelize(
+      Seq.fill(numParts)(convert(DenseMatrix.rand[Double](d, numColsPerMatrix), Float)), numParts
+    )
+    val numPerPartition = WorkflowUtils.numPerPartition(data).mapValues(x => n / (numColsPerMatrix * numParts))
+
+    val solver = new ColumnPCAEstimator(dims = k, numMachines = Some(numMachines))
+    val optimizedSolver = solver.optimize(data, numPerPartition).apply(data)
+
+    val instructions = WorkflowUtils.pipelineToInstructions(optimizedSolver)
+    val isDistributedColumnPCAEstimator = instructions.exists {
+      case _: DistributedColumnPCAEstimator => true
+      case _ => false
+    }
+    assert(isDistributedColumnPCAEstimator, "Expected distributed pca estimator")
+  }
 }