Skip to content

Commit

Permalink
Fixing estimators and causing all things to compile.
Browse files Browse the repository at this point in the history
  • Loading branch information
etrain committed Jun 7, 2016
1 parent f092b97 commit bb062e3
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 38 deletions.
10 changes: 5 additions & 5 deletions src/main/scala/nodes/images/FisherVector.scala
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ case class FisherVector(gmm: GaussianMixtureModel)
* @param k Number of centers to estimate.
*/
case class ScalaGMMFisherVectorEstimator(k: Int) extends Estimator[DenseMatrix[Float], DenseMatrix[Float]] {
protected def fit(data: RDD[DenseMatrix[Float]]): FisherVector = {
def fit(data: RDD[DenseMatrix[Float]]): FisherVector = {
val gmmTrainingData = data.flatMap(x => convert(MatrixUtils.matrixToColArray(x), Double))
val gmmEst = new GaussianMixtureModelEstimator(k)
val gmm = gmmEst.fit(gmmTrainingData)
Expand All @@ -84,12 +84,12 @@ case class ScalaGMMFisherVectorEstimator(k: Int) extends Estimator[DenseMatrix[F
case class GMMFisherVectorEstimator(k: Int) extends OptimizableEstimator[DenseMatrix[Float], DenseMatrix[Float]] {
val default = ScalaGMMFisherVectorEstimator(k)

override def optimize(sample: RDD[DenseMatrix[Float]], numPerPartition: Map[Int, Int])
: RDD[DenseMatrix[Float]] => Pipeline[DenseMatrix[Float], DenseMatrix[Float]] = {
def optimize(sample: RDD[DenseMatrix[Float]], numPerPartition: Map[Int, Int])
: Estimator[DenseMatrix[Float], DenseMatrix[Float]] = {
if (k >= 32) {
nodes.images.external.EncEvalGMMFisherVectorEstimator(k).withData(_)
nodes.images.external.EncEvalGMMFisherVectorEstimator(k)
} else {
ScalaGMMFisherVectorEstimator(k).withData(_)
ScalaGMMFisherVectorEstimator(k)
}
}
}
2 changes: 1 addition & 1 deletion src/main/scala/nodes/images/external/FisherVector.scala
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ case class FisherVector(
* @param k Number of centers to estimate.
*/
case class EncEvalGMMFisherVectorEstimator(k: Int) extends Estimator[DenseMatrix[Float], DenseMatrix[Float]] {
protected def fit(data: RDD[DenseMatrix[Float]]): FisherVector = {
def fit(data: RDD[DenseMatrix[Float]]): FisherVector = {
val gmmTrainingData = data.flatMap(x => convert(MatrixUtils.matrixToColArray(x), Double))
val gmmEst = new GaussianMixtureModelEstimator(k)
val gmm = gmmEst.fit(gmmTrainingData)
Expand Down
48 changes: 24 additions & 24 deletions src/main/scala/nodes/learning/LeastSquaresEstimator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,25 @@ import breeze.linalg._
import nodes.util.{Densify, Sparsify}
import org.apache.spark.rdd.RDD
import pipelines.Logging
import workflow.{Pipeline, WeightedNode, LabelEstimator, OptimizableLabelEstimator}
import workflow._

import scala.reflect._

/**
* A least squares solver that is optimized to use a fast algorithm, based on characteristics of the
* workload and cost models.
*
* Currently selects between Dense LBFGS, Sparse LBFGS, Exact Distributed Solve, and Approximate Block Solve.
*
* The default weights were determined empirically via results run on a 16 r3.4xlarge node cluster.
*
* @param lambda The L2 regularization parameter to use, defaults to 0
* @param numMachines
* @param cpuWeight
* @param memWeight
* @param networkWeight
* @tparam T
*/
* A least squares solver that is optimized to use a fast algorithm, based on characteristics of the
* workload and cost models.
*
* Currently selects between Dense LBFGS, Sparse LBFGS, Exact Distributed Solve, and Approximate Block Solve.
*
* The default weights were determined empirically via results run on a 16 r3.4xlarge node cluster.
*
* @param lambda The L2 regularization parameter to use, defaults to 0
* @param numMachines
* @param cpuWeight
* @param memWeight
* @param networkWeight
* @tparam T
*/
class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
lambda: Double = 0,
numMachines: Option[Int] = None,
Expand All @@ -33,22 +33,22 @@ class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
with WeightedNode
with Logging {

val options: Seq[(CostModel, (RDD[T], RDD[DenseVector[Double]]) => Pipeline[T, DenseVector[Double]])] = Seq(
val options: Seq[(CostModel, LabelEstimator[T, DenseVector[Double], DenseVector[Double]])] = Seq(
{
val solver = new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20)
(solver, solver.withData(_, _))
(solver, solver)
},
{
val solver = new SparseLBFGSwithL2(new LeastSquaresSparseGradient, regParam = lambda, numIterations = 20)
(solver, Sparsify() andThen (solver, _, _))
(solver, TransformerLabelEstimatorChain(Sparsify(), solver))
},
{
val solver = new BlockLeastSquaresEstimator(1000, 3, lambda = lambda)
(solver, Densify() andThen (solver, _, _))
(solver, TransformerLabelEstimatorChain(Densify(), solver))
},
{
val solver = new LinearMapEstimator(Some(lambda))
(solver, Densify() andThen (solver, _, _))
(solver, TransformerLabelEstimatorChain(Densify(), solver))
}
)

Expand All @@ -57,10 +57,10 @@ class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
}

override def optimize(
sample: RDD[T],
sampleLabels: RDD[DenseVector[Double]],
numPerPartition: Map[Int, Int])
: (RDD[T], RDD[DenseVector[Double]]) => Pipeline[T, DenseVector[Double]] = {
sample: RDD[T],
sampleLabels: RDD[DenseVector[Double]],
numPerPartition: Map[Int, Int])
: LabelEstimator[T, DenseVector[Double], DenseVector[Double]] = {
val n = numPerPartition.values.map(_.toLong).sum
val d = sample.first().length
val k = sampleLabels.first().length
Expand Down
12 changes: 6 additions & 6 deletions src/main/scala/nodes/learning/PCA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ case class LocalColumnPCAEstimator(dims: Int) extends Estimator[DenseMatrix[Floa

val pcaEstimator = new PCAEstimator(dims)

protected def fit(data: RDD[DenseMatrix[Float]]): Transformer[DenseMatrix[Float], DenseMatrix[Float]] = {
def fit(data: RDD[DenseMatrix[Float]]): Transformer[DenseMatrix[Float], DenseMatrix[Float]] = {
val singleTransformer = pcaEstimator.fit(data.flatMap(x => MatrixUtils.matrixToColArray(x)))
BatchPCATransformer(singleTransformer.pcaMat)
}
Expand Down Expand Up @@ -83,7 +83,7 @@ case class DistributedColumnPCAEstimator(dims: Int) extends Estimator[DenseMatri

val pcaEstimator = new DistributedPCAEstimator(dims)

protected def fit(data: RDD[DenseMatrix[Float]]): Transformer[DenseMatrix[Float], DenseMatrix[Float]] = {
def fit(data: RDD[DenseMatrix[Float]]): Transformer[DenseMatrix[Float], DenseMatrix[Float]] = {
val singleTransformer = pcaEstimator.fit(data.flatMap(x => MatrixUtils.matrixToColArray(x)))
BatchPCATransformer(singleTransformer.pcaMat)
}
Expand Down Expand Up @@ -126,8 +126,8 @@ case class ColumnPCAEstimator(
val distributedEstimator = new DistributedColumnPCAEstimator(dims)
val default = distributedEstimator

override def optimize(sample: RDD[DenseMatrix[Float]], numPerPartition: Map[Int, Int])
: RDD[DenseMatrix[Float]] => Pipeline[DenseMatrix[Float], DenseMatrix[Float]] = {
def optimize(sample: RDD[DenseMatrix[Float]], numPerPartition: Map[Int, Int])
: Estimator[DenseMatrix[Float], DenseMatrix[Float]] = {
val numColsPerMatrix: Double = sample.map(_.cols.toDouble).sum() / sample.count()
val n = (numColsPerMatrix * numPerPartition.values.sum).toInt
val d = sample.first().rows
Expand All @@ -148,9 +148,9 @@ case class ColumnPCAEstimator(
)

if (localCost < distributedCost) {
localEstimator.withData(_)
localEstimator
} else {
distributedEstimator.withData(_)
distributedEstimator
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@ object ImageNetSiftLcsFV extends Serializable with Logging {
// Part 2: Compute dimensionality-reduced PCA features.
val pcaTransformer = pcaFile match {
case Some(fname) =>
new BatchPCATransformer(convert(csvread(new File(fname)), Float).t)
new BatchPCATransformer(convert(csvread(new File(fname)), Float).t).toPipeline
case None =>
val sampler = ColumnSampler(numColSamplesPerImage).toPipeline
val pca = ColumnPCAEstimator(numPCADesc) withData (sampler(prefix(trainingData)))

pca
pca.toPipeline
}

// Part 2a: If necessary, compute a GMM based on the dimensionality-reduced features, or load from disk.
Expand Down

0 comments on commit bb062e3

Please sign in to comment.