Skip to content

Commit

Permalink
Merge branch 'master' of github.com:apache/spark into handle-configs-…
Browse files Browse the repository at this point in the history
…bash
  • Loading branch information
andrewor14 committed Aug 16, 2014
2 parents 7396be2 + 2e069ca commit 7a4190a
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 131 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.{LogisticRegressionWithSGD, SVMWithSGD}
import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.optimization.{SquaredL2Updater, L1Updater}
Expand Down Expand Up @@ -66,7 +66,8 @@ object BinaryClassification {
.text("number of iterations")
.action((x, c) => c.copy(numIterations = x))
opt[Double]("stepSize")
.text(s"initial step size, default: ${defaultParams.stepSize}")
.text("initial step size (ignored by logistic regression), " +
s"default: ${defaultParams.stepSize}")
.action((x, c) => c.copy(stepSize = x))
opt[String]("algorithm")
.text(s"algorithm (${Algorithm.values.mkString(",")}), " +
Expand Down Expand Up @@ -125,10 +126,9 @@ object BinaryClassification {

val model = params.algorithm match {
case LR =>
val algorithm = new LogisticRegressionWithSGD()
val algorithm = new LogisticRegressionWithLBFGS()
algorithm.optimizer
.setNumIterations(params.numIterations)
.setStepSize(params.stepSize)
.setUpdater(updater)
.setRegParam(params.regParam)
algorithm.run(training).clearThreshold()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ class LogisticRegressionModel (
/**
* Train a classification model for Logistic Regression using Stochastic Gradient Descent.
* NOTE: Labels used in Logistic Regression should be {0, 1}
*
* Using [[LogisticRegressionWithLBFGS]] is recommended over this.
*/
class LogisticRegressionWithSGD private (
private var stepSize: Double,
Expand Down Expand Up @@ -191,51 +193,19 @@ object LogisticRegressionWithSGD {

/**
* Train a classification model for Logistic Regression using Limited-memory BFGS.
* Standard feature scaling and L2 regularization are used by default.
* NOTE: Labels used in Logistic Regression should be {0, 1}
*/
class LogisticRegressionWithLBFGS private (
private var convergenceTol: Double,
private var maxNumIterations: Int,
private var regParam: Double)
class LogisticRegressionWithLBFGS
extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {

/**
* Construct a LogisticRegression object with default parameters
*/
def this() = this(1E-4, 100, 0.0)

this.setFeatureScaling(true)

private val gradient = new LogisticGradient()
private val updater = new SimpleUpdater()
// Have to return new LBFGS object every time since users can reset the parameters anytime.
override def optimizer = new LBFGS(gradient, updater)
.setNumCorrections(10)
.setConvergenceTol(convergenceTol)
.setMaxNumIterations(maxNumIterations)
.setRegParam(regParam)
override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater)

override protected val validators = List(DataValidators.binaryLabelValidator)

/**
* Set the convergence tolerance of iterations for L-BFGS. Default 1E-4.
* Smaller value will lead to higher accuracy with the cost of more iterations.
*/
def setConvergenceTol(convergenceTol: Double): this.type = {
this.convergenceTol = convergenceTol
this
}

/**
* Set the maximal number of iterations for L-BFGS. Default 100.
*/
def setNumIterations(numIterations: Int): this.type = {
this.maxNumIterations = numIterations
this
}

override protected def createModel(weights: Vector, intercept: Double) = {
new LogisticRegressionModel(weights, intercept)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,17 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)

/**
* Set the maximal number of iterations for L-BFGS. Default 100.
* @deprecated use [[LBFGS#setNumIterations]] instead
*/
@deprecated("use setNumIterations instead", "1.1.0")
def setMaxNumIterations(iters: Int): this.type = {
this.setNumIterations(iters)
}

/**
* Set the maximal number of iterations for L-BFGS. Default 100.
*/
def setNumIterations(iters: Int): this.type = {
this.maxNumIterations = iters
this
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.{Logging, HashPartitioner}
import org.apache.spark.Logging
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.linalg.{DenseVector, Matrix, Vector}
import org.apache.spark.rdd.{CoGroupedRDD, RDD}
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD

/**
* Compute Spearman's correlation for two RDDs of the type RDD[Double] or the correlation matrix
Expand All @@ -43,87 +43,51 @@ private[stat] object SpearmanCorrelation extends Correlation with Logging {
/**
* Compute Spearman's correlation matrix S, for the input matrix, where S(i, j) is the
* correlation between column i and j.
*
* Input RDD[Vector] should be cached or checkpointed if possible since it would be split into
* numCol RDD[Double]s, each of which sorted, and the joined back into a single RDD[Vector].
*/
override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
val indexed = X.zipWithUniqueId()

val numCols = X.first.size
if (numCols > 50) {
logWarning("Computing the Spearman correlation matrix can be slow for large RDDs with more"
+ " than 50 columns.")
}
val ranks = new Array[RDD[(Long, Double)]](numCols)

// Note: we use a for loop here instead of a while loop with a single index variable
// to avoid race condition caused by closure serialization
for (k <- 0 until numCols) {
val column = indexed.map { case (vector, index) => (vector(k), index) }
ranks(k) = getRanks(column)
// ((columnIndex, value), rowUid)
val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) =>
vec.toArray.view.zipWithIndex.map { case (v, j) =>
((j, v), uid)
}
}

val ranksMat: RDD[Vector] = makeRankMatrix(ranks, X)
PearsonCorrelation.computeCorrelationMatrix(ranksMat)
}

/**
* Compute the ranks for elements in the input RDD, using the average method for ties.
*
* With the average method, elements with the same value receive the same rank that's computed
* by taking the average of their positions in the sorted list.
* e.g. ranks([2, 1, 0, 2]) = [2.5, 1.0, 0.0, 2.5]
* Note that positions here are 0-indexed, instead of the 1-indexed as in the definition for
* ranks in the standard definition for Spearman's correlation. This does not affect the final
* results and is slightly more performant.
*
* @param indexed RDD[(Double, Long)] containing pairs of the format (originalValue, uniqueId)
* @return RDD[(Long, Double)] containing pairs of the format (uniqueId, rank), where uniqueId is
* copied from the input RDD.
*/
private def getRanks(indexed: RDD[(Double, Long)]): RDD[(Long, Double)] = {
// Get elements' positions in the sorted list for computing average rank for duplicate values
val sorted = indexed.sortByKey().zipWithIndex()

val ranks: RDD[(Long, Double)] = sorted.mapPartitions { iter =>
// add an extra element to signify the end of the list so that flatMap can flush the last
// batch of duplicates
val end = -1L
val padded = iter ++ Iterator[((Double, Long), Long)](((Double.NaN, end), end))
val firstEntry = padded.next()
var lastVal = firstEntry._1._1
var firstRank = firstEntry._2.toDouble
val idBuffer = ArrayBuffer(firstEntry._1._2)
padded.flatMap { case ((v, id), rank) =>
if (v == lastVal && id != end) {
idBuffer += id
Iterator.empty
} else {
val entries = if (idBuffer.size == 1) {
Iterator((idBuffer(0), firstRank))
} else {
val averageRank = firstRank + (idBuffer.size - 1.0) / 2.0
idBuffer.map(id => (id, averageRank))
}
lastVal = v
firstRank = rank
idBuffer.clear()
idBuffer += id
entries
// global sort by (columnIndex, value)
val sorted = colBased.sortByKey()
// assign global ranks (using average ranks for tied values)
val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
var preCol = -1
var preVal = Double.NaN
var startRank = -1.0
var cachedUids = ArrayBuffer.empty[Long]
val flush: () => Iterable[(Long, (Int, Double))] = () => {
val averageRank = startRank + (cachedUids.size - 1) / 2.0
val output = cachedUids.map { uid =>
(uid, (preCol, averageRank))
}
cachedUids.clear()
output
}
iter.flatMap { case (((j, v), uid), rank) =>
// If we see a new value or cachedUids is too big, we flush ids with their average rank.
if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
val output = flush()
preCol = j
preVal = v
startRank = rank
cachedUids += uid
output
} else {
cachedUids += uid
Iterator.empty
}
} ++ flush()
}
ranks
}

private def makeRankMatrix(ranks: Array[RDD[(Long, Double)]], input: RDD[Vector]): RDD[Vector] = {
val partitioner = new HashPartitioner(input.partitions.size)
val cogrouped = new CoGroupedRDD[Long](ranks, partitioner)
cogrouped.map {
case (_, values: Array[Iterable[_]]) =>
val doubles = values.asInstanceOf[Array[Iterable[Double]]]
new DenseVector(doubles.flatten.toArray)
// Replace values in the input matrix by their ranks compared with values in the same column.
// Note that shifting all ranks in a column by a constant value doesn't affect result.
val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
// sort by column index and then convert values to a vector
Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray)
}
PearsonCorrelation.computeCorrelationMatrix(groupedRanks)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,9 @@ class LogisticRegressionClusterSuite extends FunSuite with LocalClusterSparkCont
}.cache()
// If we serialize data directly in the task closure, the size of the serialized task would be
// greater than 1MB and hence Spark would throw an error.
val model =
(new LogisticRegressionWithLBFGS().setIntercept(true).setNumIterations(2)).run(points)
val lr = new LogisticRegressionWithLBFGS().setIntercept(true)
lr.optimizer.setNumIterations(2)
val model = lr.run(points)

val predictions = model.predict(points.map(_.features))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {

val initialWeightsWithIntercept = Vectors.dense(1.0 +: initialWeights.toArray)
val convergenceTol = 1e-12
val maxNumIterations = 10
val numIterations = 10

val (_, loss) = LBFGS.runLBFGS(
dataRDD,
gradient,
simpleUpdater,
numCorrections,
convergenceTol,
maxNumIterations,
numIterations,
regParam,
initialWeightsWithIntercept)

Expand Down Expand Up @@ -99,15 +99,15 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {
// Prepare another non-zero weights to compare the loss in the first iteration.
val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
val convergenceTol = 1e-12
val maxNumIterations = 10
val numIterations = 10

val (weightLBFGS, lossLBFGS) = LBFGS.runLBFGS(
dataRDD,
gradient,
squaredL2Updater,
numCorrections,
convergenceTol,
maxNumIterations,
numIterations,
regParam,
initialWeightsWithIntercept)

Expand Down Expand Up @@ -140,10 +140,10 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {

/**
* For the first run, we set the convergenceTol to 0.0, so that the algorithm will
* run up to the maxNumIterations which is 8 here.
* run up to the numIterations which is 8 here.
*/
val initialWeightsWithIntercept = Vectors.dense(0.0, 0.0)
val maxNumIterations = 8
val numIterations = 8
var convergenceTol = 0.0

val (_, lossLBFGS1) = LBFGS.runLBFGS(
Expand All @@ -152,7 +152,7 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {
squaredL2Updater,
numCorrections,
convergenceTol,
maxNumIterations,
numIterations,
regParam,
initialWeightsWithIntercept)

Expand All @@ -167,7 +167,7 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {
squaredL2Updater,
numCorrections,
convergenceTol,
maxNumIterations,
numIterations,
regParam,
initialWeightsWithIntercept)

Expand All @@ -182,7 +182,7 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {
squaredL2Updater,
numCorrections,
convergenceTol,
maxNumIterations,
numIterations,
regParam,
initialWeightsWithIntercept)

Expand All @@ -200,12 +200,12 @@ class LBFGSSuite extends FunSuite with LocalSparkContext with Matchers {
// Prepare another non-zero weights to compare the loss in the first iteration.
val initialWeightsWithIntercept = Vectors.dense(0.3, 0.12)
val convergenceTol = 1e-12
val maxNumIterations = 10
val numIterations = 10

val lbfgsOptimizer = new LBFGS(gradient, squaredL2Updater)
.setNumCorrections(numCorrections)
.setConvergenceTol(convergenceTol)
.setMaxNumIterations(maxNumIterations)
.setNumIterations(numIterations)
.setRegParam(regParam)

val weightLBFGS = lbfgsOptimizer.optimize(dataRDD, initialWeightsWithIntercept)
Expand Down Expand Up @@ -241,7 +241,7 @@ class LBFGSClusterSuite extends FunSuite with LocalClusterSparkContext {
val lbfgs = new LBFGS(new LogisticGradient, new SquaredL2Updater)
.setNumCorrections(1)
.setConvergenceTol(1e-12)
.setMaxNumIterations(1)
.setNumIterations(1)
.setRegParam(1.0)
val random = new Random(0)
// If we serialize data directly in the task closure, the size of the serialized task would be
Expand Down

0 comments on commit 7a4190a

Please sign in to comment.