Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-10234] [MLLIB] update since version in mllib.clustering #8435

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ import org.apache.spark.util.Utils
* @param maxIterations The maximum number of iterations to perform
*/
@Experimental
@Since("1.3.0")
class GaussianMixture private (
private var k: Int,
private var convergenceTol: Double,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ import org.apache.spark.sql.{SQLContext, Row}
*/
@Since("1.3.0")
@Experimental
class GaussianMixtureModel(
val weights: Array[Double],
val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable {
class GaussianMixtureModel @Since("1.3.0") (
@Since("1.3.0") val weights: Array[Double],
@Since("1.3.0") val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable {

require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add since to formatVersion?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a protected method. Ideally we should mark it as well. But we are not really expecting users implement Loader or Saveable.

Expand Down Expand Up @@ -178,7 +178,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
(weight, new MultivariateGaussian(mu, sigma))
}.unzip

return new GaussianMixtureModel(weights.toArray, gaussians.toArray)
new GaussianMixtureModel(weights.toArray, gaussians.toArray)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import org.apache.spark.util.random.XORShiftRandom
* This is an iterative algorithm that will make multiple passes over the data, so any RDDs given
* to it should be cached by the user.
*/
@Since("0.8.0")
class KMeans private (
private var k: Int,
private var maxIterations: Int,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ import org.apache.spark.sql.Row
* A clustering model for K-means. Each point belongs to the cluster with the closest center.
*/
@Since("0.8.0")
class KMeansModel (
val clusterCenters: Array[Vector]) extends Saveable with Serializable with PMMLExportable {
class KMeansModel @Since("1.1.0") (@Since("1.0.0") val clusterCenters: Array[Vector])
extends Saveable with Serializable with PMMLExportable {

/**
* A Java-friendly constructor that takes an Iterable of Vectors.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,15 @@ import org.apache.spark.util.BoundedPriorityQueue
* including local and distributed data structures.
*/
@Experimental
@Since("1.3.0")
abstract class LDAModel private[clustering] extends Saveable {

/** Number of topics */
@Since("1.3.0")
def k: Int

/** Vocabulary size (number of terms or terms in the vocabulary) */
@Since("1.3.0")
def vocabSize: Int

/**
Expand All @@ -57,6 +60,7 @@ abstract class LDAModel private[clustering] extends Saveable {
*
* This is the parameter to a Dirichlet distribution.
*/
@Since("1.5.0")
def docConcentration: Vector

/**
Expand All @@ -68,6 +72,7 @@ abstract class LDAModel private[clustering] extends Saveable {
* Note: The topics' distributions over terms are called "beta" in the original LDA paper
* by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
*/
@Since("1.5.0")
def topicConcentration: Double

/**
Expand All @@ -81,6 +86,7 @@ abstract class LDAModel private[clustering] extends Saveable {
* This is a matrix of size vocabSize x k, where each column is a topic.
* No guarantees are given about the ordering of the topics.
*/
@Since("1.3.0")
def topicsMatrix: Matrix

/**
Expand All @@ -91,6 +97,7 @@ abstract class LDAModel private[clustering] extends Saveable {
* (term indices, term weights in topic).
* Each topic's terms are sorted in order of decreasing weight.
*/
@Since("1.3.0")
def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])]

/**
Expand All @@ -102,6 +109,7 @@ abstract class LDAModel private[clustering] extends Saveable {
* (term indices, term weights in topic).
* Each topic's terms are sorted in order of decreasing weight.
*/
@Since("1.3.0")
def describeTopics(): Array[(Array[Int], Array[Double])] = describeTopics(vocabSize)

/* TODO (once LDA can be trained with Strings or given a dictionary)
Expand Down Expand Up @@ -185,10 +193,11 @@ abstract class LDAModel private[clustering] extends Saveable {
* @param topics Inferred topics (vocabSize x k matrix).
*/
@Experimental
@Since("1.3.0")
class LocalLDAModel private[clustering] (
val topics: Matrix,
override val docConcentration: Vector,
override val topicConcentration: Double,
@Since("1.3.0") val topics: Matrix,
@Since("1.5.0") override val docConcentration: Vector,
@Since("1.5.0") override val topicConcentration: Double,
override protected[clustering] val gammaShape: Double = 100)
extends LDAModel with Serializable {

Expand Down Expand Up @@ -376,6 +385,7 @@ class LocalLDAModel private[clustering] (
}

@Experimental
@Since("1.5.0")
object LocalLDAModel extends Loader[LocalLDAModel] {

private object SaveLoadV1_0 {
Expand Down Expand Up @@ -480,13 +490,14 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
* than the [[LocalLDAModel]].
*/
@Experimental
@Since("1.3.0")
class DistributedLDAModel private[clustering] (
private[clustering] val graph: Graph[LDA.TopicCounts, LDA.TokenCount],
private[clustering] val globalTopicTotals: LDA.TopicCounts,
val k: Int,
val vocabSize: Int,
override val docConcentration: Vector,
override val topicConcentration: Double,
@Since("1.3.0") val k: Int,
@Since("1.3.0") val vocabSize: Int,
@Since("1.5.0") override val docConcentration: Vector,
@Since("1.5.0") override val topicConcentration: Double,
private[spark] val iterationTimes: Array[Double],
override protected[clustering] val gammaShape: Double = 100)
extends LDAModel {
Expand Down Expand Up @@ -604,6 +615,7 @@ class DistributedLDAModel private[clustering] (
* (term indices, topic indices). Note that terms will be omitted if not present in
* the document.
*/
@Since("1.5.0")
lazy val topicAssignments: RDD[(Long, Array[Int], Array[Int])] = {
// For reference, compare the below code with the core part of EMLDAOptimizer.next().
val eta = topicConcentration
Expand Down Expand Up @@ -635,6 +647,7 @@ class DistributedLDAModel private[clustering] (
}

/** Java-friendly version of [[topicAssignments]] */
@Since("1.5.0")
lazy val javaTopicAssignments: JavaRDD[(java.lang.Long, Array[Int], Array[Int])] = {
topicAssignments.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Int])]].toJavaRDD()
}
Expand Down Expand Up @@ -771,6 +784,7 @@ class DistributedLDAModel private[clustering] (


@Experimental
@Since("1.5.0")
object DistributedLDAModel extends Loader[DistributedLDAModel] {

private object SaveLoadV1_0 {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@ import org.apache.spark.{Logging, SparkContext, SparkException}
*/
@Since("1.3.0")
@Experimental
class PowerIterationClusteringModel(
val k: Int,
val assignments: RDD[PowerIterationClustering.Assignment]) extends Saveable with Serializable {
class PowerIterationClusteringModel @Since("1.3.0") (
@Since("1.3.0") val k: Int,
@Since("1.3.0") val assignments: RDD[PowerIterationClustering.Assignment])
extends Saveable with Serializable {

@Since("1.4.0")
override def save(sc: SparkContext, path: String): Unit = {
Expand All @@ -56,6 +57,8 @@ class PowerIterationClusteringModel(

@Since("1.4.0")
object PowerIterationClusteringModel extends Loader[PowerIterationClusteringModel] {

@Since("1.4.0")
override def load(sc: SparkContext, path: String): PowerIterationClusteringModel = {
PowerIterationClusteringModel.SaveLoadV1_0.load(sc, path)
}
Expand Down Expand Up @@ -120,6 +123,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
* @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
*/
@Experimental
@Since("1.3.0")
class PowerIterationClustering private[clustering] (
private var k: Int,
private var maxIterations: Int,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,10 @@ import org.apache.spark.util.random.XORShiftRandom
*/
@Since("1.2.0")
@Experimental
class StreamingKMeansModel(
override val clusterCenters: Array[Vector],
val clusterWeights: Array[Double]) extends KMeansModel(clusterCenters) with Logging {
class StreamingKMeansModel @Since("1.2.0") (
@Since("1.2.0") override val clusterCenters: Array[Vector],
@Since("1.2.0") val clusterWeights: Array[Double])
extends KMeansModel(clusterCenters) with Logging {

/**
* Perform a k-means update on a batch of data.
Expand Down Expand Up @@ -168,10 +169,10 @@ class StreamingKMeansModel(
*/
@Since("1.2.0")
@Experimental
class StreamingKMeans(
var k: Int,
var decayFactor: Double,
var timeUnit: String) extends Logging with Serializable {
class StreamingKMeans @Since("1.2.0") (
@Since("1.2.0") var k: Int,
@Since("1.2.0") var decayFactor: Double,
@Since("1.2.0") var timeUnit: String) extends Logging with Serializable {

@Since("1.2.0")
def this() = this(2, 1.0, StreamingKMeans.BATCHES)
Expand Down