Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util #8430

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult
import org.jpmml.model.JAXBUtil

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory

/**
Expand All @@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
* developed by the Data Mining Group (www.dmg.org).
*/
@DeveloperApi
@Since("1.4.0")
trait PMMLExportable {

/**
Expand All @@ -48,6 +49,7 @@ trait PMMLExportable {
* Export the model to a local file in PMML format
*/
@Experimental
@Since("1.4.0")
def toPMML(localPath: String): Unit = {
toPMML(new StreamResult(new File(localPath)))
}
Expand All @@ -57,6 +59,7 @@ trait PMMLExportable {
* Export the model to a directory on a distributed file system in PMML format
*/
@Experimental
@Since("1.4.0")
def toPMML(sc: SparkContext, path: String): Unit = {
val pmml = toPMML()
sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
Expand All @@ -67,6 +70,7 @@ trait PMMLExportable {
* Export the model to the OutputStream in PMML format
*/
@Experimental
@Since("1.4.0")
def toPMML(outputStream: OutputStream): Unit = {
toPMML(new StreamResult(outputStream))
}
Expand All @@ -76,6 +80,7 @@ trait PMMLExportable {
* Export the model to a String in PMML format
*/
@Experimental
@Since("1.4.0")
def toPMML(): String = {
val writer = new StringWriter
toPMML(new StreamResult(writer))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,25 @@

package org.apache.spark.mllib.util

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

/**
* :: DeveloperApi ::
* A collection of methods used to validate data before applying ML algorithms.
*/
@DeveloperApi
@Since("0.8.0")
object DataValidators extends Logging {

/**
* Function to check if labels used for classification are either zero or one.
*
* @return True if labels are all zero or one, false otherwise.
*/
@Since("1.0.0")
val binaryLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count()
if (numInvalid != 0) {
Expand All @@ -48,6 +50,7 @@ object DataValidators extends Logging {
*
* @return True if labels are all in the range of {0, 1, ..., k-1}, false otherwise.
*/
@Since("1.3.0")
def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data =>
val numInvalid = data.filter(x =>
x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.rdd.RDD

/**
Expand All @@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD
* cluster with scale 1 around each center.
*/
@DeveloperApi
@Since("0.8.0")
object KMeansDataGenerator {

/**
Expand All @@ -42,6 +43,7 @@ object KMeansDataGenerator {
* @param r Scaling factor for the distribution of the initial centers
* @param numPartitions Number of partitions of the generated RDD; default 2
*/
@Since("0.8.0")
def generateKMeansRDD(
sc: SparkContext,
numPoints: Int,
Expand All @@ -62,6 +64,7 @@ object KMeansDataGenerator {
}
}

@Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 6) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

/**
* :: DeveloperApi ::
Expand All @@ -35,6 +35,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
* response variable `Y`.
*/
@DeveloperApi
@Since("0.8.0")
object LinearDataGenerator {

/**
Expand All @@ -46,6 +47,7 @@ object LinearDataGenerator {
* @param seed Random seed
* @return Java List of input.
*/
@Since("0.8.0")
def generateLinearInputAsList(
intercept: Double,
weights: Array[Double],
Expand All @@ -68,6 +70,7 @@ object LinearDataGenerator {
* @param eps Epsilon scaling factor.
* @return Seq of input.
*/
@Since("0.8.0")
def generateLinearInput(
intercept: Double,
weights: Array[Double],
Expand All @@ -92,6 +95,7 @@ object LinearDataGenerator {
* @param eps Epsilon scaling factor.
* @return Seq of input.
*/
@Since("0.8.0")
def generateLinearInput(
intercept: Double,
weights: Array[Double],
Expand Down Expand Up @@ -132,6 +136,7 @@ object LinearDataGenerator {
*
* @return RDD of LabeledPoint containing sample data.
*/
@Since("0.8.0")
def generateLinearRDD(
sc: SparkContext,
nexamples: Int,
Expand All @@ -151,6 +156,7 @@ object LinearDataGenerator {
data
}

@Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 2) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
Expand All @@ -31,6 +31,7 @@ import org.apache.spark.mllib.linalg.Vectors
* with probability `probOne` and scales features for positive examples by `eps`.
*/
@DeveloperApi
@Since("0.8.0")
object LogisticRegressionDataGenerator {

/**
Expand All @@ -43,6 +44,7 @@ object LogisticRegressionDataGenerator {
* @param nparts Number of partitions of the generated RDD. Default value is 2.
* @param probOne Probability that a label is 1 (and not 0). Default value is 0.5.
*/
@Since("0.8.0")
def generateLogisticRDD(
sc: SparkContext,
nexamples: Int,
Expand All @@ -62,6 +64,7 @@ object LogisticRegressionDataGenerator {
data
}

@Since("0.8.0")
def main(args: Array[String]) {
if (args.length != 5) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import scala.language.postfixOps
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.annotation.{Since, DeveloperApi}
import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix}
import org.apache.spark.rdd.RDD

Expand Down Expand Up @@ -52,7 +52,9 @@ import org.apache.spark.rdd.RDD
* testSampFact (Double) Percentage of training data to use as test data.
*/
@DeveloperApi
@Since("0.8.0")
object MFDataGenerator {
@Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 2) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import org.apache.spark.streaming.dstream.DStream
/**
* Helper methods to load, save and pre-process data used in ML Lib.
*/
@Since("0.8.0")
object MLUtils {

private[mllib] lazy val EPSILON = {
Expand Down Expand Up @@ -168,6 +169,7 @@ object MLUtils {
*
* @see [[org.apache.spark.mllib.util.MLUtils#loadLibSVMFile]]
*/
@Since("1.0.0")
def saveAsLibSVMFile(data: RDD[LabeledPoint], dir: String) {
// TODO: allow to specify label precision and feature precision.
val dataStr = data.map { case LabeledPoint(label, features) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,22 @@ import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

/**
* :: DeveloperApi ::
* Generate sample data used for SVM. This class generates uniform random values
* for the features and adds Gaussian noise with weight 0.1 to generate labels.
*/
@DeveloperApi
@Since("0.8.0")
object SVMDataGenerator {

@Since("0.8.0")
def main(args: Array[String]) {
if (args.length < 2) {
// scalastyle:off println
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import org.json4s._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.types.{DataType, StructField, StructType}

Expand All @@ -35,6 +35,7 @@ import org.apache.spark.sql.types.{DataType, StructField, StructType}
* This should be inherited by the class which implements model instances.
*/
@DeveloperApi
@Since("1.3.0")
trait Saveable {

/**
Expand All @@ -50,6 +51,7 @@ trait Saveable {
* @param path Path specifying the directory in which to save this model.
* If the directory already exists, this method throws an exception.
*/
@Since("1.3.0")
def save(sc: SparkContext, path: String): Unit

/** Current version of model save/load format. */
Expand All @@ -64,6 +66,7 @@ trait Saveable {
* This should be inherited by an object paired with the model class.
*/
@DeveloperApi
@Since("1.3.0")
trait Loader[M <: Saveable] {

/**
Expand All @@ -75,6 +78,7 @@ trait Loader[M <: Saveable] {
* @param path Path specifying the directory to which the model was saved.
* @return Model instance
*/
@Since("1.3.0")
def load(sc: SparkContext, path: String): M

}
Expand Down