diff --git a/mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala index 5c5bdfbbd80c1..8a310fc7b1fee 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala @@ -1,14 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.ml +import scala.beans.BeanInfo + +import org.apache.spark.annotation.AlphaComponent import org.apache.spark.mllib.linalg.Vector /** + * :: AlphaComponent :: * Class that represents an instance (data point) for prediction tasks. * * @param label Label to predict * @param features List of features describing this instance * @param weight Instance weight */ +@AlphaComponent +@BeanInfo case class LabeledPoint(label: Double, features: Vector, weight: Double) { override def toString: String = { @@ -16,6 +39,10 @@ case class LabeledPoint(label: Double, features: Vector, weight: Double) { } } +/** + * :: AlphaComponent :: + */ +@AlphaComponent object LabeledPoint { /** Constructor which sets instance weight to 1.0 */ def apply(label: Double, features: Vector) = new LabeledPoint(label, features, 1.0) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala index 10fdcf8751118..eeef116c876db 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala @@ -21,12 +21,17 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams} import org.apache.spark.mllib.linalg.Vector -@AlphaComponent +/** + * Params for classification. + * Currently empty, but may add functionality later. + */ private[classification] trait ClassifierParams extends PredictorParams /** * Single-label binary or multiclass classification + * Classes are indexed {0, 1, ..., numClasses - 1}. */ +@AlphaComponent abstract class Classifier[Learner <: Classifier[Learner, M], M <: ClassificationModel[M]] extends Predictor[Learner, M] with ClassifierParams { @@ -34,14 +39,22 @@ abstract class Classifier[Learner <: Classifier[Learner, M], M <: Classification // TODO: defaultEvaluator (follow-up PR) } - -private[ml] abstract class ClassificationModel[M <: ClassificationModel[M]] +/** + * :: AlphaComponent :: + * Model produced by a [[Classifier]]. + * Classes are indexed {0, 1, ..., numClasses - 1}. + * + * @tparam M Model type. + */ +@AlphaComponent +abstract class ClassificationModel[M <: ClassificationModel[M]] extends PredictionModel[M] with ClassifierParams { + /** Number of classes (values which the label can take). */ def numClasses: Int /** - * Predict label for the given features. Labels are indexed {0, 1, ..., numClasses - 1}. + * Predict label for the given features. * This default implementation for classification predicts the index of the maximum value * from [[predictRaw()]]. */ @@ -50,8 +63,12 @@ private[ml] abstract class ClassificationModel[M <: ClassificationModel[M]] } /** - * Raw prediction for each possible label - * @return vector where element i is the raw score for label i + * Raw prediction for each possible label. + * The meaning of a "raw" prediction may vary between algorithms, but it intuitively gives + * a magnitude of confidence in each possible label. + * @return vector where element i is the raw prediction for label i. + * This raw prediction may be any real number, where a larger value indicates greater + * confidence for that label. */ def predictRaw(features: Vector): Vector diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala index 89e82c36dd52e..b8fde6b08ebdb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala @@ -1,6 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.ml.impl.estimator -import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.{Estimator, LabeledPoint, Model} import org.apache.spark.ml.param._ import org.apache.spark.mllib.linalg.{Vector, VectorUDT} @@ -8,7 +24,6 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.Star -@AlphaComponent private[ml] trait PredictorParams extends Params with HasLabelCol with HasFeaturesCol with HasPredictionCol { diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala index a44d4c10a78a4..52096ce6b2589 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala @@ -1,11 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.ml.impl.estimator import org.apache.spark.mllib.linalg.Vector +/** + * Trait for a [[org.apache.spark.ml.classification.ClassificationModel]] which can output + * class conditional probabilities. + */ private[ml] trait ProbabilisticClassificationModel { /** - * Predict the probability of each label. + * Predict the probability of each class given the features. + * These predictions are also called class conditional probabilities. + * + * WARNING: Not all models output well-calibrated probability estimates! These probabilities + * should be treated as confidences, not precise probabilities. */ def predictProbabilities(features: Vector): Vector diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 5fb4379e23c2f..33cfd9bdc364f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -279,7 +279,7 @@ class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any]) exten def copy: ParamMap = new ParamMap(map.clone()) override def toString: String = { - map.map { case (param, value) => + map.toSeq.sorted.map { case (param, value) => s"\t${param.parent.uid}-${param.name}: $value" }.mkString("{\n", ",\n", "\n}") } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 8a6af90857dd1..0a95acb8a74b5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.ml.regression import org.apache.spark.annotation.AlphaComponent @@ -9,22 +26,20 @@ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel /** - * :: AlphaComponent :: * Params for linear regression. */ @AlphaComponent private[regression] trait LinearRegressionParams extends RegressorParams with HasRegParam with HasMaxIter - /** + * :: AlphaComponent :: * Logistic regression. */ +@AlphaComponent class LinearRegression extends Regressor[LinearRegression, LinearRegressionModel] with LinearRegressionParams { - // TODO: Extend IterativeEstimator - setRegParam(0.1) setMaxIter(100) @@ -52,7 +67,6 @@ class LinearRegression extends Regressor[LinearRegression, LinearRegressionModel } } - /** * :: AlphaComponent :: * Model produced by [[LinearRegression]]. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala index d2009d8610a1c..78086fe16fd60 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala @@ -1,28 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.ml.regression import org.apache.spark.annotation.AlphaComponent -import org.apache.spark.ml.Evaluator -import org.apache.spark.ml.evaluation.RegressionEvaluator -import org.apache.spark.ml.impl.estimator.{PredictionModel, HasDefaultEvaluator, Predictor, - PredictorParams} +import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams} import org.apache.spark.mllib.linalg.Vector -@AlphaComponent +/** + * Params for regression. + * Currently empty, but may add functionality later. + */ private[regression] trait RegressorParams extends PredictorParams /** + * :: AlphaComponent :: * Single-label regression */ +@AlphaComponent abstract class Regressor[Learner <: Regressor[Learner, M], M <: RegressionModel[M]] extends Predictor[Learner, M] - with RegressorParams - with HasDefaultEvaluator { + with RegressorParams { - override def defaultEvaluator: Evaluator = new RegressionEvaluator + // TODO: defaultEvaluator (follow-up PR) } - -private[ml] abstract class RegressionModel[M <: RegressionModel[M]] +/** + * :: AlphaComponent :: + * Model produced by a [[Regressor]]. + * @tparam M Model type. + */ +@AlphaComponent +abstract class RegressionModel[M <: RegressionModel[M]] extends PredictionModel[M] with RegressorParams { /**