Modified ParamMap to sort parameters in toString. Cleaned up classes …

…in class hierarchy, before implementing tests and examples.
apache · Feb 5, 2015 · 601e792 · 601e792
1 parent d705e87
commit 601e792
Show file tree

Hide file tree

Showing 7 changed files with 146 additions and 25 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala
@@ -1,21 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml
 
+import scala.beans.BeanInfo
+
+import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.mllib.linalg.Vector
 
 /**
+ * :: AlphaComponent ::
  * Class that represents an instance (data point) for prediction tasks.
  *
  * @param label Label to predict
  * @param features List of features describing this instance
  * @param weight Instance weight
  */
+@AlphaComponent
+@BeanInfo
 case class LabeledPoint(label: Double, features: Vector, weight: Double) {
 
   override def toString: String = {
     "(%s,%s,%s)".format(label, features, weight)
   }
 }
 
+/**
+ * :: AlphaComponent ::
+ */
+@AlphaComponent
 object LabeledPoint {
   /** Constructor which sets instance weight to 1.0 */
   def apply(label: Double, features: Vector) = new LabeledPoint(label, features, 1.0)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
@@ -21,27 +21,40 @@ import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams}
 import org.apache.spark.mllib.linalg.Vector
 
-@AlphaComponent
+/**
+ * Params for classification.
+ * Currently empty, but may add functionality later.
+ */
 private[classification] trait ClassifierParams extends PredictorParams
 
 /**
  * Single-label binary or multiclass classification
+ * Classes are indexed {0, 1, ..., numClasses - 1}.
  */
+@AlphaComponent
 abstract class Classifier[Learner <: Classifier[Learner, M], M <: ClassificationModel[M]]
   extends Predictor[Learner, M]
   with ClassifierParams {
 
   // TODO: defaultEvaluator (follow-up PR)
 }
 
-
-private[ml] abstract class ClassificationModel[M <: ClassificationModel[M]]
+/**
+ * :: AlphaComponent ::
+ * Model produced by a [[Classifier]].
+ * Classes are indexed {0, 1, ..., numClasses - 1}.
+ *
+ * @tparam M  Model type.
+ */
+@AlphaComponent
+abstract class ClassificationModel[M <: ClassificationModel[M]]
   extends PredictionModel[M] with ClassifierParams {
 
+  /** Number of classes (values which the label can take). */
   def numClasses: Int
 
   /**
-   * Predict label for the given features.  Labels are indexed {0, 1, ..., numClasses - 1}.
+   * Predict label for the given features.
    * This default implementation for classification predicts the index of the maximum value
    * from [[predictRaw()]].
    */
@@ -50,8 +63,12 @@ private[ml] abstract class ClassificationModel[M <: ClassificationModel[M]]
   }
 
   /**
-   * Raw prediction for each possible label
-   * @return  vector where element i is the raw score for label i
+   * Raw prediction for each possible label.
+   * The meaning of a "raw" prediction may vary between algorithms, but it intuitively gives
+   * a magnitude of confidence in each possible label.
+   * @return  vector where element i is the raw prediction for label i.
+   *          This raw prediction may be any real number, where a larger value indicates greater
+   *          confidence for that label.
    */
   def predictRaw(features: Vector): Vector
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
@@ -1,14 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml.impl.estimator
 
-import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.{Estimator, LabeledPoint, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.Star
 
-@AlphaComponent
 private[ml] trait PredictorParams extends Params
   with HasLabelCol with HasFeaturesCol with HasPredictionCol {
 

diff --git a/.../src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala b/.../src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala
@@ -1,11 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml.impl.estimator
 
 import org.apache.spark.mllib.linalg.Vector
 
+/**
+ * Trait for a [[org.apache.spark.ml.classification.ClassificationModel]] which can output
+ * class conditional probabilities.
+ */
 private[ml] trait ProbabilisticClassificationModel {
 
   /**
-   * Predict the probability of each label.
+   * Predict the probability of each class given the features.
+   * These predictions are also called class conditional probabilities.
+   *
+   * WARNING: Not all models output well-calibrated probability estimates!  These probabilities
+   *          should be treated as confidences, not precise probabilities.
    */
   def predictProbabilities(features: Vector): Vector
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -279,7 +279,7 @@ class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any]) exten
   def copy: ParamMap = new ParamMap(map.clone())
 
   override def toString: String = {
-    map.map { case (param, value) =>
+    map.toSeq.sorted.map { case (param, value) =>
       s"\t${param.parent.uid}-${param.name}: $value"
     }.mkString("{\n", ",\n", "\n}")
   }

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml.regression
 
 import org.apache.spark.annotation.AlphaComponent
@@ -9,22 +26,20 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
 /**
- * :: AlphaComponent ::
  * Params for linear regression.
  */
 @AlphaComponent
 private[regression] trait LinearRegressionParams extends RegressorParams
   with HasRegParam with HasMaxIter
 
-
 /**
+ * :: AlphaComponent ::
  * Logistic regression.
  */
+@AlphaComponent
 class LinearRegression extends Regressor[LinearRegression, LinearRegressionModel]
   with LinearRegressionParams {
 
-  // TODO: Extend IterativeEstimator
-
   setRegParam(0.1)
   setMaxIter(100)
 
@@ -52,7 +67,6 @@ class LinearRegression extends Regressor[LinearRegression, LinearRegressionModel
   }
 }
 
-
 /**
  * :: AlphaComponent ::
  * Model produced by [[LinearRegression]].

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala
@@ -1,28 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.ml.regression
 
 import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.Evaluator
-import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.ml.impl.estimator.{PredictionModel, HasDefaultEvaluator, Predictor,
-  PredictorParams}
+import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams}
 import org.apache.spark.mllib.linalg.Vector
 
-@AlphaComponent
+/**
+ * Params for regression.
+ * Currently empty, but may add functionality later.
+ */
 private[regression] trait RegressorParams extends PredictorParams
 
 /**
+ * :: AlphaComponent ::
  * Single-label regression
  */
+@AlphaComponent
 abstract class Regressor[Learner <: Regressor[Learner, M], M <: RegressionModel[M]]
   extends Predictor[Learner, M]
-  with RegressorParams
-  with HasDefaultEvaluator {
+  with RegressorParams {
 
-  override def defaultEvaluator: Evaluator = new RegressionEvaluator
+  // TODO: defaultEvaluator (follow-up PR)
 }
 
-
-private[ml] abstract class RegressionModel[M <: RegressionModel[M]]
+/**
+ * :: AlphaComponent ::
+ * Model produced by a [[Regressor]].
+ * @tparam M  Model type.
+ */
+@AlphaComponent
+abstract class RegressionModel[M <: RegressionModel[M]]
   extends PredictionModel[M] with RegressorParams {
 
   /**