From b2f6926cc5d4e5020b6811f6952101d6882877e1 Mon Sep 17 00:00:00 2001 From: Ed Date: Thu, 4 Dec 2014 13:11:41 -0800 Subject: [PATCH] Update DecisionTree.scala Hello, Hope you are well. We've been using DecisionTree at Samsung and hope to help in some small way. I was interested in setting the seed for the sampling i.e. in line 988. We're in the process of creating tests for our code being able to set the seed is helpful. To that end, I think also the sample method here depends on a PartitionwiseSampledRDD. In there the 'compute' method think uses a different seed from the one that can be passed to the constructor of PartitionSampledRDD, it uses split.seed (below). Well hope we can discuss more! Thank you.. Best Wishes -Ed override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } --- .../main/scala/org/apache/spark/mllib/tree/DecisionTree.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 73e7e32c6db31..222a46bd469af 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -985,6 +985,7 @@ object DecisionTree extends Serializable with Logging { 1.0 } logDebug("fraction of data used for calculating quantiles = " + fraction) + // To discuss: being able to set the seed via train method, or in Strategy of constructor. input.sample(withReplacement = false, fraction, new XORShiftRandom().nextInt()).collect() } else { new Array[LabeledPoint](0)