Skip to content
Permalink
Browse files

Remove unnecessary persist.

  • Loading branch information...
viirya committed Aug 15, 2019
1 parent 673fba5 commit c26014f4d1f628dda819afdb5a3dd7c3fb406516
Showing with 1 addition and 10 deletions.
  1. +1 −10 mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -219,26 +219,17 @@ class StringIndexer @Since("1.4.0") (
private def sortByAlphabet(dataset: Dataset[_], ascending: Boolean): Array[Array[String]] = {
val (inputCols, _) = getInOutCols()

// If input dataset is not originally cached, we need to unpersist it
// once we persist it later.
val needUnpersist = dataset.storageLevel == StorageLevel.NONE

dataset.persist()
val selectedCols = getSelectedCols(dataset, inputCols).map(collect_set(_))
val allLabels = dataset.select(selectedCols: _*)
.collect().toSeq.flatMap(_.toSeq).asInstanceOf[Seq[Seq[String]]]
val labels = ThreadUtils.parmap(allLabels, "sortingStringLabels", 8) { labels =>
ThreadUtils.parmap(allLabels, "sortingStringLabels", 8) { labels =>
val sorted = labels.filter(_ != null).sorted
if (ascending) {
sorted.toArray
} else {
sorted.reverse.toArray
}
}.toArray
if (needUnpersist) {
dataset.unpersist()
}
labels
}

@Since("2.0.0")

0 comments on commit c26014f

Please sign in to comment.
You can’t perform that action at this time.