From 8a5a8e4111ca9ac0745af52e7a1c29d96711b5f4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Baranov Date: Mon, 24 Aug 2015 17:03:49 +0300 Subject: [PATCH 1/3] [SPARK-10182] [MLlib] GeneralizedLinearModel doesn't unpersist cached data --- .../regression/GeneralizedLinearAlgorithm.scala | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 509f6a2d169c4..c1b0c3e2d2d90 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -239,11 +239,6 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] numFeatures = input.map(_.features.size).first() } - if (input.getStorageLevel == StorageLevel.NONE) { - logWarning("The input data is not directly cached, which may hurt performance if its" - + " parent RDDs are also uncached.") - } - // Check the data properties before running the optimizer if (validateData && !validators.forall(func => func(input))) { throw new SparkException("Input validation failed.") @@ -287,7 +282,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] if (useFeatureScaling) { input.map(lp => (lp.label, scaler.transform(lp.features))).cache() } else { - input.map(lp => (lp.label, lp.features)) + input.map(lp => (lp.label, lp.features)).cache() } } @@ -351,11 +346,8 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] } } - // Warn at the end of the run as well, for increased visibility. - if (input.getStorageLevel == StorageLevel.NONE) { - logWarning("The input data was not directly cached, which may hurt performance if its" - + " parent RDDs are also uncached.") - } + // Unpersist cached data + data.unpersist(false) createModel(weights, intercept) } From a3c589ebebec07e281ee2ace42a7332d3027a7c6 Mon Sep 17 00:00:00 2001 From: Vyacheslav Baranov Date: Tue, 25 Aug 2015 12:01:07 +0300 Subject: [PATCH 2/3] SPARK-10182: Update after discussion --- .../regression/GeneralizedLinearAlgorithm.scala | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index c1b0c3e2d2d90..6d7947b7ba126 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -239,6 +239,11 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] numFeatures = input.map(_.features.size).first() } + if (input.getStorageLevel == StorageLevel.NONE) { + logWarning("The input data is not directly cached, which may hurt performance if its" + + " parent RDDs are also uncached.") + } + // Check the data properties before running the optimizer if (validateData && !validators.forall(func => func(input))) { throw new SparkException("Input validation failed.") @@ -282,7 +287,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] if (useFeatureScaling) { input.map(lp => (lp.label, scaler.transform(lp.features))).cache() } else { - input.map(lp => (lp.label, lp.features)).cache() + input.map(lp => (lp.label, lp.features)) } } @@ -346,8 +351,16 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] } } + // Warn at the end of the run as well, for increased visibility. + if (input.getStorageLevel == StorageLevel.NONE) { + logWarning("The input data is not directly cached, which may hurt performance if its" + + " parent RDDs are also uncached.") + } + // Unpersist cached data - data.unpersist(false) + if (data.getStorageLevel != StorageLevel.NONE) { + data.unpersist(false) + } createModel(weights, intercept) } From 352e8f3d3609a74c0ff5356071a24358b523aae1 Mon Sep 17 00:00:00 2001 From: Vyacheslav Baranov Date: Tue, 25 Aug 2015 12:02:20 +0300 Subject: [PATCH 3/3] Fixed typo --- .../spark/mllib/regression/GeneralizedLinearAlgorithm.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 6d7947b7ba126..b8415acf1a896 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -353,7 +353,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] // Warn at the end of the run as well, for increased visibility. if (input.getStorageLevel == StorageLevel.NONE) { - logWarning("The input data is not directly cached, which may hurt performance if its" + logWarning("The input data was not directly cached, which may hurt performance if its" + " parent RDDs are also uncached.") }