Skip to content

Commit

Permalink
[SPARK-34757][CORE][DEPLOY] Ignore cache for SNAPSHOT dependencies in…
Browse files Browse the repository at this point in the history
… spark-submit

### What changes were proposed in this pull request?
This change is to ignore cache for SNAPSHOT dependencies in spark-submit.

### Why are the changes needed?
When spark-submit is executed with --packages, it will not download the dependency jars when they are available in cache (e.g. ivy cache), even when the dependencies are SNAPSHOT.

This might block developers who work on external modules in Spark (e.g. spark-avro), since they need to remove the cache manually every time when they update the code during developments (which generates SNAPSHOT jars). Without knowing this, they could be blocked wondering why their code changes are not reflected in spark-submit executions.

### Does this PR introduce _any_ user-facing change?
Yes. With this change, developers/users who run spark-submit with SNAPSHOT dependencies do not need to remove the cache every time when the SNAPSHOT dependencies are updated.

### How was this patch tested?
Added a unit test.

Closes #31849 from bozhang2820/spark-submit-cache-ignore.

Authored-by: Bo Zhang <bo.zhang@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
  • Loading branch information
bozhang2820 authored and HyukjinKwon committed Mar 18, 2021
1 parent 25e7d1c commit 86ea520
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
Expand Up @@ -44,7 +44,7 @@ import org.apache.ivy.core.report.ResolveReport
import org.apache.ivy.core.resolve.ResolveOptions
import org.apache.ivy.core.retrieve.RetrieveOptions
import org.apache.ivy.core.settings.IvySettings
import org.apache.ivy.plugins.matcher.GlobPatternMatcher
import org.apache.ivy.plugins.matcher.{GlobPatternMatcher, PatternMatcher}
import org.apache.ivy.plugins.repository.file.FileRepository
import org.apache.ivy.plugins.resolver.{ChainResolver, FileSystemResolver, IBiblioResolver}

Expand Down Expand Up @@ -1153,6 +1153,8 @@ private[spark] object SparkSubmitUtils extends Logging {
// We need a chain resolver if we want to check multiple repositories
val cr = new ChainResolver
cr.setName("spark-list")
cr.setChangingMatcher(PatternMatcher.REGEXP)
cr.setChangingPattern(".*-SNAPSHOT")

val localM2 = new IBiblioResolver
localM2.setM2compatible(true)
Expand Down Expand Up @@ -1312,6 +1314,8 @@ private[spark] object SparkSubmitUtils extends Logging {
remoteRepos.filterNot(_.trim.isEmpty).map(_.split(",")).foreach { repositoryList =>
val cr = new ChainResolver
cr.setName("user-list")
cr.setChangingMatcher(PatternMatcher.REGEXP)
cr.setChangingPattern(".*-SNAPSHOT")

// add current default resolver, if any
Option(ivySettings.getDefaultResolver).foreach(cr.add)
Expand Down
Expand Up @@ -304,4 +304,22 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
s" Resolved jars are: $jarPath")
}
}

test("SPARK-34757: should ignore cache for SNAPSHOT dependencies") {
val main = new MavenCoordinate("my.great.lib", "mylib", "0.1-SNAPSHOT")
IvyTestUtils.withRepository(main, None, None) { repo =>
val ivySettings = SparkSubmitUtils.buildIvySettings(Some(repo), Some(tempIvyPath))
// set isTest to false since we need to check the resolved jar file
val jarPath = SparkSubmitUtils.resolveMavenCoordinates(
main.toString, ivySettings, transitive = true, isTest = false)
val modifiedTimestamp = Files.getLastModifiedTime(Paths.get(jarPath.head))
// update the artifact and resolve again
IvyTestUtils.createLocalRepositoryForTests(main, None, Some(new File(new URI(repo))))
SparkSubmitUtils.resolveMavenCoordinates(
main.toString, ivySettings, transitive = true, isTest = false)
// check that the artifact is updated
assert(
modifiedTimestamp.compareTo(Files.getLastModifiedTime(Paths.get(jarPath.head))) != 0)
}
}
}

0 comments on commit 86ea520

Please sign in to comment.