From afe54b76a69fcbc18d37db969de6088847329de6 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Tue, 12 May 2015 01:39:21 -0700 Subject: [PATCH] [SPARK-7485] [BUILD] Remove pyspark files from assembly. The sbt part of the build is hacky; it basically tricks sbt into generating the zip by using a generator, but returns an empty list for the generated files so that nothing is actually added to the assembly. Author: Marcelo Vanzin Closes #6022 from vanzin/SPARK-7485 and squashes the following commits: 22c1e04 [Marcelo Vanzin] Remove unneeded code. 4893622 [Marcelo Vanzin] [SPARK-7485] [build] Remove pyspark files from assembly. (cherry picked from commit 82e890fb19d6fbaffa69856eecb4699f2f8a81eb) Signed-off-by: Andrew Or --- core/pom.xml | 47 ---------------------------------------- mllib/pom.xml | 11 ---------- project/SparkBuild.scala | 44 +++---------------------------------- sql/core/pom.xml | 8 ------- streaming/pom.xml | 8 ------- 5 files changed, 3 insertions(+), 115 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index fc42f48973fe9..262a3320db106 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -381,35 +381,6 @@ target/scala-${scala.binary.version}/classes target/scala-${scala.binary.version}/test-classes - - - org.apache.maven.plugins - maven-antrun-plugin - - - generate-resources - - run - - - - - - - - - - - maven-clean-plugin - - - - ${basedir}/../python/build - - - true - - org.apache.maven.plugins maven-dependency-plugin @@ -438,24 +409,6 @@ - - - - src/main/resources - - - ../python - - pyspark/*.py - - - - ../python/build - - py4j/*.py - - - diff --git a/mllib/pom.xml b/mllib/pom.xml index a3c57ae26000b..0c07ca1a62fd3 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -141,16 +141,5 @@ target/scala-${scala.binary.version}/classes target/scala-${scala.binary.version}/test-classes - - - ../python - - pyspark/mllib/*.py - pyspark/mllib/stat/*.py - pyspark/ml/*.py - pyspark/ml/param/*.py - - - diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 186345af0e60e..1b87e4e98bd83 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -168,7 +168,7 @@ object SparkBuild extends PomBuild { /* Enable Assembly for all assembly projects */ assemblyProjects.foreach(enable(Assembly.settings)) - /* Package pyspark artifacts in the main assembly. */ + /* Package pyspark artifacts in a separate zip file for YARN. */ enable(PySparkAssembly.settings)(assembly) /* Enable unidoc only for the root spark project */ @@ -373,22 +373,15 @@ object PySparkAssembly { import java.util.zip.{ZipOutputStream, ZipEntry} lazy val settings = Seq( - unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" }, // Use a resource generator to copy all .py files from python/pyspark into a managed directory // to be included in the assembly. We can't just add "python/" to the assembly's resource dir // list since that will copy unneeded / unwanted files. resourceGenerators in Compile <+= resourceManaged in Compile map { outDir: File => val src = new File(BuildCommons.sparkHome, "python/pyspark") - val zipFile = new File(BuildCommons.sparkHome , "python/lib/pyspark.zip") zipFile.delete() zipRecursive(src, zipFile) - - val dst = new File(outDir, "pyspark") - if (!dst.isDirectory()) { - require(dst.mkdirs()) - } - copy(src, dst) + Seq[File]() } ) @@ -416,42 +409,11 @@ object PySparkAssembly { output.write(buf, 0, n) } } + output.closeEntry() in.close() } } - private def copy(src: File, dst: File): Seq[File] = { - src.listFiles().flatMap { f => - val child = new File(dst, f.getName()) - if (f.isDirectory()) { - child.mkdir() - copy(f, child) - } else if (f.getName().endsWith(".py")) { - var in: Option[FileInputStream] = None - var out: Option[FileOutputStream] = None - try { - in = Some(new FileInputStream(f)) - out = Some(new FileOutputStream(child)) - - val bytes = new Array[Byte](1024) - var read = 0 - while (read >= 0) { - read = in.get.read(bytes) - if (read > 0) { - out.get.write(bytes, 0, read) - } - } - - Some(child) - } finally { - in.foreach(_.close()) - out.foreach(_.close()) - } - } else { - None - } - } - } } object Unidoc { diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 7d274a73e079f..ffe95bb49188f 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -103,13 +103,5 @@ target/scala-${scala.binary.version}/classes target/scala-${scala.binary.version}/test-classes - - - ../../python - - pyspark/sql/*.py - - - diff --git a/streaming/pom.xml b/streaming/pom.xml index 5ca55a4f680bb..5ab7f4472c38b 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -105,13 +105,5 @@ - - - ../python - - pyspark/streaming/*.py - - -