Skip to content

Commit

Permalink
[SPARK-7485] [BUILD] Remove pyspark files from assembly.
Browse files Browse the repository at this point in the history
The sbt part of the build is hacky; it basically tricks sbt
into generating the zip by using a generator, but returns
an empty list for the generated files so that nothing is
actually added to the assembly.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #6022 from vanzin/SPARK-7485 and squashes the following commits:

22c1e04 [Marcelo Vanzin] Remove unneeded code.
4893622 [Marcelo Vanzin] [SPARK-7485] [build] Remove pyspark files from assembly.

(cherry picked from commit 82e890f)
Signed-off-by: Andrew Or <andrew@databricks.com>
  • Loading branch information
Marcelo Vanzin authored and Andrew Or committed May 12, 2015
1 parent 4092a2e commit afe54b7
Show file tree
Hide file tree
Showing 5 changed files with 3 additions and 115 deletions.
47 changes: 0 additions & 47 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -381,35 +381,6 @@
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<plugins>
<!-- Unzip py4j so we can include its files in the jar -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<phase>generate-resources</phase>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
<configuration>
<target>
<unzip src="../python/lib/py4j-0.8.2.1-src.zip" dest="../python/build" />
</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<configuration>
<filesets>
<fileset>
<directory>${basedir}/../python/build</directory>
</fileset>
</filesets>
<verbose>true</verbose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
Expand Down Expand Up @@ -438,24 +409,6 @@
</executions>
</plugin>
</plugins>

<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
<resource>
<directory>../python</directory>
<includes>
<include>pyspark/*.py</include>
</includes>
</resource>
<resource>
<directory>../python/build</directory>
<includes>
<include>py4j/*.py</include>
</includes>
</resource>
</resources>
</build>

<profiles>
Expand Down
11 changes: 0 additions & 11 deletions mllib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -141,16 +141,5 @@
<build>
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<resources>
<resource>
<directory>../python</directory>
<includes>
<include>pyspark/mllib/*.py</include>
<include>pyspark/mllib/stat/*.py</include>
<include>pyspark/ml/*.py</include>
<include>pyspark/ml/param/*.py</include>
</includes>
</resource>
</resources>
</build>
</project>
44 changes: 3 additions & 41 deletions project/SparkBuild.scala
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ object SparkBuild extends PomBuild {
/* Enable Assembly for all assembly projects */
assemblyProjects.foreach(enable(Assembly.settings))

/* Package pyspark artifacts in the main assembly. */
/* Package pyspark artifacts in a separate zip file for YARN. */
enable(PySparkAssembly.settings)(assembly)

/* Enable unidoc only for the root spark project */
Expand Down Expand Up @@ -373,22 +373,15 @@ object PySparkAssembly {
import java.util.zip.{ZipOutputStream, ZipEntry}

lazy val settings = Seq(
unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" },
// Use a resource generator to copy all .py files from python/pyspark into a managed directory
// to be included in the assembly. We can't just add "python/" to the assembly's resource dir
// list since that will copy unneeded / unwanted files.
resourceGenerators in Compile <+= resourceManaged in Compile map { outDir: File =>
val src = new File(BuildCommons.sparkHome, "python/pyspark")

val zipFile = new File(BuildCommons.sparkHome , "python/lib/pyspark.zip")
zipFile.delete()
zipRecursive(src, zipFile)

val dst = new File(outDir, "pyspark")
if (!dst.isDirectory()) {
require(dst.mkdirs())
}
copy(src, dst)
Seq[File]()
}
)

Expand Down Expand Up @@ -416,42 +409,11 @@ object PySparkAssembly {
output.write(buf, 0, n)
}
}
output.closeEntry()
in.close()
}
}

private def copy(src: File, dst: File): Seq[File] = {
src.listFiles().flatMap { f =>
val child = new File(dst, f.getName())
if (f.isDirectory()) {
child.mkdir()
copy(f, child)
} else if (f.getName().endsWith(".py")) {
var in: Option[FileInputStream] = None
var out: Option[FileOutputStream] = None
try {
in = Some(new FileInputStream(f))
out = Some(new FileOutputStream(child))

val bytes = new Array[Byte](1024)
var read = 0
while (read >= 0) {
read = in.get.read(bytes)
if (read > 0) {
out.get.write(bytes, 0, read)
}
}

Some(child)
} finally {
in.foreach(_.close())
out.foreach(_.close())
}
} else {
None
}
}
}
}

object Unidoc {
Expand Down
8 changes: 0 additions & 8 deletions sql/core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,5 @@
<build>
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<resources>
<resource>
<directory>../../python</directory>
<includes>
<include>pyspark/sql/*.py</include>
</includes>
</resource>
</resources>
</build>
</project>
8 changes: 0 additions & 8 deletions streaming/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,5 @@
</configuration>
</plugin>
</plugins>
<resources>
<resource>
<directory>../python</directory>
<includes>
<include>pyspark/streaming/*.py</include>
</includes>
</resource>
</resources>
</build>
</project>

0 comments on commit afe54b7

Please sign in to comment.