Skip to content

Commit

Permalink
[SPARK-7485] [build] Remove pyspark files from assembly.
Browse files Browse the repository at this point in the history
The sbt part of the build is hacky; it basically tricks sbt
into generating the zip by using a generator, but returns
an empty list for the generated files so that nothing is
actually added to the assembly.
  • Loading branch information
Marcelo Vanzin committed May 8, 2015
1 parent 6dad76e commit 4893622
Show file tree
Hide file tree
Showing 5 changed files with 4 additions and 109 deletions.
47 changes: 0 additions & 47 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -381,35 +381,6 @@
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<plugins>
<!-- Unzip py4j so we can include its files in the jar -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<phase>generate-resources</phase>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
<configuration>
<target>
<unzip src="../python/lib/py4j-0.8.2.1-src.zip" dest="../python/build" />
</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<configuration>
<filesets>
<fileset>
<directory>${basedir}/../python/build</directory>
</fileset>
</filesets>
<verbose>true</verbose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
Expand Down Expand Up @@ -438,24 +409,6 @@
</executions>
</plugin>
</plugins>

<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
<resource>
<directory>../python</directory>
<includes>
<include>pyspark/*.py</include>
</includes>
</resource>
<resource>
<directory>../python/build</directory>
<includes>
<include>py4j/*.py</include>
</includes>
</resource>
</resources>
</build>

<profiles>
Expand Down
11 changes: 0 additions & 11 deletions mllib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -141,16 +141,5 @@
<build>
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<resources>
<resource>
<directory>../python</directory>
<includes>
<include>pyspark/mllib/*.py</include>
<include>pyspark/mllib/stat/*.py</include>
<include>pyspark/ml/*.py</include>
<include>pyspark/ml/param/*.py</include>
</includes>
</resource>
</resources>
</build>
</project>
39 changes: 4 additions & 35 deletions project/SparkBuild.scala
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ object SparkBuild extends PomBuild {
/* Enable Assembly for all assembly projects */
assemblyProjects.foreach(enable(Assembly.settings))

/* Package pyspark artifacts in the main assembly. */
/* Package pyspark artifacts in a separate zip file for YARN. */
enable(PySparkAssembly.settings)(assembly)

/* Enable unidoc only for the root spark project */
Expand Down Expand Up @@ -373,7 +373,6 @@ object PySparkAssembly {
import java.util.zip.{ZipOutputStream, ZipEntry}

lazy val settings = Seq(
unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" },
// Use a resource generator to copy all .py files from python/pyspark into a managed directory
// to be included in the assembly. We can't just add "python/" to the assembly's resource dir
// list since that will copy unneeded / unwanted files.
Expand All @@ -388,7 +387,8 @@ object PySparkAssembly {
if (!dst.isDirectory()) {
require(dst.mkdirs())
}
copy(src, dst)

Seq[File]()
}
)

Expand Down Expand Up @@ -416,42 +416,11 @@ object PySparkAssembly {
output.write(buf, 0, n)
}
}
output.closeEntry()
in.close()
}
}

private def copy(src: File, dst: File): Seq[File] = {
src.listFiles().flatMap { f =>
val child = new File(dst, f.getName())
if (f.isDirectory()) {
child.mkdir()
copy(f, child)
} else if (f.getName().endsWith(".py")) {
var in: Option[FileInputStream] = None
var out: Option[FileOutputStream] = None
try {
in = Some(new FileInputStream(f))
out = Some(new FileOutputStream(child))

val bytes = new Array[Byte](1024)
var read = 0
while (read >= 0) {
read = in.get.read(bytes)
if (read > 0) {
out.get.write(bytes, 0, read)
}
}

Some(child)
} finally {
in.foreach(_.close())
out.foreach(_.close())
}
} else {
None
}
}
}
}

object Unidoc {
Expand Down
8 changes: 0 additions & 8 deletions sql/core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,5 @@
<build>
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<resources>
<resource>
<directory>../../python</directory>
<includes>
<include>pyspark/sql/*.py</include>
</includes>
</resource>
</resources>
</build>
</project>
8 changes: 0 additions & 8 deletions streaming/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,5 @@
</configuration>
</plugin>
</plugins>
<resources>
<resource>
<directory>../python</directory>
<includes>
<include>pyspark/streaming/*.py</include>
</includes>
</resource>
</resources>
</build>
</project>

0 comments on commit 4893622

Please sign in to comment.