Skip to content

Commit

Permalink
[ZEPPELIN-18] Running pyspark without deploying python libraries to e…
Browse files Browse the repository at this point in the history
…very yarn node

- Spark supports pyspark on yarn cluster without deploying python libraries from Spark 1.4
 - https://issues.apache.org/jira/browse/SPARK-6869
 - apache/spark#5580, apache/spark#5478

Author: Jongyoul Lee <jongyoul@gmail.com>

Closes #118 from jongyoul/ZEPPELIN-18 and squashes the following commits:

a47e27c [Jongyoul Lee] - Fixed test script for spark 1.4.0
72a65fd [Jongyoul Lee] - Fixed test script for spark 1.4.0
ee6d100 [Jongyoul Lee] - Cleanup codes
47fd9c9 [Jongyoul Lee] - Cleanup codes
248e330 [Jongyoul Lee] - Cleanup codes
4cd10b5 [Jongyoul Lee] - Removed meaningless codes comments
c9cda29 [Jongyoul Lee] - Removed setting SPARK_HOME - Changed the location of pyspark's directory into interpreter/spark
ef240f5 [Jongyoul Lee] - Fixed typo
06002fd [Jongyoul Lee] - Fixed typo
4b35c8d [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - Dummy for trigger
682986e [Jongyoul Lee] rebased
8a7bf47 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - rebasing
ad610fb [Jongyoul Lee] rebased
94bdf30 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - Fixed checkstyle
929333d [Jongyoul Lee] rebased
64b8195 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - rebasing
0a2d90e [Jongyoul Lee] rebased
b05ae6e [Jongyoul Lee] [ZEPPELIN-18] Remove setting SPARK_HOME for PySpark - Excludes python/** from apache-rat
71e2a92 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - Removed verbose setting
0ddb436 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - Followed spark's way to support pyspark - https://issues.apache.org/jira/browse/SPARK-6869 - apache/spark#5580 - https://github.com/apache/spark/pull/5478/files
1b192f6 [Jongyoul Lee] [ZEPPELIN-18] Remove setting SPARK_HOME for PySpark - Removed redundant dependency setting
32fd9e1 [Jongyoul Lee] [ZEPPELIN-18] Running pyspark without deploying python libraries to every yarn node - rebasing

(cherry picked from commit 3bd2b21)
Signed-off-by: Lee moon soo <moon@apache.org>
  • Loading branch information
jongyoul authored and Leemoonsoo committed Jul 5, 2015
1 parent 8d15d7c commit 66b93b9
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 27 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,4 @@ auto-save-list
tramp
.\#*
*.swp
**/dependency-reduced-pom.xml
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@ before_install:
- "sh -e /etc/init.d/xvfb start"

install:
- mvn package -DskipTests -Phadoop-2.3 -B
- mvn package -DskipTests -Phadoop-2.3 -Ppyspark -B

before_script:
-

script:
# spark 1.4
- mvn package -Pbuild-distr -Phadoop-2.3 -B
- mvn package -Pbuild-distr -Phadoop-2.3 -Ppyspark -B
- ./testing/startSparkCluster.sh 1.4.0 2.3
- SPARK_HOME=./spark-1.4.1-bin-hadoop2.3 mvn verify -Pusing-packaged-distr -Phadoop-2.3 -B
- SPARK_HOME=`pwd`/spark-1.4.0-bin-hadoop2.3 mvn verify -Pusing-packaged-distr -Phadoop-2.3 -Ppyspark -B
- ./testing/stopSparkCluster.sh 1.4.0 2.3
# spark 1.3
- mvn clean package -DskipTests -Pspark-1.3 -Phadoop-2.3 -B -pl 'zeppelin-interpreter,spark'
Expand Down
13 changes: 13 additions & 0 deletions bin/interpreter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ if [[ ! -d "${ZEPPELIN_LOG_DIR}" ]]; then
$(mkdir -p "${ZEPPELIN_LOG_DIR}")
fi

if [[ ! -z "${SPARK_HOME}" ]]; then
PYSPARKPATH="${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py4j-0.8.2.1-src.zip"
else
PYSPARKPATH="${ZEPPELIN_HOME}/interpreter/spark/pyspark/pyspark.zip:${ZEPPELIN_HOME}/interpreter/spark/pyspark/py4j-0.8.2.1-src.zip"
fi

if [[ x"" == x"${PYTHONPATH}" ]]; then
export PYTHONPATH="${PYSPARKPATH}"
else
export PYTHONPATH="${PYTHONPATH}:${PYSPARKPATH}"
fi

unset PYSPARKPATH

${ZEPPELIN_RUNNER} ${JAVA_INTP_OPTS} -cp ${CLASSPATH} ${ZEPPELIN_SERVER} ${PORT} &
pid=$!
Expand Down
77 changes: 70 additions & 7 deletions spark/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@

<akka.group>org.spark-project.akka</akka.group>
<akka.version>2.3.4-spark</akka.version>

<spark.download.url>http://www.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz</spark.download.url>
</properties>

<repositories>
Expand Down Expand Up @@ -473,13 +475,6 @@
</exclusions>
</dependency>

<!-- pyspark -->
<dependency>
<groupId>net.sf.py4j</groupId>
<artifactId>py4j</artifactId>
<version>0.8.2.1</version>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-exec</artifactId>
Expand Down Expand Up @@ -723,6 +718,74 @@
</dependencies>
</profile>

<profile>
<id>pyspark</id>
<properties>
<spark.download.url>http://www.apache.org/dist/spark/spark-${spark.version}/spark-${spark.version}.tgz
</spark.download.url>
</properties>
<build>
<plugins>
<plugin>
<groupId>com.googlecode.maven-download-plugin</groupId>
<artifactId>download-maven-plugin</artifactId>
<version>1.2.1</version>
<executions>
<execution>
<id>download-pyspark-files</id>
<phase>validate</phase>
<goals>
<goal>wget</goal>
</goals>
<configuration>
<url>${spark.download.url}</url>
<unpack>true</unpack>
<outputDirectory>${project.build.directory}/spark-dist</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<configuration>
<filesets>
<fileset>
<directory>${basedir}/../python/build</directory>
</fileset>
<fileset>
<directory>${project.build.direcoty}/spark-dist</directory>
</fileset>
</filesets>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<version>1.7</version>
<executions>
<execution>
<id>download-and-zip-pyspark-files</id>
<phase>generate-resources</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<delete dir="../interpreter/spark/pyspark"/>
<copy todir="../interpreter/spark/pyspark"
file="${project.build.directory}/spark-dist/spark-${spark.version}/python/lib/py4j-0.8.2.1-src.zip"/>
<zip destfile="${project.build.directory}/../../interpreter/spark/pyspark/pyspark.zip"
basedir="${project.build.directory}/spark-dist/spark-${spark.version}/python"
includes="pyspark/*.py,pyspark/**/*.py"/>
</target>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>

<!-- Build without Hadoop dependencies that are included in some runtime environments. -->
<profile>
<id>hadoop-provided</id>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,18 +159,6 @@ public void open() {
try {
Map env = EnvironmentUtils.getProcEnvironment();

String pythonPath = (String) env.get("PYTHONPATH");
if (pythonPath == null) {
pythonPath = "";
} else {
pythonPath += ":";
}

pythonPath += getSparkHome() + "/python/lib/py4j-0.8.2.1-src.zip:"
+ getSparkHome() + "/python";

env.put("PYTHONPATH", pythonPath);

executor.execute(cmd, env, this);
pythonscriptRunning = true;
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,9 @@
import java.lang.reflect.Method;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.*;

import com.google.common.base.Joiner;
import org.apache.spark.HttpServer;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
Expand Down Expand Up @@ -273,6 +270,34 @@ public SparkContext createSparkContext() {
}
}

//TODO(jongyoul): Move these codes into PySparkInterpreter.java

String pysparkBasePath = getSystemDefault("SPARK_HOME", "spark.home", null);
File pysparkPath;
if (null == pysparkBasePath) {
pysparkBasePath = getSystemDefault("ZEPPELIN_HOME", "zeppelin.home", "../");
pysparkPath = new File(pysparkBasePath,
"interpreter" + File.separator + "spark" + File.separator + "pyspark");
} else {
pysparkPath = new File(pysparkBasePath,
"python" + File.separator + "lib");
}

String[] pythonLibs = new String[]{"pyspark.zip", "py4j-0.8.2.1-src.zip"};
ArrayList<String> pythonLibUris = new ArrayList<>();
for (String lib : pythonLibs) {
File libFile = new File(pysparkPath, lib);
if (libFile.exists()) {
pythonLibUris.add(libFile.toURI().toString());
}
}
pythonLibUris.trimToSize();
if (pythonLibs.length == pythonLibUris.size()) {
conf.set("spark.yarn.dist.files", Joiner.on(",").join(pythonLibUris));
conf.set("spark.files", conf.get("spark.yarn.dist.files"));
conf.set("spark.submit.pyArchives", Joiner.on(":").join(pythonLibs));
}

SparkContext sparkContext = new SparkContext(conf);
return sparkContext;
}
Expand Down

0 comments on commit 66b93b9

Please sign in to comment.