Skip to content
Permalink
Browse files
Close #41: [HIVEMALL-54][SPARK] Add an easy-to-use script for spark-s…
…hell
  • Loading branch information
maropu committed Feb 10, 2017
1 parent 247e1ae commit 6b462ae45fcf44987e0a0ec49dfa5e4b4a8855e1
Showing 7 changed files with 244 additions and 0 deletions.
@@ -11,6 +11,8 @@ logs
.DS_Store
*~
bin/scala*
bin/spark-*-bin-*
bin/apache-maven-*
scalastyle-output.xml
scalastyle.txt
derby.log
@@ -19,3 +21,4 @@ spark/bin/zinc-*
*.so
.classpath
.project
metastore_db
@@ -0,0 +1,137 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Determine the current working directory
_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# Preserve the calling directory
_CALLING_DIR="$(pwd)"

# Download any application given a URL
## Arg1 - Remote URL
## Arg2 - Local file name
download_app() {
local remote_url="$1"
local local_name="$2"

# setup `curl` and `wget` options
local curl_opts="--progress-bar -L"
local wget_opts="--progress=bar:force"

# check if we already have the given application
# check if we have curl installed
# download application
[ ! -f "${local_name}" ] && [ $(command -v curl) ] && \
echo "exec: curl ${curl_opts} ${remote_url}" 1>&2 && \
curl ${curl_opts} "${remote_url}" > "${local_name}"
# if the file still doesn't exist, lets try `wget` and cross our fingers
[ ! -f "${local_name}" ] && [ $(command -v wget) ] && \
echo "exec: wget ${wget_opts} ${remote_url}" 1>&2 && \
wget ${wget_opts} -O "${local_name}" "${remote_url}"
# if both were unsuccessful, exit
[ ! -f "${local_name}" ] && \
echo -n "ERROR: Cannot download $2 with cURL or wget; " && \
echo "please install manually and try again." && \
exit 2
}

# Installs any application tarball given a URL, the expected tarball name,
# and, optionally, a checkable binary path to determine if the binary has
# already been installed
## Arg1 - URL
## Arg2 - Tarball Name
## Arg3 - Checkable Binary
install_app() {
local remote_tarball="$1/$2"
local local_tarball="${_DIR}/$2"
local binary="${_DIR}/$3"

if [ -z "$3" -o ! -f "$binary" ]; then
download_app "${remote_tarball}" "${local_tarball}"
cd "${_DIR}" && tar -xzf "$2"
rm -rf "$local_tarball"
fi
}

# Determine the Spark version from the root pom.xml file and
# install Spark under the bin/ folder if needed.
install_spark() {
local SPARK_VERSION=`grep "<spark.version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
local HADOOP_VERSION=`grep "<hadoop.version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
local SPARK_DIR="${_DIR}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
local APACHE_MIRROR=${APACHE_MIRROR:-'http://d3kbcqa49mib13.cloudfront.net'}

install_app \
"${APACHE_MIRROR}" \
"spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \
"spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}/bin/spark-shell"

SPARK_BIN="${SPARK_DIR}/bin/spark-shell"
}

# Determine the Maven version from the root pom.xml file and
# install maven under the build/ folder if needed.
install_mvn() {
local MVN_VERSION="3.3.9"
MVN_BIN="$(command -v mvn)"
if [ "$MVN_BIN" ]; then
local MVN_DETECTED_VERSION="$(mvn --version | head -n1 | awk '{print $3}')"
fi
# See simple version normalization: http://stackoverflow.com/questions/16989598/bash-comparing-version-numbers
function version { echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }'; }
if [ $(version $MVN_DETECTED_VERSION) -lt $(version $MVN_VERSION) ]; then
local APACHE_MIRROR=${APACHE_MIRROR:-'https://www.apache.org/dyn/closer.lua?action=download&filename='}

install_app \
"${APACHE_MIRROR}/maven/maven-3/${MVN_VERSION}/binaries" \
"apache-maven-${MVN_VERSION}-bin.tar.gz" \
"apache-maven-${MVN_VERSION}/bin/mvn"

MVN_BIN="${_DIR}/apache-maven-${MVN_VERSION}/bin/mvn"
fi
}

# Compile hivemall for the latest Spark release
compile_hivemall() {
local HIVEMALL_VERSION=`grep "<version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
local SCALA_VERSION=`grep "<scala.binary.version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
local SPARK_VERSION=`grep "<spark.binary.version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`

HIVEMALL_BIN="${_DIR}/../target/hivemall-spark-${SPARK_VERSION}_${SCALA_VERSION}-${HIVEMALL_VERSION}-with-dependencies.jar"
if [ ! -f "${HIVEMALL_BIN}" ]; then
install_mvn && ${MVN_BIN} -f "${_DIR}/../pom.xml" clean package -P"spark-${SPARK_VERSION}" -DskipTests
if [ $? = 127 ]; then
echo "Failed to compile hivemall for spark-${SPARK_VERSION}"
exit 1
fi
fi
}

# Install the proper version of Spark for launching spark-shell
install_spark

# Compile hivemall for the Spark version
compile_hivemall

# Reset the current working directory
cd "${_CALLING_DIR}"

echo "Using \`spark-shell\` from path: $SPARK_BIN" 1>&2

# Last, call the `spark-shell` command as usual
${SPARK_BIN} --properties-file ${_DIR}/../conf/spark-defaults.conf "$@"

@@ -0,0 +1,31 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.

# Example:
# spark.master spark://master:7077
# spark.eventLog.enabled true
# spark.eventLog.dir hdfs://namenode:8021/directory
# spark.serializer org.apache.spark.serializer.KryoSerializer
# spark.driver.memory 5g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"

# We assume that the latest Spark loads this configuration via ./bin/spark-shell
spark.jars ./target/hivemall-spark-2.1_2.11-0.4.2-rc.2-with-dependencies.jar

@@ -0,0 +1,20 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

# Summary
@@ -0,0 +1,49 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

Prerequisites
============

* Spark v2.0 or later
* Java 7 or later
* hivemall-spark-xxx-with-dependencies.jar
* [define-all.spark](https://github.com/apache/incubator-hivemall/blob/master/resources/ddl/define-all.spark)
* [import-packages.spark](https://github.com/apache/incubator-hivemall/blob/master/resources/ddl/import-packages.spark)

Installation
============

First, you download a compiled Spark package from [the Spark official web page](http://spark.apache.org/downloads.html) and
invoke spark-shell with a compiled Hivemall binary.

```
$ ./bin/spark-shell --jars hivemall-spark-xxx-with-dependencies.jar
```

> #### Notice
> If you would like to try Hivemall functions on the latest release of Spark, you just say `bin/spark-shell` in a Hivemall package.
> This command automatically downloads the latest Spark version, compiles Hivemall for the version, and invokes spark-shell with the compiled Hivemall binary.
Then, you load scripts for Hivemall functions.

```
scala> :load define-all.spark
scala> :load import-packages.spark
```

@@ -265,6 +265,7 @@
<module>spark/spark-common</module>
</modules>
<properties>
<hadoop.version>2.7</hadoop.version>
<spark.version>2.1.0</spark.version>
<spark.binary.version>2.1</spark.binary.version>
</properties>
@@ -276,6 +277,7 @@
<module>spark/spark-common</module>
</modules>
<properties>
<hadoop.version>2.7</hadoop.version>
<spark.version>2.0.2</spark.version>
<spark.binary.version>2.0</spark.binary.version>
</properties>
@@ -2,6 +2,8 @@
* Hivemall: Hive scalable Machine Learning Library
*/

val sqlContext = spark.sqlContext

sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS hivemall_version")
sqlContext.sql("CREATE TEMPORARY FUNCTION hivemall_version AS 'hivemall.HivemallVersionUDF'")

0 comments on commit 6b462ae

Please sign in to comment.