Skip to content
Permalink
Browse files
Close #68: [HIVEMALL-84] Add Docker Support
  • Loading branch information
amaya382 authored and myui committed Apr 25, 2017
1 parent 0c4798e commit bffd2c78d6a938b3ac9b6b5d5f5c4d40a7cfffbf
Showing 13 changed files with 228 additions and 1 deletion.
@@ -0,0 +1,6 @@
.dockerignore
resources/docker/Dockerfile
resources/docker/docker-compose.yml
.git/
target/
.*.swp
@@ -173,7 +173,11 @@
* [Top-k Join processing](spark/misc/topk_join.md)
* [Other utility functions](spark/misc/functions.md)

## Part XIII - External References
## Part XIII - Hivemall on Docker

* [Getting Started](docker/getting_started.md)

## Part XIV - External References

* [Hivemall on Apache Spark](https://github.com/maropu/hivemall-spark)
* [Hivemall on Apache Pig](https://github.com/daijyc/hivemall/wiki/PigHome)
@@ -0,0 +1,68 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

This page introduces how to run Hivemall on Docker.

<!-- toc -->

> #### Caution
> This docker image contains a single-node Hadoop enviroment for evaluating Hivemall. Not suited for production uses.
# Requirements

* Docker Engine 1.6+
* Docker Compose 1.10+

# 1. Build image

## Build using docker-compose

`docker-compose -f resources/docker/docker-compose.yml build`

## Build using docker engine

`docker build -f resources/docker/Dockerfile`

# 2. Run container

## Run by docker-compose

1. Edit `resources/docker/docker-compose.yml`
2. `docker-compose -f resources/docker/docker-compose.yml up -d && docker attach hivemall`

## Run by docker command

1. Find a local docker image by `docker images`.
2. Run `docker run -it ${docker_image_id}`.
Refer [Docker reference](https://docs.docker.com/engine/reference/run/) for the command detail.

# 3. Run Hivemall on Docker

1. type `hive` to run (see `.hiverc` loads Hivemall functions)
2. Try your Hivemall queries!

## Load data into HDFS (optional)

You can find an example script to load data into HDFS in `./bin/prepare_iris.sh`.
The script loads iris dataset into `iris` database.

## Build Hivemall (optional)

In the container, Hivemall resource is stored in `$HIVEMALL_PATH`.
You can build Hivemall package by `cd $HIVEMALL_PATH && ./bin/build.sh`.
@@ -44,6 +44,17 @@ add jar /tmp/hivemall-core-xxx-with-dependencies.jar;
source /tmp/define-all.hive;
```


Other choices
=============

You can also run Hivemall on the following platforms:

* [Apache Spark](../spark/getting_started/installation.md)
* [Apache Pig](https://github.com/daijyc/hivemall/wiki/PigHome)
* [Apache Hive on Docker](../docker/getting_started.md) for testing


Build from Source
==================

@@ -0,0 +1,63 @@
FROM openjdk:7

WORKDIR /root/

ARG PREBUILD=true
ARG HADOOP_VERSION=2.7.3
ARG HIVE_VERSION=2.1.1

ENV BASE_URL='https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename='
ENV HADOOP_HOME='/usr/local/hadoop'
ENV HIVE_HOME='/usr/local/hive'
ENV HIVEMALL_PATH='/opt/hivemall'
ENV HADOOP_OPTS=' \
-Dsystem:java.io.tmpdir=/tmp \
-Dsystem:user.name=root \
-Dderby.stream.error.file=/root/derby.log'
ENV PATH="${HADOOP_HOME}/bin:${HIVE_HOME}/bin:${PATH}"

COPY . ${HIVEMALL_PATH}/

RUN set -eux && \
apt update && \
apt install -y --no-install-recommends openssh-server maven ruby npm && \
ln -s /usr/bin/nodejs /usr/bin/node && \
npm install -g gitbook-cli && \
\
wget ${BASE_URL}hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -O - \
| tar xz && \
mv hadoop-${HADOOP_VERSION} ${HADOOP_HOME} && \
sed -i -e 's!${JAVA_HOME}!'"${JAVA_HOME}!" ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh && \
ssh-keygen -q -P '' -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
echo 'host *\n StrictHostKeyChecking no' > ~/.ssh/config && \
mv ${HIVEMALL_PATH}/resources/docker/etc/hadoop/*.xml ${HADOOP_HOME}/etc/hadoop && \
hdfs namenode -format && \
\
wget ${BASE_URL}hive/hive-${HIVE_VERSION}/apache-hive-${HIVE_VERSION}-bin.tar.gz -O - \
| tar xz && \
mv apache-hive-${HIVE_VERSION}-bin ${HIVE_HOME} && \
cat ${HIVE_HOME}/conf/hive-default.xml.template \
| sed -e 's!databaseName=metastore_db!databaseName=/root/metastore_db!' \
> ${HIVE_HOME}/conf/hive-site.xml && \
\
cd ${HIVEMALL_PATH} && \
HIVEMALL_VERSION=`cat VERSION` && \
mkdir -p /root/bin /root/hivemall && \
find ${HIVEMALL_PATH}/resources/docker/home/bin -mindepth 1 -maxdepth 1 \
-exec sh -c 'f={} && ln -s $f /root/bin/${f##*/}' \; && \
ln -s ${HIVEMALL_PATH}/resources/docker/home/.hiverc /root && \
ln -s ${HIVEMALL_PATH}/resources/ddl/define-all.hive /root/hivemall/define-all.hive && \
ln -s ${HIVEMALL_PATH}/target/hivemall-core-${HIVEMALL_VERSION}-with-dependencies.jar \
/root/hivemall/hivemall-core-with-dependencies.jar && \
\
(if ${PREBUILD}; then \
mvn package -Dmaven.test.skip=true -pl core; \
fi) && \
\
rm -rf /var/cache/apt/archives/* /var/lib/apt/lists/* /root/.m2/* /root/.npm/*

VOLUME ["/opt/hivemall/", "/root/data/"]
EXPOSE 8088 19888 50070

CMD ["sh", "-c", "./bin/init.sh && bash"]
@@ -0,0 +1,19 @@
version: '2'
services:
hivemall:
build:
context: ../../
dockerfile: resources/docker/Dockerfile
args:
- PREBUILD=false
image: hivemall
container_name: hivemall
ports:
- "8088:8088" # ResourceManager
- "19888:19888" # JobHistoryServer
- "50070:50070" # NameNode
volumes:
- "../../:/opt/hivemall/" # mount current hivemall dir
#- "/path/to/data/:/root/data/" # mount resources
tty: true
stdin_open: true
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
@@ -0,0 +1,7 @@
<?xml version="1.0"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
@@ -0,0 +1,7 @@
<?xml version="1.0"?>
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
@@ -0,0 +1,2 @@
add jar /root/hivemall/hivemall-core-with-dependencies.jar;
source /root/hivemall/define-all.hive;
@@ -0,0 +1,7 @@
#!/bin/sh -eux

/etc/init.d/ssh start
$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/sbin/start-yarn.sh
$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
schematool -initSchema -dbType derby
@@ -0,0 +1,17 @@
#!/bin/sh -eux

DATA_DIR='/root/data'
HDFS_DATA_DIR='/dataset/iris/raw'
DATA='iris.data'
mkdir -p $DATA_DIR
[ -f $DATA_DIR/$DATA ] || wget http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data -O $DATA_DIR/$DATA
hadoop fs -mkdir -p $HDFS_DATA_DIR
awk -F',' 'NF >0 {OFS="|"; print NR,$5,$1","$2","$3","$4}' $DATA_DIR/$DATA \
| hadoop fs -put - $HDFS_DATA_DIR/$DATA
hive -e " \
create database if not exists iris; \
use iris; \
create external table iris_raw (rowid int, label string, features array<float>) \
row format delimited fields terminated by '|' \
collection items terminated by ',' \
stored as textfile location \"$HDFS_DATA_DIR\";"

0 comments on commit bffd2c7

Please sign in to comment.