diff --git a/Makefile b/Makefile index da7cb7d9..9d643724 100644 --- a/Makefile +++ b/Makefile @@ -1,19 +1,20 @@ DOCKER_NETWORK = docker-hadoop_default ENV_FILE = hadoop.env current_branch := $(shell git rev-parse --abbrev-ref HEAD) +base_version := --build-arg HADOOP_BASE_VERSION=$(current_branch) build: docker build -t bde2020/hadoop-base:$(current_branch) ./base - docker build -t bde2020/hadoop-namenode:$(current_branch) ./namenode - docker build -t bde2020/hadoop-datanode:$(current_branch) ./datanode - docker build -t bde2020/hadoop-resourcemanager:$(current_branch) ./resourcemanager - docker build -t bde2020/hadoop-nodemanager:$(current_branch) ./nodemanager - docker build -t bde2020/hadoop-historyserver:$(current_branch) ./historyserver - docker build -t bde2020/hadoop-submit:$(current_branch) ./submit + docker build -t bde2020/hadoop-namenode:$(current_branch) $(base_version) ./namenode + docker build -t bde2020/hadoop-datanode:$(current_branch) $(base_version) ./datanode + docker build -t bde2020/hadoop-resourcemanager:$(current_branch) $(base_version) ./resourcemanager + docker build -t bde2020/hadoop-nodemanager:$(current_branch) $(base_version) ./nodemanager + docker build -t bde2020/hadoop-historyserver:$(current_branch) $(base_version) ./historyserver + docker build -t bde2020/hadoop-submit:$(current_branch) $(base_version) ./submit wordcount: docker build -t hadoop-wordcount ./submit docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -mkdir -p /input/ - docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -copyFromLocal -f /opt/hadoop-3.2.1/README.txt /input/ + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -copyFromLocal -f /opt/hadoop/README.txt /input/ docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} hadoop-wordcount docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -cat /output/* docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -rm -r /output diff --git a/README.md b/README.md index e836e345..275b4b71 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Version 2.0.0 introduces uses wait_for_it script for the cluster startup # Hadoop Docker ## Supported Hadoop Versions -See repository branches for supported hadoop versions +See repository branches for supported Hadoop versions ## Quick Start @@ -26,16 +26,17 @@ Or deploy in swarm: docker stack deploy -c docker-compose-v3.yml hadoop ``` -`docker-compose` creates a docker network that can be found by running `docker network list`, e.g. `dockerhadoop_default`. +`docker-compose` creates a docker network that can be found by running `docker network list`, e.g. `docker-hadoop_default`. -Run `docker network inspect` on the network (e.g. `dockerhadoop_default`) to find the IP the hadoop interfaces are published on. Access these interfaces with the following URLs: +Run `docker network inspect` on the network (e.g. `docker-hadoop_default`) to find the IP the Hadoop interfaces are published on. Access these interfaces with the following URLs: * Namenode: http://:9870/dfshealth.html#tab-overview * History server: http://:8188/applicationhistory -* Datanode: http://:9864/ -* Nodemanager: http://:8042/node * Resource manager: http://:8088/ +All other Hadoop communication ports are not exposed and only accessible from inside the Docker network using service name and port, eg. `http://namenode:9000/`. + + ## Configure Environment Variables The configuration parameters can be specified in the hadoop.env file or as environmental variables for specific services (e.g. namenode, datanode etc.): diff --git a/base/Dockerfile b/base/Dockerfile index dec673e2..19ca087d 100644 --- a/base/Dockerfile +++ b/base/Dockerfile @@ -1,27 +1,31 @@ -FROM debian:9 +FROM debian:10 MAINTAINER Ivan Ermilov MAINTAINER Giannis Mouchakis RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - openjdk-8-jdk \ + openjdk-11-jdk \ net-tools \ curl \ netcat \ gnupg \ libsnappy-dev \ + libssl-dev \ && rm -rf /var/lib/apt/lists/* -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ - RUN curl -O https://dist.apache.org/repos/dist/release/hadoop/common/KEYS RUN gpg --import KEYS -ENV HADOOP_VERSION 3.2.1 -ENV HADOOP_URL https://www.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz +ENV HADOOP_VERSION=3.3.1 +# base URL for downloads: the name of the tar file depends +# on the target platform (amd64/x86_64 vs. arm64/aarch64) +ENV HADOOP_BASE_URL=https://www.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION RUN set -x \ + && ARCH=$(uname -m) \ + && ARCH=$(if test "$ARCH" = "x86_64"; then echo ""; else echo "-$ARCH"; fi) \ + && HADOOP_URL="$HADOOP_BASE_URL/hadoop-$HADOOP_VERSION$ARCH.tar.gz" \ && curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz \ && curl -fSL "$HADOOP_URL.asc" -o /tmp/hadoop.tar.gz.asc \ && gpg --verify /tmp/hadoop.tar.gz.asc \ @@ -29,16 +33,24 @@ RUN set -x \ && rm /tmp/hadoop.tar.gz* RUN ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop +RUN ln -s /opt/hadoop-$HADOOP_VERSION /opt/hadoop RUN mkdir /opt/hadoop-$HADOOP_VERSION/logs RUN mkdir /hadoop-data +ENV JAVA_HOME=/usr/lib/jvm/default-java +# create the symlink "/usr/lib/jvm/default-java" in case +# it is not already there (cf. package "default-jre-headless") +RUN if ! test -d $JAVA_HOME; then \ + ln -sf $(readlink -f $(dirname $(readlink -f $(which java)))/..) $JAVA_HOME; \ + fi + ENV HADOOP_HOME=/opt/hadoop-$HADOOP_VERSION ENV HADOOP_CONF_DIR=/etc/hadoop ENV MULTIHOMED_NETWORK=1 ENV USER=root -ENV PATH $HADOOP_HOME/bin/:$PATH +ENV PATH=$HADOOP_HOME/bin/:$PATH ADD entrypoint.sh /entrypoint.sh diff --git a/datanode/Dockerfile b/datanode/Dockerfile index 55be14a7..743a45bc 100644 --- a/datanode/Dockerfile +++ b/datanode/Dockerfile @@ -1,4 +1,5 @@ -FROM bde2020/hadoop-base:2.0.0-hadoop3.2.1-java8 +ARG HADOOP_BASE_VERSION=2.0.0-hadoop3.3.1-java11 +FROM bde2020/hadoop-base:$HADOOP_BASE_VERSION MAINTAINER Ivan Ermilov diff --git a/docker-compose-v3.yml b/docker-compose-v3.yml index 84587ec9..a6153917 100644 --- a/docker-compose-v3.yml +++ b/docker-compose-v3.yml @@ -2,7 +2,7 @@ version: '3' services: namenode: - image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-namenode:2.0.0-hadoop3.3.1-java11 networks: - hbase volumes: @@ -24,7 +24,7 @@ services: traefik.port: 50070 datanode: - image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-datanode:2.0.0-hadoop3.3.1-java11 networks: - hbase volumes: @@ -42,7 +42,7 @@ services: traefik.port: 50075 resourcemanager: - image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.3.1-java11 networks: - hbase environment: @@ -64,7 +64,7 @@ services: disable: true nodemanager: - image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.3.1-java11 networks: - hbase environment: @@ -80,7 +80,7 @@ services: traefik.port: 8042 historyserver: - image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-historyserver:2.0.0-hadoop3.3.1-java11 networks: - hbase volumes: diff --git a/docker-compose.yml b/docker-compose.yml index ed40dc62..2079ad5d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,12 +2,11 @@ version: "3" services: namenode: - image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-namenode:2.0.0-hadoop3.3.1-java11 container_name: namenode restart: always ports: - 9870:9870 - - 9000:9000 volumes: - hadoop_namenode:/hadoop/dfs/name environment: @@ -16,7 +15,7 @@ services: - ./hadoop.env datanode: - image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-datanode:2.0.0-hadoop3.3.1-java11 container_name: datanode restart: always volumes: @@ -27,16 +26,18 @@ services: - ./hadoop.env resourcemanager: - image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.3.1-java11 container_name: resourcemanager restart: always + ports: + - 8088:8088 environment: SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864" env_file: - ./hadoop.env nodemanager1: - image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.3.1-java11 container_name: nodemanager restart: always environment: @@ -45,9 +46,11 @@ services: - ./hadoop.env historyserver: - image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 + image: bde2020/hadoop-historyserver:2.0.0-hadoop3.3.1-java11 container_name: historyserver restart: always + ports: + - 8188:8188 environment: SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088" volumes: diff --git a/hadoop.env b/hadoop.env index 95b3d102..147f7f1c 100644 --- a/hadoop.env +++ b/hadoop.env @@ -38,6 +38,6 @@ MAPRED_CONF_mapreduce_map_memory_mb=4096 MAPRED_CONF_mapreduce_reduce_memory_mb=8192 MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m -MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ -MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ -MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop/ +MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop/ +MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop/ diff --git a/historyserver/Dockerfile b/historyserver/Dockerfile index 6ad934e2..c53e95fb 100644 --- a/historyserver/Dockerfile +++ b/historyserver/Dockerfile @@ -1,4 +1,5 @@ -FROM bde2020/hadoop-base:2.0.0-hadoop3.2.1-java8 +ARG HADOOP_BASE_VERSION=2.0.0-hadoop3.3.1-java11 +FROM bde2020/hadoop-base:$HADOOP_BASE_VERSION MAINTAINER Ivan Ermilov diff --git a/namenode/Dockerfile b/namenode/Dockerfile index f5725ddf..992dbc82 100644 --- a/namenode/Dockerfile +++ b/namenode/Dockerfile @@ -1,4 +1,5 @@ -FROM bde2020/hadoop-base:2.0.0-hadoop3.2.1-java8 +ARG HADOOP_BASE_VERSION=2.0.0-hadoop3.3.1-java11 +FROM bde2020/hadoop-base:$HADOOP_BASE_VERSION MAINTAINER Ivan Ermilov diff --git a/nodemanager/Dockerfile b/nodemanager/Dockerfile index 966167cf..88dc9b7d 100644 --- a/nodemanager/Dockerfile +++ b/nodemanager/Dockerfile @@ -1,4 +1,5 @@ -FROM bde2020/hadoop-base:2.0.0-hadoop3.2.1-java8 +ARG HADOOP_BASE_VERSION=2.0.0-hadoop3.3.1-java11 +FROM bde2020/hadoop-base:$HADOOP_BASE_VERSION MAINTAINER Ivan Ermilov diff --git a/resourcemanager/Dockerfile b/resourcemanager/Dockerfile index cec9d132..918fdbea 100644 --- a/resourcemanager/Dockerfile +++ b/resourcemanager/Dockerfile @@ -1,4 +1,5 @@ -FROM bde2020/hadoop-base:2.0.0-hadoop3.2.1-java8 +ARG HADOOP_BASE_VERSION=2.0.0-hadoop3.3.1-java11 +FROM bde2020/hadoop-base:$HADOOP_BASE_VERSION MAINTAINER Ivan Ermilov diff --git a/submit/Dockerfile b/submit/Dockerfile index 6eba11d0..574f60db 100644 --- a/submit/Dockerfile +++ b/submit/Dockerfile @@ -1,4 +1,5 @@ -FROM bde2020/hadoop-base:2.0.0-hadoop3.2.1-java8 +ARG HADOOP_BASE_VERSION=2.0.0-hadoop3.3.1-java11 +FROM bde2020/hadoop-base:$HADOOP_BASE_VERSION MAINTAINER Ivan Ermilov