Skip to content

Commit

Permalink
[SEDONA-285] Sedona Spark Jupyterlab docker image (#939)
Browse files Browse the repository at this point in the history
Co-authored-by: Hadiya Kartikey <kartikey.hadiya@esri.in>
Co-authored-by: Kartikey <56991178+kartikeyhadiya@users.noreply.github.com>
Co-authored-by: Kartikey <hadiyakartikey123@gmail.com>
Co-authored-by: yyy1000 <992364620@qq.com>
  • Loading branch information
5 people committed Aug 2, 2023
1 parent 8d5c76d commit bb87200
Show file tree
Hide file tree
Showing 11 changed files with 431 additions and 1 deletion.
53 changes: 53 additions & 0 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: Docker build

on:
push:
branches:
- master
paths:
- 'docker/**'
pull_request:
branches:
- '*'

env:
MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=60

jobs:
build:
strategy:
fail-fast: true
matrix:
os: ['ubuntu-latest', 'macos-latest']
include:
- spark: 3.4.1
sedona: 1.4.1
- spark: 3.4.1
sedona: latest
- spark: 3.3.2
sedona: latest
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash

steps:
- uses: actions/checkout@v2
- uses: actions/setup-java@v1
with:
java-version: 11
- name: Cache Maven packages
uses: actions/cache@v2
with:
path: ~/.m2
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
restore-keys: ${{ runner.os }}-m2
- name: Setup docker (missing on MacOS)
if: runner.os == 'macos'
run: |
brew install docker
colima start
- env:
SPARK_VERSION: ${{ matrix.spark }}
SEDONA_VERSION: ${{ matrix.sedona }}
run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION} ${SEDONA_VERSION}
2 changes: 1 addition & 1 deletion binder/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ mkdocs="*"
pytest-cov = "*"

[packages]
pandas="*"
pandas="1.3.5"
shapely="==1.8.4"
geopandas="==0.11.1"
pyspark="==3.3.2"
Expand Down
3 changes: 3 additions & 0 deletions docker/sedona-spark-jupyterlab/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Dockerfile
compose.yml
README.md
6 changes: 6 additions & 0 deletions docker/sedona-spark-jupyterlab/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
commands.txt
docker-compose-orig.yml
Dockerfile.bak
log.txt
examples
.ipynb_checkpoints
47 changes: 47 additions & 0 deletions docker/sedona-spark-jupyterlab/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

SPARK_VERSION=$1
SEDONA_VERSION=$2
BUILD_MODE=$3

if [ "$SEDONA_VERSION" = "latest" ]; then
# The compilation must take place outside Docker to avoid unnecessary maven packages
mvn clean install -DskipTests -Dspark=${SEDONA_SPARK_VERSION} -Dgeotools -Dscala=2.12
fi

# -- Building the image

if [ -z "$BUILD_MODE" ] || [ "$BUILD_MODE" = "local" ]; then
# If local, build the image for the local environment
docker build \
--build-arg spark_version="${SPARK_VERSION}" \
--build-arg sedona_version="${SEDONA_VERSION}" \
-f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
-t sedona/sedona-jupyterlab:${SEDONA_VERSION} .
else
# If release, build the image for cross-platform
docker buildx build --platform linux/amd64,linux/arm64 \
--progress=plain \
--output type=registry \
--build-arg spark_version="${SPARK_VERSION}" \
--build-arg sedona_version="${SEDONA_VERSION}" \
-f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
-t drjiayu/sedona-jupyterlab:${SEDONA_VERSION} .
fi
12 changes: 12 additions & 0 deletions docker/sedona-spark-jupyterlab/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
pandas==1.3.5
fiona==1.8.22
geopandas==0.10.2
keplergl==0.3.2
pydeck==0.8.0
attrs
matplotlib
descartes
ipywidgets
jupyterlab-widgets
ipykernel
jupyterlab==3.6.4
73 changes: 73 additions & 0 deletions docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

FROM ubuntu:22.04

ARG shared_workspace=/opt/workspace
ARG spark_version=3.3.2
ARG hadoop_version=3
ARG hadoop_s3_version=3.3.4
ARG aws_sdk_version=1.12.402
ARG spark_xml_version=0.16.0
ARG sedona_version=1.4.1
ARG geotools_wrapper_version=1.4.0-28.2

# Set up envs
ENV SHARED_WORKSPACE=${shared_workspace}
ENV SPARK_HOME /opt/spark
RUN mkdir ${SPARK_HOME}
ENV SEDONA_HOME /opt/sedona
RUN mkdir ${SEDONA_HOME}

ENV SPARK_MASTER_HOST localhost
ENV SPARK_MASTER_PORT 7077
ENV PYTHONPATH=$SPARK_HOME/python
ENV PYSPARK_PYTHON python3
ENV PYSPARK_DRIVER_PYTHON jupyter

COPY ./ ${SEDONA_HOME}/

RUN chmod +x ${SEDONA_HOME}/docker/spark.sh
RUN chmod +x ${SEDONA_HOME}/docker/sedona.sh
RUN ${SEDONA_HOME}/docker/spark.sh ${spark_version} ${hadoop_version} ${hadoop_s3_version} ${aws_sdk_version} ${spark_xml_version}
RUN ${SEDONA_HOME}/docker/sedona.sh ${sedona_version} ${geotools_wrapper_version} ${spark_version}

# Install Python dependencies
COPY docker/sedona-spark-jupyterlab/requirements.txt /opt/requirements.txt
RUN pip3 install -r /opt/requirements.txt

COPY binder/*.ipynb /opt/workspace/examples/
COPY binder/*.py /opt/workspace/examples/
COPY binder/data /opt/workspace/examples/data

# Add the master IP address to all notebooks
RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i 's/config = SedonaContext.builder()/config = SedonaContext.builder().master(\\"spark:\/\/localhost:7077\\")/' {} +
# Delete packages configured by the notebooks
RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i '/spark\.jars\.packages/d' {} +
RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i '/org\.apache\.sedona:sedona-spark-shaded-/d' {} +
RUN find /opt/workspace/examples/ -type f -name "*.ipynb" -exec sed -i '/org\.datasyslab:geotools-wrapper:/d' {} +

RUN rm -rf ${SEDONA_HOME}

EXPOSE 8888
EXPOSE 8080
EXPOSE 8081
EXPOSE 4040

WORKDIR ${SHARED_WORKSPACE}

CMD service ssh start && ${SPARK_HOME}/sbin/start-all.sh && jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=
45 changes: 45 additions & 0 deletions docker/sedona.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -e

# Define variables
sedona_version=$1
geotools_wrapper_version=$2
spark_version=$3

lower_version=$(echo -e $spark_version"\n3.4" | sort -V | head -n1)
if [ $lower_version = "3.4" ]; then
sedona_spark_version=3.4
else
sedona_spark_version=3.0
fi

if [ $sedona_version = "latest" ]; then
# Code to execute when SEDONA_VERSION is "latest"
cp ${SEDONA_HOME}/spark-shaded/target/sedona-spark-shaded-*.jar ${SPARK_HOME}/jars/
cd ${SEDONA_HOME}/python;pip3 install shapely==1.8.4;pip3 install .
else
# Code to execute when SEDONA_VERSION is not "latest"
# Download Sedona
curl https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-${sedona_spark_version}_2.12/${sedona_version}/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar -o $SPARK_HOME/jars/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar
curl https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/${geotools_wrapper_version}/geotools-wrapper-${geotools_wrapper_version}.jar -o $SPARK_HOME/jars/geotools-wrapper-${geotools_wrapper_version}.jar

# Install Sedona Python
pip3 install shapely==1.8.4
pip3 install apache-sedona==${sedona_version}
fi
60 changes: 60 additions & 0 deletions docker/spark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -e

# Define variables
spark_version=$1
hadoop_version=$2
hadoop_s3_version=$3
aws_sdk_version=$4
spark_xml_version=$5

# Set up OS libraries
apt-get update
apt-get install -y openjdk-19-jdk-headless curl python3-pip maven
pip3 install --upgrade pip && pip3 install pipenv

# Download Spark jar and set up PySpark
curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz
tar -xf spark.tgz && mv spark-${spark_version}-bin-hadoop${hadoop_version}/* ${SPARK_HOME}/
rm spark.tgz && rm -rf spark-${spark_version}-bin-hadoop${hadoop_version}
pip3 install pyspark==${spark_version}

# Add S3 jars
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${hadoop_s3_version}/hadoop-aws-${hadoop_s3_version}.jar -o ${SPARK_HOME}/jars/hadoop-aws-${hadoop_s3_version}.jar
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${aws_sdk_version}/aws-java-sdk-bundle-${aws_sdk_version}.jar -o ${SPARK_HOME}/jars/aws-java-sdk-bundle-${aws_sdk_version}.jar

# Add spark-xml jar
curl https://repo1.maven.org/maven2/com/databricks/spark-xml_2.12/${spark_xml_version}/spark-xml_2.12-${spark_xml_version}.jar -o ${SPARK_HOME}/jars/spark-xml_2.12-${spark_xml_version}.jar

# Set up master IP address and executor memory
cp ${SPARK_HOME}/conf/spark-defaults.conf.template ${SPARK_HOME}/conf/spark-defaults.conf
echo "spark.driver.memory 4g" >> ${SPARK_HOME}/conf/spark-defaults.conf
echo "spark.executor.memory 4g" >> ${SPARK_HOME}/conf/spark-defaults.conf

# Install required libraries for GeoPandas on Apple chip mac
apt-get install -y gdal-bin libgdal-dev

# Install OpenSSH for cluster mode
apt-get install -y openssh-client openssh-server
systemctl enable ssh

# Enable nopassword ssh
ssh-keygen -t rsa -f ~/.ssh/id_rsa -N ""
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 600 ~/.ssh/authorized_keys

0 comments on commit bb87200

Please sign in to comment.