Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ SHELL := /bin/sh

# Set variables if testing locally
ifeq ($(IS_RELEASE_BUILD),)
SPARK_VERSION := 2.4
SPARK_VERSION := 3.0
PROCESSOR := cpu
FRAMEWORK_VERSION := py37
SM_VERSION := 0.1
SM_VERSION := 1.0
USE_CASE := processing
BUILD_CONTEXT := ./spark/${USE_CASE}/${SPARK_VERSION}/py3
AWS_PARTITION := aws
Expand Down Expand Up @@ -84,18 +84,19 @@ test-sagemaker: install-sdk build-tests
# History server tests can't run in parallel since they use the same container name.
pytest -s -vv test/integration/history \
--repo=$(DEST_REPO) --tag=$(VERSION) --durations=0 \
--spark-version=$(SPARK_VERSION)
--framework_version=$(FRAMEWORK_VERSION) \
--spark-version=$(SPARK_VERSION) \
--framework-version=$(FRAMEWORK_VERSION) \
--role $(ROLE) \
--image_uri $(IMAGE_URI) \
--region ${REGION} \
--domain ${AWS_DOMAIN}
# OBJC_DISABLE_INITIALIZE_FORK_SAFETY: https://github.com/ansible/ansible/issues/32499#issuecomment-341578864
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES pytest --workers auto -s -vv test/integration/sagemaker \
--repo=$(DEST_REPO) --tag=$(VERSION) --durations=0 \
--spark-version=$(SPARK_VERSION)
--framework_version=$(FRAMEWORK_VERSION) \
--spark-version=$(SPARK_VERSION) \
--framework-version=$(FRAMEWORK_VERSION) \
--role $(ROLE) \
--account-id ${INTEG_TEST_ACCOUNT} \
--image_uri $(IMAGE_URI) \
--region ${REGION} \
--domain ${AWS_DOMAIN}
Expand All @@ -104,8 +105,8 @@ test-sagemaker: install-sdk build-tests
test-prod:
pytest -s -vv test/integration/tag \
--repo=$(DEST_REPO) --tag=$(VERSION) --durations=0 \
--spark-version=$(SPARK_VERSION)
--framework_version=$(FRAMEWORK_VERSION) \
--spark-version=$(SPARK_VERSION) \
--framework-version=$(FRAMEWORK_VERSION) \
--role $(ROLE) \
--image_uri $(IMAGE_URI) \
--region ${REGION} \
Expand Down
2 changes: 1 addition & 1 deletion new_images.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
new_images:
- spark: "2.4.4"
- spark: "3.0.0"
use-case: "processing"
processors: ["cpu"]
python: ["py37"]
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
],
setup_requires=["setuptools", "wheel"],
# Be frugal when adding dependencies. Prefer Python's standard library.
install_requires = install_reqs,
install_requires=install_reqs,

extras_require={
"test": test_install_reqs,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo "Not implemented"
100 changes: 100 additions & 0 deletions spark/processing/3.0/py3/docker/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
FROM amazonlinux:2
ARG REGION
ENV AWS_REGION ${REGION}
RUN yum clean all
RUN yum update -y
RUN yum install -y awscli bigtop-utils curl gcc gzip unzip python3 python3-setuptools python3-pip python-devel python3-devel python-psutil gunzip tar wget liblapack* libblas* libopencv* libopenblas*

# install nginx amazonlinux:2.0.20200304.0 does not have nginx, so need to install epel-release first
RUN wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
RUN yum install -y epel-release-latest-7.noarch.rpm
RUN yum install -y nginx

RUN rm -rf /var/cache/yum

ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
ENV PYTHONHASHSEED 0
ENV PYTHONIOENCODING UTF-8
ENV PIP_DISABLE_PIP_VERSION_CHECK 1

# Install EMR Spark/Hadoop
ENV HADOOP_HOME /usr/lib/hadoop
ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop
ENV SPARK_HOME /usr/lib/spark

COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo

# Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations.
# replace placeholder with region in repository URL
RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo

# These packages are a subset of what EMR installs in a cluster with the
# "hadoop", "spark", and "hive" applications.
# They include EMR-optimized libraries and extras.
RUN yum install -y aws-hm-client \
aws-java-sdk \
aws-sagemaker-spark-sdk \
emr-goodies \
emr-scripts \
emr-s3-select \
emrfs \
hadoop \
hadoop-client \
hadoop-hdfs \
hadoop-hdfs-datanode \
hadoop-hdfs-namenode \
hadoop-httpfs \
hadoop-kms \
hadoop-lzo \
hadoop-yarn \
hadoop-yarn-nodemanager \
hadoop-yarn-proxyserver \
hadoop-yarn-resourcemanager \
hadoop-yarn-timelineserver \
hive \
hive-hcatalog \
hive-hcatalog-server \
hive-jdbc \
hive-server2 \
python37-numpy \
python37-sagemaker_pyspark \
s3-dist-cp \
spark-core \
spark-datanucleus \
spark-external \
spark-history-server \
spark-python


# Point Spark at proper python binary
ENV PYSPARK_PYTHON=/usr/bin/python3

# Setup Spark/Yarn/HDFS user as root
ENV PATH="/usr/bin:/opt/program:${PATH}"
ENV YARN_RESOURCEMANAGER_USER="root"
ENV YARN_NODEMANAGER_USER="root"
ENV HDFS_NAMENODE_USER="root"
ENV HDFS_DATANODE_USER="root"
ENV HDFS_SECONDARYNAMENODE_USER="root"

# Set up bootstrapping program and Spark configuration
COPY *.whl /opt/program/
RUN /usr/bin/python3 -m pip install /opt/program/*.whl
COPY hadoop-config /opt/hadoop-config
COPY nginx-config /opt/nginx-config
COPY aws-config /opt/aws-config

# Setup container bootstrapper
COPY container-bootstrap-config /opt/container-bootstrap-config
RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh
RUN /opt/container-bootstrap-config/bootstrap.sh

# With this config, spark history server will not run as daemon, otherwise there
# will be no server running and container will terminate immediately
ENV SPARK_NO_DAEMONIZE TRUE

WORKDIR $SPARK_HOME

ENTRYPOINT ["smspark-submit"]
26 changes: 26 additions & 0 deletions spark/processing/3.0/py3/hadoop-config/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://nn_uri/</value>
<description>NameNode URI</description>
</property>
<property>
<name>fs.s3a.aws.credentials.provider</name>
<value>com.amazonaws.auth.DefaultAWSCredentialsProviderChain</value>
<description>AWS S3 credential provider</description>
</property>
<property>
<name>fs.s3.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
<description>s3a filesystem implementation</description>
</property>
<property>
<name>fs.AbstractFileSystem.s3a.imp</name>
<value>org.apache.hadoop.fs.s3a.S3A</value>
<description>s3a filesystem implementation</description>
</property>
</configuration>
19 changes: 19 additions & 0 deletions spark/processing/3.0/py3/hadoop-config/hdfs-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///opt/amazon/hadoop/hdfs/datanode</value>
<description>Comma separated list of paths on the local filesystem of a DataNode where it should store its\
blocks.</description>
</property>

<property>
<name>dfs.namenode.name.dir</name>
<value>file:///opt/amazon/hadoop/hdfs/namenode</value>
<description>Path on the local filesystem where the NameNode stores the namespace and transaction logs per\
sistently.</description>
</property>
</configuration>
6 changes: 6 additions & 0 deletions spark/processing/3.0/py3/hadoop-config/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
spark.driver.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
spark.driver.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
spark.executor.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
spark.executor.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
spark.driver.host=sd_host
spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2
3 changes: 3 additions & 0 deletions spark/processing/3.0/py3/hadoop-config/spark-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#EMPTY FILE AVOID OVERRIDDING ENV VARS
# Specifically, without copying the empty file, SPARK_HISTORY_OPTS will be overriden,
# spark.history.ui.port defaults to 18082, and spark.eventLog.dir defaults to local fs
34 changes: 34 additions & 0 deletions spark/processing/3.0/py3/hadoop-config/yarn-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version="1.0"?>
<!-- Site specific YARN configuration properties -->
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>rm_hostname</value>
<description>The hostname of the RM.</description>
</property>
<property>
<name>yarn.nodemanager.hostname</name>
<value>nm_hostname</value>
<description>The hostname of the NM.</description>
</property>
<property>
<name>yarn.nodemanager.webapp.address</name>
<value>nm_webapp_address</value>
</property>
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>5</value>
<description>Ratio between virtual memory to physical memory.</description>
</property>
<property>
<name>yarn.resourcemanager.am.max-attempts</name>
<value>1</value>
<description>The maximum number of application attempts.</description>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,YARN_HOME,AWS_CONTAINER_CREDENTIALS_RELATIVE_URI</value>
<description>Environment variable whitelist</description>
</property>

</configuration>
17 changes: 17 additions & 0 deletions spark/processing/3.0/py3/nginx-config/default.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
server {
listen 15050;
server_name localhost;
client_header_buffer_size 128k;
large_client_header_buffers 4 128k;

location ~ ^/history/(.*)/(.*)/jobs/$ {
proxy_pass http://localhost:18080/history/$1/jobs/;
proxy_redirect http://localhost:18080/history/$1/jobs/ $domain_name/proxy/15050/history/$1/jobs/;
expires off;
}

location / {
proxy_pass http://localhost:18080;
expires off;
}
}
66 changes: 66 additions & 0 deletions spark/processing/3.0/py3/nginx-config/nginx.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# For more information on configuration, see:
# * Official English Documentation: http://nginx.org/en/docs/
# * Official Russian Documentation: http://nginx.org/ru/docs/

user nginx;
worker_processes auto;
error_log /var/log/nginx/error.log;
pid /run/nginx.pid;

# Load dynamic modules. See /usr/share/doc/nginx/README.dynamic.
include /usr/share/nginx/modules/*.conf;

events {
worker_connections 1024;
}

http {
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';

access_log /var/log/nginx/access.log main;

sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
types_hash_max_size 2048;

include /etc/nginx/mime.types;
default_type application/octet-stream;

# Load modular configuration files from the /etc/nginx/conf.d directory.
# See http://nginx.org/en/docs/ngx_core_module.html#include
# for more information.
include /etc/nginx/conf.d/*.conf;

server {
listen 80 default_server;
listen [::]:80 default_server;
server_name _;
root /usr/share/nginx/html;

# Load configuration files for the default server block.
include /etc/nginx/default.d/*.conf;

location /proxy/15050 {
proxy_pass http://localhost:15050/;
}

location ~ ^/proxy/15050/(.*) {
proxy_pass http://localhost:15050/$1;
}

location / {
}

error_page 404 /404.html;
location = /40x.html {
}

error_page 500 502 503 504 /50x.html;
location = /50x.html {
}
}
}
Binary file not shown.
7 changes: 7 additions & 0 deletions spark/processing/3.0/py3/yum/emr-apps.repo
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[emr-apps]
name = EMR Application Repository
gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.1.0/72a9ec2e-9bf6-4d7d-9244-86a0ab1e50d6/repoPublicKey.txt
enabled = 1
baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.1.0/72a9ec2e-9bf6-4d7d-9244-86a0ab1e50d6
priority = 5
gpgcheck = 0
Loading