Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,27 @@ docker push $AWS_ACCOUNT_ID.dkr.ecr.us-west-2.amazonaws.com/$SPARK_REPOSITORY:$V
make test-sagemaker
```

6. please run following command before you raise CR:

```
make test-unit
make install-container-library
```


## Push the code
1. You need to create PR request in order to merge the code. How to create PR request lists here:https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request
2. You need to get Github access of AWS organization. please following here:https://w.amazon.com/?Open_Source/GitHub
3. Get access to permission specific to a team, example is here:https://github.com/orgs/aws/teams/sagemakerwrite/members
4. Ask a person to review the code and merge it in.This repo needs at least one code reviewer.
5. The code needs to be signed before pushing. More detail about signing is here:https://docs.github.com/en/authentication/managing-commit-signature-verification/signing-commits.
Remember in your local, you need to set up: git config --global user.signingkey [key id] and also upload public key into your github account.
6. The email you specify when you created public key must match github email in github settings.

```
$ git commit -S -m "your commit message"
```

6.Remember in your local, you need to set up: git config --global user.signingkey [key id] and also upload public key into your github account.
The email you specify when you created public key must match github email in github settings.

### FAQ

Expand Down Expand Up @@ -168,3 +181,20 @@ make: *** [install-container-library] Error 255
```

* you need to update smsparkbuild/py39/Pipfile corresponding package version.

6. Code build may fail because of the format,
for example
```
2 files would be reformatted, 13 files would be left unchanged.
```

you can fix it by running

```
black src/smspark/bootstrapper.py
```
see https://www.freecodecamp.org/news/auto-format-your-python-code-with-black/ for detail.

7. Remember to define module at start of python file. Missing docstring error.

see more detail here https://stackoverflow.com/questions/46192576/how-can-i-fix-flake8-d100-missing-docstring-error-in-atom-editor
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ SHELL := /bin/sh

# Set variables if testing locally
ifeq ($(IS_RELEASE_BUILD),)
SPARK_VERSION := 3.1
SPARK_VERSION := 3.2
PROCESSOR := cpu
FRAMEWORK_VERSION := py39
SM_VERSION := 1.0
Expand Down
41 changes: 41 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]

[packages]
tenacity = "==8.0.1"
psutil = "==5.9.0"
click = "==8.1.2"
watchdog = "==0.10.3"
waitress = "==2.1.2"
types-waitress = "==2.0.6"
requests = "==2.27.1"
types-requests = "==2.27.16"
rsa = "==4.3"
pyasn1 = "==0.4.8"
boto3 = "==1.21.33"
safety = "==1.10.3"
black = "==22.3.0"
mypy = "==0.942"
flake8 = "==4.0.1"
flake8-docstrings = "==1.5.0"
pytest = "==7.1.1"
pytest-cov = "==2.10.0"
pytest-xdist = "==2.5.0"
docker = "==5.0.3"
docker-compose = "==1.29.2"
cryptography = "==36.0.2"
typing-extensions = "==4.1.1"
sagemaker = "==2.83.0"
smspark = {editable = true, path = "."}
importlib-metadata = "==4.11.3"
pytest-parallel = "==0.1.1"
pytest-rerunfailures = "10.0"
numpy = "==1.22.2"
protobuf = "==3.20.1"

[requires]
python_version = "3.9"
1,052 changes: 1,052 additions & 0 deletions Pipfile.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions new_images.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
new_images:
- spark: "3.1.1"
- spark: "3.2"
use-case: "processing"
processors: ["cpu"]
python: ["py39"]
sm_version: "1.3"
sm_version: "1.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo "Not implemented"
128 changes: 128 additions & 0 deletions spark/processing/3.2/py3/docker/py39/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
FROM 137112412989.dkr.ecr.us-west-2.amazonaws.com/amazonlinux:2
ARG REGION
ENV AWS_REGION ${REGION}

RUN yum clean all \
&& yum update -y \
&& yum install -y awscli bigtop-utils curl gcc gzip unzip zip gunzip tar wget liblapack* libblas* libopencv* libopenblas*

# Install python 3.9
ARG PYTHON_BASE_VERSION=3.9
ARG PYTHON_WITH_BASE_VERSION=python${PYTHON_BASE_VERSION}
ARG PIP_WITH_BASE_VERSION=pip${PYTHON_BASE_VERSION}
ARG PYTHON_VERSION=${PYTHON_BASE_VERSION}.12
RUN yum -y groupinstall 'Development Tools' \
&& yum -y install openssl-devel bzip2-devel libffi-devel sqlite-devel xz-devel \
&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
&& tar xzf Python-${PYTHON_VERSION}.tgz \
&& cd Python-*/ \
&& ./configure --enable-optimizations \
&& make altinstall \
&& echo -e 'alias python3=python3.9\nalias pip3=pip3.9' >> ~/.bashrc \
&& ln -s $(which ${PYTHON_WITH_BASE_VERSION}) /usr/local/bin/python3 \
&& ln -s $(which ${PIP_WITH_BASE_VERSION}) /usr/local/bin/pip3 \
&& cd .. \
&& rm Python-${PYTHON_VERSION}.tgz \
&& rm -rf Python-${PYTHON_VERSION}

# install nginx amazonlinux:2.0.20200304.0 does not have nginx, so need to install epel-release first
RUN wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
RUN yum install -y nginx

RUN rm -rf /var/cache/yum

ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
ENV PYTHONHASHSEED 0
ENV PYTHONIOENCODING UTF-8
ENV PIP_DISABLE_PIP_VERSION_CHECK 1

# Install EMR Spark/Hadoop
ENV HADOOP_HOME /usr/lib/hadoop
ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop
ENV SPARK_HOME /usr/lib/spark

COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo

# Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations.
# replace placeholder with region in repository URL
RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo
RUN adduser -N hadoop

# These packages are a subset of what EMR installs in a cluster with the
# "hadoop", "spark", and "hive" applications.
# They include EMR-optimized libraries and extras.
RUN yum install -y aws-hm-client \
aws-java-sdk \
aws-sagemaker-spark-sdk \
emr-goodies \
emr-ruby \
emr-scripts \
emr-s3-select \
emrfs \
hadoop \
hadoop-client \
hadoop-hdfs \
hadoop-hdfs-datanode \
hadoop-hdfs-namenode \
hadoop-httpfs \
hadoop-kms \
hadoop-lzo \
hadoop-yarn \
hadoop-yarn-nodemanager \
hadoop-yarn-proxyserver \
hadoop-yarn-resourcemanager \
hadoop-yarn-timelineserver \
hive \
hive-hcatalog \
hive-hcatalog-server \
hive-jdbc \
hive-server2 \
s3-dist-cp \
spark-core \
spark-datanucleus \
spark-external \
spark-history-server \
spark-python


# Point Spark at proper python binary
ENV PYSPARK_PYTHON=/usr/local/bin/python3.9

# Setup Spark/Yarn/HDFS user as root
ENV PATH="/usr/bin:/opt/program:${PATH}"
ENV YARN_RESOURCEMANAGER_USER="root"
ENV YARN_NODEMANAGER_USER="root"
ENV HDFS_NAMENODE_USER="root"
ENV HDFS_DATANODE_USER="root"
ENV HDFS_SECONDARYNAMENODE_USER="root"


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are missing a statement similar to this: https://github.com/aws/sagemaker-spark-container/blob/master/spark/processing/3.1/py3/docker/py39/Dockerfile.cpu#L102 in this Dockerfile. Can we make sure its added?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Ajay, I was in the assumption that you want to keep this because this command remove lookup class from the jar file because log4j might have some vulnerability so that we want to remove Lookup class from the jar. See here: https://community.bmc.com/s/article/Log4j-CVE-2021-44228-REMEDIATION-Remove-JndiLookup-class-from-log4j-core-2-jar

But if you look into it, there is another approach that we can update the version of log4j, which is done piggybacked by hive version upgrade. I remove this line on-purpose because everytime you have to look up log4j version in hive package to determine which jar to update, that is very cumbersome way to do it.
As state above, there is no need to add this line.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it thanks for the explanation.


# Set up bootstrapping program and Spark configuration
COPY hadoop-config /opt/hadoop-config
COPY nginx-config /opt/nginx-config
COPY aws-config /opt/aws-config
COPY Pipfile Pipfile.lock setup.py *.whl /opt/program/
ENV PIPENV_PIPFILE=/opt/program/Pipfile
# Use --system flag, so it will install all packages into the system python,
# and not into the virtualenv. Since docker containers do not need to have virtualenvs
# pipenv > 2022.4.8 fails to build smspark
RUN /usr/local/bin/python3.9 -m pip install pipenv==2022.4.8 \
&& pipenv install --system \
&& /usr/local/bin/python3.9 -m pip install /opt/program/*.whl

# Setup container bootstrapper
COPY container-bootstrap-config /opt/container-bootstrap-config
RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh \
&& /opt/container-bootstrap-config/bootstrap.sh

# With this config, spark history server will not run as daemon, otherwise there
# will be no server running and container will terminate immediately
ENV SPARK_NO_DAEMONIZE TRUE

WORKDIR $SPARK_HOME

ENTRYPOINT ["smspark-submit"]
26 changes: 26 additions & 0 deletions spark/processing/3.2/py3/hadoop-config/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://nn_uri/</value>
<description>NameNode URI</description>
</property>
<property>
<name>fs.s3a.aws.credentials.provider</name>
<value>com.amazonaws.auth.DefaultAWSCredentialsProviderChain</value>
<description>AWS S3 credential provider</description>
</property>
<property>
<name>fs.s3.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
<description>s3a filesystem implementation</description>
</property>
<property>
<name>fs.AbstractFileSystem.s3a.imp</name>
<value>org.apache.hadoop.fs.s3a.S3A</value>
<description>s3a filesystem implementation</description>
</property>
</configuration>
67 changes: 67 additions & 0 deletions spark/processing/3.2/py3/hadoop-config/hdfs-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///opt/amazon/hadoop/hdfs/datanode</value>
<description>Comma separated list of paths on the local filesystem of a DataNode where it should store its\
blocks.</description>
</property>

<property>
<name>dfs.namenode.name.dir</name>
<value>file:///opt/amazon/hadoop/hdfs/namenode</value>
<description>Path on the local filesystem where the NameNode stores the namespace and transaction logs per\
sistently.</description>
</property>

<!-- Fix for "Failed to replace a bad datanode on the existing pipeline due to no more good datanodes being available to try"
From https://community.cloudera.com/t5/Support-Questions/Failed-to-replace-a-bad-datanode-on-the-existing-pipeline/td-p/207711
This issue can be caused by Continuous network issues causing or repeated packet drops. This specially happens when data is
being written to any one of the DataNode which is in process of pipelining the data to next datanode and due to any communicaiton
issue it may lead to pipeline failure. We are only see this issue in small regions. -->
<property>
<name>dfs.client.block.write.replace-datanode-on-failure.enable</name>
<value>true</value>
<description>
If there is a datanode/network failure in the write pipeline,
DFSClient will try to remove the failed datanode from the pipeline
and then continue writing with the remaining datanodes. As a result,
the number of datanodes in the pipeline is decreased. The feature is
to add new datanodes to the pipeline.

This is a site-wide property to enable/disable the feature.

When the cluster size is extremely small, e.g. 3 nodes or less, cluster
administrators may want to set the policy to NEVER in the default
configuration file or disable this feature. Otherwise, users may
experience an unusually high rate of pipeline failures since it is
impossible to find new datanodes for replacement.

See also dfs.client.block.write.replace-datanode-on-failure.policy
</description>
</property>

<property>
<name>dfs.client.block.write.replace-datanode-on-failure.policy</name>
<value>ALWAYS</value>
<description>
This property is used only if the value of
dfs.client.block.write.replace-datanode-on-failure.enable is true.

ALWAYS: always add a new datanode when an existing datanode is
removed.

NEVER: never add a new datanode.

DEFAULT:
Let r be the replication number.
Let n be the number of existing datanodes.
Add a new datanode only if r is greater than or equal to 3 and either
(1) floor(r/2) is greater than or equal to n; or
(2) r is greater than n and the block is hflushed/appended.
</description>
</property>
</configuration>
10 changes: 10 additions & 0 deletions spark/processing/3.2/py3/hadoop-config/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
spark.driver.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
spark.driver.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
spark.executor.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
spark.executor.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
spark.driver.host=sd_host
spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2

# Fix for "Uncaught exception: org.apache.spark.rpc.RpcTimeoutException: Cannot
# receive any reply from 10.0.109.30:35219 in 120 seconds.""
spark.rpc.askTimeout=300s
3 changes: 3 additions & 0 deletions spark/processing/3.2/py3/hadoop-config/spark-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#EMPTY FILE AVOID OVERRIDDING ENV VARS
# Specifically, without copying the empty file, SPARK_HISTORY_OPTS will be overriden,
# spark.history.ui.port defaults to 18082, and spark.eventLog.dir defaults to local fs
34 changes: 34 additions & 0 deletions spark/processing/3.2/py3/hadoop-config/yarn-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version="1.0"?>
<!-- Site specific YARN configuration properties -->
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>rm_hostname</value>
<description>The hostname of the RM.</description>
</property>
<property>
<name>yarn.nodemanager.hostname</name>
<value>nm_hostname</value>
<description>The hostname of the NM.</description>
</property>
<property>
<name>yarn.nodemanager.webapp.address</name>
<value>nm_webapp_address</value>
</property>
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>5</value>
<description>Ratio between virtual memory to physical memory.</description>
</property>
<property>
<name>yarn.resourcemanager.am.max-attempts</name>
<value>1</value>
<description>The maximum number of application attempts.</description>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,YARN_HOME,AWS_CONTAINER_CREDENTIALS_RELATIVE_URI,AWS_REGION</value>
<description>Environment variable whitelist</description>
</property>

</configuration>
Loading