Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 42 additions & 37 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,70 +1,75 @@
# Multi-stage build for optimal size
FROM public.ecr.aws/lambda/python:3.10 as builder
FROM public.ecr.aws/lambda/python:3.12

# Build arguments - consolidated at top
ARG HADOOP_VERSION=3.2.4
ARG AWS_SDK_VERSION=1.11.901
ARG PYSPARK_VERSION=3.3.0
ARG HADOOP_VERSION=3.3.6
ARG AWS_SDK_VERSION=1.12.261
ARG PYSPARK_VERSION=3.5.0
ARG FRAMEWORK
ARG DELTA_FRAMEWORK_VERSION=2.2.0
ARG HUDI_FRAMEWORK_VERSION=0.12.2
ARG ICEBERG_FRAMEWORK_VERSION=3.3_2.12
ARG ICEBERG_FRAMEWORK_SUB_VERSION=1.0.0
ARG DEEQU_FRAMEWORK_VERSION=2.0.3-spark-3.3
ARG AWS_REGION

# Single consolidated RUN layer for all build operations
ENV AWS_REGION=${AWS_REGION}

# System updates and package installation
COPY download_jars.sh /tmp/
RUN set -ex && \
# System updates and package installation
yum update -y && \
yum install -y java-11-amazon-corretto-headless wget unzip && \
yum clean all && \
rm -rf /var/cache/yum && \
# Python package installation
dnf update -y && \
dnf install -y wget unzip java-11-amazon-corretto-headless python3-setuptools && \
dnf clean all && \
rm -rf /var/cache/dnf && \
pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir setuptools wheel && \
pip install --no-cache-dir pyspark==$PYSPARK_VERSION boto3 && \
# Conditional DEEQU installation
(echo "$FRAMEWORK" | grep -q "DEEQU" && \
pip install --no-cache-dir --no-deps pydeequ && \
pip install --no-cache-dir pandas || \
pip install --no-cache-dir pandas && \
echo "DEEQU found in FRAMEWORK" || \
echo "DEEQU not found in FRAMEWORK") && \
# JAR download and cleanup
chmod +x /tmp/download_jars.sh && \
SPARK_HOME="/var/lang/lib/python3.10/site-packages/pyspark" && \
SPARK_HOME="/var/lang/lib/python3.12/site-packages/pyspark" && \
/tmp/download_jars.sh $FRAMEWORK $SPARK_HOME $HADOOP_VERSION $AWS_SDK_VERSION $DELTA_FRAMEWORK_VERSION $HUDI_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_SUB_VERSION $DEEQU_FRAMEWORK_VERSION && \
rm -rf /tmp/* /var/tmp/*

# Final optimized stage
FROM public.ecr.aws/lambda/python:3.10
# Copy requirements.txt if present and install
COPY requirements.txt ${LAMBDA_TASK_ROOT}/
RUN if [ -f "${LAMBDA_TASK_ROOT}/requirements.txt" ]; then pip install --no-cache-dir -r ${LAMBDA_TASK_ROOT}/requirements.txt; fi

# Single consolidated RUN layer for runtime setup
COPY --from=builder /var/lang/lib/python3.10/site-packages/ /var/lang/lib/python3.10/site-packages/
COPY --from=builder /var/runtime/ /var/runtime/
# Copy application files
COPY libs/glue_functions /home/glue_functions
COPY spark-class /var/lang/lib/python3.10/site-packages/pyspark/bin/
COPY spark-class /var/lang/lib/python3.12/site-packages/pyspark/bin/
COPY sparkLambdaHandler.py ${LAMBDA_TASK_ROOT}
# Optionally copy log4j.properties if present
RUN if [ -f log4j.properties ]; then cp log4j.properties /var/lang/lib/python3.12/site-packages/pyspark/conf/; fi

RUN set -ex && \
# Install runtime Java and cleanup
yum update -y && \
yum install -y java-11-amazon-corretto-headless && \
yum clean all && \
rm -rf /var/cache/yum /tmp/* /var/tmp/* && \
# Set permissions in single operation
chmod -R 755 /home/glue_functions /var/lang/lib/python3.10/site-packages/pyspark
dnf update -y && \
dnf install -y java-11-amazon-corretto-headless && \
dnf clean all && \
rm -rf /var/cache/dnf /tmp/* /var/tmp/* && \
chmod -R 755 /home/glue_functions /var/lang/lib/python3.12/site-packages/pyspark && \
# Diagnostics for spark-class
ls -la /var/lang/lib/python3.12/site-packages/pyspark/bin/ || echo "Spark bin directory not found" && \
if [ -f "/var/lang/lib/python3.12/site-packages/pyspark/bin/spark-class" ]; then echo "Custom spark-class after copying:"; cat /var/lang/lib/python3.12/site-packages/pyspark/bin/spark-class; else echo "Custom spark-class not found"; fi && \
ln -sf /var/lang/lib/python3.12/site-packages/pyspark/bin/spark-class /usr/local/bin/spark-class && \
ls -la /usr/local/bin/spark-class

# Consolidated environment variables
ENV SPARK_HOME="/var/lang/lib/python3.10/site-packages/pyspark" \
SPARK_VERSION=3.3.0 \
ENV SPARK_HOME="/var/lang/lib/python3.12/site-packages/pyspark" \
SPARK_VERSION=3.5.0 \
JAVA_HOME="/usr/lib/jvm/java-11-amazon-corretto" \
PATH="$PATH:/var/lang/lib/python3.10/site-packages/pyspark/bin:/var/lang/lib/python3.10/site-packages/pyspark/sbin:/usr/lib/jvm/java-11-amazon-corretto/bin" \
PYTHONPATH="/var/lang/lib/python3.10/site-packages/pyspark/python:/var/lang/lib/python3.10/site-packages/pyspark/python/lib/py4j-0.10.9-src.zip:/home/glue_functions" \
PATH="$PATH:/var/lang/lib/python3.12/site-packages/pyspark/bin:/var/lang/lib/python3.12/site-packages/pyspark/sbin:/usr/lib/jvm/java-11-amazon-corretto/bin" \
PYTHONPATH="/var/lang/lib/python3.12/site-packages/pyspark/python:/var/lang/lib/python3.12/site-packages/pyspark/python/lib/py4j-0.10.9.7-src.zip:/home/glue_functions" \
INPUT_PATH="" \
OUTPUT_PATH="" \
AWS_ACCESS_KEY_ID="" \
AWS_SECRET_ACCESS_KEY="" \
AWS_REGION="" \
AWS_SESSION_TOKEN="" \
CUSTOM_SQL=""

CMD [ "/var/task/sparkLambdaHandler.lambda_handler" ]
RUN java -version

RUN chmod 755 ${LAMBDA_TASK_ROOT}/sparkLambdaHandler.py

CMD [ "sparkLambdaHandler.lambda_handler" ]
21 changes: 11 additions & 10 deletions aws-ecr-repository-push.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

echo "Starting the PUSH to AWS ECR...."



if [ $# -eq 0 ]
then
echo "Please provide the image name"
echo "Usage: $0 <image-name>"
exit 1
fi

Dockerimage=$1
Expand All @@ -18,13 +18,13 @@ aws_account=$(aws sts get-caller-identity --query Account --output text)

if [ $? -ne 0 ]
then
echo "Failed to get AWS account number. Please check your AWS credentials."
exit 255
fi


# Get the region defined in the current configuration (default to us-west-2 if none defined)
# Get the region defined in the current configuration (default to us-east-1 if none defined)
aws_region=$(aws configure get region)
aws_region=${region:-us-east-1}
aws_region=${aws_region:-us-east-1}
reponame="${aws_account}.dkr.ecr.${aws_region}.amazonaws.com/${Dockerimage}:latest"

# Creates a repo if it does not exist
Expand All @@ -36,17 +36,18 @@ then
aws ecr create-repository --repository-name "${Dockerimage}" > /dev/null
fi

# Get the AWS ECr login
aws ecr get-login-password --region "${aws_region}" | docker login --username AWS --password-stdin "${aws_account}".dkr.ecr."${aws_region}".amazonaws.com
# Get the AWS ECR login to pull base image from public ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws

# Build the docker image and push to ECR
echo "Building the docker image"
docker build -t ${Dockerimage} .

docker build -t ${Dockerimage} .

echo "Tagging the Docker image"
docker tag ${Dockerimage} ${reponame}

# Get the AWS ECR login to push the image to private ECR
aws ecr get-login-password --region "${aws_region}" | docker login --username AWS --password-stdin "${aws_account}".dkr.ecr."${aws_region}".amazonaws.com

echo "Pushing the Docket image to AWS ECR"
echo "Pushing the Docker image to AWS ECR"
docker push ${reponame}
107 changes: 104 additions & 3 deletions download_jars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,112 @@ mkdir $SPARK_HOME/conf
echo "SPARK_LOCAL_IP=127.0.0.1" > $SPARK_HOME/conf/spark-env.sh
echo "JAVA_HOME=/usr/lib/jvm/$(ls /usr/lib/jvm |grep java)/jre" >> $SPARK_HOME/conf/spark-env.sh

# Download core S3 filesystem JARs with updated versions
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -P ${SPARK_HOME}/jars/

# Additional JARs for better S3 compatibility
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/${HADOOP_VERSION}/hadoop-common-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/${HADOOP_VERSION}/hadoop-client-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION}/hadoop-client-api-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION}/hadoop-client-runtime-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/

# Add Hadoop statistics and fs libraries to fix NoSuchMethodError
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-annotations/${HADOOP_VERSION}/hadoop-annotations-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-auth/${HADOOP_VERSION}/hadoop-auth-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-guava/${HADOOP_VERSION}/hadoop-shaded-guava-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-protobuf_3_7/${HADOOP_VERSION}/hadoop-shaded-protobuf_3_7-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-core/${HADOOP_VERSION}/hadoop-mapreduce-client-core-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-common/${HADOOP_VERSION}/hadoop-mapreduce-client-common-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-hdfs-client/${HADOOP_VERSION}/hadoop-hdfs-client-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/

# Add additional Hadoop libraries to fix S3A filesystem issues
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION}/hadoop-client-api-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION}/hadoop-client-runtime-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/

# Fix for IOStatisticsBinding NoSuchMethodError
# Download specific version that contains the required IOStatisticsBinding class
FIXED_VERSION="3.3.4"
echo "Downloading fixed Hadoop libraries version $FIXED_VERSION"
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/$FIXED_VERSION/hadoop-common-$FIXED_VERSION.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/$FIXED_VERSION/hadoop-aws-$FIXED_VERSION.jar -P ${SPARK_HOME}/jars/

# Download specific statistics implementation jars
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-guava/1.1.1/hadoop-shaded-guava-1.1.1.jar -P ${SPARK_HOME}/jars/ || echo "hadoop-shaded-guava not found"

# Download specific fs-statistics JAR that contains IOStatisticsBinding
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/$FIXED_VERSION/hadoop-common-$FIXED_VERSION-tests.jar -P ${SPARK_HOME}/jars/ || echo "hadoop-common-tests not found"

# Download additional S3A implementation classes
wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/$FIXED_VERSION/hadoop-aws-$FIXED_VERSION-tests.jar -P ${SPARK_HOME}/jars/ || echo "hadoop-aws-tests not found"

# Copy the existing log4j.properties file to the Spark conf directory
echo "Copying existing log4j.properties file to Spark conf directory"
cp /opt/spark-on-lambda-handler/log4j.properties ${SPARK_HOME}/conf/

# Create a core-site.xml file with S3A configurations
echo "Creating core-site.xml file"
cat > ${SPARK_HOME}/conf/core-site.xml << EOL
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>fs.s3a.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
</property>
<property>
<name>fs.s3a.aws.credentials.provider</name>
<value>com.amazonaws.auth.DefaultAWSCredentialsProviderChain</value>
</property>
<property>
<name>fs.s3a.connection.maximum</name>
<value>100</value>
</property>
<property>
<name>fs.s3a.experimental.input.fadvise</name>
<value>sequential</value>
</property>
<property>
<name>fs.s3a.impl.disable.cache</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.path.style.access</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.committer.name</name>
<value>directory</value>
</property>
<property>
<name>fs.s3a.committer.staging.conflict-mode</name>
<value>append</value>
</property>
<property>
<name>fs.s3a.committer.staging.unique-filenames</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.fast.upload</name>
<value>true</value>
</property>
<property>
<name>mapreduce.fileoutputcommitter.algorithm.version</name>
<value>2</value>
</property>
</configuration>
EOL

# Add AWS SDK v2components for better S3 compatibility
wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.20.56/s3-2.20.56.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/utils/2.20.56/utils-2.20.56.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/auth/2.20.56/auth-2.20.56.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/http-client-spi/2.20.56/http-client-spi-2.20.56.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/regions/2.20.56/regions-2.20.56.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/sdk-core/2.20.56/sdk-core-2.20.56.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/apache-client/2.20.56/apache-client-2.20.56.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/aws-core/2.20.56/aws-core-2.20.56.jar -P ${SPARK_HOME}/jars/

wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/
wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -P ${SPARK_HOME}/jars/
# jar files needed to conncet to Snowflake
#wget -q https://repo1.maven.org/maven2/net/snowflake/spark-snowflake_2.12/2.12.0-spark_3.3/spark-snowflake_2.12-2.12.0-spark_3.3.jar -P ${SPARK_HOME}/jars/
#wget -q https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/3.13.33/snowflake-jdbc-3.13.33.jar -P ${SPARK_HOME}/jars/
Expand Down Expand Up @@ -61,4 +162,4 @@ echo $fw
echo "Unknown framework: $fw"
;;
esac
done
done
47 changes: 47 additions & 0 deletions log4j.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Set root logger level to DEBUG and its only appender to console
log4j.rootLogger=WARN, console

# Console appender configuration
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

# Set the default spark-shell log level to WARN. When running the spark-shell, the
# log level for this class is used to overwrite the root logger's log level, so that
# the user can have different defaults for the shell and regular Spark apps.
log4j.logger.org.apache.spark.repl.Main=WARN

# Settings to quiet third party logs that are too verbose
log4j.logger.org.spark_project.jetty=WARN
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN
log4j.logger.org.apache.parquet=ERROR
log4j.logger.parquet=ERROR

# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR

# Set the log level for your application's package
log4j.logger.your.application.package=DEBUG

# Additional settings to reduce logging
log4j.logger.org.apache.spark=WARN
log4j.logger.org.apache.hadoop=WARN
log4j.logger.org.apache.kafka=WARN
log4j.logger.org.apache.zookeeper=WARN
log4j.logger.org.apache.hive=WARN

# Hadoop metrics configuration to reduce warnings
log4j.logger.org.apache.hadoop.metrics2=ERROR
log4j.logger.org.apache.hadoop.metrics2.impl.MetricsConfig=ERROR
log4j.logger.org.apache.hadoop.metrics2.impl.MetricsSystemImpl=ERROR

# S3 filesystem specific logging
log4j.logger.org.apache.hadoop.fs.s3a=WARN
log4j.logger.org.apache.hadoop.fs.s3a.impl=WARN

# AWS SDK logging
log4j.logger.com.amazonaws=WARN
log4j.logger.com.amazonaws.services.s3=WARN
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
boto3
scikit-learn
pandas
numpy
seaborn
matplotlib
s3fs
openpyxl
Loading