diff --git a/Dockerfile b/Dockerfile index 82172f9..468e6ba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,70 +1,75 @@ -# Multi-stage build for optimal size -FROM public.ecr.aws/lambda/python:3.10 as builder +FROM public.ecr.aws/lambda/python:3.12 # Build arguments - consolidated at top -ARG HADOOP_VERSION=3.2.4 -ARG AWS_SDK_VERSION=1.11.901 -ARG PYSPARK_VERSION=3.3.0 +ARG HADOOP_VERSION=3.3.6 +ARG AWS_SDK_VERSION=1.12.261 +ARG PYSPARK_VERSION=3.5.0 ARG FRAMEWORK ARG DELTA_FRAMEWORK_VERSION=2.2.0 ARG HUDI_FRAMEWORK_VERSION=0.12.2 ARG ICEBERG_FRAMEWORK_VERSION=3.3_2.12 ARG ICEBERG_FRAMEWORK_SUB_VERSION=1.0.0 ARG DEEQU_FRAMEWORK_VERSION=2.0.3-spark-3.3 +ARG AWS_REGION -# Single consolidated RUN layer for all build operations +ENV AWS_REGION=${AWS_REGION} + +# System updates and package installation COPY download_jars.sh /tmp/ RUN set -ex && \ - # System updates and package installation - yum update -y && \ - yum install -y java-11-amazon-corretto-headless wget unzip && \ - yum clean all && \ - rm -rf /var/cache/yum && \ - # Python package installation + dnf update -y && \ + dnf install -y wget unzip java-11-amazon-corretto-headless python3-setuptools && \ + dnf clean all && \ + rm -rf /var/cache/dnf && \ pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir setuptools wheel && \ pip install --no-cache-dir pyspark==$PYSPARK_VERSION boto3 && \ # Conditional DEEQU installation (echo "$FRAMEWORK" | grep -q "DEEQU" && \ pip install --no-cache-dir --no-deps pydeequ && \ - pip install --no-cache-dir pandas || \ + pip install --no-cache-dir pandas && \ + echo "DEEQU found in FRAMEWORK" || \ echo "DEEQU not found in FRAMEWORK") && \ # JAR download and cleanup chmod +x /tmp/download_jars.sh && \ - SPARK_HOME="/var/lang/lib/python3.10/site-packages/pyspark" && \ + SPARK_HOME="/var/lang/lib/python3.12/site-packages/pyspark" && \ /tmp/download_jars.sh $FRAMEWORK $SPARK_HOME $HADOOP_VERSION $AWS_SDK_VERSION $DELTA_FRAMEWORK_VERSION $HUDI_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_SUB_VERSION $DEEQU_FRAMEWORK_VERSION && \ rm -rf /tmp/* /var/tmp/* -# Final optimized stage -FROM public.ecr.aws/lambda/python:3.10 +# Copy requirements.txt if present and install +COPY requirements.txt ${LAMBDA_TASK_ROOT}/ +RUN if [ -f "${LAMBDA_TASK_ROOT}/requirements.txt" ]; then pip install --no-cache-dir -r ${LAMBDA_TASK_ROOT}/requirements.txt; fi -# Single consolidated RUN layer for runtime setup -COPY --from=builder /var/lang/lib/python3.10/site-packages/ /var/lang/lib/python3.10/site-packages/ -COPY --from=builder /var/runtime/ /var/runtime/ +# Copy application files COPY libs/glue_functions /home/glue_functions -COPY spark-class /var/lang/lib/python3.10/site-packages/pyspark/bin/ +COPY spark-class /var/lang/lib/python3.12/site-packages/pyspark/bin/ COPY sparkLambdaHandler.py ${LAMBDA_TASK_ROOT} +# Optionally copy log4j.properties if present +RUN if [ -f log4j.properties ]; then cp log4j.properties /var/lang/lib/python3.12/site-packages/pyspark/conf/; fi RUN set -ex && \ - # Install runtime Java and cleanup - yum update -y && \ - yum install -y java-11-amazon-corretto-headless && \ - yum clean all && \ - rm -rf /var/cache/yum /tmp/* /var/tmp/* && \ - # Set permissions in single operation - chmod -R 755 /home/glue_functions /var/lang/lib/python3.10/site-packages/pyspark + dnf update -y && \ + dnf install -y java-11-amazon-corretto-headless && \ + dnf clean all && \ + rm -rf /var/cache/dnf /tmp/* /var/tmp/* && \ + chmod -R 755 /home/glue_functions /var/lang/lib/python3.12/site-packages/pyspark && \ + # Diagnostics for spark-class + ls -la /var/lang/lib/python3.12/site-packages/pyspark/bin/ || echo "Spark bin directory not found" && \ + if [ -f "/var/lang/lib/python3.12/site-packages/pyspark/bin/spark-class" ]; then echo "Custom spark-class after copying:"; cat /var/lang/lib/python3.12/site-packages/pyspark/bin/spark-class; else echo "Custom spark-class not found"; fi && \ + ln -sf /var/lang/lib/python3.12/site-packages/pyspark/bin/spark-class /usr/local/bin/spark-class && \ + ls -la /usr/local/bin/spark-class -# Consolidated environment variables -ENV SPARK_HOME="/var/lang/lib/python3.10/site-packages/pyspark" \ - SPARK_VERSION=3.3.0 \ +ENV SPARK_HOME="/var/lang/lib/python3.12/site-packages/pyspark" \ + SPARK_VERSION=3.5.0 \ JAVA_HOME="/usr/lib/jvm/java-11-amazon-corretto" \ - PATH="$PATH:/var/lang/lib/python3.10/site-packages/pyspark/bin:/var/lang/lib/python3.10/site-packages/pyspark/sbin:/usr/lib/jvm/java-11-amazon-corretto/bin" \ - PYTHONPATH="/var/lang/lib/python3.10/site-packages/pyspark/python:/var/lang/lib/python3.10/site-packages/pyspark/python/lib/py4j-0.10.9-src.zip:/home/glue_functions" \ + PATH="$PATH:/var/lang/lib/python3.12/site-packages/pyspark/bin:/var/lang/lib/python3.12/site-packages/pyspark/sbin:/usr/lib/jvm/java-11-amazon-corretto/bin" \ + PYTHONPATH="/var/lang/lib/python3.12/site-packages/pyspark/python:/var/lang/lib/python3.12/site-packages/pyspark/python/lib/py4j-0.10.9.7-src.zip:/home/glue_functions" \ INPUT_PATH="" \ OUTPUT_PATH="" \ - AWS_ACCESS_KEY_ID="" \ - AWS_SECRET_ACCESS_KEY="" \ - AWS_REGION="" \ - AWS_SESSION_TOKEN="" \ CUSTOM_SQL="" -CMD [ "/var/task/sparkLambdaHandler.lambda_handler" ] +RUN java -version + +RUN chmod 755 ${LAMBDA_TASK_ROOT}/sparkLambdaHandler.py + +CMD [ "sparkLambdaHandler.lambda_handler" ] diff --git a/aws-ecr-repository-push.sh b/aws-ecr-repository-push.sh old mode 100644 new mode 100755 index f2fd8c3..4f7941b --- a/aws-ecr-repository-push.sh +++ b/aws-ecr-repository-push.sh @@ -4,11 +4,11 @@ echo "Starting the PUSH to AWS ECR...." - - if [ $# -eq 0 ] then echo "Please provide the image name" + echo "Usage: $0 " + exit 1 fi Dockerimage=$1 @@ -18,13 +18,13 @@ aws_account=$(aws sts get-caller-identity --query Account --output text) if [ $? -ne 0 ] then + echo "Failed to get AWS account number. Please check your AWS credentials." exit 255 fi - -# Get the region defined in the current configuration (default to us-west-2 if none defined) +# Get the region defined in the current configuration (default to us-east-1 if none defined) aws_region=$(aws configure get region) -aws_region=${region:-us-east-1} +aws_region=${aws_region:-us-east-1} reponame="${aws_account}.dkr.ecr.${aws_region}.amazonaws.com/${Dockerimage}:latest" # Creates a repo if it does not exist @@ -36,17 +36,18 @@ then aws ecr create-repository --repository-name "${Dockerimage}" > /dev/null fi -# Get the AWS ECr login -aws ecr get-login-password --region "${aws_region}" | docker login --username AWS --password-stdin "${aws_account}".dkr.ecr."${aws_region}".amazonaws.com +# Get the AWS ECR login to pull base image from public ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws # Build the docker image and push to ECR echo "Building the docker image" -docker build -t ${Dockerimage} . - +docker build -t ${Dockerimage} . echo "Tagging the Docker image" docker tag ${Dockerimage} ${reponame} +# Get the AWS ECR login to push the image to private ECR +aws ecr get-login-password --region "${aws_region}" | docker login --username AWS --password-stdin "${aws_account}".dkr.ecr."${aws_region}".amazonaws.com -echo "Pushing the Docket image to AWS ECR" +echo "Pushing the Docker image to AWS ECR" docker push ${reponame} \ No newline at end of file diff --git a/download_jars.sh b/download_jars.sh index acf0eee..9690c39 100644 --- a/download_jars.sh +++ b/download_jars.sh @@ -13,11 +13,112 @@ mkdir $SPARK_HOME/conf echo "SPARK_LOCAL_IP=127.0.0.1" > $SPARK_HOME/conf/spark-env.sh echo "JAVA_HOME=/usr/lib/jvm/$(ls /usr/lib/jvm |grep java)/jre" >> $SPARK_HOME/conf/spark-env.sh +# Download core S3 filesystem JARs with updated versions +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -P ${SPARK_HOME}/jars/ +# Additional JARs for better S3 compatibility +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/${HADOOP_VERSION}/hadoop-common-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/${HADOOP_VERSION}/hadoop-client-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION}/hadoop-client-api-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION}/hadoop-client-runtime-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +# Add Hadoop statistics and fs libraries to fix NoSuchMethodError +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-annotations/${HADOOP_VERSION}/hadoop-annotations-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-auth/${HADOOP_VERSION}/hadoop-auth-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-guava/${HADOOP_VERSION}/hadoop-shaded-guava-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-protobuf_3_7/${HADOOP_VERSION}/hadoop-shaded-protobuf_3_7-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-core/${HADOOP_VERSION}/hadoop-mapreduce-client-core-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-common/${HADOOP_VERSION}/hadoop-mapreduce-client-common-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-hdfs-client/${HADOOP_VERSION}/hadoop-hdfs-client-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ + +# Add additional Hadoop libraries to fix S3A filesystem issues +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION}/hadoop-client-api-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION}/hadoop-client-runtime-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ + +# Fix for IOStatisticsBinding NoSuchMethodError +# Download specific version that contains the required IOStatisticsBinding class +FIXED_VERSION="3.3.4" +echo "Downloading fixed Hadoop libraries version $FIXED_VERSION" +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/$FIXED_VERSION/hadoop-common-$FIXED_VERSION.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/$FIXED_VERSION/hadoop-aws-$FIXED_VERSION.jar -P ${SPARK_HOME}/jars/ + +# Download specific statistics implementation jars +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-guava/1.1.1/hadoop-shaded-guava-1.1.1.jar -P ${SPARK_HOME}/jars/ || echo "hadoop-shaded-guava not found" + +# Download specific fs-statistics JAR that contains IOStatisticsBinding +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/$FIXED_VERSION/hadoop-common-$FIXED_VERSION-tests.jar -P ${SPARK_HOME}/jars/ || echo "hadoop-common-tests not found" + +# Download additional S3A implementation classes +wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/$FIXED_VERSION/hadoop-aws-$FIXED_VERSION-tests.jar -P ${SPARK_HOME}/jars/ || echo "hadoop-aws-tests not found" + +# Copy the existing log4j.properties file to the Spark conf directory +echo "Copying existing log4j.properties file to Spark conf directory" +cp /opt/spark-on-lambda-handler/log4j.properties ${SPARK_HOME}/conf/ + +# Create a core-site.xml file with S3A configurations +echo "Creating core-site.xml file" +cat > ${SPARK_HOME}/conf/core-site.xml << EOL + + + + + fs.s3a.impl + org.apache.hadoop.fs.s3a.S3AFileSystem + + + fs.s3a.aws.credentials.provider + com.amazonaws.auth.DefaultAWSCredentialsProviderChain + + + fs.s3a.connection.maximum + 100 + + + fs.s3a.experimental.input.fadvise + sequential + + + fs.s3a.impl.disable.cache + true + + + fs.s3a.path.style.access + true + + + fs.s3a.committer.name + directory + + + fs.s3a.committer.staging.conflict-mode + append + + + fs.s3a.committer.staging.unique-filenames + true + + + fs.s3a.fast.upload + true + + + mapreduce.fileoutputcommitter.algorithm.version + 2 + + +EOL + +# Add AWS SDK v2components for better S3 compatibility +wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.20.56/s3-2.20.56.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/utils/2.20.56/utils-2.20.56.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/auth/2.20.56/auth-2.20.56.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/http-client-spi/2.20.56/http-client-spi-2.20.56.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/regions/2.20.56/regions-2.20.56.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/sdk-core/2.20.56/sdk-core-2.20.56.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/apache-client/2.20.56/apache-client-2.20.56.jar -P ${SPARK_HOME}/jars/ +wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/aws-core/2.20.56/aws-core-2.20.56.jar -P ${SPARK_HOME}/jars/ -wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -P ${SPARK_HOME}/jars/ -wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -P ${SPARK_HOME}/jars/ # jar files needed to conncet to Snowflake #wget -q https://repo1.maven.org/maven2/net/snowflake/spark-snowflake_2.12/2.12.0-spark_3.3/spark-snowflake_2.12-2.12.0-spark_3.3.jar -P ${SPARK_HOME}/jars/ #wget -q https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/3.13.33/snowflake-jdbc-3.13.33.jar -P ${SPARK_HOME}/jars/ @@ -61,4 +162,4 @@ echo $fw echo "Unknown framework: $fw" ;; esac -done +done \ No newline at end of file diff --git a/log4j.properties b/log4j.properties new file mode 100644 index 0000000..1bcb82c --- /dev/null +++ b/log4j.properties @@ -0,0 +1,47 @@ +# Set root logger level to DEBUG and its only appender to console +log4j.rootLogger=WARN, console + +# Console appender configuration +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark_project.jetty=WARN +log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR + +# Set the log level for your application's package +log4j.logger.your.application.package=DEBUG + +# Additional settings to reduce logging +log4j.logger.org.apache.spark=WARN +log4j.logger.org.apache.hadoop=WARN +log4j.logger.org.apache.kafka=WARN +log4j.logger.org.apache.zookeeper=WARN +log4j.logger.org.apache.hive=WARN + +# Hadoop metrics configuration to reduce warnings +log4j.logger.org.apache.hadoop.metrics2=ERROR +log4j.logger.org.apache.hadoop.metrics2.impl.MetricsConfig=ERROR +log4j.logger.org.apache.hadoop.metrics2.impl.MetricsSystemImpl=ERROR + +# S3 filesystem specific logging +log4j.logger.org.apache.hadoop.fs.s3a=WARN +log4j.logger.org.apache.hadoop.fs.s3a.impl=WARN + +# AWS SDK logging +log4j.logger.com.amazonaws=WARN +log4j.logger.com.amazonaws.services.s3=WARN diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c24c356 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +boto3 +scikit-learn +pandas +numpy +seaborn +matplotlib +s3fs +openpyxl \ No newline at end of file diff --git a/spark-class b/spark-class index 0da74b3..53cb89a 100644 --- a/spark-class +++ b/spark-class @@ -1,6 +1,51 @@ #!/usr/bin/env bash +# Debug: Print environment variables +echo "JAVA_HOME: $JAVA_HOME" +echo "PATH: $PATH" -java_path=$(ls /usr/lib/jvm |grep java) -exec /usr/lib/jvm/$java_path/jre/bin/java -cp /var/lang/lib/python3.8/site-packages/pyspark/conf/:/var/lang/lib/python3.8/site-packages/pyspark/jars/* -Xmx1g "$@" +# Hard-code the Java executable path for maximum reliability +JAVA_EXECUTABLE="/usr/bin/java" +# Fallback options if the hard-coded path doesn't work +if [ ! -f "$JAVA_EXECUTABLE" ]; then + echo "Primary Java path not found, trying alternatives..." + + # Try using JAVA_HOME + if [ -n "$JAVA_HOME" ] && [ -f "$JAVA_HOME/bin/java" ]; then + JAVA_EXECUTABLE="$JAVA_HOME/bin/java" + echo "Using Java from JAVA_HOME: $JAVA_EXECUTABLE" + # Try Java 11 Amazon Corretto path + elif [ -f "/usr/lib/jvm/java-11-amazon-corretto/bin/java" ]; then + JAVA_EXECUTABLE="/usr/lib/jvm/java-11-amazon-corretto/bin/java" + echo "Using Java from Amazon Corretto path: $JAVA_EXECUTABLE" + # Try common JRE path (for compatibility) + elif [ -f "/usr/lib/jvm/jre/bin/java" ]; then + JAVA_EXECUTABLE="/usr/lib/jvm/jre/bin/java" + echo "Using Java from JRE path: $JAVA_EXECUTABLE" + else + # Last resort: search for any java executable + echo "Searching for Java executable..." + java_path=$(find /usr -name java -type f -executable 2>/dev/null | head -1) + if [ -n "$java_path" ]; then + JAVA_EXECUTABLE="$java_path" + echo "Found Java at: $java_path" + else + echo "Error: Java executable not found" >&2 + exit 1 + fi + fi +fi + +# Verify Java executable exists and is executable +if [ ! -f "$JAVA_EXECUTABLE" ] || [ ! -x "$JAVA_EXECUTABLE" ]; then + echo "Error: Java executable not found or not executable at $JAVA_EXECUTABLE" >&2 + exit 1 +fi + +# Print Java version for debugging +echo "Using Java executable: $JAVA_EXECUTABLE" +$JAVA_EXECUTABLE -version + +# Use Python 3.12 paths for PySpark +exec "$JAVA_EXECUTABLE" -cp /var/lang/lib/python3.12/site-packages/pyspark/conf/:/var/lang/lib/python3.12/site-packages/pyspark/jars/* -Xmx1g "$@"