From 56a6c15d29a250ea58410171c8f4ec6d02c5ecc5 Mon Sep 17 00:00:00 2001 From: Emerson Antony Date: Thu, 19 Oct 2023 21:27:34 +1100 Subject: [PATCH 1/6] Lambda tests --- sparkLambdaHandler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparkLambdaHandler.py b/sparkLambdaHandler.py index 3200331..a1949ee 100644 --- a/sparkLambdaHandler.py +++ b/sparkLambdaHandler.py @@ -41,7 +41,7 @@ def spark_submit(s3_bucket_script: str,input_script: str, event: dict)-> None: # Run the spark-submit command on the local copy of teh script try: logger.info(f'Spark-Submitting the Spark script {input_script} from {s3_bucket_script}') - subprocess.run(["spark-submit", "/tmp/spark_script.py", "--event", json.dumps(event)], check=True) + subprocess.run(["spark-submit", "/tmp/spark_script.py", "--event", json.dumps(event)], check=True, env=event) except Exception as e : logger.error(f'Error Spark-Submit with exception: {e}') raise e From 3cddc264f2ea0da2e530e67441fa11c379561af2 Mon Sep 17 00:00:00 2001 From: Emerson Antony Date: Thu, 19 Oct 2023 21:39:18 +1100 Subject: [PATCH 2/6] Updated fix for events --- sparkLambdaHandler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparkLambdaHandler.py b/sparkLambdaHandler.py index a1949ee..944fbc9 100644 --- a/sparkLambdaHandler.py +++ b/sparkLambdaHandler.py @@ -41,7 +41,7 @@ def spark_submit(s3_bucket_script: str,input_script: str, event: dict)-> None: # Run the spark-submit command on the local copy of teh script try: logger.info(f'Spark-Submitting the Spark script {input_script} from {s3_bucket_script}') - subprocess.run(["spark-submit", "/tmp/spark_script.py", "--event", json.dumps(event)], check=True, env=event) + subprocess.run(["spark-submit", "/tmp/spark_script.py", "--event", json.dumps(event)], check=True, env=event ) except Exception as e : logger.error(f'Error Spark-Submit with exception: {e}') raise e From 3b08469cb4035ddbe93109c1409769607350d843 Mon Sep 17 00:00:00 2001 From: Emerson Antony Date: Thu, 19 Oct 2023 21:41:17 +1100 Subject: [PATCH 3/6] other ixes --- cloudformation/sam-imagebuilder.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudformation/sam-imagebuilder.yaml b/cloudformation/sam-imagebuilder.yaml index 9f028f6..c7543c6 100644 --- a/cloudformation/sam-imagebuilder.yaml +++ b/cloudformation/sam-imagebuilder.yaml @@ -133,6 +133,7 @@ Resources: - echo Downloading requires source files from git - git clone https://github.com/aws-samples/spark-on-aws-lambda.git - cd spark-on-aws-lambda + - git checkout fixes - echo Build started on `date` - echo Building the Docker image... - docker build --build-arg FRAMEWORK=$FRAMEWORK -t $IMAGE_REPO_NAME:latest . From 06636d264e3998c93028b26e3448cb698eefd0d8 Mon Sep 17 00:00:00 2001 From: Emerson Antony Date: Thu, 19 Oct 2023 21:56:15 +1100 Subject: [PATCH 4/6] latest tests --- cloudformation/sam-imagebuilder.yaml | 2 +- sparkLambdaHandler.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cloudformation/sam-imagebuilder.yaml b/cloudformation/sam-imagebuilder.yaml index c7543c6..6aaaef1 100644 --- a/cloudformation/sam-imagebuilder.yaml +++ b/cloudformation/sam-imagebuilder.yaml @@ -133,7 +133,7 @@ Resources: - echo Downloading requires source files from git - git clone https://github.com/aws-samples/spark-on-aws-lambda.git - cd spark-on-aws-lambda - - git checkout fixes + - git checkout dockerfilefixes - echo Build started on `date` - echo Building the Docker image... - docker build --build-arg FRAMEWORK=$FRAMEWORK -t $IMAGE_REPO_NAME:latest . diff --git a/sparkLambdaHandler.py b/sparkLambdaHandler.py index 944fbc9..76b8b1b 100644 --- a/sparkLambdaHandler.py +++ b/sparkLambdaHandler.py @@ -38,10 +38,12 @@ def spark_submit(s3_bucket_script: str,input_script: str, event: dict)-> None: # Source input and output if available in event input_path = event.get('INPUT_PATH','') output_path = event.get('OUTPUT_PATH', '') + for key,value in event: + os.environ[key] = value # Run the spark-submit command on the local copy of teh script try: logger.info(f'Spark-Submitting the Spark script {input_script} from {s3_bucket_script}') - subprocess.run(["spark-submit", "/tmp/spark_script.py", "--event", json.dumps(event)], check=True, env=event ) + subprocess.run(["spark-submit", "/tmp/spark_script.py", "--event", json.dumps(event)], check=True, env=os.environ) except Exception as e : logger.error(f'Error Spark-Submit with exception: {e}') raise e From 830d9c42a1ca4b7b2c44197d719f150f958bc91c Mon Sep 17 00:00:00 2001 From: Emerson Antony Date: Thu, 19 Oct 2023 22:06:07 +1100 Subject: [PATCH 5/6] event fixces --- sparkLambdaHandler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparkLambdaHandler.py b/sparkLambdaHandler.py index 76b8b1b..3ac958a 100644 --- a/sparkLambdaHandler.py +++ b/sparkLambdaHandler.py @@ -38,7 +38,7 @@ def spark_submit(s3_bucket_script: str,input_script: str, event: dict)-> None: # Source input and output if available in event input_path = event.get('INPUT_PATH','') output_path = event.get('OUTPUT_PATH', '') - for key,value in event: + for key,value in event.items(): os.environ[key] = value # Run the spark-submit command on the local copy of teh script try: From a480489e89fbeef5a0dd1c9c8608a52ca8524ef2 Mon Sep 17 00:00:00 2001 From: Emerson Antony Date: Thu, 19 Oct 2023 22:26:24 +1100 Subject: [PATCH 6/6] Updated latest fixes --- README.md | 2 +- cloudformation/sam-imagebuilder.yaml | 1 - sparkLambdaHandler.py | 9 +++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f0a8dca..10a47fa 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Here is a summary of the main steps in the script: 1. The lambda_handler function is the entry point for the Lambda function. It receives an event object and a context object as parameters. 2. The s3_bucket_script and input_script variables are used to specify the Amazon S3 bucket and object key where the Spark script is located. 3. The boto3 module is used to download the Spark script from Amazon S3 to a temporary file on the Lambda function's file system. -4. The os.environ dictionary is used to set the PYSPARK_SUBMIT_ARGS environment variable, which is required by the Spark application to run. +4. The os.environ dictionary is used to store any arguments passed via the lambda event. 5. The subprocess.run method is used to execute the spark-submit command, passing in the path to the temporary file where the Spark script was downloaded.The event payload recieved by the lambda is passed onto the spark application via the event arguement. Overall, this script enables you to execute a Spark script in AWS Lambda by downloading it from an S3 bucket and running it using the spark-submit command. The script can be configured by setting environment variables, such as the PYSPARK_SUBMIT_ARGS variable, to control the behavior of the Spark application.

diff --git a/cloudformation/sam-imagebuilder.yaml b/cloudformation/sam-imagebuilder.yaml index 6aaaef1..9f028f6 100644 --- a/cloudformation/sam-imagebuilder.yaml +++ b/cloudformation/sam-imagebuilder.yaml @@ -133,7 +133,6 @@ Resources: - echo Downloading requires source files from git - git clone https://github.com/aws-samples/spark-on-aws-lambda.git - cd spark-on-aws-lambda - - git checkout dockerfilefixes - echo Build started on `date` - echo Building the Docker image... - docker build --build-arg FRAMEWORK=$FRAMEWORK -t $IMAGE_REPO_NAME:latest . diff --git a/sparkLambdaHandler.py b/sparkLambdaHandler.py index 3ac958a..990b44f 100644 --- a/sparkLambdaHandler.py +++ b/sparkLambdaHandler.py @@ -34,10 +34,11 @@ def spark_submit(s3_bucket_script: str,input_script: str, event: dict)-> None: Submits a local Spark script using spark-submit. """ # Set the environment variables for the Spark application - pyspark_submit_args = event.get('PYSPARK_SUBMIT_ARGS', '') - # Source input and output if available in event - input_path = event.get('INPUT_PATH','') - output_path = event.get('OUTPUT_PATH', '') + # pyspark_submit_args = event.get('PYSPARK_SUBMIT_ARGS', '') + # # Source input and output if available in event + # input_path = event.get('INPUT_PATH','') + # output_path = event.get('OUTPUT_PATH', '') + for key,value in event.items(): os.environ[key] = value # Run the spark-submit command on the local copy of teh script