## SageMaker Create PySparkProcessor Job 
haimtran 30/05/2023
DataScience and Python3 Kernel

In [5]:
# !pip install -U "sagemaker>2.0"

In [4]:
import logging
from time import gmtime, strftime

import sagemaker

## Parameters 

In [11]:
!aws s3 ls

2023-05-28 10:34:50 413175686616-us-east-1-athena-results-bucket-w83sab7ocj
2023-05-27 04:30:33 cdk-hnb659fds-assets-413175686616-us-east-1
2023-05-27 04:25:29 ee-remote-deployment-commons3bucket-eky3ctzi6wog
2023-05-27 04:25:29 ee-remote-deployment-loggings3bucket-10lkfutr3jyug
2023-05-27 05:10:46 sagemaker-us-east-1-413175686616


In [12]:
source_bucket_name = "amazon-reviews-pds"
dest_bucket_name = "sagemaker-us-east-1-413175686616"

## Get SageMaker Session

In [6]:
assertsagemaker_logger = logging.getLogger("sagemaker")
sagemaker_logger.setLevel(logging.INFO)
sagemaker_logger.addHandler(logging.StreamHandler())

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

## Prepare PySpark Code

In [7]:
!mkdir spark-code

mkdir: cannot create directory ‘spark-code’: File exists


In [8]:
%%writefile ./spark-code/preprocess.py
import argparse

from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType

# create spark session
spark = SparkSession.builder.appName("PySparkApp").getOrCreate()

# create schema
schema = (
    StructType()
    .add("marketplace", StringType(), True)
    .add("customer_id", StringType(), True)
    .add("review_id", StringType(), True)
    .add("product_id", StringType(), True)
    .add("product_parent", IntegerType(), True)
    .add("product_title", StringType(), True)
    .add("product_category", StringType(), True)
    .add("star_rating", IntegerType(), True)
    .add("helpful_vote", IntegerType(), True)
    .add("total_vote", IntegerType(), True)
    .add("vine", StringType(), True)
    .add("verified_purchase", StringType(), True)
    .add("review_headline", StringType(), True)
    .add("review_body", StringType(), True)
    .add("myyear", StringType(), True)
)


def main():
    """
    parse argument
    """
    # define parser
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--source_bucket_name", type=str, help="s3 input bucket")
    parser.add_argument("--dest_bucket_name", type=str, help="output s3 prefix")

    # parse argument
    args = parser.parse_args()
    print(f"{args.source_bucket_name} and {args.dest_bucket_name}")

    # read data from s3
    df_csv = (
        spark.read.format("csv")
        .option("header", True)
        .schema(schema)
        .option("delimiter", "\t")
        .option("quote", '"')
        .load(f"s3://{args.source_bucket_name}/tsv/")
    )

    # transform and feature engineer
    df_clean = df_csv.where("marketplace='US'").select(
        "marketplace", "customer_id", "product_id", "star_rating"
    )

    # write data to s3
    df_clean.write.format("parquet").save(
        f"s3://{args.dest_bucket_name}/amazon-reviews/"
    )


if __name__ == "__main__":
    main()

Overwriting ./spark-code/preprocess.py


## Create PySparkProcessor Job

In [15]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.spark.processing import PySparkProcessor

In [16]:
spark_processor = PySparkProcessor(
    base_job_name="process-amazon-reviews",
    framework_version="3.1",
    role=role,
    instance_count=2,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=1200,
    tags=[{"Key": "tag-key", "Value": "tag-value"}],
)

this take about 5 minutes 

In [17]:
spark_processor.run(
    submit_app="./spark-code/preprocess.py",
    arguments=[
        "--source_bucket_name",
        source_bucket_name,
        "--dest_bucket_name",
        dest_bucket_name,
    ],
    logs=False,
)

Creating processing-job with name process-amazon-reviews-2023-05-29-03-03-22-057
Creating processing-job with name process-amazon-reviews-2023-05-29-03-03-22-057
INFO:sagemaker:Creating processing-job with name process-amazon-reviews-2023-05-29-03-03-22-057


................................................................................................................................!

## Check Output in S3 

In [14]:
# !aws s3 rm s3://sagemaker-us-east-1-413175686616/amazon-reviews/ --recursive

In [19]:
# !aws s3 ls --summarize --human-readable --recursive s3://sagemaker-us-east-1-413175686616/amazon-reviews/