This Sagemaker Notebook will show how to run a Sagemaker Processing job, and a Sagemaker Hyperparameter tuning job while bring your own custom container to Sagemaker

In [1]:
import boto3
import sagemaker
import pandas as pd
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor

In [2]:
region = sagemaker.Session().boto_region_name
role = "AmazonSageMaker-ExecutionRole-20220323T210063" # Specific to your account

script_processor = ScriptProcessor(
    image_uri="987987637308.dkr.ecr.us-east-2.amazonaws.com/sagemaker-example",
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    command=["python3"],
)

In [3]:
s3 = boto3.client("s3")
s3.download_file(
    "sagemaker-sample-data-{}".format(region),
    "processing/census/census-income.csv",
    "census-income.csv",
)
df = pd.read_csv("census-income.csv")
df.to_csv("dataset.csv")
df.head()

Unnamed: 0,age,class of worker,detailed industry recode,detailed occupation recode,education,wage per hour,enroll in edu inst last wk,marital stat,major industry code,major occupation code,...,country of birth father,country of birth mother,country of birth self,citizenship,own business or self employed,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year,year,income
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.


In [4]:
script_processor.run(
    code="preprocessing.py",
    # arguments = ["arg1", "arg2"], # Arguments can optionally be specified here
    inputs=[ProcessingInput(source="dataset.csv", destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output/train"),
        ProcessingOutput(source="/opt/ml/processing/output/validation"),
        ProcessingOutput(source="/opt/ml/processing/output/test"),
    ],
)

INFO:sagemaker:Creating processing-job with name sagemaker-example-2023-09-19-03-33-21-607


........................Shape of data is: (199523, 43)
[Errno 17] File exists: '/opt/ml/processing/output/train'
Could not make directories
Wrote files successfully
Completed running the processing job



In [5]:
job_name = script_processor.latest_job.job_name

In [6]:
s3_client = boto3.client("s3")
default_bucket = sagemaker.Session().default_bucket()
for i in range(1, 4):
    prefix = s3_client.list_objects(Bucket=default_bucket, Prefix="sagemaker-scikit-learn")[
        "Contents"
    ][-i]["Key"]
    print("s3://" + default_bucket + "/" + prefix)

s3://sagemaker-us-east-2-987987637308/sagemaker-scikit-learn-2023-09-19-02-40-09-319/output/output-3/test.csv
s3://sagemaker-us-east-2-987987637308/sagemaker-scikit-learn-2023-09-19-02-40-09-319/output/output-2/validation.csv
s3://sagemaker-us-east-2-987987637308/sagemaker-scikit-learn-2023-09-19-02-40-09-319/output/output-1/train.csv
