In [None]:
from sagemaker.core.helper.session_helper import Session, get_execution_role

sess = Session()
role = get_execution_role()

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

### Download Data

In [None]:
from datasets import load_dataset

train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
train_dataset, test_dataset

In [None]:
train_dataset[10]

### Use FrameworkProcessor with pytorch image

In [None]:
from sagemaker.core.image_uris import get_training_image_uri
from sagemaker.core.processing import FrameworkProcessor

image_uri = get_training_image_uri(
    region=sess.boto_region_name,
    framework="pytorch",
    framework_version="1.13",
    py_version="py39",
    instance_type="ml.m5.xlarge",
)

pytorch_processor = FrameworkProcessor(
    image_uri=image_uri,
    role=role,
    instance_type="ml.m5.xlarge",
    instance_count=1,
)

In [None]:
from sagemaker.core.shapes import ProcessingOutput, ProcessingS3Output
from time import gmtime, strftime
import os

s3_prefix = "huggingface-text-classification"
processing_job_name = "{}-{}".format(s3_prefix, strftime("%d-%H-%M-%S", gmtime()))
output_destination = "s3://{}/{}".format(sess.default_bucket(), s3_prefix)

pytorch_processor.run(
    code="preprocessing.py",
    source_dir=os.path.abspath("scripts/preprocess"),
    job_name=processing_job_name,
    outputs=[
        ProcessingOutput(
            output_name="train",
            s3_output=ProcessingS3Output(
                s3_uri="{}/train".format(output_destination),
                local_path="/opt/ml/processing/train",
                s3_upload_mode="EndOfJob",
            ),
        ),
        ProcessingOutput(
            output_name="test",
            s3_output=ProcessingS3Output(
                s3_uri="{}/test".format(output_destination),
                local_path="/opt/ml/processing/test",
                s3_upload_mode="EndOfJob",
            ),
        ),
    ],
    wait=False,
)


In [None]:
pytorch_processor.latest_job.refresh().processing_job_status