In [1]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput


In [2]:
region = sagemaker.Session().boto_region_name

role = get_execution_role()
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0", role=role, instance_type="ml.m5.large", instance_count=1
)


In [5]:
sklearn_processor.run(
    code="../src/preprocessing.py",
    inputs=[ProcessingInput(
        source="../data/house_prices.csv", # S3 ó local, si es local será subido a S3
        destination="/opt/ml/processing/input" # Ubicación en el fs del proceso
    )],
    outputs=[
        ProcessingOutput(
            output_name="train_data", # Referencia
            destination="s3://page2sage/train", # Donde guardar en s3, si no se provee se generará automáticamente
            source="/opt/ml/processing/train" # Ubicación en el fs del proceso
        ),
        ProcessingOutput(output_name="validation_data", destination="s3://page2sage/validation", source="/opt/ml/processing/validation"),
    ]
)

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "train_data":
        preprocessed_training_data = output["S3Output"]["S3Uri"]
    if output["OutputName"] == "test_data":
        preprocessed_test_data = output["S3Output"]["S3Uri"]




Job Name:  sagemaker-scikit-learn-2021-12-02-21-45-46-736
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-317987917227/sagemaker-scikit-learn-2021-12-02-21-45-46-736/input/input-1/house_prices.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-317987917227/sagemaker-scikit-learn-2021-12-02-21-45-46-736/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://page2sage/train', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation_data', 'AppManaged': Fa

In [9]:
!aws s3 ls page2sage/validation/

2021-12-02 21:49:56      52590 validation.csv
