In [1]:
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
sess = sagemaker.Session()

In [2]:
bucket = 'humana-data'

In [3]:
test_input = 's3://{}/rawdata/original_raw_files/test'.format(bucket)
test_input

's3://humana-data/rawdata/original_raw_files/test'

In [4]:
from sagemaker.sklearn.processing import SKLearnProcessor
FRAMEWORK_VERSION = "1.0-1"
script_dir = "humana_script"
script_path = "humana_preprocessing.py"

script_dependent_dir = script_dir + '/' + 'humana_package/'
script_dependent_dir

sklearn_preprocessor = SKLearnProcessor(
    framework_version=FRAMEWORK_VERSION,
    role=role, instance_type="ml.c4.xlarge", instance_count=1
)

from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_preprocessor.run(code = script_dir + '/' + script_path,
                     inputs = [ProcessingInput(source = test_input, destination = "/opt/ml/processing/input"),
                              ProcessingInput(source = script_dependent_dir, 
                                              destination = "/opt/ml/processing/input/code/humana_package/")
                              ],
                     outputs = [ProcessingOutput(output_name = "df_fe", source = "/opt/ml/processing/test"),
                               ],
                      arguments = ['--train_or_valid_or_test', "test"],
                     )

preprocessing_job_description = sklearn_preprocessor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]

for output in output_config["Outputs"]:
    if output["OutputName"] == "df_fe":
        preprocessed_test_data = output["S3Output"]["S3Uri"]

preprocessed_test_data


Job Name:  sagemaker-scikit-learn-2022-12-30-03-36-04-592
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://humana-data/rawdata/original_raw_files/test', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-30-03-36-04-592/input/input-2', 'LocalPath': '/opt/ml/processing/input/code/humana_package/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-30-03-36-04-592/input/code/humana_preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataD

's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-30-03-36-04-592/output/df_fe'

In [10]:
script_file = "humana_encoder.py"
dependent_files = "s3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-29-03-45-32-528/source/sourcedir.tar.gz"
script_dir = "humana_script"

# import saved sklearn encoder and perform transformation on train dataset
from sagemaker.transformer import Transformer

sklearn_encoded = sess.create_model_from_job(
    training_job_name = "sagemaker-scikit-learn-2022-12-29-03-45-32-528", 
    name="{}".format("test-data-encoded-12-29-2022"),
    role=role,
    env={"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT": "text/csv", 
     "SAGEMAKER_USE_NGINX": "True", 
     "SAGEMAKER_WORKER_CLASS_TYPE": "gevent", 
     "SAGEMAKER_KEEP_ALIVE_SEC": "60", 
     "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
     "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false",
     "SAGEMAKER_PROGRAM": script_file, #the entry point present in training-src-files.tar.gz
     "SAGEMAKER_REGION": "us-east-1",
     "SAGEMAKER_SUBMIT_DIRECTORY": dependent_files,
    }
)

transformer_encoded = Transformer(
    sklearn_encoded,
    instance_count = 1,
    instance_type = "ml.c4.xlarge",
    assemble_with="Line",
    accept="text/csv"
)


# Preprocess training input
transformer_encoded.transform(preprocessed_test_data, content_type="text/csv")

print("Waiting for transform job: " + transformer_encoded.latest_transform_job.job_name)
transformer_encoded.wait()
encoded_test_data = transformer_encoded.output_path
encoded_test_data

..............................[34m2022-12-30 03:55:23,673 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-30 03:55:23,676 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-30 03:55:23,677 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;


[35m169.254.255.130 - - [30/Dec/2022:03:55:32 +0000] "POST /invocations HTTP/1.1" 200 16960245 "-" "Go-http-client/1.1"[0m
[32m2022-12-30T03:55:30.708:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m


's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-30-03-50-32-041'

In [11]:
encoded_df = pd.read_csv(encoded_test_data+'/df_fe_test.csv.out')