In [12]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor

region = boto3.session.Session().region_name
role = get_execution_role()


# S3 prefix
bucket = "humana-data"
prefix = "humana-Linearclass-pipeline-condition-credit"

In [13]:
import pandas as pd

train_input = 's3://{}/rawdata/original_raw_files/train'.format(bucket)
train_input

's3://humana-data/rawdata/original_raw_files/train'

In [14]:
valid_input = 's3://{}/rawdata/original_raw_files/valid'.format(bucket)
valid_input

's3://humana-data/rawdata/original_raw_files/valid'

In [15]:
train_fe_input = 's3://{}/intermediate/data/'.format(bucket)
train_fe_input

's3://humana-data/intermediate/data/'

In [16]:
FRAMEWORK_VERSION = "1.0-1"
script_dir = "humana_script"
script_path = "humana_preprocessing.py"

script_dependent_dir = script_dir + '/' + 'humana_package/'
script_dependent_dir

sklearn_train_processor = SKLearnProcessor(
    framework_version=FRAMEWORK_VERSION,
    role=role, instance_type="ml.c4.xlarge", instance_count=1
)

#### Train Data preprocessing

In [17]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_train_processor.run(code = script_dir + '/' + script_path,
                     inputs = [ProcessingInput(source = train_input, destination = "/opt/ml/processing/input"),
                              ProcessingInput(source = script_dependent_dir, 
                                              destination = "/opt/ml/processing/input/code/humana_package/")
                              ],
                     outputs = [ProcessingOutput(output_name = "df_fe", source = "/opt/ml/processing/train"),
                               ],
                      arguments = ['--train_or_test', "train"],
                     )


Job Name:  sagemaker-scikit-learn-2022-12-18-01-22-47-439
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://humana-data/rawdata/original_raw_files/train', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-18-01-22-47-439/input/input-2', 'LocalPath': '/opt/ml/processing/input/code/humana_package/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-18-01-22-47-439/input/code/humana_preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3Data

In [18]:
preprocessing_job_description = sklearn_train_processor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "df_fe":
        preprocessed_train_data = output["S3Output"]["S3Uri"]

preprocessed_train_data

's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-18-01-22-47-439/output/df_fe'

In [19]:
# df = pd.read_csv(preprocessed_data + "/df_fe_train.csv", header=None,skiprows=1)
# df.isna().sum().sum()
# df.head()

# df.shape

In [20]:
script_dependent_dir = script_dir + '/' + 'humana_package/'
script_dependent_dir

'humana_script/humana_package/'

#### Train data fitting

In [21]:
import sagemaker
sagemaker_session = sagemaker.Session()
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.0-1"
script_path = "humana_encoder.py"
script_dir = "humana_script"

sklearn_encoder = SKLearn(
    entry_point= script_path,
    source_dir = script_dependent_dir,
    role=role,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.c4.xlarge",
    sagemaker_session=sagemaker_session,
)

print("entry_point", script_dir +"/" + script_path)
print(script_dependent_dir)
sklearn_encoder.fit({"train": preprocessed_train_data})

entry_point humana_script/humana_encoder.py
humana_script/humana_package/
2022-12-18 01:28:35 Starting - Starting the training job...
2022-12-18 01:29:01 Starting - Preparing the instances for trainingProfilerReport-1671326915: InProgress
............
2022-12-18 01:31:00 Downloading - Downloading input data...
2022-12-18 01:31:28 Training - Training image download completed. Training in progress..[34m2022-12-18 01:31:29,724 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-12-18 01:31:29,727 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-18 01:31:29,735 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-12-18 01:31:29,936 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-18 01:31:29,946 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-18 01:31:2

In [22]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_encoder.transformer(
    instance_count=1, instance_type="ml.c4.xlarge", assemble_with="Line", accept="text/csv"
)


In [24]:
# Preprocess training input
transformer.transform(preprocessed_train_data, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
encoded_train = transformer.output_path

...............................[34m2022-12-18 01:37:38,096 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-18 01:37:38,098 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-18 01:37:38,099 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;

[32m2022-12-18T01:37:45.005:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m


In [34]:
import boto3
from sagemaker.image_uris import retrieve

ll_image = retrieve("linear-learner", boto3.Session().region_name)

In [None]:
s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = "s3://{}/{}/{}/{}".format(
    bucket, prefix, s3_ll_output_key_prefix, "ll_model"
)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.2xlarge",
    volume_size=20,
    max_run=3600,
    input_mode="File",
    output_path=s3_ll_output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(feature_dim=10, predictor_type="regressor", mini_batch_size=32)

ll_train_data = sagemaker.inputs.TrainingInput(
    encoded_train,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": ll_train_data}
ll_estimator.fit(inputs=data_channels, logs=True)

2022-12-18 02:10:47 Starting - Starting the training job.

In [32]:
# train_model_path = script_dir + "/"+ "train.py"
# train_model_path

# hyperparameters = {
#     "max_depth":5,
#     "eta":0.2,
#     "gamma":4,
#     "min_child_weight":6,
#     "subsample":0.8,
#     "verbosity":0,
#     "objective":"binary:logistic",
#     "num_round":100,}

# # from sagemaker.sklearn.estimator import SKLearn

# # sklearn = SKLearn(
# #     entry_point= train_model_path, framework_version="0.20.0", instance_type="ml.m5.xlarge", role=role
# # )

# from sagemaker.xgboost.estimator import XGBoost

# xgb_estimator = XGBoost(
#     entry_point=train_model_path,
#     hyperparameters=hyperparameters,
#     role=role,
#     instance_count=1,
#     instance_type="ml.m5.2xlarge",
#     framework_version="1.0-1",
# )





# xgb_estimator.fit({"train": encoded_train+ "/df_fe_train.csv.out"})




# training_job_description = sklearn.jobs[-1].describe()
# model_data_s3_uri = "{}{}/{}".format(
#     training_job_description["OutputDataConfig"]["S3OutputPath"],
#     training_job_description["TrainingJobName"],
#     "output/model.tar.gz",
# )


### Valid Dataset

In [17]:
FRAMEWORK_VERSION = "1.0-1"
script_dir = "humana_script"
script_path = "humana_preprocessing.py"

script_dependent_dir = script_dir + '/' + 'humana_package/'
script_dependent_dir

sklearn_valid_processor = SKLearnProcessor(
    framework_version=FRAMEWORK_VERSION,
    role=role, instance_type="ml.c4.xlarge", instance_count=1
)

In [18]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_valid_processor.run(code = script_dir + '/' + script_path,
                     inputs = [ProcessingInput(source = valid_input, destination = "/opt/ml/processing/input"),
                              ProcessingInput(source = script_dependent_dir, 
                                              destination = "/opt/ml/processing/input/code/humana_package/")
                              ],
                     outputs = [ProcessingOutput(output_name = "df_fe", source = "/opt/ml/processing/test"),
                               ],
                      arguments = ['--train_or_test', "test"],
                     )



Job Name:  sagemaker-scikit-learn-2022-12-17-23-01-09-981
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://humana-data/rawdata/original_raw_files/valid', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-17-23-01-09-981/input/input-2', 'LocalPath': '/opt/ml/processing/input/code/humana_package/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-17-23-01-09-981/input/code/humana_preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3Data

NameError: name 'sklearn_processor' is not defined

In [19]:

preprocessing_job_description = sklearn_valid_processor.jobs[-1].describe()

output_config = preprocessing_job_description["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "df_fe":
        preprocessed_valid_data = output["S3Output"]["S3Uri"]

preprocessed_valid_data

's3://sagemaker-us-east-1-930992672261/sagemaker-scikit-learn-2022-12-17-23-01-09-981/output/df_fe'

In [20]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_encoder.transformer(
    instance_count=1, instance_type="ml.c4.xlarge", assemble_with="Line", accept="text/csv"
)


In [21]:
# Preprocess training input
transformer.transform(preprocessed_valid_data, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
encoded_valid_data = transformer.output_path

...............................[34m2022-12-17 23:14:47,108 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-17 23:14:47,110 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-17 23:14:47,111 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;

[35m169.254.255.130 - - [17/Dec/2022:23:15:00 +0000] "POST /invocations HTTP/1.1" 200 16962718 "-" "Go-http-client/1.1"[0m
[32m2022-12-17T23:14:58.818:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m


'humana_script/train.py'