In [6]:
!pip install sagemaker-experiments

Collecting sagemaker-experiments
  Using cached sagemaker_experiments-0.1.42-py3-none-any.whl (42 kB)
Installing collected packages: sagemaker-experiments
Successfully installed sagemaker-experiments-0.1.42
[0m

In [7]:
import sagemaker
import boto3
import pandas as pd
from datetime import datetime 

from sagemaker import image_uris 
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from time import gmtime, strftime
import time
from smexperiments import experiment

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = 'lawsnic-aiml-east2'
prefix = 'kaggle/customerChurn'

role = sagemaker.get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)


# name of experiment
timestep = datetime.now()
timestep = timestep.strftime("%d-%m-%Y-%H-%M-%S")
experiment_name = timestep + "xgboost-cust-churn"

# create experiment
experiment.Experiment.create(
    experiment_name=experiment_name,
    description="Iterative model tuning - altering data engineering steps",
    sagemaker_boto_client=sm,
)

Experiment(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7faf6c526790>,experiment_name='12-01-2023-19-37-54xgboost-cust-churn',description='Iterative model tuning - altering data engineering steps',tags=None,experiment_arn='arn:aws:sagemaker:us-east-2:791580863750:experiment/12-01-2023-19-37-54xgboost-cust-churn',response_metadata={'RequestId': 'ebbe8f36-9ed1-490c-bfc2-3ca12019f292', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'ebbe8f36-9ed1-490c-bfc2-3ca12019f292', 'content-type': 'application/x-amz-json-1.1', 'content-length': '109', 'date': 'Thu, 12 Jan 2023 19:37:54 GMT'}, 'RetryAttempts': 0})

In [23]:
#sm.get_paginator('list_candidates_for_auto_ml_job')
sm.list_candidates_for_auto_ml_job(AutoMLJobName='automl-house-price-13-14-11-36')

{'Candidates': [{'CandidateName': 'automl-house-price-13-14-11-36lG-007-acd71847',
   'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:r2',
    'Value': 0.8491299748420715},
   'ObjectiveStatus': 'Succeeded',
   'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob',
     'CandidateStepArn': 'arn:aws:sagemaker:us-east-2:791580863750:processing-job/automl-house-price-13-14-11-36-db-1-428b2cdb4a964a8c8016c769f44',
     'CandidateStepName': 'automl-house-price-13-14-11-36-db-1-428b2cdb4a964a8c8016c769f44'},
    {'CandidateStepType': 'AWS::SageMaker::TrainingJob',
     'CandidateStepArn': 'arn:aws:sagemaker:us-east-2:791580863750:training-job/automl-house-price-13-14-11-36-dpp5-1-27d74c85c34342c48f7fbcee1',
     'CandidateStepName': 'automl-house-price-13-14-11-36-dpp5-1-27d74c85c34342c48f7fbcee1'},
    {'CandidateStepType': 'AWS::SageMaker::TransformJob',
     'CandidateStepArn': 'arn:aws:sagemaker:us-east-2:791580863750:transform-job/automl-house-price-13-14-

In [8]:
# cell 10
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [9]:
# cell 11
training_location = 's3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/test/CustomerChurnDW-2023-01-06T17-24-14/part-00000-3986a971-e8a5-4844-bfc8-e72e78f341b4-c000.csv'
validation_location = 's3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/train/CustomerChurnDW-2023-01-06T17-24-14/part-00000-0338ba51-05d0-4737-bd40-d52e24739eb6-c000.csv'

df = pd.read_csv(training_location)
df.to_csv('s3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/train.csv',header=None,index=False)

df = pd.read_csv(validation_location)
df.to_csv('s3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/validate.csv',header=None,index=False)

df = 0

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/train.csv', content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/validate.csv', content_type='csv')

First we'll need to specify training parameters to the estimator.  This includes:
1. The `xgboost` algorithm container
1. The IAM role to use
1. Training instance type and count
1. S3 location for output data
1. Algorithm hyperparameters

And then a `.fit()` function which specifies:
1. S3 location for output data.  In this case we have both a training and validation set which are passed in.

In [13]:
from smexperiments.trial import Trial

trial = Trial.create(experiment_name=experiment_name, sagemaker_boto_client=sm)
experiment_config = {
    "ExperimentName": experiment_name,
    "TrialName": trial.trial_name,
    "TrialComponentDisplayName": "Data",
}

sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/manual-output/xgb-hpo'.format(bucket,prefix),
                                    sagemaker_session=sess,
                                    enable_sagemaker_metrics=True)
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        eval_metric='auc',
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation},
    experiment_config=experiment_config)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgboost-2023-01-12-20-56-53-140


2023-01-12 20:56:53 Starting - Starting the training job...
2023-01-12 20:57:19 Starting - Preparing the instances for trainingProfilerReport-1673557013: InProgress
......
2023-01-12 20:58:18 Downloading - Downloading input data...
2023-01-12 20:58:38 Training - Downloading the training image...
2023-01-12 20:59:24 Training - Training image download completed. Training in progress....
2023-01-12 20:59:44 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2023-01-12:20:59:40:INFO] Running standalone xgboost training.[0m
[34m[2023-01-12:20:59:40:INFO] File size need to be processed in the node: 1.11mb. Available memory size in the node: 8608.97mb[0m
[34m[2023-01-12:20:59:40:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:59:40] S3DistributionType set as FullyReplicated[0m
[34m[20:59:40] 1407x40 matrix with 56280 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-01-12:20:59:40:INFO] Determined del

## HPO

In [29]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                            'min_child_weight': ContinuousParameter(1, 10),
                            'alpha': ContinuousParameter(0, 2),
                            'max_depth': IntegerParameter(1, 10)}

objective_metric_name = 'validation:auc'

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=3)

In [30]:
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

......................................................................................................!


In [34]:
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

'Completed'

In [37]:
boto3.client('sagemaker').describe_training_job(TrainingJobName=tuner.best_training_job())['FinalMetricDataList']

[{'MetricName': 'validation:auc',
  'Value': 0.8405889868736267,
  'Timestamp': datetime.datetime(2023, 1, 6, 23, 14, 55, tzinfo=tzlocal())},
 {'MetricName': 'train:auc',
  'Value': 0.8764730095863342,
  'Timestamp': datetime.datetime(2023, 1, 6, 23, 14, 55, tzinfo=tzlocal())},
 {'MetricName': 'ObjectiveMetric',
  'Value': 0.8405889868736267,
  'Timestamp': datetime.datetime(2023, 1, 6, 23, 14, 55, tzinfo=tzlocal())}]

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
import sys
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()

predictionsT = xgb_predictor.predict(cleaned_test_data.to_numpy()).decode('utf-8')

#print(predictionsT)

predictions = np.fromstring(predictionsT[1:], sep='\r\n')