In [17]:
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
import pandas as pd

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = 'lawsnic-aiml-east2'
prefix = 'kaggle/customerChurn'

role = sagemaker.get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)

In [18]:
# cell 10
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')

In [23]:
# cell 11
training_location = 's3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/test/CustomerChurnDW-2023-01-06T17-24-14/part-00000-3986a971-e8a5-4844-bfc8-e72e78f341b4-c000.csv'
validation_location = 's3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/train/CustomerChurnDW-2023-01-06T17-24-14/part-00000-0338ba51-05d0-4737-bd40-d52e24739eb6-c000.csv'

df = pd.read_csv(training_location)
df.to_csv('s3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/train.csv',header=None,index=False)

df = pd.read_csv(validation_location)
df.to_csv('s3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/validate.csv',header=None,index=False)

df = 0

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/train.csv', content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/validate.csv', content_type='csv')

First we'll need to specify training parameters to the estimator.  This includes:
1. The `xgboost` algorithm container
1. The IAM role to use
1. Training instance type and count
1. S3 location for output data
1. Algorithm hyperparameters

And then a `.fit()` function which specifies:
1. S3 location for output data.  In this case we have both a training and validation set which are passed in.

In [26]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/manual-output/xgb-hpo'.format(bucket,prefix),
                                    sagemaker_session=sess,
                                    enable_sagemaker_metrics=True)
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        eval_metric='auc',
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

2023-01-06 22:59:38 Starting - Starting the training job...
2023-01-06 23:00:01 Starting - Preparing the instances for trainingProfilerReport-1673045978: InProgress
............
2023-01-06 23:02:06 Downloading - Downloading input data
2023-01-06 23:02:06 Training - Downloading the training image...
2023-01-06 23:02:37 Uploading - Uploading generated training model.[34mArguments: train[0m
[34m[2023-01-06:23:02:32:INFO] Running standalone xgboost training.[0m
[34m[2023-01-06:23:02:32:INFO] File size need to be processed in the node: 1.11mb. Available memory size in the node: 8640.82mb[0m
[34m[2023-01-06:23:02:32:INFO] Determined delimiter of CSV input is ','[0m
[34m[23:02:32] S3DistributionType set as FullyReplicated[0m
[34m[23:02:32] 1407x40 matrix with 56280 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-01-06:23:02:32:INFO] Determined delimiter of CSV input is ','[0m
[34m[23:02:32] S3DistributionType set as FullyReplicate

In [None]:
boto3.client('sagemaker').describe_training_job(TrainingJobName=tuner.best_training_job())

## HPO

In [29]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                            'min_child_weight': ContinuousParameter(1, 10),
                            'alpha': ContinuousParameter(0, 2),
                            'max_depth': IntegerParameter(1, 10)}

objective_metric_name = 'validation:auc'

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=3)

In [30]:
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

......................................................................................................!


In [34]:
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

'Completed'

In [37]:
boto3.client('sagemaker').describe_training_job(TrainingJobName=tuner.best_training_job())['FinalMetricDataList']

[{'MetricName': 'validation:auc',
  'Value': 0.8405889868736267,
  'Timestamp': datetime.datetime(2023, 1, 6, 23, 14, 55, tzinfo=tzlocal())},
 {'MetricName': 'train:auc',
  'Value': 0.8764730095863342,
  'Timestamp': datetime.datetime(2023, 1, 6, 23, 14, 55, tzinfo=tzlocal())},
 {'MetricName': 'ObjectiveMetric',
  'Value': 0.8405889868736267,
  'Timestamp': datetime.datetime(2023, 1, 6, 23, 14, 55, tzinfo=tzlocal())}]