In [47]:
import sagemaker
import boto3
from sagemaker import get_execution_role
import pandas as pd

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = 'lawsnic-aiml-east2'
prefix = 'kaggle/customerChurn'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)
s3 =  boto3.Session().client(service_name='s3',region_name=region)

train_data_uri = 's3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/train/CustomerChurnDW-2023-01-06T14-14-13/part-00000-642a0746-49b9-4fda-ad2d-98afe1db11ec-c000.csv'
test_data_uri = 's3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/test/CustomerChurnDW-2023-01-06T14-14-13/part-00000-ccd6e4dd-898c-4fc4-a63a-85d1cfcfc4dc-c000.csv'
test_data_uri_with_target = 's3://lawsnic-aiml-east2/kaggle/customerChurn/clarify/test_with_target.csv'

train_data = pd.read_csv(train_data_uri);
test_data = pd.read_csv(test_data_uri);

train_data["SeniorCitizen"] = train_data["SeniorCitizen"].astype(int)
train_data["Churn"] = train_data["Churn"].astype(int)
train_data["Partner"] = train_data["Partner"].astype(int)
train_data["Dependents"] = train_data["Dependents"].astype(int)
train_data["PhoneService"] = train_data["PhoneService"].astype(int)
train_data["PaperlessBilling"] = train_data["PaperlessBilling"].astype(int)
train_data["isFemale"] = train_data["isFemale"].astype(int)
train_data_uri = 's3://lawsnic-aiml-east2/kaggle/customerChurn/clarify/train.csv'
train_data.to_csv(train_data_uri,index=False)

test_data["SeniorCitizen"] = test_data["SeniorCitizen"].astype(int)
test_data["Churn"] = test_data["Churn"].astype(int)
test_data["Partner"] = test_data["Partner"].astype(int)
test_data["Dependents"] = test_data["Dependents"].astype(int)
test_data["PhoneService"] = test_data["PhoneService"].astype(int)
test_data["PaperlessBilling"] = test_data["PaperlessBilling"].astype(int)
test_data["isFemale"] = test_data["isFemale"].astype(int)
test_data.to_csv('s3://lawsnic-aiml-east2/kaggle/customerChurn/clarify/test_with_target.csv',index=False)

test_data2 =test_data.drop(["Churn"], axis=1)

test_data_uri = 's3://lawsnic-aiml-east2/kaggle/customerChurn/clarify/test.csv'
test_data2.to_csv(test_data_uri,index=False, header=False)


print(role)

arn:aws:iam::791580863750:role/service-role/AmazonSageMaker-ExecutionRole-20220707T123330


In [2]:
#train_data.head(10)
#test_data.head(10)
#train_data.columns.to_list()
#test_data.columns.to_list()
#train_data.count('columns')
#test_data.count('columns')

In [3]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': train_data_uri
        }
      },
      'ChannelType': 'training',    
      'TargetAttributeName': 'Churn'
    }     
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/manual-output/autopilot-en'.format(bucket,prefix)
  }

autoMLJobConfig={
        'CompletionCriteria': {
            'MaxCandidates': 10
        },
        'Mode':'ENSEMBLING'
}

autoMLJobObjective = {
    'MetricName': 'Precision'
}

test_data_s3_path = test_data_uri

Launching the SageMaker Autopilot Job
You can now launch the Autopilot job by calling the create_auto_ml_job API. https://docs.aws.amazon.com/cli/latest/reference/sagemaker/create-auto-ml-job.html

In [4]:
import time

auto_ml_job_name = 'chn' + str(int(time.time()))
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=autoMLJobConfig,
                      AutoMLJobObjective=autoMLJobObjective,
                      ProblemType="BinaryClassification",
                      RoleArn=role)

AutoMLJobName: chn1677611865


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-2:791580863750:automl-job/chn1677611865',
 'ResponseMetadata': {'RequestId': '938abb4d-a80d-42f3-9f52-6cc2454a7da0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '938abb4d-a80d-42f3-9f52-6cc2454a7da0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '84',
   'date': 'Tue, 28 Feb 2023 19:17:46 GMT'},
  'RetryAttempts': 0}}

## Tracking SageMaker Autopilot job progress<a name="Tracking"></a>
SageMaker Autopilot job consists of the following high-level steps : 
* Analyzing Data, where the dataset is analyzed and Autopilot comes up with a list of ML pipelines that should be tried out on the dataset. The dataset is also split into train and validation sets.
* Feature Engineering, where Autopilot performs feature transformation on individual features of the dataset as well as at an aggregate level.
* Model Tuning, where the top performing pipeline is selected along with the optimal hyperparameters for the training algorithm (the last stage of the pipeline). 

In [6]:
print ('JobStatus - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    time.sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
Completed - Completed


## Results

Now use the describe_auto_ml_job API to look up the best candidate selected by the SageMaker Autopilot job. 

In [None]:
#sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

In [7]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
#print(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))



CandidateName: WeightedEnsemble-L2-FULL-t1791580863750chn1677611865
FinalAutoMLJobObjectiveMetricName: Precision
FinalAutoMLJobObjectiveMetricValue: 0.7345971465110779


In [19]:
best_candidate

{'CandidateName': 'WeightedEnsemble-L2-FULL-t1791580863750chn1677611865',
 'FinalAutoMLJobObjectiveMetric': {'Type': 'Maximize',
  'MetricName': 'Precision',
  'Value': 0.7345971465110779,
  'StandardMetricName': 'Precision'},
 'ObjectiveStatus': 'Succeeded',
 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob',
   'CandidateStepArn': 'arn:aws:sagemaker:us-east-2:791580863750:processing-job/chn1677611865-t1-1-ce6f3883c79a4556932c9cf8258d42a6bba35543204a',
   'CandidateStepName': 'chn1677611865-t1-1-ce6f3883c79a4556932c9cf8258d42a6bba35543204a'}],
 'CandidateStatus': 'Completed',
 'InferenceContainers': [{'Image': '763104351884.dkr.ecr.us-east-2.amazonaws.com/autogluon-inference:0.4.3-cpu-py38-ubuntu20.04',
   'ModelDataUrl': 's3://lawsnic-aiml-east2/kaggle/customerChurn/manual-output/autopilot-en/chn1677611865/sagemaker-automl-candidates/model/WeightedEnsemble-L2-FULL-t1/model.tar.gz',
   'Environment': {'MODEL_NAME': 'WeightedEnsemble-L2-FULL',
    'SAGEMAKER_DEFA

In [64]:
model_name = 'automl-custChurn-model-' + str(int(time.time()))

model = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))

#print(model.model_data)

Model ARN corresponding to the best candidate is : arn:aws:sagemaker:us-east-2:791580863750:model/automl-custchurn-model-1677698200


AttributeError: 'dict' object has no attribute 'model_data'

### Register the best model with Model Registry

#### Create model group 

In [10]:
# Create a Model Package Group: https://docs.aws.amazon.com/sagemaker/latest/dg/model-registry-model-group.html
import time
from time import gmtime, strftime

model_package_group_name = "custChurnDemo" 
model_package_group_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageGroupDescription": "Model package group for xgboost regression model with Abalone dataset",
}

#create_model_pacakge_group_response = sm.create_model_package_group(
    **model_package_group_input_dict
)
print(
    "ModelPackageGroup Arn : {}".format(create_model_pacakge_group_response["ModelPackageGroupArn"])
)

ModelPackageGroup Arn : arn:aws:sagemaker:us-east-2:791580863750:model-package-group/custchurndemo


In [91]:
model_package_group_arn = create_model_pacakge_group_response["ModelPackageGroupArn"]
modelpackage_inference_specification = {
    "InferenceSpecification": {
        "Containers": [
            {
                "Image": best_candidate['InferenceContainers'][0]['Image'],
            }
        ],
        "SupportedContentTypes": ["text/csv"],
        "SupportedResponseMIMETypes": ["text/csv"],
    }
}

# Specify the model source
model_url = best_candidate['InferenceContainers'][0]['ModelDataUrl']
model_env = best_candidate['InferenceContainers'][0]['Environment']

# Specify the model data
modelpackage_inference_specification["InferenceSpecification"]["Containers"][0][
    "ModelDataUrl"
] = model_url

modelpackage_inference_specification["InferenceSpecification"]["Containers"][0][
     "Environment"
] = model_env

create_model_package_input_dict = {
    "ModelPackageGroupName": model_package_group_arn,
    "ModelPackageDescription": "Model for regression with the Abalone dataset",
    "ModelApprovalStatus": "Approved",
}
create_model_package_input_dict.update(modelpackage_inference_specification)

#print(create_model_package_input_dict)

# Create cross-account model package
create_mode_package_response = sm.create_model_package(**create_model_package_input_dict)
model_package_arn = create_mode_package_response["ModelPackageArn"]
#print("ModelPackage Version ARN : {}".format(model_package_arn))
#print(create_mode_package_response)

### Retrive the model back from Registry to do batch inference

In [98]:
pkg_list = sm.list_model_packages(ModelPackageGroupName=model_package_group_name,
                                         ModelPackageType='Versioned',
                                         SortBy='CreationTime',
                                         SortOrder='Descending',
                                         #NameContains=str(model_package_version)
                                          )

#print(pkg_list['ModelPackageSummaryList'][0])
print(sm.describe_model_package(ModelPackageName=pkg_list['ModelPackageSummaryList'][0]['ModelPackageArn']))

pkg_description = sm.describe_model_package(ModelPackageName=pkg_list['ModelPackageSummaryList'][0]['ModelPackageArn'])

pkg_model_arn = pkg_list['ModelPackageSummaryList'][0]['ModelPackageArn']

# Create a Model object from the retrieved model package
pkg_model = sagemaker.ModelPackage(model_package_arn=pkg_model_arn,
                               role=role,
                               sagemaker_session=session)

#print(pkg_model)
#print(type(pkg_model))
#print(pkg_model.__dict__)

#import json
#json.dumps(pkg_model.__dict__)

{'ModelPackageGroupName': 'custChurnDemo', 'ModelPackageVersion': 5, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-2:791580863750:model-package/custchurndemo/5', 'ModelPackageDescription': 'Model for regression with the Abalone dataset', 'CreationTime': datetime.datetime(2023, 3, 1, 21, 39, 15, 965000, tzinfo=tzlocal()), 'InferenceSpecification': {'Containers': [{'Image': '763104351884.dkr.ecr.us-east-2.amazonaws.com/autogluon-inference:0.4.3-cpu-py38-ubuntu20.04', 'ImageDigest': 'sha256:f461d5f846f9e2f30db9ee57ff6fa6c8ba2263f8cf343f031e1c178a6b6af864', 'ModelDataUrl': 's3://lawsnic-aiml-east2/kaggle/customerChurn/manual-output/autopilot-en/chn1677611865/sagemaker-automl-candidates/model/WeightedEnsemble-L2-FULL-t1/model.tar.gz', 'Environment': {'MODEL_NAME': 'WeightedEnsemble-L2-FULL', 'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT': 'text/csv', 'SAGEMAKER_INFERENCE_OUTPUT': 'predicted_label', 'SAGEMAKER_INFERENCE_SUPPORTED': 'predicted_label,probability,probabilities,labels', 'SAGEMAKER_PROGR

In [100]:
registry_model_name = 'latestCustChurnRegisteredPkg4'
testModel = sm.create_model(
    ModelName = registry_model_name,
    Containers = [
        {
            'Image': pkg_description['InferenceSpecification']['Containers'][0]['Image'],
            'Mode': 'SingleModel',
            'ModelDataUrl': pkg_description['InferenceSpecification']['Containers'][0]['ModelDataUrl'],
            
            #attempt 4.1 changing this to the env passed into the register
            #'Environment': {"MMS_DEFAULT_WORKERS_PER_MODEL": '1'}
            'Environment':pkg_description['InferenceSpecification']['Containers'][0]['Environment'],
        }
    ],
    ExecutionRoleArn = role
)

In [101]:
#registry_model_name = 'latestCustChurnRegisteredPkg2'
#print(testModel)

### Perform batch inference using the best candidate

Now that you have successfully completed the SageMaker Autopilot job on the dataset, create a model from any of the candidates by using [Inference Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipelines.html). 

In [102]:
from time import gmtime, strftime, sleep 
from sagemaker.inputs import BatchDataCaptureConfig
from sagemaker.model_monitor import DataCaptureConfig
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

transform_job_name = 'registry-custchurn-tr-' + timestamp_suffix

transform_input = {
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                #'S3Uri': test_data_uri
                'S3Uri': test_data_uri
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    }

transform_output = {
        'S3OutputPath': 's3://{}/{}/inference-results'.format(bucket,prefix),
        "Accept": "text/csv",
        'AssembleWith': 'Line',
    }

transform_resources = {
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    }

data_capture_config = {
       "DestinationS3Uri": 's3://{}/{}/model-monitor/batch-results'.format(bucket,prefix), 
       "GenerateInferenceId": True,
    }

data_proc = { 
        "JoinSource": "Input"
        }

#original transform job pulling the model directly from AutoPilot - should always work or we have a problem
#transformer = sm.create_transform_job(TransformJobName = transform_job_name,
#                        ModelName = model_name,
#                        TransformInput = transform_input,
#                        TransformOutput = transform_output,
#                        TransformResources = transform_resources,
#                        DataCaptureConfig= data_capture_config,
#                        DataProcessing = data_proc
#)

#this method fails due to modelName >63 characters
#transformer = transformer = sagemaker.transformer.Transformer(
#                        model_name=pkg_model_arn,
#                        base_transform_job_name = transform_job_name,
#                        instance_type='ml.m5.4xlarge',
#                        instance_count=1, 
#                        output_path='s3://{}/{}/inference-results'.format(bucket,prefix),
#)


##this is using the modelPkg obj built in transformer obj
#transformer = pkg_model.transformer(  
#                        instance_type='ml.m5.4xlarge', 
#                        instance_count=1, 
#                        output_path='s3://{}/{}/inference-results'.format(bucket,prefix),
#)

# Start the batch transform job
#transformer.transform(test_data_uri, content_type='text/csv', split_type='Line')

#attempt 4 - created a model the old way but using the info from pkg_model describe function
#attempt 4.1 - now passing in the Environment config to the modelRegister when creating entry
transformer = sm.create_transform_job(TransformJobName = transform_job_name,
                        ModelName = registry_model_name,
                        TransformInput = transform_input,
                        TransformOutput = transform_output,
                        TransformResources = transform_resources,
                        DataCaptureConfig= data_capture_config,
                        DataProcessing = data_proc,
                        #Environment= { 
                        #    "SAGEMAKER_PROGRAM":"inference.py",
                        #    "SAGEMAKER_SUBMIT_DIRECTORY":"/opt/ml/model/code",
                        #    "SAGEMAKER_CONTAINER_LOG_LEVEL":"20",
                        #    "SAGEMAKER_REGION": 'us-east-2'
                        #}
)

In [103]:
print ('JobStatus')
print('----------')


describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
    job_run_status = describe_response['TransformJobStatus']
    print (job_run_status)
    sleep(30)

JobStatus
----------
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


In [None]:
s3_output_key = '{}/inference-results/test.csv.out'.format(prefix);
local_inference_results_path = 'inference_results.csv'

s3 = boto3.resource('s3')
inference_results_bucket = s3.Bucket(bucket)

print(s3_output_key)

In [None]:
inference_results_bucket.download_file(s3_output_key, local_inference_results_path);

data = pd.read_csv(local_inference_results_path, sep=';')
pd.set_option('display.max_rows', 10)         # Keep the output on one page
data