# AutoPilot Ensemble example notebook for KAGGLE customer churn data sample set
We are using https://www.kaggle.com/c/customer-churn-prediction-2020 as the data set for this example, we assume this data set has been loaded into s3 buckets and no feature engineering has been done

## Initial Set up of environment

In [2]:
import sagemaker
import boto3
from sagemaker import get_execution_role
import pandas as pd

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = 'lawsnic-aiml-east2'
prefix = 'kaggle/customerChurn'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)
s3 =  boto3.Session().client(service_name='s3',region_name=region)

train_data_uri = 's3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/train/CustomerChurnDW-2023-01-06T14-14-13/part-00000-642a0746-49b9-4fda-ad2d-98afe1db11ec-c000.csv'
test_data_uri = 's3://lawsnic-aiml-east2/kaggle/customerChurn/features/partial/test/CustomerChurnDW-2023-01-06T14-14-13/part-00000-ccd6e4dd-898c-4fc4-a63a-85d1cfcfc4dc-c000.csv'
test_data_uri_with_target = 's3://lawsnic-aiml-east2/kaggle/customerChurn/clarify/test_with_target.csv'

train_data = pd.read_csv(train_data_uri);
test_data = pd.read_csv(test_data_uri);

train_data["SeniorCitizen"] = train_data["SeniorCitizen"].astype(int)
train_data["Churn"] = train_data["Churn"].astype(int)
train_data["Partner"] = train_data["Partner"].astype(int)
train_data["Dependents"] = train_data["Dependents"].astype(int)
train_data["PhoneService"] = train_data["PhoneService"].astype(int)
train_data["PaperlessBilling"] = train_data["PaperlessBilling"].astype(int) 
train_data_uri = 's3://lawsnic-aiml-east2/kaggle/customerChurn/clarify/train.csv'
train_data.to_csv(train_data_uri,index=False)

test_data["SeniorCitizen"] = test_data["SeniorCitizen"].astype(int)
test_data["Churn"] = test_data["Churn"].astype(int)
test_data["Partner"] = test_data["Partner"].astype(int)
test_data["Dependents"] = test_data["Dependents"].astype(int)
test_data["PhoneService"] = test_data["PhoneService"].astype(int)
test_data["PaperlessBilling"] = test_data["PaperlessBilling"].astype(int) 
test_data.to_csv('s3://lawsnic-aiml-east2/kaggle/customerChurn/clarify/test_with_target.csv',index=False)

test_data2 =test_data.drop(["Churn"], axis=1)

test_data_uri = 's3://lawsnic-aiml-east2/kaggle/customerChurn/clarify/test.csv'
test_data2.to_csv(test_data_uri,index=False, header=False)


print(role)

arn:aws:iam::791580863750:role/service-role/AmazonSageMaker-ExecutionRole-20220707T123330


## Configure AutoPilot parameters

In [3]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': train_data_uri
        }
      },
      'ChannelType': 'training',    
      'TargetAttributeName': 'Churn'
    }     
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/manual-output/autopilot-en'.format(bucket,prefix)
  }

autoMLJobObjective = {
    'MetricName': 'Precision'
} 

test_data_s3_path = test_data_uri

### Configure AutoPilot job 
Splitting this into it's own cell because this is where we are able to change the algorithm select for the AutoPilot job

In [4]:
autoMLJobConfig={
        'CompletionCriteria': {
            'MaxCandidates': 10
        },
        'Mode':'ENSEMBLING',    
        'CandidateGenerationConfig': { 
            'AlgorithmsConfig': [ 
                { 
                   'AutoMLAlgorithms': [ 
                       'catboost' ,
                       'fastai',
                       'xgboost'
                   ]
                }
             ],
        }
}

## Launching the SageMaker Autopilot Job
You can now launch the Autopilot job by calling the create_auto_ml_job API. https://docs.aws.amazon.com/cli/latest/reference/sagemaker/create-auto-ml-job.html

In [5]:
import time

auto_ml_job_name = 'chn' + str(int(time.time()))
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name, 
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=autoMLJobConfig,
                      AutoMLJobObjective=autoMLJobObjective,
                      ProblemType="BinaryClassification",
                      RoleArn=role)

AutoMLJobName: chn1678306696


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-2:791580863750:automl-job/chn1678306696',
 'ResponseMetadata': {'RequestId': '427c3e3b-b346-4e2a-92d9-0a562112f8ad',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '427c3e3b-b346-4e2a-92d9-0a562112f8ad',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '84',
   'date': 'Wed, 08 Mar 2023 20:18:17 GMT'},
  'RetryAttempts': 0}}

## Tracking SageMaker Autopilot job progress<a name="Tracking"></a>
Optionally poll SageMaker API to see if job has completed.

!This can be started/stopped at any time and will not affect training!

SageMaker Autopilot job consists of the following high-level steps : 
* Analyzing Data, where the dataset is analyzed and Autopilot comes up with a list of ML pipelines that should be tried out on the dataset. The dataset is also split into train and validation sets.
* Feature Engineering, where Autopilot performs feature transformation on individual features of the dataset as well as at an aggregate level.
* Model Tuning, where the top performing pipeline is selected along with the optimal hyperparameters for the training algorithm (the last stage of the pipeline). 

In [6]:
print ('JobStatus - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    time.sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - Starting
InProgress - Starting
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
InProgress - TrainingModels
Completed - Completed


## Results

Now use the describe_auto_ml_job API to look up the best candidate selected by the SageMaker Autopilot job. 

In [7]:
sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

{'AutoMLJobName': 'chn1678306696',
 'AutoMLJobArn': 'arn:aws:sagemaker:us-east-2:791580863750:automl-job/chn1678306696',
 'InputDataConfig': [{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://lawsnic-aiml-east2/kaggle/customerChurn/clarify/train.csv'}},
   'TargetAttributeName': 'Churn',
   'ContentType': 'text/csv;header=present',
   'ChannelType': 'training'}],
 'OutputDataConfig': {'S3OutputPath': 's3://lawsnic-aiml-east2/kaggle/customerChurn/manual-output/autopilot-en'},
 'RoleArn': 'arn:aws:iam::791580863750:role/service-role/AmazonSageMaker-ExecutionRole-20220707T123330',
 'AutoMLJobObjective': {'MetricName': 'Precision'},
 'ProblemType': 'BinaryClassification',
 'AutoMLJobConfig': {'CompletionCriteria': {'MaxCandidates': 10},
  'CandidateGenerationConfig': {'AlgorithmsConfig': [{'AutoMLAlgorithms': ['catboost',
      'fastai',
      'xgboost']}]},
  'Mode': 'ENSEMBLING'},
 'CreationTime': datetime.datetime(2023, 3, 8, 20, 18, 17, 126000, tzinfo=tzloc

In [7]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
#print(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))



CandidateName: WeightedEnsemble-L2-FULL-t1791580863750chn1677611865
FinalAutoMLJobObjectiveMetricName: Precision
FinalAutoMLJobObjectiveMetricValue: 0.7345971465110779


In [19]:
best_candidate

{'CandidateName': 'WeightedEnsemble-L2-FULL-t1791580863750chn1677611865',
 'FinalAutoMLJobObjectiveMetric': {'Type': 'Maximize',
  'MetricName': 'Precision',
  'Value': 0.7345971465110779,
  'StandardMetricName': 'Precision'},
 'ObjectiveStatus': 'Succeeded',
 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob',
   'CandidateStepArn': 'arn:aws:sagemaker:us-east-2:791580863750:processing-job/chn1677611865-t1-1-ce6f3883c79a4556932c9cf8258d42a6bba35543204a',
   'CandidateStepName': 'chn1677611865-t1-1-ce6f3883c79a4556932c9cf8258d42a6bba35543204a'}],
 'CandidateStatus': 'Completed',
 'InferenceContainers': [{'Image': '763104351884.dkr.ecr.us-east-2.amazonaws.com/autogluon-inference:0.4.3-cpu-py38-ubuntu20.04',
   'ModelDataUrl': 's3://lawsnic-aiml-east2/kaggle/customerChurn/manual-output/autopilot-en/chn1677611865/sagemaker-automl-candidates/model/WeightedEnsemble-L2-FULL-t1/model.tar.gz',
   'Environment': {'MODEL_NAME': 'WeightedEnsemble-L2-FULL',
    'SAGEMAKER_DEFA