# Forest Cover Type 2a): SageMaker Autopilot


In [None]:
%load_ext autoreload
%autoreload 2

# External Dependencies:
import boto3
import numpy as np
import pandas as pd
import sagemaker
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

# Local Dependencies:
import util

In [None]:
%store -r bucket_name
%store -r experiment_name
%store -r preproc_trial_component_name

bucket = boto3.resource("s3").Bucket(bucket_name)
role = sagemaker.get_execution_role()
smclient = boto3.client("sagemaker")
smsess = sagemaker.session.Session()

In [None]:
automl_trial = Trial.create(
    trial_name=util.append_timestamp("tabnet-automl"), 
    experiment_name=experiment_name,
    sagemaker_boto_client=smclient,
)
automl_trial.add_trial_component(preproc_trial_component_name)

In [None]:
autoestimator = sagemaker.automl.automl.AutoML(
    role=role,
    sagemaker_session=smsess,
    target_attribute_name="Cover_Type",
    problem_type="MulticlassClassification",
    job_objective={ "MetricName": "Accuracy" },
    output_path=f"s3://{bucket_name}/automl",
    base_job_name="auto-forestcover",
    max_candidates=30,
    #max_runtime_per_training_job_in_seconds=None,
    #total_job_runtime_in_seconds=None,
    generate_candidate_definitions_only=False,
    tags=None,
)

In [None]:
autoestimator.fit(
    [f"s3://{bucket_name}/data/train.csv"],
    wait=False,
    logs=False, #logs=True,  # Only works with wait=True
    # Might want to set the job name explicitly because the default gives you very few free prefix chars!
    #job_name=util.append_timestamp("auto-frstcv"),
)

auto_ml_job_name = autoestimator.current_job_name

In [None]:
def is_automl_status_done(status):
    if status["AutoMLJobStatus"] == "Completed":
        return True
    elif status["AutoMLJobStatus"] in ("Failed", "Stopped"):
        raise ValueError(f"Job ended in non-successful state '{status['AutoMLJobStatus']}'\n{status}")
    else:
        return False

util.spinner.wait(
    autoestimator.describe_auto_ml_job,
    is_automl_status_done,
    fn_stringify_result=lambda status: f"{status['AutoMLJobStatus']} - {status['AutoMLJobSecondaryStatus']}",
    spinner_secs=0.4,
    poll_secs=30
)

## Alternative Boto3 method (vs SageMaker SDK)

Don't run me!

In [None]:

auto_ml_job_name = util.append_timestamp("auto-frstcv")
create_automl_response = smclient.create_auto_ml_job(
    AutoMLJobName=auto_ml_job_name,
    InputDataConfig=[{
        "DataSource": {
            "S3DataSource": {
                "S3DataType": "S3Prefix",
                "S3Uri": f"s3://{bucket_name}/data/train.csv"
            }
        },
        "TargetAttributeName": "Cover_Type",
    }],
    OutputDataConfig={
        "S3OutputPath": f"s3://{bucket_name}/automl"
    },
    ProblemType="MulticlassClassification",
    AutoMLJobObjective={
        "MetricName": "Accuracy",
    },
    AutoMLJobConfig={
        "CompletionCriteria": {
            "MaxCandidates": 30,
#             "MaxRuntimePerTrainingJobInSeconds": 123,
#             "MaxAutoMLJobRuntimeInSeconds": 123
        }
    },
    RoleArn=role
)

In [None]:
def get_automl_job_status(job_name):
    response = smclient.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    return response["AutoMLJobStatus"], response["AutoMLJobSecondaryStatus"]

def is_automl_status_done(status):
    if status[0] == "Completed":
        return True
    elif status[0] in ("Failed", "Stopped"):
        raise ValueError(f"Job ended in non-successful state '{status}'")
    else:
        return False

util.spinner.wait(
    lambda: get_automl_job_status(auto_ml_job_name),
    is_automl_status_done,
    fn_stringify_result=lambda status: " - ".join(status),
    spinner_secs=0.4,
    poll_secs=5
)

In [None]:
smclient.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)

## Logging in Our Experiment

Autopilot always creates a **Experiment** with associated Trials and Trial Components describing the detail of the flow it undertook.

For the purposes of **our Experiment** (as created in Notebook 1) which is to compare Autopilot with other methods, the Autopilot run is just one Trial and we only care about the best/selected results.

In [None]:
# describe_auto_ml_job() doesn't seem to give us anything to reconstruct what the Experiment name is, so
# we'll assume it was created with the AutoML job name + standard suffix:
automl_experiment = Experiment.load(f"{auto_ml_job_name}-aws-auto-ml-job")

In [None]:
# TODO: Extract relevant data from the 'best' Trial/Components of AutoML Experiment, and copy the info to a Trial in our Experiment
list(Trial.load(list(automl_experiment.list_trials())[0].trial_name).list_trial_components())

## Deploy

In [None]:
autoestimator.deploy(...)