## Optimizing an ML Pipeline in Azure

### Initialise workspace

In [16]:
# Create workspace
subscription_name = 'Udacity CloudLabs Sub - 20'
subscription_id = 'a0a76bad-11a1-4a2d-9887-97a29122c8ed'
resource_group = 'aml-quickstarts-139543'
workspace_name = 'quick-starts-ws-139543'

ws = Workspace(subscription_id, resource_group, workspace_name)

In [17]:
from azureml.core import Workspace, Experiment

# ws = Workspace.get(name="quick-starts-ws-139543")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-139543
Azure region: southcentralus
Subscription id: a0a76bad-11a1-4a2d-9887-97a29122c8ed
Resource group: aml-quickstarts-139543


### Create compute cluster

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###

In [18]:
cluster_name = "ar-compute"

try:
    compute_cluster = ComputeTarget(workspace=ws, name = cluster_name)
    print('Found existing compute cluster')
    
except ComputeTargetException:
    print('Creating new compute cluster ...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    compute_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_cluster.wait_for_completion(show_output=True)

Found existing compute cluster

Running


### HyperDrive Config

In [20]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling({
        "--C":         uniform(0.5, 1.0),
        "--max_iter" : choice(50, 100, 150, 200)
})

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval = 1, delay_evaluation = 5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(  source_directory = "./",
                entry_script="train.py",
                compute_target=compute_cluster,
                vm_size='STANDARD_D2_V2'
)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(   estimator = est,
                                        hyperparameter_sampling = ps,
                                        policy = policy,
                                        primary_metric_name = 'Accuracy',
                                        primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                        max_total_runs = 20,
                                        max_concurrent_runs = 4)




### Submit HyperDrive run

In [21]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###

hyperdrive_run = exp.submit(config=hyperdrive_config)

# Monitor the progress with the history widget
RunDetails(hyperdrive_run).show()

hyperdrive_run.wait_for_completion(show_output=True)

assert(hyperdrive_run.get_status() == "Completed")



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_0b250445-c62c-4d17-9d59-73c0a29ce537
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_0b250445-c62c-4d17-9d59-73c0a29ce537?wsid=/subscriptions/a0a76bad-11a1-4a2d-9887-97a29122c8ed/resourcegroups/aml-quickstarts-139543/workspaces/quick-starts-ws-139543

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-02-26T21:37:41.641481][API][INFO]Experiment created<END>\n""<START>[2021-02-26T21:37:42.466071][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"<START>[2021-02-26T21:37:42.9288253Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2021-02-26T21:37:42.693584][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_0b250445-c62c-4d17-9d59-73c0a29ce537
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_0b250445-c62c-4d17-9d59-73c0a29ce537?wsid=/subscriptions/a0a76

### Get Best run

In [22]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###

# When all the jobs finish, we can find out the one that has the highest accuracy.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

# Best metrics
best_run_metrics = best_run.get_metrics()
print('Best Run Id: ', best_run.id)

# Now let's list the model files uploaded during the run.
print(best_run.get_file_names())

print('Accuracy:', best_run_metrics['Accuracy'])

# Saving the model under the workspace for deployment
model = best_run.register_model(model_name = 'best_model', model_path = './')

['--C', '0.5171872582703092', '--max_iter', '200']
Best Run Id:  HD_0b250445-c62c-4d17-9d59-73c0a29ce537_0
['azureml-logs/55_azureml-execution-tvmps_e6c8f4cc981faeee70d44c5812eb8178a485a6cb8600400ac6be3ea6cbee0149_d.txt', 'azureml-logs/65_job_prep-tvmps_e6c8f4cc981faeee70d44c5812eb8178a485a6cb8600400ac6be3ea6cbee0149_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_e6c8f4cc981faeee70d44c5812eb8178a485a6cb8600400ac6be3ea6cbee0149_d.txt', 'logs/azureml/92_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']
Accuracy: 0.9072837632776934


### Create TabularDataset

In [24]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

# train.py
# ds = TabularDatasetFactory.from_delimited_files(path = csv_path)
# x, y = clean_data(ds)

### YOUR CODE HERE ###
csv_path = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
ds = TabularDatasetFactory.from_delimited_files(path = csv_path)

In [25]:
ds

{
  "source": [
    "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
  ],
  "definition": [
    "GetFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

### Clean Data

In [26]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

### Set parameters for AutoMLConfig

In [27]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task = 'classification',
    primary_metric = 'accuracy',
    training_data = ds,
    label_column_name = 'y',
    n_cross_validations=6,
    enable_onnx_compatible_models=True
)

In [28]:
automl_config

<azureml.train.automl.automlconfig.AutoMLConfig at 0x7f0f3c101080>

### Submit AutoML run

In [29]:
# Submit your automl run

### YOUR CODE HERE ###
experiment = Experiment(ws, "automl_experiment")
run = experiment.submit(config = automl_config, show_output = True)

No run_configuration provided, running on local with default configuration
Running on local machine
Parent Run ID: AutoML_b920e6ff-0565-48d9-9d63-6dd79a3206e1

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/Auto

### Retrieve best AutoML model

In [30]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, best_model = run.get_output()
best_run.register_model(model_name = 'automl_best_model.pkl', model_path = './outputs/')

Model(workspace=Workspace.create(name='quick-starts-ws-139543', subscription_id='a0a76bad-11a1-4a2d-9887-97a29122c8ed', resource_group='aml-quickstarts-139543'), name=automl_best_model.pkl, id=automl_best_model.pkl:1, version=1, tags={}, properties={})

In [31]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl_experiment,AutoML_b920e6ff-0565-48d9-9d63-6dd79a3206e1_27,,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [32]:
best_model

PipelineWithYTransformations(Pipeline={'memory': None,
                                       'steps': [('datatransformer',
                                                  DataTransformer(enable_dnn=None,
                                                                  enable_feature_sweeping=None,
                                                                  feature_sweeping_config=None,
                                                                  feature_sweeping_timeout=None,
                                                                  featurization_config=None,
                                                                  force_text_dnn=None,
                                                                  is_cross_validation=None,
                                                                  is_onnx_compatible=None,
                                                                  logger=None,
                                                              

### Delete compute cluster

In [None]:
compute_cluster.delete()