In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="bima-experiment")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code EDFNF5MZG to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-138651
Azure region: southcentralus
Subscription id: 9e65f93e-bdd8-437b-b1e8-0647cd6098f7
Resource group: aml-quickstarts-138651


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException


# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

cluster_name = "cpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found an existing compute target.')
except ComputeTargetException:
    print('No existing compute target found. Creating a new compute target.')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

print(compute_target.get_status().serialize())

No existing compute target found. Creating a new compute target.
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-02-12T01:41:21.696000+00:00', 'errors': None, 'creationTime': '2021-02-12T01:41:19.578347+00:00', 'modifiedTime': '2021-02-12T01:41:34.974303+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [3]:
from azureml.core import ScriptRunConfig, Environment
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, normal, uniform
import os

# Specify parameter sampler
ps = RandomParameterSampling(
     {
        '--C': choice(1,2,4), 
        '--max_iter': choice(10,50,100)
     }
)

# Specify a Policy
policy = BanditPolicy(
           slack_factor = 0.1,
           evaluation_interval = 2
)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create an SKLearn estimator for use with train.py
est = SKLearn(source_directory='./', compute_target=compute_target, entry_script='train.py')

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator = est,
                             hyperparameter_sampling=ps,
                             policy=policy,
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=40,
                             max_concurrent_runs=4)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)

from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_f673258c-7096-40b8-925a-0849ab351aaa
Web View: https://ml.azure.com/experiments/bima-experiment/runs/HD_f673258c-7096-40b8-925a-0849ab351aaa?wsid=/subscriptions/9e65f93e-bdd8-437b-b1e8-0647cd6098f7/resourcegroups/aml-quickstarts-138651/workspaces/quick-starts-ws-138651

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-02-12T01:41:45.955468][API][INFO]Experiment created<END>\n""<START>[2021-02-12T01:41:46.468717][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-02-12T01:41:46.634934][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-02-12T01:41:47.0694522Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_f673258c-7096-40b8-925a-0849ab351aaa
Web View: https://ml.azure.com/experiments/bima-experiment/runs/HD_f673258c-7096-40b8-925a-0849ab351aaa?wsid=/subscriptions/9e65f

{'runId': 'HD_f673258c-7096-40b8-925a-0849ab351aaa',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-02-12T01:41:45.624813Z',
 'endTimeUtc': '2021-02-12T01:51:38.33761Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '49411f0f-a5d1-4d9f-8f67-539b1cef1172',
  'score': '0.9119932022335518',
  'best_child_run_id': 'HD_f673258c-7096-40b8-925a-0849ab351aaa_0',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg138651.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_f673258c-7096-40b8-925a-0849ab351aaa/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=wjtXZOP2tmr29KVNA6Awgh28WMYj2XpIcLYeTV4YMZ8%3D&st=2021-02-12T01%3A41%3A42Z&se=2021-02-12T09%3A51%3A42Z&sp=r'},
 'submittedBy': 'ODL_User 138651'}

In [5]:
import joblib

# Get the best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best run:', best_run)
print('Metrics:', best_run_metrics)

# model = best_run.register_model(model_name='hyperdrive_model', model_path='./outputs/model.pkl')

Best run: Run(Experiment: bima-experiment,
Id: HD_f673258c-7096-40b8-925a-0849ab351aaa_0,
Type: azureml.scriptrun,
Status: Completed)
Metrics: {'Regularization Strength:': 2.0, 'Max iterations:': 50, 'Accuracy': 0.9119932022335518}


In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(data_url)

In [7]:
import pandas as pd
from train import clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

# Join x and y since automl takes a dataset
dataset = x.join(y)

# Get the default datastore to be entered as a parameter in tabular dataset creation
datastore = ws.get_default_datastore()

# Change pandas dataframe into a tabular dataset to be used in automl
training_data = TabularDatasetFactory.register_pandas_dataframe(dataset, datastore, "training_data")



Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/7970cda5-4b34-47ec-a118-905cec5e87ef/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [8]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    compute_target=compute_target,
    experiment_timeout_minutes = 30,
    task = 'classification',
    primary_metric = 'accuracy',
    training_data = training_data,
    label_column_name = 'y',
    n_cross_validations = 4)

In [9]:
# Submit the automl run

experiment = Experiment(ws, "automl_model")
print("Experiment created")

automl_run = experiment.submit(config=automl_config, show_output=True)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

Experiment created
Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_70938642-bddc-4546-b5dc-623d53620801

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced dat

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION      METRIC      BEST
         0   MaxAbsScaler LightGBM                          0:00:59       0.9167    0.9167
         1   MaxAbsScaler XGBoostClassifier                 0:01:06       0.9151    0.9167
         2   MaxAbsScaler RandomForest                      0:00:54       0.8933    0.9167
         3   MaxAbsScaler RandomForest                      0:00:59       0.8880    0.9167
         4   MaxAbsScaler RandomForest                      0:01:07       0.8026    0.9167
         5   MaxAbsScaler RandomForest                      0:00:54       0.7382    0.9167
         6   SparseNormalizer XGBoostClassifier             0:01:17       0.9128    0.9167
         7   MaxAbsS

{'runId': 'AutoML_70938642-bddc-4546-b5dc-623d53620801',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-02-12T01:52:30.824231Z',
 'endTimeUtc': '2021-02-12T02:34:25.328902Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"d186d5f6-65aa-49db-a58b-79e3a8ab67c2\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"managed-dataset/7970cda5-4b34-47ec-a118-905cec5e87ef/\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-138651\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"9e65f93e-bdd8-437b-b1e

In [10]:
# Retrieve and save the best automl model
best_run, fitted_model = automl_run.get_output()
print(best_run)
print(fitted_model)
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

joblib.dump(value=fitted_model, filename="fitted_automl_model.joblib")

automl_model = best_run.register_model(model_name='automl_model.pkl', 
                                       model_path = './outputs/')

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


Run(Experiment: automl_model,
Id: AutoML_70938642-bddc-4546-b5dc-623d53620801_0,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('MaxAbsScaler', MaxAbsScaler(copy...
                 LightGBMClassifier(boosting_type='gbdt', class_weight=None,
                                    colsample_bytree=1.0,
                                    importance_type='split', learning_rate=0.1,
                                    max_depth=-1, m

In [11]:
# Delete the cluster instance
AmlCompute.delete(compute_target)

Current provisioning state of AmlCompute is "Deleting"

