In [1]:
# Imports
from azureml.core import Workspace, Experiment, ScriptRunConfig, Environment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.widgets import RunDetails
#from azureml.train.sklearn import SKLearn depricated
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import joblib

import os

In [2]:
# Set up values to connect to our azure environment
subscription_id = 'abace65f-7c1e-4e4f-9ea3-7521dee39a5c'
resource_group = 'Azure_Machine_Learning'
workspace_name = 'CerionML'
cluster_name = 'cerionCompute'
experiment_name = 'hyper-param-experiment'
environment_name = 'AzureML-sklearn-0.24-ubuntu18.04-py37-cpu' #See list below if we are unsure which exists in thos work space

# Grab the workspace
ws = Workspace(subscription_id, resource_group, workspace_name)
exp = Experiment(workspace=ws, name=experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group,
      'Cluster name: ' + cluster_name,
      'Experiment name: ' + experiment_name,
      'Environment name: ' + environment_name, sep = '\n')

run = exp.start_logging()

Workspace name: CerionML
Azure region: westeurope
Subscription id: abace65f-7c1e-4e4f-9ea3-7521dee39a5c
Resource group: Azure_Machine_Learning
Cluster name: cerionCompute
Experiment name: hyper-param-experiment
Environment name: AzureML-sklearn-0.24-ubuntu18.04-py37-cpu


In [3]:
envs = Environment.list(workspace=ws)

#Print available environments if we don't know which to use, or if we need to create on of our own
for env in envs:
    print("Name",env)   


Name AzureML-Triton
Name AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu
Name AzureML-tensorflow-2.4-ubuntu18.04-py37-cuda11-gpu
Name AzureML-pytorch-1.7-ubuntu18.04-py37-cuda11-gpu
Name AzureML-onnxruntime-1.6-ubuntu18.04-py37-cpu-inference
Name AzureML-pytorch-1.6-ubuntu18.04-py37-cpu-inference
Name AzureML-pytorch-1.7-ubuntu18.04-py37-cpu-inference
Name AzureML-tensorflow-1.15-ubuntu18.04-py37-cpu-inference
Name AzureML-minimal-ubuntu18.04-py37-cpu-inference
Name AzureML-tensorflow-2.4-ubuntu18.04-py37-cpu-inference
Name AzureML-sklearn-0.24.1-ubuntu18.04-py37-cpu-inference
Name AzureML-xgboost-0.9-ubuntu18.04-py37-cpu-inference
Name AzureML-tensorflow-2.4-ubuntu18.04-py37-cuda11.0.3-gpu-inference
Name AzureML-PyTorch-1.3-CPU
Name AzureML-sklearn-1.0-ubuntu20.04-py38-cpu
Name AzureML-sklearn-0.24-ubuntu18.04-py37-cpu
Name AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu
Name AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu
Name AzureML-pytorch-1.8-ubuntu18.04-py37-cuda11-gpu
Name AzureML-m

In [4]:
# Get compute target, create it if it does not exist

try:
    aml_compute = ComputeTarget(workspace=ws, name=cluster_name)
    print('Using existing cluster:', cluster_name)
except:
    print('Creating new cluster:', cluster_name)
    cluster_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    aml_compute = ComputeTarget.create(ws, cluster_name, cluster_config)

aml_compute.wait_for_completion(show_output=True)

Using existing cluster: cerionCompute
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


# Sanity check that training works
We want to be sure that our train.py works as expected

In [5]:
%run train.py --C 0.8 --max_iter 100


Attempted to log scalar metric Regularization Strength::
0.8
Attempted to log scalar metric Max iterations::
100
Attempted to log scalar metric Random state::
42
Attempted to log scalar metric Model save path::
outputs/model.joblib
Attempted to log scalar metric Accuracy:
0.9088012139605463


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [6]:
# Specify parameter sampler
ps = RandomParameterSampling(
    {
        "--C": uniform(0.05, 0.1),
        "--max_iter": choice(10, 100, 150)
    })

# Specify a Policy
policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10)

project_folder = "./"
model_save_path = 'outputs/model.joblib'
if "training" not in os.listdir():
    os.mkdir(project_folder)

env = Environment.get(ws, name=environment_name)


script_run_config = ScriptRunConfig(source_directory=project_folder,
                      script='train.py',
                      arguments=['--model_save_path', model_save_path],              
                      compute_target=aml_compute,                                    
                      environment=env,
                      max_run_duration_seconds = 60*10)


# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=script_run_config,
                             hyperparameter_sampling=ps,
                             policy=policy,                           
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=4,
                             max_concurrent_runs=4,
                             max_duration_minutes=30)

In [7]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hyperdrive_run = exp.submit(hyperdrive_config)

In [8]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [11]:
hyperdrive_run.get_status()

'Completed'

In [12]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_6830524f-8c8d-4368-b173-d7cf8ad9132a
Web View: https://ml.azure.com/runs/HD_6830524f-8c8d-4368-b173-d7cf8ad9132a?wsid=/subscriptions/abace65f-7c1e-4e4f-9ea3-7521dee39a5c/resourcegroups/Azure_Machine_Learning/workspaces/CerionML&tid=43d44578-88b7-4970-9034-652e5262953e

Execution Summary
RunId: HD_6830524f-8c8d-4368-b173-d7cf8ad9132a
Web View: https://ml.azure.com/runs/HD_6830524f-8c8d-4368-b173-d7cf8ad9132a?wsid=/subscriptions/abace65f-7c1e-4e4f-9ea3-7521dee39a5c/resourcegroups/Azure_Machine_Learning/workspaces/CerionML&tid=43d44578-88b7-4970-9034-652e5262953e



{'runId': 'HD_6830524f-8c8d-4368-b173-d7cf8ad9132a',
 'target': 'cerionCompute',
 'status': 'Completed',
 'startTimeUtc': '2022-01-07T08:51:53.174341Z',
 'endTimeUtc': '2022-01-07T08:59:27.728849Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '46852c88-981a-4d09-b5aa-83ef4bd091d2',
  'user_agent': 'python/3.6.9 (Linux-5.4.0-1063-azure-x86_64-with-debian-buster-sid) msrest/0.6.21 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.36.0',
  'space_size': 'infinite_space_size',
  'score': '0.9080424886191198',
  'best_child_run_id': 'HD_6830524f-8c8d-4368-b173-d7cf8ad9132a_3',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://cerionml7971117355.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_6830524f-8c8d-4368-b1

In [13]:
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
best_accuracy = best_run_metrics['Accuracy']
print('Best accuracy:', best_accuracy)



Best accuracy: 0.9080424886191198


In [14]:
# Get the best arguments found
best_args = best_run.get_details()['runDefinition']['arguments']
print('Best arguments found:', best_args)

Best arguments found: ['--model_save_path', 'outputs/model.joblib', '--C', '0.09506819820759513', '--max_iter', '150']


In [15]:
# Register model
best_run.register_model(model_name='hyper_drive_regression_model', 
                        model_path=model_save_path,
                       tags={'Accuracy': best_accuracy},
                       properties = {'Params':best_args})

Model(workspace=Workspace.create(name='CerionML', subscription_id='abace65f-7c1e-4e4f-9ea3-7521dee39a5c', resource_group='Azure_Machine_Learning'), name=hyper_drive_regression_model, id=hyper_drive_regression_model:3, version=3, tags={'Accuracy': '0.9080424886191198'}, properties={'Params': "['--model_save_path', 'outputs/model.joblib', '--C', '0.09506819820759513', '--max_iter', '150']"})

# Run AutoML and compare with HyperDrive
Now try to run a test using Azure AutoML. It will try and find the best model, with the optimal hyper parameters.

Then we will compare the results.

In [16]:
# Create dataset
from azureml.data.dataset_factory import TabularDatasetFactory
# Create TabularDataset using TabularDatasetFactory
path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = Dataset.Tabular.from_delimited_files(path=path)   


In [18]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [37]:
# Concatinate to full dataset again, since this is what AutoML expects
trainingData = x
trainingData['y'] = y

In [38]:
trainingData.head()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
0,57,1,0,0,1,5,1,371,1,999,...,0,0,0,0,1,0,0,0,0,0
1,55,1,0,1,0,5,4,285,2,999,...,1,0,0,0,0,0,0,0,1,0
2,33,1,0,0,0,5,5,52,1,999,...,0,0,0,1,0,0,0,0,0,0
3,36,1,0,0,0,6,5,355,4,999,...,1,0,0,0,1,0,0,0,0,0
4,27,1,0,1,0,7,5,189,2,999,...,0,0,0,0,1,0,0,0,0,0


In [39]:
trainingData.tail()

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
32945,56,1,0,0,1,7,1,116,1,999,...,0,1,0,0,0,0,0,0,0,0
32946,37,1,0,0,1,7,5,69,7,999,...,0,0,0,0,0,0,0,1,0,0
32947,26,0,0,0,0,5,2,135,4,999,...,0,0,0,0,0,0,0,1,0,0
32948,31,0,0,0,0,4,1,386,1,999,...,0,0,0,1,0,0,0,0,0,0
32949,39,1,0,0,0,8,4,179,1,999,...,0,1,0,0,0,0,0,0,0,0


In [81]:
df = ds.to_pandas_dataframe()


In [82]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [None]:
def_blob_store = ws.get_default_datastore()
print("Default datastore's name: {}".format(def_blob_store.name))

In [49]:
trainingDataTabular = Dataset.Tabular.register_pandas_dataframe(trainingData, target=def_blob_store,name='bankmarketing_train')

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/ba2289d4-aa26-45e2-9632-667a04c2a3ec/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [50]:
trainingDataTabular

{
  "source": [
    "('workspaceblobstore', 'managed-dataset/ba2289d4-aa26-45e2-9632-667a04c2a3ec/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "760b323e-6abe-48e0-b136-7c64de9cba45",
    "name": "bankmarketing_train",
    "version": 1,
    "workspace": "Workspace.create(name='CerionML', subscription_id='abace65f-7c1e-4e4f-9ea3-7521dee39a5c', resource_group='Azure_Machine_Learning')"
  }
}

In [53]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=trainingDataTabular,
    label_column_name='y',
    n_cross_validations=3,
    test_size=0.2,
    enable_early_stopping=True,
    compute_target=aml_compute)

In [54]:
# Submit your automl run
experiment = Experiment(ws, 'BankMarketing_AutoML')
run = experiment.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on cerionCompute with default configuration
Running on remote compute: cerionCompute


Experiment,Id,Type,Status,Details Page,Docs Page
BankMarketing_AutoML,AutoML_8d28ada4-5134-4df0-830b-8d9e9ca457bf,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+-------------------------

In [56]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [57]:
# Retrieve and save your best automl model.

best_run, fitted_model = run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: BankMarketing_AutoML,
Id: AutoML_8d28ada4-5134-4df0-830b-8d9e9ca457bf_30,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/mount...
    gpu_training_param_dict={'processing_unit_type': 'cpu'}
), random_state=0, reg_alpha=0.7291666666666667, reg_lambda=2.3958333333333335, subsample=0.8, tree_method='auto'))], verbose=False))], flatten_transform=None, weights=[0.07142857142857142, 0.2857142857142857, 0.2857142857142857, 0.14285714285714285, 0.07142857142857142, 0.07142857142857142, 0.07142857142857142]))],
         verbose=False)


In [65]:
dir(best_run)

['DELIM',
 'EXPERIMENT_ID_PATH',
 'EXPERIMENT_NAME_PATH',
 'RUN_PATH',
 'TID_FMT',
 'WORKSPACE_FMT',
 '_DEFAULT_GET_CONTENT_TIMEOUT',
 '_RUNSOURCE_PROPERTY',
 '_WAIT_COMPLETION_POLLING_INTERVAL_MAX',
 '_WAIT_COMPLETION_POLLING_INTERVAL_MIN',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cleanup',
 '_client',
 '_container',
 '_context_manager',
 '_create',
 '_download_artifact_contents_to_string',
 '_dto_to_run',
 '_experiment',
 '_experiment_url',
 '_formatted_tid',
 '_get_aml_token_auth',
 '_get_base_info_dict',
 '_get_blob_datastore_from_run',
 '_get_last_log_primary_instance',
 '_get_logs',
 '_get_msi_auth',
 '_get_outputs_datapath',
 

In [71]:
best_run.get_metrics()

{'recall_score_weighted': 0.9166918838716214,
 'average_precision_score_micro': 0.9816749663846559,
 'recall_score_macro': 0.7477495127463475,
 'precision_score_macro': 0.8023855244649609,
 'matthews_correlation': 0.5466260371485672,
 'accuracy': 0.9166918838716214,
 'weighted_accuracy': 0.9587220619229045,
 'AUC_weighted': 0.9483188423632624,
 'balanced_accuracy': 0.7477495127463475,
 'f1_score_macro': 0.7702579577879564,
 'AUC_micro': 0.9809056298805366,
 'average_precision_score_weighted': 0.9555715046200155,
 'log_loss': 0.25808860797855465,
 'recall_score_micro': 0.9166918838716214,
 'f1_score_weighted': 0.9125362664694308,
 'f1_score_micro': 0.9166918838716214,
 'precision_score_weighted': 0.9108336798496773,
 'average_precision_score_macro': 0.8250193198274748,
 'norm_macro_recall': 0.495499025492695,
 'precision_score_micro': 0.9166918838716214,
 'AUC_macro': 0.9483188423632626,
 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_8d28ada4-5134-4df0-830b-8d9e9ca457bf_

In [66]:
best_run.get_details()

{'runId': 'AutoML_8d28ada4-5134-4df0-830b-8d9e9ca457bf_30',
 'target': 'cerionCompute',
 'status': 'Completed',
 'startTimeUtc': '2022-01-07T10:05:10.37208Z',
 'endTimeUtc': '2022-01-07T10:05:57.419057Z',
 'services': {},
 'properties': {'runTemplate': 'automl_child',
  'pipeline_id': '__AutoML_Ensemble__',
  'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'BankMarketing_AutoML\',\'compute_target\':\'cerionCompute\',\'subscription_id\':\'abace65f-7c1e-4e4f-9ea3-7521dee39a5c\',\'region\':\'westeurope\',\'spark_service\':None}","ensemble_run_id":"AutoML_8d28ada4-5134-4df0-830b-8d9e9ca457bf_30","experiment_name":"BankMarketing_AutoML","workspace_name":"CerionML","subscription_id":"abace65f-7c

In [73]:
best_run.get_details()['properties']['run_algorithm']

'VotingEnsemble'

In [63]:
fitted_model.steps

[('datatransformer',
  DataTransformer(
      task='classification',
      is_onnx_compatible=False,
      enable_feature_sweeping=True,
      enable_dnn=False,
      force_text_dnn=False,
      feature_sweeping_timeout=86400,
      featurization_config=None,
      is_cross_validation=True,
      feature_sweeping_config={}
  )),
 ('prefittedsoftvotingclassifier',
  PreFittedSoftVotingClassifier(
      estimators=[('1', Pipeline(
          memory=None,
          steps=[('maxabsscaler', MaxAbsScaler(
              copy=True
          )), ('xgboostclassifier', XGBoostClassifier(
              random_state=0,
              n_jobs=1,
              problem_info=ProblemInfo(
                  gpu_training_param_dict={'processing_unit_type': 'cpu'}
              ),
              tree_method='auto'
          ))],
          verbose=False
      )), ('14', Pipeline(
          memory=None,
          steps=[('standardscalerwrapper', StandardScalerWrapper(
              copy=True,
              with_

In [77]:
# View files created for the best model run
best_run.get_file_names()

['accuracy_table',
 'automl_driver.py',
 'confusion_matrix',
 'explanation/63cd6075/classes.interpret.json',
 'explanation/63cd6075/eval_data_viz.interpret.json',
 'explanation/63cd6075/expected_values.interpret.json',
 'explanation/63cd6075/features.interpret.json',
 'explanation/63cd6075/global_names/0.interpret.json',
 'explanation/63cd6075/global_rank/0.interpret.json',
 'explanation/63cd6075/global_values/0.interpret.json',
 'explanation/63cd6075/local_importance_values.interpret.json',
 'explanation/63cd6075/per_class_names/0.interpret.json',
 'explanation/63cd6075/per_class_rank/0.interpret.json',
 'explanation/63cd6075/per_class_values/0.interpret.json',
 'explanation/63cd6075/rich_metadata.interpret.json',
 'explanation/63cd6075/true_ys_viz.interpret.json',
 'explanation/63cd6075/visualization_dict.interpret.json',
 'explanation/63cd6075/ys_pred_proba_viz.interpret.json',
 'explanation/63cd6075/ys_pred_viz.interpret.json',
 'explanation/64bf6de3/classes.interpret.json',
 'expl

In [76]:
# Register model
best_run.register_model(model_name='BankMarketing_AutoML_model', model_path='outputs/model.pkl')

Model(workspace=Workspace.create(name='CerionML', subscription_id='abace65f-7c1e-4e4f-9ea3-7521dee39a5c', resource_group='Azure_Machine_Learning'), name=BankMarketing_AutoML_model, id=BankMarketing_AutoML_model:1, version=1, tags={}, properties={})