In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: ai-tci-ml
Azure region: westeurope
Subscription id: 7c03dd83-6b95-43b1-9f53-23dfd07e8803
Resource group: azp-078-rg


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "aml-und-project1"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
try:
    compute_cluster = ComputeTarget(workspace=ws,name=cluster_name)
    print("cluster exists")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_v2",max_nodes=4)
    compute_cluster = ComputeTarget.create(workspace=ws,name=cluster_name,provisioning_configuration=compute_config)
    compute_cluster.wait_for_completion(show_output=True)

print(compute_cluster.get_status().serialize())

cluster exists
{'currentNodeCount': 4, 'targetNodeCount': 3, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 3, 'unusableNodeCount': 0, 'leavingNodeCount': 1, 'preemptedNodeCount': 0}, 'allocationState': 'Resizing', 'allocationStateTransitionTime': '2023-08-27T18:06:51.397000+00:00', 'errors': None, 'creationTime': '2023-08-27T14:53:15.615918+00:00', 'modifiedTime': '2023-08-27T14:53:25.989110+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling, BayesianParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = BayesianParameterSampling({
    "--C" : uniform(0.01, 2.0),
    "--max_iter" : choice(100, 200)
    })

# Specify a Policy
policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1)


if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(source_directory='./training',script='train.py',compute_target=compute_cluster,environment=env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps,
                                    policy=policy,
                                    primary_metric_name='Accuracy',
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=20,
                                    run_config=src,
                                    max_concurrent_runs=4)

For best results with Bayesian Sampling we recommend using a maximum number of runs greater than or equal to 20 times the number of hyperparameters being tuned. Recommendend value:40.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
experiment = Experiment(workspace=ws, name='aml-und-project1-hyperdrive')
submitted_run = experiment.submit(config=hyperdrive_config)
RunDetails(submitted_run).show()
submitted_run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_204a7ced-3feb-48a1-9166-8b3363c392d1
Web View: https://ml.azure.com/runs/HD_204a7ced-3feb-48a1-9166-8b3363c392d1?wsid=/subscriptions/7c03dd83-6b95-43b1-9f53-23dfd07e8803/resourcegroups/azp-078-rg/workspaces/ai-tci-ml&tid=eb70b763-b6d7-4486-8555-8831709a784e

Streaming azureml-logs/hyperdrive.txt

[2023-08-27T18:11:35.030466][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space
[2023-08-27T18:11:35.6791895Z][SCHEDULER][INFO]Scheduling job, id='HD_204a7ced-3feb-48a1-9166-8b3363c392d1_0' 
[2023-08-27T18:11:35.7489106Z][SCHEDULER][INFO]Scheduling job, id='HD_204a7ced-3feb-48a1-9166-8b3363c392d1_1' 
[2023-08-27T18:11:35.8352837Z][SCHEDULER][INFO]Scheduling job, id='HD_204a7ced-3feb-48a1-9166-8b3363c392d1_2' 
[2023-08-27T18:11:35.9465023Z][SCHEDULER][INFO]Scheduling job, id='HD_204a7ced-3feb-48a1-9166-8b3363c392d1_3' 
[2023-08-27T18:11:35.899068][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.
[2023-08-27T18:11:

{'runId': 'HD_204a7ced-3feb-48a1-9166-8b3363c392d1',
 'target': 'aml-und-project1',
 'status': 'Completed',
 'startTimeUtc': '2023-08-27T18:11:34.040012Z',
 'endTimeUtc': '2023-08-27T18:22:40.843789Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"Accuracy","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '2731b569-28a0-4ec2-bb3f-7e8ede1c4089',
  'user_agent': 'python/3.8.16 (Windows-10-10.0.19045-SP0) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.53.0',
  'space_size': 'infinite_space_size',
  'score': '0.9139354212187424',
  'best_child_run_id': 'HD_204a7ced-3feb-48a1-9166-8b3363c392d1_12',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_204a7ced-3feb-48a1-9166-8b3363c392d1_12'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryValues': {'amlCl

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Performing interactive authentication. Please follow the instructions on the terminal.


In [5]:
import joblib
# Get your best run and save the model from that run.

best_run = submitted_run.get_best_run_by_primary_metric()
best_accuracy = best_run.get_metrics()['Accuracy']
best_parameters = best_run.get_details()['runDefinition']['arguments']
print('Best Accuracy:', best_accuracy)
print('Best Parameters:', best_parameters)

Best Accuracy: 0.9139354212187424
Best Parameters: ['--C', '1.9456524612308663', '--max_iter', '200']


In [6]:
from azureml.core.model import Model
best_run.register_model(model_path='outputs/LogisticRegression_model.pkl',
                        model_name='aml-und-project1-HyperDrive',
                        tags={"Method" : "HyperDrive"},
                        properties={'Accuracy':best_accuracy})

best_run.download_file(name='outputs/LogisticRegression_model.pkl',output_file_path='outputs/aml-und-project1-HyperDrive.pkl')

In [7]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

data = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [8]:
from training.train import clean_data
from sklearn.model_selection import train_test_split
import pandas as pd
from azureml.core import Dataset, Datastore

# Use the clean_data function to clean your data.
x, y = clean_data(data)
data = pd.concat([x,y], axis=1)

train_data, test_data = train_test_split(data, test_size=0.25, shuffle=True)

train_data.to_csv('training/aml-und-project1-automl-training_df.csv')
test_data.to_csv('training/aml-und-project1-automl-testing_df.csv')

ds_automl= ws.get_default_datastore()
ds_automl.upload(src_dir='training', target_path='.')

train_ds = Dataset.Tabular.from_delimited_files(path=[(ds_automl, ('aml-und-project1-automl-training_df.csv'))])
test_ds = Dataset.Tabular.from_delimited_files(path=[(ds_automl, ('aml-und-project1-automl-testing_df.csv'))])

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 4 files
Target already exists. Skipping upload for .\aml-und-project1-automl-testing_df.csv
Target already exists. Skipping upload for .\aml-und-project1-automl-training_df.csv
Target already exists. Skipping upload for .\train.py
Target already exists. Skipping upload for .\__pycache__\train.cpython-38.pyc
Uploaded 0 files


In [10]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    name='aml-und-project1-automl',
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=train_ds,
    validation_data=test_ds,
    label_column_name='y',
    max_concurrent_iterations=4,
    iterations=20,
    n_cross_validations=5,
    compute_target=compute_cluster,
    enable_early_stopping = True,
    )

In [None]:
# Submit your automl run

### YOUR CODE HERE ###
automl_experiment = Experiment(workspace=ws,name='aml-und-project1-automl')
automl_submitted_run = automl_experiment.submit(automl_config)
RunDetails(automl_submitted_run).show()
automl_submitted_run.wait_for_completion(show_output=True)

Submitting remote run.
This run may take 30 minutes longer than usual because it is building a training environment.


Experiment,Id,Type,Status,Details Page,Docs Page
aml-und-project1-automl,AutoML_eede5e9f-8a64-4a17-8438-0682ce8ff917,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
aml-und-project1-automl,AutoML_eede5e9f-8a64-4a17-8438-0682ce8ff917,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+--------------------------------------+
|Size of the smallest class    |Name/Label of the smallest class|Number of samples in th

{'runId': 'AutoML_eede5e9f-8a64-4a17-8438-0682ce8ff917',
 'target': 'aml-und-project1',
 'status': 'Completed',
 'startTimeUtc': '2023-08-27T18:26:23.622103Z',
 'endTimeUtc': '2023-08-27T18:34:39.094611Z',
 'services': {},
 'properties': {'num_iterations': '20',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'aml-und-project1',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"5cf1f961-7acb-49c5-b14a-19336b3cb82e\\"}, \\"validation_data\\": {\\"datasetId\\": \\"73b22696-8075-40fc-8a5c-fd95274e26d9\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-dataprep-native": "38.0.0", "azureml-dataprep": "4.12.1", "azureml-dataprep-rslex": "2.19.1", "azureml-automl-runtime": "1.53.0.post1",

Current provisioning state of AmlCompute is "Deleting"

Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on the terminal.
Performing interactive authentication. Please follow the instructions on t

In [12]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###

automl_best_run = automl_submitted_run.get_best_child()

automl_best_run_details = automl_best_run.get_details()
best_algorithm = automl_best_run_details['properties']['run_algorithm']

automl_best_run_metrics = automl_best_run.get_metrics()
best_accuracy = automl_best_run_metrics['accuracy']

print('Best Algorithm:', best_algorithm)
print('Best Accuracy:', best_accuracy)

Best Algorithm: VotingEnsemble
Best Accuracy: 0.9207331876669095


In [13]:
print("Best algorithm and it's run's details:")
automl_best_run_details

Best algorithm and it's run's details:


{'runId': 'AutoML_eede5e9f-8a64-4a17-8438-0682ce8ff917_18',
 'target': 'aml-und-project1',
 'status': 'Completed',
 'startTimeUtc': '2023-08-27T18:33:50.636288Z',
 'endTimeUtc': '2023-08-27T18:34:29.937941Z',
 'services': {},
 'properties': {'runTemplate': 'automl_child',
  'pipeline_id': '__AutoML_Ensemble__',
  'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'aml-und-project1-automl\',\'compute_target\':\'aml-und-project1\',\'subscription_id\':\'7c03dd83-6b95-43b1-9f53-23dfd07e8803\',\'region\':\'westeurope\',\'spark_service\':None}","ensemble_run_id":"AutoML_eede5e9f-8a64-4a17-8438-0682ce8ff917_18","experiment_name":"aml-und-project1-automl","workspace_name":"ai-tci-ml","subscription_id

In [14]:
automl_best_run.register_model(model_path='outputs/model.pkl',
                            model_name='aml-und-project1-AutoML',
                            tags={"Method" : "AutoML"},
                            properties={'Accuracy':best_accuracy})

automl_best_run.download_file(name='outputs/model.pkl',output_file_path='outputs/aml-und-project1-AutoML.pkl')

In [17]:
# delete compute cluster
compute_cluster.delete()