# Configure environment
* Workspace
* Experiment
* Cluster

In [1]:
import azureml.core
import pandas as pd
import numpy as np
import logging

print("Azure ML SDK Version:",azureml.core.VERSION)

Azure ML SDK Version: 1.19.0


In [2]:
from azureml.core import Workspace, Experiment

#use json config file to access remote vscode
ws=Workspace.from_config()
#create experiment
exp = Experiment(ws, "udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')
run = exp.start_logging()

Note, we have launched a browser for you to login. For old experience with device code, use "az login --use-device-code"
Performing interactive authentication. Please follow the instructions on the terminal.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-133565
Azure region: southcentralus
Subscription id: 510b94ba-e453-4417-988b-fbdc37b55ca7
Resource group: aml-quickstarts-133565


In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
amlcompute_cluster_name = "cpu-cluster"

provisioning_config=AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",max_nodes=4)

compute_target=ComputeTarget.create(ws,amlcompute_cluster_name,provisioning_config)

compute_target.wait_for_completion(show_output=True,min_node_count=None,timeout_in_minutes=20)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


# Optimize Logistic Regression Hyperparameters
* -C        = Inverse of regularization strength. Smaller values cause stronger regularization
* --max_iter= Maximum number of iterations to converge

Use Random Sampling

[Documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters) for hyperparameters configuration

[Documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-scikit-learn) to train scikit-learn models at azure

In [4]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
from azureml.core import Environment
from azureml.core import ScriptRunConfig


## Scikit-learn Environment and Run Configuration

In [5]:
%%writefile conda_dependencies.yml
dependencies:
- python=3.6.2
- scikit-learn
- pip:
  - azureml-defaults

Overwriting conda_dependencies.yml


In [7]:
from azureml.core import Environment
from azureml.core import ScriptRunConfig
sklearn_env = Environment.from_conda_specification(name = 'sklearn-env', file_path = 'conda_dependencies.yml')
src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                      compute_target=compute_target,
                      environment=sklearn_env)

## HyperDrive Configuration

In [8]:
# Specify parameter sampler
ps = RandomParameterSampling( {
       "C": choice(0.01,0.05,0.2,1,5,10,25),
       "max_iter": choice(100, 150, 200, 250)    
    }
)


# Specify a Policy
# (using early termination policy the Bayesian Sampling is not supported)
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1)


if "training" not in os.listdir():
    os.mkdir("./training")


# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.


hyperdrive_config = HyperDriveConfig(run_config=src,
                                    hyperparameter_sampling=ps,
                                    policy=policy,
                                    primary_metric_name="accuracy",
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=20,
                                    max_concurrent_runs=4)

##HyperDrive run

In [9]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
#run experiment
hyperdrive_run = exp.submit(hyperdrive_config)
#visualize experiment

RunDetails(hyperdrive_run).show()

hyperdrive_run.wait_for_completion(show_output=True)


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_4dbe41a2-caaf-4a36-b259-bffd55ba60de
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_4dbe41a2-caaf-4a36-b259-bffd55ba60de?wsid=/subscriptions/510b94ba-e453-4417-988b-fbdc37b55ca7/resourcegroups/aml-quickstarts-133565/workspaces/quick-starts-ws-133565

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-05T21:31:35.860003][API][INFO]Experiment created<END>\n""<START>[2021-01-05T21:31:36.355001][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-01-05T21:31:36.527238][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-01-05T21:31:37.1572941Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_4dbe41a2-caaf-4a36-b259-bffd55ba60de
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_4dbe41a2-caaf-4a36-b259-bffd55ba60de?wsid=/subscriptions/510b9

{'runId': 'HD_4dbe41a2-caaf-4a36-b259-bffd55ba60de',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-05T21:31:35.645169Z',
 'endTimeUtc': '2021-01-05T21:50:45.268194Z',
 'properties': {'primary_metric_config': '{"name": "accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '56bd67e1-7d19-4aa7-a3a7-d0bae4b76d03',
  'score': '0.9130500758725342',
  'best_child_run_id': 'HD_4dbe41a2-caaf-4a36-b259-bffd55ba60de_13',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg133565.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_4dbe41a2-caaf-4a36-b259-bffd55ba60de/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=xBUYuoTxBUYCL482Da9yX7O3wsb%2B4wIFomLXnno3zyg%3D&st=2021-01-05T21%3A41%3A15Z&se=2021-01-06T05%3A51%3A15Z&sp=r'}}

## Get the Best Model 

In [10]:
# Get your best run and save the model from that run.
### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
print('Best Run Id: ', best_run.id)
print('Best Run info:',best_run_metrics)

Best Run Id:  HD_4dbe41a2-caaf-4a36-b259-bffd55ba60de_13
Best Run info: {'Regularization Strength:': 5.0, 'accuracy': 0.9130500758725342, 'Max iterations:': 250}


In [16]:


#parameter_values = best_run_metrics.get_details()['runDefinition']['Arguments']
#print(parameter_values)

#print('\n Accuracy:', best_run_metrics['accuracy'])

#print('\n Max Iterations:', best_run_metrics['Max Iterations'])

Best Run Id:  HD_58c0f94d-3647-40ab-afdb-7b37809e5d95_10


## Saving the Best Model as joblib
To save the Run as a model. For more details see the [documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.run(class)?view=azure-ml-py)

In [11]:
import joblib
print(best_run.get_file_names())
hyperdrive_model=best_run.register_model(model_name = 'hyperdrivemodel', model_path = 'outputs/model.joblib')

['azureml-logs/55_azureml-execution-tvmps_e3a9071979372c81b6e02fef6d0529467d666b55ee07b31fc2203036c551b111_d.txt', 'azureml-logs/65_job_prep-tvmps_e3a9071979372c81b6e02fef6d0529467d666b55ee07b31fc2203036c551b111_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_e3a9071979372c81b6e02fef6d0529467d666b55ee07b31fc2203036c551b111_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/100_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/engine_spans_l_c54e2392-0d13-4e36-a09b-98e3ab6b9764.jsonl', 'logs/azureml/dataprep/python_span_00ed3a90-e255-4cc8-ae55-28f220045c6d.jsonl', 'logs/azureml/dataprep/python_span_034b4c67-c9ed-4f2a-93e9-9b77c2335a4e.jsonl', 'logs/azureml/dataprep/python_span_0421c138-70c8-43ff-90e2-e13302f683aa.jsonl', 'logs/azureml/dataprep/python_span_06593dff-0068-4ee4-a829-d814d37ff826.jsonl', 'logs/azureml/dataprep/python_sp

# AutoML Run
Create a AutoML to compare with the Logistic Regression model with hyperparameters tuned with Hyperdrive

## Load the data
First we load the data again just for the AutoML Run

In [12]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###

ds = TabularDatasetFactory.from_delimited_files(path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")


## Split the training/validation  and test data
The AutoML will perform the training and validation, we can test the model afterwards with the remaining test data.

In [13]:

train_validation_data, test_data = ds.random_split(percentage=0.8, seed=1)


## AutoML Config
We need to configurate as a classification task
For more details see the [AutoMLConfig documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py).
There is no need to split the data as the AutoML will take care of this

In [14]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    compute_target = compute_target,
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=train_validation_data,
    label_column_name='y',
    n_cross_validations=5)

We will now run the experiment, using the same experiment defined previously as udacity project  

In [15]:
# Submit your automl run

### YOUR CODE HERE ###
automl_run = exp.submit(automl_config, show_output = True)


Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_ac11b830-a542-45b6-bb02-1b43156666d2

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input

In [16]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###

best_run, fitted_model = automl_run.get_output()


In [18]:
model_name = best_run.properties['model_name']


In [19]:
description = 'AutoML Model trained on bank marketing data to predict if a client will subscribe to a term deposit'
tags = None
model = automl_run.register_model(model_name = model_name, description = description, tags = tags)

print(automl_run.model_id) # This will be written to the script file later in the notebook.

AutoMLac11b830a23


In [20]:
print(best_run.properties)

{'runTemplate': 'automl_child', 'pipeline_id': '__AutoML_Ensemble__', 'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'udacity-project\',\'compute_target\':\'cpu-cluster\',\'subscription_id\':\'510b94ba-e453-4417-988b-fbdc37b55ca7\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_ac11b830-a542-45b6-bb02-1b43156666d2_23","experiment_name":"udacity-project","workspace_name":"quick-starts-ws-133565","subscription_id":"510b94ba-e453-4417-988b-fbdc37b55ca7","resource_group_name":"aml-quickstarts-133565"}}]}', 'training_percent': '100', 'predicted_cost': None, 'iteration': '23', '_aml_system_scenario_identification': 'Remote.Child', '_azureml.ComputeTargetType': 

## Save AutoML Model 
The autoML best model is saved as joblib

In [15]:
best_automl_run, fitted_automl_model = automl_run.get_output()
print(best_automl_run)
print(fitted_automl_model)

import joblib

joblib.dump(fitted_automl_model, 'outputs/model.joblib')

Run(Experiment: udacity-project,
Id: AutoML_cf630e37-c9b9-4734-9100-4ee4592ece9b_24,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                    min_samples_leaf=0.01,
                                                                                                    min_samples_split=0.01,
                

['outputs/model.joblib']

## Clean-up Cluster resources

In [17]:
#deletes the compute cluster
compute_target.delete()