# Introduction
This is a notebook for solving a classification problem (Logistic Regression). We would like to show two ways for optimizing our solution

1. Use Hyperdrive to tunr the hyperparameters
2. Use AutoML when looking for our best model

# Creeate the Experiment and the Workspace

In [31]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Experiment

# choose a name for experiment 
experiment_name = "udacity-project-1"

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name= experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-142852
Azure region: southcentralus
Subscription id: 610d6e37-4747-4a20-80eb-3aad70a55f43
Resource group: aml-quickstarts-142852


# Create the cluster if it does not exist

In [32]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

compute_target = "cpu-cluster"
# verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name= compute_target)
    print("Found existing cluster, use it")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS3_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, compute_target, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it

Running


# Use Hyperdrive to tune the parameters
Note that we need :

1. Specify the parameter sampler with the hyperparameters that we can find in the train.py script, i.e., C and max_iter
2. Specify an early termination policy
3. Create a SKLearn estimator for use with train.py

In [33]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import choice
import os


# Define the environment
To define the Azure ML Environment that encapsulates our training script's dependencies, we will use a custom environement

https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-scikit-learn
## Create a custom Environment


In [34]:
%%writefile conda_dependencies.yml
dependencies:
  - python=3.6.2
  - scikit-learn
  - numpy
  - pip:
    - azureml-defaults

Overwriting conda_dependencies.yml


In [35]:
from azureml.core import Environment
from azureml.core import ScriptRunConfig

sklearn_env = Environment.from_conda_specification(name = 'sklearn_env', file_path='conda_dependencies.yml')
est = ScriptRunConfig(source_directory = '.',
                      script = 'train.py',
                      compute_target = compute_target,
                      environment = sklearn_env)


# Hyperdrive 
We need to specify the space search of both hyperparameters C and max_iter, where 
- C is defined as the inverse of regularization, that is high values will result in smaller regularization
- max_iter: is the maximum number of iteration to converge
- Use Bandit termination policy

In [36]:
# Specify parameter sampler
ps = RandomParameterSampling({
    "C": choice(0.01, 0.1, 1, 10, 20, 40),
    "max_iter": choice(100, 150, 200, 250)
})
# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)


In [37]:
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config = est,
                                    hyperparameter_sampling = ps,
                                    policy=policy,
                                    primary_metric_name = 'accuracy',
                                    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs = 20,
                                    max_concurrent_runs = 4)

In [38]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hdr = exp.submit(config= hyperdrive_config)
hdr.wait_for_completion(show_output=True)

RunId: HD_49594788-0e31-4ed3-a95f-8deeea92618a
Web View: https://ml.azure.com/runs/HD_49594788-0e31-4ed3-a95f-8deeea92618a?wsid=/subscriptions/610d6e37-4747-4a20-80eb-3aad70a55f43/resourcegroups/aml-quickstarts-142852/workspaces/quick-starts-ws-142852&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-04-16T16:13:26.138590][API][INFO]Experiment created<END>\n""<START>[2021-04-16T16:13:26.827900][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"<START>[2021-04-16T16:13:27.0379441Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2021-04-16T16:13:27.000320][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_49594788-0e31-4ed3-a95f-8deeea92618a
Web View: https://ml.azure.com/runs/HD_49594788-0e31-4ed3-a95f-8deeea92618a?wsid=/subscriptions/610d6e37-4747-4a20-8

{'runId': 'HD_49594788-0e31-4ed3-a95f-8deeea92618a',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-04-16T16:13:25.850659Z',
 'endTimeUtc': '2021-04-16T16:24:47.466959Z',
 'properties': {'primary_metric_config': '{"name": "accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '85875807-2343-411a-a5a2-2d26744148c7',
  'score': '0.9132018209408195',
  'best_child_run_id': 'HD_49594788-0e31-4ed3-a95f-8deeea92618a_8',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg142852.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_49594788-0e31-4ed3-a95f-8deeea92618a/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=kX4C0wtXPQmtSvBf4Wxf5MBw9oT6NbL%2FUeXfF6tWBWk%3D&st=2021-04-16T16%3A14%3A58Z&se=2021-04-17T00%3A24%3A58Z&sp=r'},
 'submittedBy': 'ODL_User 142852'}

In [40]:
RunDetails(hdr).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [None]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###

In [None]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###

In [None]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(### YOUR DATA OBJECT HERE ###)

In [None]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task=,
    primary_metric=,
    training_data=,
    label_column_name=,
    n_cross_validations=)

In [None]:
# Submit your automl run

### YOUR CODE HERE ###

In [None]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###