## Azure Machine Learning Engineer
### Project 1 - Optimising an ML Pipeline in Azure

#### Step 1 Create a workspace
In this step we are making an Azure Workspace and setting up an experiment

In [2]:
# Create a new workspace and define an experiment.

from azureml.core import Workspace, Experiment

#ws = Workspace.get(name="udacity-project")
ws = Workspace.from_config()
ws.get_details()

# Choose a name for the experiment
experiment_name = 'udacity-project'
exp = Experiment(workspace=ws, name = experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-141920
Azure region: southcentralus
Subscription id: 2c48c51c-bd47-40d4-abbe-fb8eabd19c8c
Resource group: aml-quickstarts-141920


### Setup Compute
Create a new compute or use an existing one if its present

In [3]:
# Create a compute cluster to provision VM Resources.

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

# Choose a name for the cluster
cpu_cluster_name = 'cpu-cluster-01'

#Verify that the culster does not exist already
try:
    compute_target = ComputeTarget(workspace = ws, name = cpu_cluster_name)
    print('Found existing cluster, use it')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_D2_V2', max_nodes = 4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output = True)

Creating...
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [56]:
# Setup Hyperparameter Tuning with Hyperdrive.

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform,choice
import os
import shutil
from azureml.core import ScriptRunConfig

#Define the parameter search space/method
# Specify parameter sampler, in this case we are looking to get defined ranges and pass back to the SKILEARN
# training model.

ps = RandomParameterSampling({
    '--C': choice(1.0, 0.1, 0.05),
    '--max_iter': choice(50,100,150)})

#note in the script, C is described as float
#note in the script, max iter is described as integer

# Specify an early termination Policy
# Other options are median policy, have stuck with bandit for simplification
# Bandit policy stops if its less than 10% of best model, starting and interval 5
early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

#creates a training director.
if "training" not in os.listdir():
    os.mkdir("./training")
    
script_folder = './training'

os.makedirs(script_folder, exist_ok = True)

shutil.copy('./train.py',script_folder)

# Create a SKLearn estimator for use with train.py ### YOUR CODE HERE ###
est = SKLearn(source_directory = script_folder,
              entry_script ='train.py',
              compute_target = compute_target,
              vm_size = 'Standard_d2_v')

hyperdrive_config = HyperDriveConfig(estimator = est,
                                     hyperparameter_sampling = ps,
                                     policy = early_termination_policy,
                                     primary_metric_name = 'Accuracy',
                                     primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs = 100,
                                     max_concurrent_runs = 4)



In [57]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

from azureml.core.experiment import Experiment

#experiment = Experiment(ws, experiment_name)
hyperdrive_run = experiment.submit(hyperdrive_config, show_output = True)



In [58]:
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [None]:
import joblib
# Get your best run and save the model from that run.

best_hyperdrive_run = hyperdrive_run.get_best_run_by_primary_metric()
#best_hyperdrive_run_metrics = best_hyperdrive_run.get_metrics()

print("Best Run Metrics :", best_hyperdrive_run.get_metrics())

best_hyperdrive_run.download_file(
    best_hyperdrive_run.get_file_names()[-1],
    output_file_path="./outputs/"
)
best_hyperdrive_model = best_hyperdrive_run.register_model(
    model_name="best_hyperdrive_model",
    model_path="./outputs/best_hyperdrive_model.joblib",
    tags=best_hyperdrive_run.get_metrics()
)

In [63]:
import joblib
#Get your best run and save the model from that run.

#best_run = hyperdrive_run.get_best_run_by_primary_metric()
#best_run_metrics = best_run.get_metrics()

#print(best_run.get_details()['runDefinition']['arguments'])

#print(best_run.get_file_names())
#print("Best Run Metrics :", best_run.get_metrics())
#print("Best Run Accuracy : ",best_run_metrics['Accuracy'])


['--C', '0.05', '--max_iter', '150']
['azureml-logs/55_azureml-execution-tvmps_bffa2edad86269e39f3a16292c66d0afdcd244e7bdd67b2a53ce5e54c1da526b_d.txt', 'azureml-logs/65_job_prep-tvmps_bffa2edad86269e39f3a16292c66d0afdcd244e7bdd67b2a53ce5e54c1da526b_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_bffa2edad86269e39f3a16292c66d0afdcd244e7bdd67b2a53ce5e54c1da526b_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/106_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']
Best Run Metrics : {'Regularization Strength:': 0.05, 'Max iterations:': 150, 'Accuracy': 0.9097622660596864}
Best Run Accuracy :  0.9097622660596864


In [64]:
#from azureml.core import Model
#if "outputs" not in os.listdir():
    #os.mkdir("./outputs")

#model = best_run.register_model(model_name ='hyperdrive',
                                #model_path = './outputs/model.joblib')

ModelPathNotFoundException: ModelPathNotFoundException:
	Message: Could not locate the provided model_path outputs/model.joblib in the set of files uploaded to the run: ['azureml-logs/55_azureml-execution-tvmps_bffa2edad86269e39f3a16292c66d0afdcd244e7bdd67b2a53ce5e54c1da526b_d.txt', 'azureml-logs/65_job_prep-tvmps_bffa2edad86269e39f3a16292c66d0afdcd244e7bdd67b2a53ce5e54c1da526b_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_bffa2edad86269e39f3a16292c66d0afdcd244e7bdd67b2a53ce5e54c1da526b_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/106_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']
                See https://aka.ms/run-logging for more details.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Could not locate the provided model_path outputs/model.joblib in the set of files uploaded to the run: ['azureml-logs/55_azureml-execution-tvmps_bffa2edad86269e39f3a16292c66d0afdcd244e7bdd67b2a53ce5e54c1da526b_d.txt', 'azureml-logs/65_job_prep-tvmps_bffa2edad86269e39f3a16292c66d0afdcd244e7bdd67b2a53ce5e54c1da526b_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_bffa2edad86269e39f3a16292c66d0afdcd244e7bdd67b2a53ce5e54c1da526b_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/106_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']\n                See https://aka.ms/run-logging for more details."
    }
}

### Now compare against AutoML

In [35]:
# Choose a name for the experiment
experiment_name = 'AutoML-udacity-project'
exp = Experiment(workspace=ws, name= experiment_name)

In [36]:
from azureml.data.dataset_factory import TabularDatasetFactory
import pandas as pd

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

#  path to URL from Chrome DevTools Console
url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

#  read remote URL data to DataFrame
ds = TabularDatasetFactory.from_delimited_files(url)

In [47]:
from train import clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

#automl settings have the optios to assign the dataframe and identify a target variable within it.
datafinal = pd.concat([x,y], axis = 1)

#x_train, x_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [68]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes = 30,
    compute_target = compute_target
    primary_metric = 'accuracy',
    n_cross_validations = 5,
    task = 'classification',
    training_data = datafinal,
    label_column_name = 'y',
    enable_onnx_compatible_models = True)

In [69]:
# Submit your automl run
experiment = Experiment(workspace, "udacity_automl")
automl_run = exp.submit(config = automl_config, show_output = True)

No run_configuration provided, running on local with default configuration


ConfigException: ConfigException:
	Message: Module 'azureml.train.automl.runtime' is required in the current environment for running Remote or Local (in-process) runs. Please install this dependency (e.g. `pip install azureml.train.automl.runtime`) or provide a RunConfiguration.
	InnerException: ModuleNotFoundError: No module named 'azureml.train.automl.runtime'
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "Module 'azureml.train.automl.runtime' is required in the current environment for running Remote or Local (in-process) runs. Please install this dependency (e.g. `pip install azureml.train.automl.runtime`) or provide a RunConfiguration.",
        "target": "compute_target",
        "inner_error": {
            "code": "NotSupported",
            "inner_error": {
                "code": "IncompatibleOrMissingDependency"
            }
        }
    }
}

## Clean Up Resources
To ensure we don't continue to acrue cost for the resources, delete resources.

#### Delete Compute Instance and Cluster
1. In the Microsoft Azure Machine Learning Portal, select Compute on the far left.

2. From the list, select the compute instances and or compute clsuters you created.

3. Select Delete.

In [None]:
compute_target.delete()
compute_target.wait_for_completion(show_output=True)