# Azure Machine Learning Engineer
## Project 3 - Capstone

## Create a workspace
In this step we are making an Azure Workspace and setting up an experiment

In [1]:
# Create a new workspace and define an experiment.

from azureml.core import Workspace, Experiment

#ws = Workspace.get(name="udacity-project")
ws = Workspace.from_config()
ws.get_details()

# Choose a name for the experiment
experiment_name = 'udacity-project-hyperdrive'
experiment = Experiment(workspace=ws, name = experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-146358
Azure region: southcentralus
Subscription id: f9d5a085-54dc-4215-9ba6-dad5d86e60a0
Resource group: aml-quickstarts-146358


### Setup Compute
Create a new compute or use an existing one if its present

In [2]:
# Create a compute cluster to provision VM Resources.

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

# Choose a name for the cluster
cpu_cluster_name = 'cpu-cluster-01'

#Verify that the culster does not exist already
try:
    compute_target = ComputeTarget(workspace = ws, name = cpu_cluster_name)
    print('Found existing cluster, use it')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_D2_V2', max_nodes = 4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output = True)

Creating.........
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

In [8]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'f9d5a085-54dc-4215-9ba6-dad5d86e60a0'
resource_group = 'aml-quickstarts-146358'
workspace_name = 'quick-starts-ws-146358'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='Adult')

ds = dataset.to_pandas_dataframe()

## Hyperdrive Configuration

In [39]:
# Setup Hyperparameter Tuning with Hyperdrive.

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform,choice
import os
import shutil
from azureml.core import ScriptRunConfig

#Define the parameter search space/method
# Specify parameter sampler, in this case we are looking to get defined ranges and pass back to the SKILEARN
# training model.


ps = RandomParameterSampling({
    'max_depth' : choice(2,5), #limited for simplicity
    'learning_rate' : choice (1,10) #limited for simplicity
})

# Specify an early termination Policy
# Other options are median policy, have stuck with bandit for simplification
# Bandit policy stops if its less than 10% of best model, starting and interval 5
early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

#creates a training director.
if "training" not in os.listdir():
    os.mkdir("./training")
    
script_folder = './training'

os.makedirs(script_folder, exist_ok = True)

shutil.copy('./train.py',script_folder)

# Create a SKLearn estimator for use with train.py ### YOUR CODE HERE ###
est = SKLearn(source_directory = script_folder,
              entry_script ='train.py',
              compute_target = compute_target,
              vm_size = 'Standard_d2_v')

hyperdrive_config = HyperDriveConfig(estimator = est,
                                     hyperparameter_sampling = ps,
                                     policy = early_termination_policy,
                                     primary_metric_name = 'Accuracy', #Accuracy for classification
                                     primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs = 50,
                                     max_concurrent_runs = 4)



## Run Details

In [40]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
from azureml.widgets import RunDetails
from azureml.core.experiment import Experiment

#experiment = Experiment(ws, experiment_name)
hyperdrive_run = experiment.submit(hyperdrive_config, show_output = True)

RunDetails(hyperdrive_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model

In [11]:
import joblib
# Get your best run and save the model from that run.

best_hyperdrive_run = hyperdrive_run.get_best_run_by_primary_metric()
#best_hyperdrive_run_metrics = best_hyperdrive_run.get_metrics()

print("Best Run Metrics :", best_hyperdrive_run.get_metrics())

best_hyperdrive_run.download_file(
    best_hyperdrive_run.get_file_names()[-1],
    output_file_path="./outputs/"
)
best_hyperdrive_model = best_hyperdrive_run.register_model(
    model_name="best_hyperdrive_model",
    model_path="./outputs/best_hyperdrive_model.joblib",
    tags=best_hyperdrive_run.get_metrics()
)

Best Run Metrics : {'Regularization Strength:': 0.05, 'Max iterations:': 50, 'Accuracy': 0.9097622660596864}


### Now compare against AutoML

In [12]:
from azureml.data.dataset_factory import TabularDatasetFactory
import pandas as pd

# Create TabularDataset using TabularDatasetFactory

#  path to URL 
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

#  read remote URL data to DataFrame
ds = TabularDatasetFactory.from_delimited_files(url)

In [13]:
from train import clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

#automl settings have the optios to assign the dataframe and identify a target variable within it.
datafinal = pd.concat([x,y], axis = 1)
datafinal.head()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [15]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes = 30,
    compute_target = compute_target,
    primary_metric = 'accuracy',
    n_cross_validations = 3, #typically 5, used to make time go quicker.
    task = 'classification',
    training_data = ds,
    label_column_name = 'y',
    enable_onnx_compatible_models = True)

In [16]:
# Submit your automl run
experiment = Experiment(ws, 'udacity_automl')
automl_run = experiment.submit(config = automl_config, show_output = True)

Running on remote.
No run_configuration provided, running on cpu-cluster-01 with default configuration
Running on remote compute: cpu-cluster-01
Parent Run ID: AutoML_573ad1d1-40a4-457f-a797-cdccc223a339

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the

In [19]:
pip install azureml-train-automl-runtime

Collecting azureml-train-automl-runtime
  Downloading azureml_train_automl_runtime-1.18.0.post1-py3-none-any.whl (119 kB)
[K     |████████████████████████████████| 119 kB 8.6 MB/s eta 0:00:01
[?25hCollecting keras2onnx<=1.6.0,>=1.4.0
  Downloading keras2onnx-1.6.0-py3-none-any.whl (219 kB)
[K     |████████████████████████████████| 219 kB 10.9 MB/s eta 0:00:01
Collecting azureml-dataset-runtime[fuse,pandas]~=1.18.0
  Downloading azureml_dataset_runtime-1.18.0-py3-none-any.whl (3.4 kB)
Collecting skl2onnx==1.4.9
  Downloading skl2onnx-1.4.9-py2.py3-none-any.whl (114 kB)
[K     |████████████████████████████████| 114 kB 12.9 MB/s eta 0:00:01
Collecting onnxconverter-common<=1.6.0,>=1.4.2
  Downloading onnxconverter_common-1.6.0-py2.py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 1.1 MB/s  eta 0:00:01
[?25hCollecting onnxmltools==1.4.1
  Downloading onnxmltools-1.4.1-py2.py3-none-any.whl (371 kB)
[K     |████████████████████████████████| 371 kB 12.4 MB/s eta 

Collecting nimbusml<=1.8.0,>=1.7.1
  Downloading nimbusml-1.8.0-cp38-none-manylinux1_x86_64.whl (114.1 MB)
[K     |████████████████████████████████| 114.1 MB 67 kB/s s eta 0:00:01
Collecting pyopenssl<20.0.0
  Downloading pyOpenSSL-19.1.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.3 MB/s  eta 0:00:01
Collecting interpret-community==0.15.*
  Downloading interpret_community-0.15.4-py3-none-any.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 103.4 MB/s eta 0:00:01
Collecting azureml-dataprep-rslex<1.3.0a,>=1.2.0dev0
  Downloading azureml_dataprep_rslex-1.2.3-cp38-cp38-manylinux2010_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 52.3 MB/s eta 0:00:01
[?25hCollecting azureml-dataprep-native<25.0.0,>=24.0.0
  Downloading azureml_dataprep_native-24.0.0-cp38-cp38-manylinux1_x86_64.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 54.2 MB/s eta 0:00:01


[31mERROR: azureml-widgets 1.24.0 has requirement azureml-core~=1.24.0, but you'll have azureml-core 1.18.0.post4 which is incompatible.[0m
[31mERROR: azureml-widgets 1.24.0 has requirement azureml-telemetry~=1.24.0, but you'll have azureml-telemetry 1.18.0.post1 which is incompatible.[0m
[31mERROR: azureml-train-core 1.24.0 has requirement azureml-core~=1.24.0, but you'll have azureml-core 1.18.0.post4 which is incompatible.[0m
[31mERROR: azureml-train-core 1.24.0 has requirement azureml-telemetry~=1.24.0, but you'll have azureml-telemetry 1.18.0.post1 which is incompatible.[0m
[31mERROR: azureml-tensorboard 1.24.0 has requirement azureml-core~=1.24.0, but you'll have azureml-core 1.18.0.post4 which is incompatible.[0m
[31mERROR: azureml-sdk 1.24.0 has requirement azureml-core~=1.24.0, but you'll have azureml-core 1.18.0.post4 which is incompatible.[0m
[31mERROR: azureml-sdk 1.24.0 has requirement azureml-dataset-runtime[fuse]~=1.24.0, but you'll have azureml-dataset-runt

  Attempting uninstall: azureml-dataprep-rslex
    Found existing installation: azureml-dataprep-rslex 1.9.0
    Uninstalling azureml-dataprep-rslex-1.9.0:
      Successfully uninstalled azureml-dataprep-rslex-1.9.0
  Attempting uninstall: azureml-dataprep-native
    Found existing installation: azureml-dataprep-native 30.0.0
    Uninstalling azureml-dataprep-native-30.0.0:
      Successfully uninstalled azureml-dataprep-native-30.0.0
  Attempting uninstall: azureml-dataprep
    Found existing installation: azureml-dataprep 2.11.1
    Uninstalling azureml-dataprep-2.11.1:
      Successfully uninstalled azureml-dataprep-2.11.1
  Attempting uninstall: azureml-dataset-runtime
    Found existing installation: azureml-dataset-runtime 1.24.0
    Uninstalling azureml-dataset-runtime-1.24.0:
      Successfully uninstalled azureml-dataset-runtime-1.24.0
  Attempting uninstall: pyopenssl
    Found existing installation: pyOpenSSL 20.0.1
    Uninstalling pyOpenSSL-20.0.1:
      Successfully unins

In [20]:
import azureml.train.automl

import azureml.automl.core
from azureml.automl.runtime.onnx_convert import OnnxConverter

best_automl_run, best_automl_onnx_model = automl_run.get_output(return_onnx_model = True)
OnnxConverter.save_onnx_model(best_automl_onnx_model, file_path = 'outputs/best_automl_model.onnx')

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.