## Azure Machine Learning Engineer
### Project 1 - Optimising an ML Pipeline in Azure

#### Step 1 Create a workspace
In this step we are making an Azure Workspace and setting up an experiment

In [14]:
# Create a new workspace and define an experiment.

from azureml.core import Workspace, Experiment

#ws = Workspace.get(name="udacity-project")
ws = Workspace.from_config()
ws.get_details()

# Choose a name for the experiment
experiment_name = 'udacity-project'
exp = Experiment(workspace=ws, name = experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-141858
Azure region: southcentralus
Subscription id: d4ad7261-832d-46b2-b093-22156001df5b
Resource group: aml-quickstarts-141858


### Setup Compute
Create a new compute or use an existing one if its present

In [15]:
# Create a compute cluster to provision VM Resources.

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

# Choose a name for the cluster
cpu_cluster_name = 'cpu-cluster-01'

#Verify that the culster does not exist already
try:
    compute_target = ComputeTarget(workspace = ws, name = cpu_cluster_name)
    print('Found existing cluster, use it')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_D2_V2', max_nodes = 4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output = True)

Found existing cluster, use it
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [45]:
# Setup Hyperparameter Tuning with Hyperdrive.

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform,choice
import os

#Define the parameter search space/method
# Specify parameter sampler, in this case we are looking to get defined ranges and pass back to the SKILEARN
# training model.

ps = RandomParameterSampling({
    '--C': choice(1.0, 0.1, 0.05),
    '--max_iter': choice(50,100,150)})

#note in the script, C is described as float
#note in the script, max iter is described as integer


# Specify an early termination Policy
# Other options are median policy, have stuck with bandit for simplification
# Bandit policy stops if its less than 10% of best model, starting and interval 5
early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

#creates a training director.
#if "training" not in os.listdir():
    #os.mkdir("./training")
    
script_folder = './training'

#os.makedirs(script_folder, exist_ok = True)

#shutil.copy('./train.py',script_folder)

# Create a SKLearn estimator for use with train.py ### YOUR CODE HERE ###
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory = script_folder,
                      script ='train.py',
                      compute_target = compute_target)

#src = ScriptRunConfig(source_directory='./training',
#                      script='train.py',
#                      arguments = ['--C',0.05,'--max_iter',16],
#                      compute_target = compute_target)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config = src,
                                     hyperparameter_sampling = ps,
                                     policy = early_termination_policy,
                                     primary_metric_name = 'accuracy',
                                     primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs = 50,
                                     max_concurrent_runs = 4)

In [46]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

from azureml.core.experiment import Experiment

experiment = Experiment(ws, experiment_name)
hyperdrive_run = experiment.submit(hyperdrive_config, show_output = True)

In [47]:
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb

In [None]:
import joblib

#Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

print(best_run.get_file_names())

model = best_run.register_model(model_name='hyperdrive', model_path='outputs/model.joblib')

### Now compare against AutoML

In [20]:
# Choose a name for the experiment
experiment_name = 'AutoML-udacity-project'
exp = Experiment(workspace=ws, name= experiment_name)

In [21]:
from azureml.data.dataset_factory import TabularDatasetFactory
import pandas as pd

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

#  path to URL from Chrome DevTools Console
url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

#  read remote URL data to DataFrame
#ds = TabularDatasetFactory.from_delimited_files(url,separator = ',')
ds = TabularDatasetFactory.from_delimited_files(url)

In [36]:
#from train import clean_data
from sklearn.model_selection import train_test_split

ds_x = ds.to_pandas_dataframe()

# Use the clean_data function to clean your data.
x, y = clean_data(ds_x)

#automl settings have the optios to assign the dataframe and identify a target variable within it.
#datafinal = pd.concat([x,y], axis = 1)

#x_train, x_test = train_test_split(x, y, test_size=0.3, random_state=42)

NameError: name 'clean_data' is not defined

In [31]:
ds_x.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,no
3,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,no
4,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no


In [23]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

#define automl settings
automl_settings {
    'experiment_timeout_minutes' : 30,
    'primary_metric' : 'Accuracy',
    'n_cross_validations : 5
    'enable_onnx_compatible_models' : True')

automl_config = AutoMLConfig(task = 'classification',
                             x = x_train,
                             y = y_train,
                             **automl_settings)
    
#automl_config = AutoMLConfig(task = 'classification',
                             #training_data= ds,
                             #label_column_name = 'y',
                             #**automl_settings)

SyntaxError: invalid syntax (<ipython-input-23-725e6deadd7b>, line 10)

In [2]:
# Submit your automl run
from azureml.core.experiment import Experiment

experiment = Experiment(ws,"automl_udacity_project")
run.experiment.submit(config = automml_config, show_output = True)

In [None]:
# Retrieve and save your best automl model.
### YOUR CODE HERE ###

best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

## Clean Up Resources
To ensure we don't continue to acrue cost for the resources, delete resources.

#### Delete Compute Instance and Cluster
1. In the Microsoft Azure Machine Learning Portal, select Compute on the far left.

2. From the list, select the compute instances and or compute clsuters you created.

3. Select Delete.