In [1]:
from azureml.core import Workspace, Experiment
#from azureml.core import Workspace

ws = Workspace.from_config()
ws.write_config(path='.azureml')
experiment_name = 'udacity_project1'
exp = Experiment(workspace=ws, name=experiment_name)


# Old setup:
#ws = Workspace.get(name="project1_wsp")
#exp = Experiment(workspace=ws, name="project1_exp")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: project1_wsp
Azure region: westus2
Subscription id: 0c66ad45-500d-48af-80d3-0039ebf1975e
Resource group: rsr_grp


In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

cpu_cluster_name = 'cmp'

try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Existing compute target.')

except:
    print('Creating compute target.')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

#compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
print(compute_target.get_status())

Existing compute target.
{
  "errors": [],
  "creationTime": "2021-01-09T16:42:23.518080+00:00",
  "createdBy": {
    "userObjectId": "49e75006-b9ac-415c-9176-f83c59d4bf26",
    "userTenantId": "d689239e-c492-40c6-b391-2c5951d31d14",
    "userName": "Mikhaylov, Dmitry"
  },
  "modifiedTime": "2021-01-09T16:44:40.497653+00:00",
  "state": "Running",
  "vmSize": "STANDARD_DS2_V2"
}


In [5]:
from azureml.core import ScriptRunConfig, Environment
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice, normal
import os
'''
# Specify parameter sampler
ps = RandomParameterSampling({'--C': choice(0.5, 0.7, 1), '-max_iter': choice(10, 11)})

# Specify a Policy

policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)
if "training" not in os.listdir():
    os.mkdir("./training")


# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = './',
entry_script = 'train.py', 
compute_target = cpu_cluster_name)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.

primary_metric_name = 'Accuracy'
primary_metric_goal = PrimaryMetricGoal.MAXIMIZE

hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps, 
primary_metric_name=primary_metric_name,
max_total_runs = 4,
max_concurrent_runs = 2,
primary_metric_goal = primary_metric_goal,
policy=policy,
estimator=est)
'''
###


# Specify parameter sampler
ps = RandomParameterSampling({
    "C": uniform(0.0, 1.0), 
    "max_iter": choice(50, 100, 150, 200, 250)
})

# Specify a Policy
policy = BanditPolicy(
    slack_factor=0.1,
    evaluation_interval=1,
    delay_evaluation=5
)
if "training" not in os.listdir():
    os.mkdir("./training")
# Create a SKLearn estimator for use with train.py

script_folder = './training'
os.makedirs(script_folder, exist_ok=True)
import shutil
shutil.copy('./train.py', script_folder)

est = SKLearn(
    source_directory= script_folder,
    compute_target=compute_target,
    entry_script="train.py"
)
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name="Accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_concurrent_runs=1,
    max_total_runs=40
)




'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [6]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

#from azureml.core.experiment import Experiment
#from azureml.widgets import RunDetails

hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)
#hyperdrive_run.wait_for_completion(show_output=True)




_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_85c4e81a-0394-4e70-bf9e-50939ec15cd5
Web View: https://ml.azure.com/experiments/udacity_project1/runs/HD_85c4e81a-0394-4e70-bf9e-50939ec15cd5?wsid=/subscriptions/0c66ad45-500d-48af-80d3-0039ebf1975e/resourcegroups/rsr_grp/workspaces/project1_wsp

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-09T18:05:05.621642][API][INFO]Experiment created<END>\n""<START>[2021-01-09T18:05:06.270714][GENERATOR][INFO]Trying to sample '1' jobs from the hyperparameter space<END>\n""<START>[2021-01-09T18:05:06.484236][GENERATOR][INFO]Successfully sampled '1' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-01-09T18:05:07.0644307Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>


In [5]:
model = best_run.register_model(model_name='model', model_path='azureml-logs/model.joblib')

RunId: HD_e88a1548-abe1-452a-8636-c6748d6b1839
Web View: https://ml.azure.com/experiments/project1_exp/runs/HD_e88a1548-abe1-452a-8636-c6748d6b1839?wsid=/subscriptions/0c66ad45-500d-48af-80d3-0039ebf1975e/resourcegroups/rsr_grp/workspaces/project1_wsp

Streaming azureml-logs/hyperdrive.txt


Execution Summary
RunId: HD_e88a1548-abe1-452a-8636-c6748d6b1839
Web View: https://ml.azure.com/experiments/project1_exp/runs/HD_e88a1548-abe1-452a-8636-c6748d6b1839?wsid=/subscriptions/0c66ad45-500d-48af-80d3-0039ebf1975e/resourcegroups/rsr_grp/workspaces/project1_wsp



{'runId': 'HD_e88a1548-abe1-452a-8636-c6748d6b1839',
 'target': 'cmp',
 'status': 'Canceled',
 'startTimeUtc': '2021-01-09T17:46:51.165527Z',
 'endTimeUtc': '2021-01-09T17:49:23.933465Z',
 'error': {'error': {'code': 'UserError',
   'message': 'User errors were found in at least one of the child runs.',
   'messageParameters': {},
   'details': []},
  'time': '0001-01-01T00:00:00.000Z'},
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'abc31b98-0732-4bd8-b6e0-82412f99ca2c'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://project1wsp4834081590.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_e88a1548-abe1-452a-8636-c6748d6b1839/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=NqDC5Zyv%2F7cZHO0qtiB6kfUDeN4eA9LcBIhqqgIDmu8%3D&st=2021-01-09T17%3A39%3A26Z&se=2021-

In [28]:
import joblib
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

# Try:
# best_run = run.get_best_run_by_primary_metric()
# best_run.register_model(model_path='outputs/model.joblib', model_name='BankMarketingModel'


print('Best run: ', best_run)
print('Metrics: ', best_run_metrics)


AttributeError: 'NoneType' object has no attribute 'get_metrics'

In [None]:
model = best_run.register_model(model_name='hyperdrive_model', model_path='./outputs/model.pkl')

#os.makedirs('outputs', exist_ok=True)
#joblib.dump(value=best_run, filename='outputs/model.pkl')

In [9]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path)
### YOUR CODE HERE ###

In [10]:
from train import clean_data 
from sklearn.model_selection import train_test_split
import pandas as pd

# Use the clean_data function to clean your data.
x, y = clean_data(ds) ### YOUR DATA OBJECT HERE ###)
x_train, x_test, y_train, y_test = train_test_split(x, y)

#df_train = pd.concat([x_train, y_train], axis=1)
dataset = x_train.join(y_train)

# Get the default datastore to be entered as a parameter in tabular dataset creation
datastore = ws.get_default_datastore()

# Change pandas dataframe into a tabular dataset to be used in automl
training_data = TabularDatasetFactory.register_pandas_dataframe(dataset, datastore, "training_data")



Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/4c89a487-aed1-4b3b-b44e-d5a3509a0cd2/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [11]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(compute_target=cpu_cluster_name,
experiment_timeout_minutes=30,
task='classification',
primary_metric='accuracy',
training_data=training_data,
label_column_name='y',
n_cross_validations=4) 

In [12]:
print(type(training_data))

<class 'azureml.data.tabular_dataset.TabularDataset'>


In [13]:
# Submit your automl run
#from azureml.core.experiment import Experiment

experiment = Experiment(ws, 'automl_model')
print("Experiment created")

run = experiment.submit(config=automl_config, show_output=True)
RunDetails(run).show()
run.wait_for_completion(show_output=True)

Experiment created
Running on remote.
No run_configuration provided, running on project1-cmp with default configuration
Running on remote compute: project1-cmp
Parent Run ID: AutoML_447ca5c5-a63d-407c-a351-f145d5c3689f

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+----

In [14]:
# Retrieve and save your best automl model.
best_run, fitted_model = run.get_output()
#print(best_run)
print(fitted_model)
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Run(Experiment: automl_model,
Id: AutoML_447ca5c5-a63d-407c-a351-f145d5c3689f_27,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                  min_samples_split=0.10368421052631578,
                                                                                                  min_weight_fraction_leaf=0.0,
 

In [15]:
print(best_run)

Run(Experiment: automl_model,
Id: AutoML_447ca5c5-a63d-407c-a351-f145d5c3689f_27,
Type: azureml.scriptrun,
Status: Completed)


In [16]:
joblib.dump(value=fitted_model, filename="fitted_automl_model.joblib")

automl_model = best_run.register_model(model_name='automl_model.pkl', model_path = './outputs/')

In [None]:
# Delete the cluster instance
AmlCompute.delete(compute_target)