In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
ws.write_config(path='.azureml')
experiment_name = 'udacity_project1'
exp = Experiment(workspace=ws, name=experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code ADGRA7BQN to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: prj1-wsp
Azure region: westus2
Subscription id: 0c66ad45-500d-48af-80d3-0039ebf1975e
Resource group: prj1-rgp


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cpu_cluster_name = 'cmp'

try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Existing compute target.')

except:
    print('Creating compute target.')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

print(compute_target.get_status())

Existing compute target.
{
  "errors": [],
  "creationTime": "2021-01-10T19:11:37.779689+00:00",
  "createdBy": {
    "userObjectId": "49e75006-b9ac-415c-9176-f83c59d4bf26",
    "userTenantId": "d689239e-c492-40c6-b391-2c5951d31d14",
    "userName": "Mikhaylov, Dmitry"
  },
  "modifiedTime": "2021-01-10T19:13:24.594299+00:00",
  "state": "Running",
  "vmSize": "STANDARD_DS3_V2"
}


In [3]:
from azureml.core import ScriptRunConfig, Environment
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice, normal
import os

# Specify parameter sampler
ps = RandomParameterSampling({
    "C": uniform(0.0, 1.0), 
    "max_iter": choice(50, 100, 150, 200, 250)
})

# Specify a Policy
policy = BanditPolicy(
    slack_factor=0.1,
    evaluation_interval=1,
    delay_evaluation=5
)
if "training" not in os.listdir():
    os.mkdir("./training")
# Create a SKLearn estimator for use with train.py

script_folder = './training'
os.makedirs(script_folder, exist_ok=True)
import shutil
shutil.copy('./train.py', script_folder)

est = SKLearn(
    source_directory= script_folder,
    compute_target=compute_target,
    entry_script="train.py"
)
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=policy,
    primary_metric_name="Accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_concurrent_runs=1,
    max_total_runs=40
)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_31be76b3-dd82-45ca-9b0e-1817835b14c7
Web View: https://ml.azure.com/experiments/udacity_project1/runs/HD_31be76b3-dd82-45ca-9b0e-1817835b14c7?wsid=/subscriptions/0c66ad45-500d-48af-80d3-0039ebf1975e/resourcegroups/prg1-rsg/workspaces/prj1-wsp

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-09T22:09:55.555762][API][INFO]Experiment created<END>\n""<START>[2021-01-09T22:09:56.137011][GENERATOR][INFO]Trying to sample '1' jobs from the hyperparameter space<END>\n""<START>[2021-01-09T22:09:56.407792][GENERATOR][INFO]Successfully sampled '1' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-01-09T22:09:56.7365791Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>


In [6]:
model = best_run.register_model(model_name='model', model_path='azureml-logs/model.joblib')

NameError: name 'best_run' is not defined

In [7]:
import joblib
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best run: ', best_run)
print('Metrics: ', best_run_metrics)

Best run:  Run(Experiment: udacity_project1,
Id: HD_31be76b3-dd82-45ca-9b0e-1817835b14c7_20,
Type: azureml.scriptrun,
Status: Completed)
Metrics:  {'Regularization Strength:': 0.5057501115630281, 'Max iterations:': 100, 'Accuracy': 0.9155134741442098}


In [10]:

#model = best_run.register_model(model_name='hyperdrive_model', model_path='./outputs/model.pkl')
#os.makedirs('outputs', exist_ok=True)
#joblib.dump(value=best_run, filename='outputs/model.pkl')

In [4]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path)

In [5]:
from train import clean_data 
from sklearn.model_selection import train_test_split
import pandas as pd

# Use the clean_data function to clean your data.
x, y = clean_data(ds) 
x_train, x_test, y_train, y_test = train_test_split(x, y)
dataset = x_train.join(y_train)

# Get the default datastore to be entered as a parameter in tabular dataset creation
datastore = ws.get_default_datastore()

# Change pandas dataframe into a tabular dataset to be used in automl
training_data = TabularDatasetFactory.register_pandas_dataframe(dataset, datastore, "training_data")

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/30f35065-e260-415b-8404-d77ff8d51e53/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [6]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig

automl_config = AutoMLConfig(compute_target=cpu_cluster_name,
experiment_timeout_minutes=30,
task='classification',
primary_metric='accuracy',
training_data=training_data,
label_column_name='y',
n_cross_validations=4) 

In [14]:
print(type(training_data))

<class 'azureml.data.tabular_dataset.TabularDataset'>


In [7]:
# Submit your automl run

experiment = Experiment(ws, 'automl_model')
print("Experiment created")
run = experiment.submit(config=automl_config, show_output=True)
RunDetails(run).show()
run.wait_for_completion(show_output=True)

Experiment created
Running on remote.
No run_configuration provided, running on cmp with default configuration
Running on remote compute: cmp
Parent Run ID: AutoML_afe891b4-1d1c-4a66-ab6c-e9486ee9a2e4

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+----------------------

In [8]:
# Retrieve and save your best automl model.
best_run, fitted_model = run.get_output()
print(fitted_model)
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                fit_intercept=True,
                                                                                                intercept_scaling=1,
                                                                                                l1_ratio=None,
                                                

In [9]:
print(best_run)

Run(Experiment: automl_model,
Id: AutoML_afe891b4-1d1c-4a66-ab6c-e9486ee9a2e4_28,
Type: azureml.scriptrun,
Status: Completed)


In [11]:
# Register the model produced by AutoML
#joblib.dump(value=fitted_model, filename="fitted_automl_model.joblib")
#automl_model = best_run.register_model(model_name='automl_model.pkl', model_path = './outputs/')

In [22]:
# Delete the cluster instance
AmlCompute.delete(compute_target)