In [6]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-269070
Azure region: westeurope
Subscription id: 510b94ba-e453-4417-988b-fbdc37b55ca7
Resource group: aml-quickstarts-269070


In [7]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "udacity-project"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
# Check if the cluster already exists
try:
    compute_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    # comput_target = ComputeTarget(workspace=ws, name=cluster_name)
    print(f"Found existing cluster: {cluster_name}, using it.")

except ComputeTargetException:
    # If the cluster doesn't exist, create a new one
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_DS3_V2", 
                                                           max_nodes=4)

    compute_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

    # Wait for the cluster to be ready
    compute_cluster.wait_for_completion(show_output=True)

Found existing cluster: udacity-project, using it.


In [8]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
### YOUR CODE HERE ###
ps = RandomParameterSampling({
    'C': choice( 0.01, 0.1, 1, 10, 100),
    'max_iter': choice(100, 200, 300)
})

# Specify a Policy
### YOUR CODE HERE ###
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval = 1, delay_evaluation = 5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
### YOUR CODE HERE ###
src = ScriptRunConfig(source_directory = '.',
                      script = 'train.py',
                      compute_target = compute_cluster,
                      environment = sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
### YOUR CODE HERE ###
hyperdrive_config = HyperDriveConfig(run_config = src,
                     hyperparameter_sampling = ps,
                     policy = policy,
                     primary_metric_name = 'Accuracy',
                     primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                     max_total_runs = 20,
                     max_concurrent_runs = 4)

In [5]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
# Submit the hyperdrive run
hyperdrive_run = exp.submit(hyperdrive_config)

# Show run details using the widget
RunDetails(hyperdrive_run).show
hyperdrive_run.wait_for_completion(show_output = True)

2024-10-25 10:11:04.909688: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-25 10:11:04.931671: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-25 10:11:04.938260: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-25 10:11:04.954903: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


RunId: HD_9bda6508-79d2-4a4c-b179-1c776e77e141
Web View: https://ml.azure.com/runs/HD_9bda6508-79d2-4a4c-b179-1c776e77e141?wsid=/subscriptions/510b94ba-e453-4417-988b-fbdc37b55ca7/resourcegroups/aml-quickstarts-269070/workspaces/quick-starts-ws-269070&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

[2024-10-25T10:11:02.1702315Z][GENERATOR][DEBUG]Sampled 4 jobs from search space 
[2024-10-25T10:11:02.6774517Z][SCHEDULER][INFO]Scheduling job, id='HD_9bda6508-79d2-4a4c-b179-1c776e77e141_0' 
[2024-10-25T10:11:02.9815833Z][SCHEDULER][INFO]Scheduling job, id='HD_9bda6508-79d2-4a4c-b179-1c776e77e141_2' 
[2024-10-25T10:11:02.9826889Z][SCHEDULER][INFO]Scheduling job, id='HD_9bda6508-79d2-4a4c-b179-1c776e77e141_1' 
[2024-10-25T10:11:03.3783771Z][SCHEDULER][INFO]Scheduling job, id='HD_9bda6508-79d2-4a4c-b179-1c776e77e141_3' 
[2024-10-25T10:11:03.6582268Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_9bda6508-79d2-4a4c-b179-1c776e77e141_2' 
[2024-10-25T10:1

{'runId': 'HD_9bda6508-79d2-4a4c-b179-1c776e77e141',
 'target': 'udacity-project',
 'status': 'Completed',
 'startTimeUtc': '2024-10-25T10:11:00.874511Z',
 'endTimeUtc': '2024-10-25T10:16:06.30202Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"Accuracy","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '73b318b5-ab48-45e9-866d-6447045cad16',
  'user_agent': 'python/3.10.11 (Linux-5.15.0-1073-azure-x86_64-with-glibc2.31) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.57.0',
  'best_child_run_id': 'HD_9bda6508-79d2-4a4c-b179-1c776e77e141_5',
  'score': '0.9088012139605464',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_9bda6508-79d2-4a4c-b179-1c776e77e141_5'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryValues': {'amlClientType': 'azureml-sd

In [6]:
child_runs = list(hyperdrive_run.get_children())
print(f"Number of child runs: {len(child_runs)}")

Number of child runs: 15


In [7]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run)

Run(Experiment: udacity-project,
Id: HD_9bda6508-79d2-4a4c-b179-1c776e77e141_5,
Type: azureml.scriptrun,
Status: Completed)


In [7]:
for run in child_runs:
    metrics = run.get_metrics()
    print(f"Run ID: {run.id}, Metrics: {metrics}")

Run ID: HD_b94a789e-01bf-4c08-89a9-3308cd3aff68_14, Metrics: {'Regularization Strength:': 0.01, 'Max iterations:': 100, 'Accuracy': 0.9084977238239758}
Run ID: HD_b94a789e-01bf-4c08-89a9-3308cd3aff68_13, Metrics: {'Regularization Strength:': 1.0, 'Max iterations:': 200, 'Accuracy': 0.9071320182094081}
Run ID: HD_b94a789e-01bf-4c08-89a9-3308cd3aff68_12, Metrics: {'Regularization Strength:': 0.1, 'Max iterations:': 300, 'Accuracy': 0.9072837632776934}
Run ID: HD_b94a789e-01bf-4c08-89a9-3308cd3aff68_11, Metrics: {'Regularization Strength:': 10.0, 'Max iterations:': 100, 'Accuracy': 0.908649468892261}
Run ID: HD_b94a789e-01bf-4c08-89a9-3308cd3aff68_10, Metrics: {'Regularization Strength:': 100.0, 'Max iterations:': 300, 'Accuracy': 0.9080424886191198}
Run ID: HD_b94a789e-01bf-4c08-89a9-3308cd3aff68_9, Metrics: {'Max iterations:': 200, 'Regularization Strength:': 0.1, 'Accuracy': 0.9071320182094081}
Run ID: HD_b94a789e-01bf-4c08-89a9-3308cd3aff68_8, Metrics: {'Regularization Strength:': 1.0

In [8]:
print(best_run.get_file_names())

['logs/azureml/dataprep/0/backgroundProcess.log', 'logs/azureml/dataprep/0/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/0/rslex.log.2024-10-25-10', 'outputs/model.pkl', 'system_logs/cs_capability/cs-capability.log', 'system_logs/hosttools_capability/hosttools-capability.log', 'system_logs/lifecycler/execution-wrapper.log', 'system_logs/lifecycler/lifecycler.log', 'system_logs/lifecycler/vm-bootstrapper.log', 'system_logs/metrics_capability/metrics-capability.log', 'system_logs/snapshot_capability/snapshot-capability.log', 'user_logs/std_log.txt']


In [11]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

model_file_path = "outputs/model.pkl" # path where the model is saved
local_model_path = "best_model_local.pkl" # path to save locally

best_run.download_file(name = model_file_path, output_file_path = local_model_path)
model = joblib.load(local_model_path)

joblib.dump(model, 'best_logistic_regression_model.joblib')

print(f"Best run ID: {best_run.id}")

Best run ID: HD_9bda6508-79d2-4a4c-b179-1c776e77e141_5


Trying to unpickle estimator LogisticRegression from version 0.24.2 when using version 1.5.1. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## AutoML experiment

In [1]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
dataset = TabularDatasetFactory.from_delimited_files(path = data_url)
df = dataset.to_pandas_dataframe()
df.head()

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,no
3,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,no
4,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no


In [2]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(dataset)

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}


In [4]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task= 'classification',
    primary_metric= 'accuracy',
    training_data= dataset,
    label_column_name= 'y',
    n_cross_validations= 5)

In [9]:
# Submit your automl run

### YOUR CODE HERE ###
from azureml.core.experiment import Experiment
exp = Experiment(ws, "automl_experiment")

automl_run = exp.submit(config = automl_config, show_output = True)

# Show the run details with the widget
RunDetails(automl_run).show()

#wait for run to complete
automl_run.wait_for_completion(show_output = True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


2024-10-25 11:22:12.073053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-25 11:22:12.094986: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-25 11:22:12.101546: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Experiment,Id,Type,Status,Details Page,Docs Page
automl_experiment,AutoML_4434ef4f-7489-4628-813b-b092b9e02baa,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one cl



_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
automl_experiment,AutoML_4434ef4f-7489-4628-813b-b092b9e02baa,automl,Running,Link to Azure Machine Learning studio,Link to Documentation




********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+--------------------------------------+
|Size of the smallest class    |Name/Label of the smallest class|Number of samples in the training data|
|3692                          |yes                             |32950                                 |
+------------------------------+--------------------------------+--------------------------------------+

********************************************************************

{'runId': 'AutoML_4434ef4f-7489-4628-813b-b092b9e02baa',
 'target': 'local',
 'status': 'Canceled',
 'startTimeUtc': '2024-10-25T11:22:31.32426Z',
 'endTimeUtc': '2024-10-25T11:50:32.833411Z',
 'services': {},
   'message': 'The run was terminated due to an interruption while being executed.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'local',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"78687534-c001-4ad0-9dce-f9cd964c58b0\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-contrib-notebook": "1.57.0", "azureml-inference-server-http": "1.2.2", "azureml-contrib-server": "1.57.0", "azureml-training-tabular": "1.57.0", "azureml-openda

In [11]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
import joblib

best_run, best_model = automl_run.get_output()

model_path = 'best_automl_model.joblib'
joblib.dump(best_model, model_path)

print(f"Best AutoML model saved to {model_path}")

Best AutoML model saved to best_automl_model.joblib


In [12]:
metrics = best_run.get_metrics()
print(metrics)

{'AUC_macro': 0.9492716947466121, 'f1_score_micro': 0.915113808801214, 'weighted_accuracy': 0.9557291961613894, 'average_precision_score_micro': 0.9819938406559887, 'precision_score_micro': 0.915113808801214, 'AUC_weighted': 0.9492716947466123, 'recall_score_weighted': 0.915113808801214, 'accuracy': 0.915113808801214, 'precision_score_macro': 0.7942279733207941, 'recall_score_macro': 0.7515810660689499, 'AUC_micro': 0.9811814332195054, 'recall_score_micro': 0.915113808801214, 'f1_score_weighted': 0.911845324880639, 'norm_macro_recall': 0.5031621321379, 'precision_score_weighted': 0.9098514819787763, 'average_precision_score_macro': 0.8276864332531633, 'matthews_correlation': 0.544034395652931, 'balanced_accuracy': 0.7515810660689499, 'log_loss': 0.17206143799365556, 'f1_score_macro': 0.7704146144552512, 'average_precision_score_weighted': 0.956331169650299, 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_4434ef4f-7489-4628-813b-b092b9e02baa_0/confusion_matrix', 'accurac

In [13]:
# Deleting compute cluster
cluster_name = "udacity-project"
compute_cluster = ComputeTarget(workspace = ws, name = cluster_name)
compute_cluster.delete()
compute_cluster.wait_for_completion(show_output = True)


Deleting