# Optimize Machine Learning - Bank Marketing Predictor

## Training Setup

### Initialize Workspace

In [1]:
from azureml.core import Workspace

ws = Workspace.from_config()
print(f"Azure Machine Learning Workspace Loaded Successfully\n"
      f"-----------------------------------------------------------\n"
      f"Workspace Name     : {ws.name}\n"
      f"Resource Group     : {ws.resource_group}\n"
      f"Location           : {ws.location}\n"
      f"Subscription ID    : {ws.subscription_id}\n")

Azure Machine Learning Workspace Loaded Successfully
-----------------------------------------------------------
Workspace Name     : optimize-machine-learning-ws
Resource Group     : optimize-machine-learning
Location           : norwayeast
Subscription ID    : 48778e11-0fc7-4fc8-a16c-304a430e61a4



### Initialize Experiment

In [2]:
from azureml.core import Experiment

# Define experiment name and initialize it in the workspace
experiment_name = "bank-deposit-predictor"
experiment = Experiment(workspace=ws, name=experiment_name)

# Define project folder
project_folder = "./bank-deposit-predictor"

# Display experiment details
print(f"Azure ML Experiment Initialized Successfully\n"
      f"--------------------------------------------------\n"
      f"Experiment Name    : {experiment.name}\n"
      f"Workspace          : {experiment.workspace.name}\n"
      f"Project Folder     : {project_folder}\n")


Azure ML Experiment Initialized Successfully
--------------------------------------------------
Experiment Name    : bank-deposit-predictor
Workspace          : optimize-machine-learning-ws
Project Folder     : ./bank-deposit-predictor



### Handling the Compute Cluster

In [3]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# -------------------------------
# Compute Cluster Configuration
# -------------------------------
compute_name = "training-cluster"
compute_min_nodes = 0
compute_max_nodes = 5
vm_size = "Standard_D3_v2"

# Define provisioning configuration for the compute cluster
compute_config = AmlCompute.provisioning_configuration(
    vm_size=vm_size,
    min_nodes=compute_min_nodes,
    max_nodes=compute_max_nodes,
    idle_seconds_before_scaledown=600,  # Auto-scale down after 10 minutes of inactivity
)

# -------------------------------
# Create or Attach Compute Cluster
# -------------------------------
try:
    # Check if the compute cluster already exists in the workspace
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
    print(f"✔️ Compute cluster '{compute_name}' found in the workspace.")
    print(f"   - VM Size         : {compute_target.vm_size}")
    print(f"   - Minimum Nodes   : {compute_target.scale_settings.minimum_node_count}")
    print(f"   - Maximum Nodes   : {compute_target.scale_settings.maximum_node_count}")
    print(f"   - Auto-Scale Down : {compute_target.scale_settings.idle_seconds_before_scaledown // 60} minutes")
except ComputeTargetException:
    # Create a new compute cluster if it does not exist
    print(f"🚀 Creating a new compute cluster: '{compute_name}'")
    compute_target = ComputeTarget.create(ws, compute_name, compute_config)
    compute_target.wait_for_completion(show_output=True)
    print(f"✅ Compute cluster '{compute_name}' has been successfully created.")

print(f"🔹 Compute cluster '{compute_name}' is ready for use.")



✔️ Compute cluster 'training-cluster' found in the workspace.
   - VM Size         : Standard_D3_v2
   - Minimum Nodes   : 0
   - Maximum Nodes   : 5
   - Auto-Scale Down : 10 minutes
🔹 Compute cluster 'training-cluster' is ready for use.


### Importing the Dataset

In [4]:
from azureml.core import Dataset

# -------------------------------
# Dataset Configuration
# -------------------------------
dataset_name = "bank-marketing-dataset"
dataset_description = "Bank Marketing Dataset for predictive modeling."
dataset_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

# -------------------------------
# Check if Dataset Exists
# -------------------------------
if dataset_name in ws.datasets:
    dataset = ws.datasets[dataset_name]
    print(f"✔️ Found existing dataset: '{dataset_name}' in the workspace.")
else:
    print(f"📂 Dataset '{dataset_name}' not found. Registering a new dataset...")
    
    # Create AML Tabular Dataset from URL
    dataset = Dataset.Tabular.from_delimited_files(dataset_url)
    
    # Register the dataset in the workspace
    dataset = dataset.register(
        workspace=ws,
        name=dataset_name,
        description=dataset_description
    )
    
    print(f"✅ Dataset '{dataset_name}' has been successfully registered.")

# -------------------------------
# Load Dataset into DataFrame
# -------------------------------
df = dataset.to_pandas_dataframe()
print(f"📊 Dataset '{dataset_name}' loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
df.head()


✔️ Found existing dataset: 'bank-marketing-dataset' in the workspace.
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}
📊 Dataset 'bank-marketing-dataset' loaded successfully with 32950 rows and 21 columns.


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,no
3,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,no
4,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no


### Retrieving Summary Statistics on Data  

In [5]:
# Generate summary statistics for numerical columns
summary_stats = df.describe().T[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]

# Format numbers to two decimal places
summary_stats = summary_stats.round(2)

# Display the enhanced summary statistics table
print("\n📊 Summary Statistics Table")
print("=" * 80)
print(summary_stats.to_string())




📊 Summary Statistics Table
                   mean     std      min      25%      50%      75%      max
age               40.04   10.43    17.00    32.00    38.00    47.00    98.00
duration         257.34  257.33     0.00   102.00   179.00   318.00  4918.00
campaign           2.56    2.76     1.00     1.00     2.00     3.00    56.00
pdays            962.17  187.65     0.00   999.00   999.00   999.00   999.00
previous           0.17    0.50     0.00     0.00     0.00     0.00     7.00
emp.var.rate       0.08    1.57    -3.40    -1.80     1.10     1.40     1.40
cons.price.idx    93.57    0.58    92.20    93.08    93.75    93.99    94.77
cons.conf.idx    -40.52    4.62   -50.80   -42.70   -41.80   -36.40   -26.90
euribor3m          3.62    1.74     0.63     1.34     4.86     4.96     5.04
nr.employed     5166.86   72.21  4963.60  5099.10  5191.00  5228.10  5228.10


### Setting up Training Configuration

In [6]:
from azureml.train.automl import AutoMLConfig

# Define AutoML settings with updated parameters
automl_settings = {
    "experiment_timeout_minutes": 20,  # 20 minutes
    "max_concurrent_iterations": 5,  # Number of concurrent iterations
    "primary_metric": "accuracy",  # Classification typically optimizes for accuracy
    "n_cross_validations": 5  # Maintain robust evaluation
}

# Configure AutoML for classification with model explainability
classification_config = AutoMLConfig(
    compute_target=compute_target,
    task="classification",
    training_data=dataset,
    label_column_name="y",
    path=project_folder,
    enable_early_stopping=True,
    featurization="auto",
    debug_log="automl_errors.log",
    **automl_settings
)


### Setting Up Information Containers

In [7]:
from azureml.pipeline.core import PipelineData, TrainingOutput

# Retrieve the default datastore
default_datastore = ws.get_default_datastore()

# Define pipeline output identifiers
METRICS_OUTPUT_NAME = "metrics_output"
MODEL_OUTPUT_NAME = "best_model_output"

# Define PipelineData for storing training metrics
metrics_data = PipelineData(
    name="metrics_data",
    datastore=default_datastore,
    pipeline_output_name=METRICS_OUTPUT_NAME,
    training_output=TrainingOutput(type="Metrics")  # Stores training metrics
)

# Define PipelineData for storing the best model
model_data = PipelineData(
    name="model_data",
    datastore=default_datastore,
    pipeline_output_name=MODEL_OUTPUT_NAME,
    training_output=TrainingOutput(type="Model")  # Stores trained model
)

# Confirmation message
print("✅ Pipeline outputs initialized successfully.")
print(f"📊 Metrics Output: {METRICS_OUTPUT_NAME}")
print(f"📦 Model Output: {MODEL_OUTPUT_NAME}")


✅ Pipeline outputs initialized successfully.
📊 Metrics Output: metrics_output
📦 Model Output: best_model_output


### Creating the AutoML Pipeline

In [8]:
from azureml.pipeline.steps import AutoMLStep
from azureml.pipeline.core import Pipeline

# Define the AutoML step
automl_step = AutoMLStep(
    name="AutoML_Classification",
    automl_config=classification_config,
    outputs=[metrics_data, model_data],  # Capture training metrics and model output
    allow_reuse=True  # Enables step reuse to optimize execution time
)

# Construct the pipeline with AutoML step
pipeline = Pipeline(
    workspace=ws,
    steps=[automl_step],
    description="Automated ML pipeline for classiying banking conversion rates"
)

# Confirmation message
print("✅ AutoML pipeline successfully created.")


✅ AutoML pipeline successfully created.


## Traning the Data

### Submit the AutoML Job

In [9]:
!sudo -i pip install azureml-widgets
!sudo -i jupyter labextension install @jupyter-widgets/jupyterlab-manager
!jupyter labextension list

Traceback (most recent call last):
  File "/usr/local/bin/jupyter-labextension", line 5, in <module>
    from jupyterlab.labextensions import main
  File "/usr/local/lib/python3.8/dist-packages/jupyterlab/__init__.py", line 8, in <module>
    from .handlers.announcements import (
  File "/usr/local/lib/python3.8/dist-packages/jupyterlab/handlers/announcements.py", line 14, in <module>
    from jupyter_server.base.handlers import APIHandler
  File "/usr/local/lib/python3.8/dist-packages/jupyter_server/base/handlers.py", line 22, in <module>
    from jinja2 import TemplateNotFound
  File "/usr/lib/python3/dist-packages/jinja2/__init__.py", line 33, in <module>
    from jinja2.environment import Environment, Template
  File "/usr/lib/python3/dist-packages/jinja2/environment.py", line 15, in <module>
    from jinja2 import nodes
  File "/usr/lib/python3/dist-packages/jinja2/nodes.py", line 23, in <module>
    from jinja2.utils import Markup
  File "/usr/lib/python3/dist-packages/jinja2/uti

In [10]:
from azureml.widgets import RunDetails

# Submit the pipeline experiment
try:
    run = experiment.submit(pipeline)
    print(f"✅ Experiment \"{experiment_name}\" successfully submitted. Run ID: {run.id}")

    # Show live RunDetails widget
    RunDetails(run).show()

    # Monitor run progress with live logs
    print("🔄 Waiting for pipeline execution to complete...")
    run.wait_for_completion(show_output=True)

except Exception as e:
    print(f"⚠️ Error: {str(e)}")

Created step AutoML_Classification [b51661e4][8a657e52-d718-4591-8bd0-ec21d24eb377], (This step will run and generate new outputs)
Submitted PipelineRun 29f08177-7f35-4bc5-b42e-e4d879ee79d6
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/29f08177-7f35-4bc5-b42e-e4d879ee79d6?wsid=/subscriptions/48778e11-0fc7-4fc8-a16c-304a430e61a4/resourcegroups/optimize-machine-learning/workspaces/optimize-machine-learning-ws&tid=f804f881-90d8-4e7d-8309-7fb565eaaf2c
✅ Experiment "bank-deposit-predictor" successfully submitted. Run ID: 29f08177-7f35-4bc5-b42e-e4d879ee79d6


2025-03-14 16:28:20.781590: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-14 16:28:20.806627: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-14 16:28:20.813814: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-14 16:28:20.833226: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

🔄 Waiting for pipeline execution to complete...
PipelineRunId: 29f08177-7f35-4bc5-b42e-e4d879ee79d6
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/29f08177-7f35-4bc5-b42e-e4d879ee79d6?wsid=/subscriptions/48778e11-0fc7-4fc8-a16c-304a430e61a4/resourcegroups/optimize-machine-learning/workspaces/optimize-machine-learning-ws&tid=f804f881-90d8-4e7d-8309-7fb565eaaf2c
PipelineRun Status: Running


StepRunId: 8ca913fe-66d9-4e75-8f6c-01c13c24ddf7
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8ca913fe-66d9-4e75-8f6c-01c13c24ddf7?wsid=/subscriptions/48778e11-0fc7-4fc8-a16c-304a430e61a4/resourcegroups/optimize-machine-learning/workspaces/optimize-machine-learning-ws&tid=f804f881-90d8-4e7d-8309-7fb565eaaf2c
StepRun( AutoML_Classification ) Status: NotStarted


ERROR:azureml.data._dataset_client:[NOT_SUPPORTED_API_USE_ATTEMPT] The [_DatasetClient.get] API has been deprecated and is no longer supported
ERROR:azureml.data._dataset_client:[NOT_SUPPORTED_API_USE_ATTEMPT] The [_DatasetClient.get] API has been deprecated and is no longer supported
ERROR:azureml.data._dataset_client:[NOT_SUPPORTED_API_USE_ATTEMPT] The [_DatasetClient.get] API has been deprecated and is no longer supported
ERROR:azureml.data._dataset_client:[NOT_SUPPORTED_API_USE_ATTEMPT] The [_DatasetClient.get] API has been deprecated and is no longer supported


## Viewing the Results

### Retrieving the Metrics of All Child Runs

In [None]:
import json
import pandas as pd

# Retrieve the pipeline output for metrics
try:
    metrics_output = run.get_pipeline_output(METRICS_OUTPUT_NAME)  # Use `run` instead of `pipeline_run`
    num_files_downloaded = metrics_output.download('.', show_progress=True)

    print(f"✅ Successfully downloaded {num_files_downloaded} file(s) from pipeline output.")

    # Load and deserialize metrics data
    with open(metrics_output._path_on_datastore, "r", encoding="utf-8") as f:
        metrics_output_result = f.read()

    deserialized_metrics_output = json.loads(metrics_output_result)
    df = pd.DataFrame(deserialized_metrics_output)

    # Display the DataFrame
    print(df)

except Exception as e:
    print(f"⚠️ Error retrieving pipeline output: {str(e)}")


### Retrieve the Best Model

In [None]:
import pickle

# Retrieve the best model from the pipeline run
try:
    best_model_output = run.get_pipeline_output(MODEL_OUTPUT_NAME)
    num_files_downloaded = best_model_output.download('.', show_progress=True)

    print(f"✅ Successfully downloaded {num_files_downloaded} file(s) from the pipeline output.")

    # Load the model from the downloaded file
    model_path = best_model_output._path_on_datastore

    if not model_path:
        raise ValueError("⚠️ Error: Model output file path is empty or invalid.")

    with open(model_path, "rb") as f:
        best_model = pickle.load(f)

    print("✅ Model successfully loaded.")

    # Display model steps (if applicable)
    if hasattr(best_model, "steps"):
        print("🔹 Model Steps:")
        for step in best_model.steps:
            print(f"  - {step}")
    else:
        print("⚠️ Warning: The loaded model does not have a `steps` attribute.")

except Exception as e:
    print(f"⚠️ Error retrieving or loading the model: {str(e)}")


## Setting Up the the RestAPI for the Pipeline

### Publish the Pipeline

In [None]:
# Publish the pipeline with metadata
published_pipeline = pipeline.publish(
    name="Bank Marketing Predictor Pipeline",
    description="This pipeline is used to train an ML algorithm to recommend a bank savings account.",
    version="1.0"
)

print(f"✅ Pipeline successfully published. Pipeline ID: {published_pipeline.id}")

### Authenticate and Retrieve the Authorization Header

In [None]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()


### Trigger the Pipeline Execution via REST API

In [None]:
import requests

rest_endpoint = published_pipeline.endpoint
response = requests.post(
    rest_endpoint,
    headers=auth_header,
    json={"ExperimentName": "pipeline-rest-endpoint"}
)


### Handle API Response and Extract Run ID

In [None]:
try:
    response.raise_for_status()
except Exception:
    raise Exception(
        "Received an error response from the endpoint:\n"
        f"🔹 Endpoint: {rest_endpoint}\n"
        f"🔹 Response Code: {response.status_code}\n"
        f"🔹 Headers: {response.headers}\n"
        f"🔹 Content: {response.content}"
    )

run_id = response.json().get('Id')
print(f"✅ Pipeline run successfully submitted. Run ID: {run_id}")

### Monitor the Pipeline Execution

In [None]:
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails

published_pipeline_run = PipelineRun(ws.experiments["pipeline-rest-endpoint"], run_id)
RunDetails(published_pipeline_run).show()
