# Bank Marketing Efficiency

## Set up Workspace / Compute / Environment

In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="bankmarketing_hyperdrive")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

#run = exp.start_logging()

Workspace name: quick-starts-ws-270850
Azure region: eastus2
Subscription id: a0a76bad-11a1-4a2d-9887-97a29122c8ed
Resource group: aml-quickstarts-270850


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "anph-aml-compute"
cluster_sku = "Standard_D2_V2"
cluster_max_nodes = 4

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

try:
    aml_compute = ComputeTarget(workspace=ws, name=cluster_name)
    print(f"{cluster_name} exists, using it...")

except:
    print(f"{cluster_name} doesn't exists, creating it...")

    compute_config = AmlCompute.provisioning_configuration(vm_size=cluster_sku, max_nodes=cluster_max_nodes)
    
    aml_compute = ComputeTarget.create(workspace=ws, name=cluster_name, provisioning_configuration=compute_config)
    aml_compute.wait_for_completion(show_output=True)


anph-aml-compute doesn't exists, creating it...
InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.core import Environment

# Register an environment

env_name = 'sklearn-env'

envs = list(ws.environments)

if env_name in envs:
    print("Using existing environment")
    sklearn_env = Environment(name=env_name)
    
else:
    print("Registering new environment")
    sklearn_env = Environment.from_conda_specification(name=env_name, file_path='conda_dependencies.yml')
    sklearn_env.register(ws)

print(sklearn_env)

Registering new environment
Environment(Name: sklearn-env,
Version: None)


## Using HyperDrive

In [4]:
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive import HyperDriveRun
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling({
        "C": choice(0.01, 0.1, 1, 10, 100),
        "max_iter": choice(100, 200, 300, 400, 500)
        })

# Specify a Policy
policy = BanditPolicy(slack_factor=0.1, evaluation_interval=2)

#if "training" not in os.listdir():
#    os.mkdir("./training")

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(
        source_directory='./', 
        script='train.py', 
        arguments=['--C', 1.0,'--max_iter', 100], 
        compute_target=aml_compute, 
        environment=sklearn_env
        )

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
                        run_config=src,
                        hyperparameter_sampling=ps, 
                        policy=policy,
                        primary_metric_name='accuracy',
                        primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                        max_total_runs=10,
                        max_concurrent_runs=10
                        )

In [5]:
from azureml.widgets import RunDetails

# Submit your hyperdrive run to the experiment and show run details with the widget.

hd_run = exp.submit(config=hyperdrive_config)
RunDetails(hd_run).show()

hd_run.wait_for_completion(show_output=True)

2024-12-20 21:15:48.712308: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-20 21:15:50.369975: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-20 21:15:50.857515: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-20 21:15:54.790975: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_cad81c08-e34c-42ca-96e5-fc4d490db662
Web View: https://ml.azure.com/runs/HD_cad81c08-e34c-42ca-96e5-fc4d490db662?wsid=/subscriptions/a0a76bad-11a1-4a2d-9887-97a29122c8ed/resourcegroups/aml-quickstarts-270850/workspaces/quick-starts-ws-270850&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

[2024-12-20T21:14:48.3974585Z][GENERATOR][DEBUG]Sampled 10 jobs from search space 
[2024-12-20T21:14:48.9901519Z][SCHEDULER][INFO]Scheduling job, id='HD_cad81c08-e34c-42ca-96e5-fc4d490db662_2' 
[2024-12-20T21:14:48.8698793Z][SCHEDULER][INFO]Scheduling job, id='HD_cad81c08-e34c-42ca-96e5-fc4d490db662_0' 
[2024-12-20T21:14:48.9897089Z][SCHEDULER][INFO]Scheduling job, id='HD_cad81c08-e34c-42ca-96e5-fc4d490db662_5' 
[2024-12-20T21:14:48.9921608Z][SCHEDULER][INFO]Scheduling job, id='HD_cad81c08-e34c-42ca-96e5-fc4d490db662_4' 
[2024-12-20T21:14:48.8689701Z][SCHEDULER][INFO]Scheduling job, id='HD_cad81c08-e34c-42ca-96e5-fc4d490db662_1' 
[2024-12-20T21:14:48.9911674Z

In [7]:
# Widget only avail via jupyter env?

print(f"Display Name: {hd_run.display_name}")
print(f"Id: {hd_run.id}")
print(f"Id: {hd_run.status}")

Display Name: kind_steelpan_7z2xydy7
Id: HD_cad81c08-e34c-42ca-96e5-fc4d490db662
Id: Completed


In [12]:
import pandas as pd
from azureml.core import Run

run_metrics = hd_run.get_metrics()

pdf = pd.DataFrame \
    .from_dict(run_metrics, orient='index') \
    .rename_axis('Job Id') \
    .reset_index() \
    .sort_values(by='accuracy', ascending=False)

pdf.reset_index(drop=True, inplace=True)

pdf_best_run_id = pdf['Job Id'].iloc[0]
best_run_display_name = Run(experiment=exp, run_id=pdf_best_run_id).display_name

display(pdf)
print(f"\nBest Run: {best_run_display_name} ({pdf_best_run_id})")

Unnamed: 0,Job Id,Regularization Strength,Max iterations,accuracy
0,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_5,0.1,100,0.908801
1,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_9,1.0,300,0.908801
2,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_6,0.01,100,0.908649
3,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_8,100.0,500,0.908194
4,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_3,100.0,300,0.908194
5,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_2,10.0,400,0.908042
6,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_4,1.0,500,0.908042
7,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_7,100.0,400,0.907739
8,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_0,0.01,200,0.907436
9,HD_cad81c08-e34c-42ca-96e5-fc4d490db662_1,0.01,400,0.907132



Best Run: ivory_yogurt_0b3yt75c (HD_cad81c08-e34c-42ca-96e5-fc4d490db662_5)


In [14]:
# import joblib
# Get your best run and save the model from that run.

best_run = hd_run.get_best_run_by_primary_metric()

best_run_metrics = best_run.get_metrics()
best_run_files = best_run.get_file_names()

print(best_run.display_name)
print(best_run_metrics)

#assert pdf_best_run_id == best_run.id

print(f"\nFiles from best run: \n{best_run_files}")

calm_evening_dw7n0jr3
{'Regularization Strength': 1.0, 'Max iterations': 300, 'accuracy': 0.9088012139605463}

Files from best run: 
['azureml-logs/20_image_build_log.txt', 'logs/azureml/dataprep/0/backgroundProcess.log', 'logs/azureml/dataprep/0/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/0/rslex.log.2024-12-20-21', 'outputs/model.pkl', 'system_logs/cs_capability/cs-capability.log', 'system_logs/hosttools_capability/hosttools-capability.log', 'system_logs/lifecycler/execution-wrapper.log', 'system_logs/lifecycler/lifecycler.log', 'system_logs/metrics_capability/metrics-capability.log', 'system_logs/snapshot_capability/snapshot-capability.log', 'user_logs/std_log.txt']


In [15]:
best_run.register_model(
    model_name='bank-marketing-effectiveness', 
    model_path='outputs/model.pkl', 
    tags={
        'project': 'classification'
        },
    properties={
        'framework': 'scikit-learn',
        'regularization strength': best_run_metrics['Regularization Strength'],
        'max_iter': best_run_metrics['Max iterations'],
        'accuracy': best_run_metrics['accuracy']
        }
    )

Model(workspace=Workspace.create(name='quick-starts-ws-270850', subscription_id='a0a76bad-11a1-4a2d-9887-97a29122c8ed', resource_group='aml-quickstarts-270850'), name=bank-marketing-effectiveness, id=bank-marketing-effectiveness:1, version=1, tags={'project': 'classification'}, properties={'framework': 'scikit-learn', 'regularization strength': '1.0', 'max_iter': '300', 'accuracy': '0.9088012139605463'})

## Using AutoML

In [17]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

bankmarketing_data_uri = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(path=bankmarketing_data_uri)
    

In [18]:
from train import clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

# train test split
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}


In [19]:
# Combine the dataframes back
cleaned_pdf = x

cleaned_pdf['y'] = y

display(cleaned_pdf[cleaned_pdf['y'] == 1])

true_positives_pdf = cleaned_pdf[cleaned_pdf['y'] == 1]

# there are some positives in the final dataset
assert len(cleaned_pdf) > len(true_positives_pdf) & len(true_positives_pdf) > 0

Unnamed: 0,age,marital,default,housing,loan,month,day_of_week,duration,campaign,pdays,...,contact_telephone,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,y
12,43,0,0,1,0,7,4,982,1,999,...,0,0,0,1,0,0,0,0,0,1
13,56,1,0,0,0,8,3,552,3,999,...,0,0,0,0,0,0,0,1,0,1
19,46,1,0,1,0,8,4,399,1,999,...,0,0,0,0,0,0,0,1,0,1
20,59,1,0,0,0,7,5,1330,3,999,...,0,0,0,1,0,0,0,0,0,1
22,26,0,0,1,1,6,3,524,15,999,...,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32902,26,0,0,1,0,7,5,806,3,6,...,0,0,0,0,0,0,0,1,0,1
32923,36,1,0,0,1,6,1,284,1,999,...,0,0,0,0,0,0,0,1,0,1
32928,43,1,0,0,0,9,2,293,1,999,...,0,0,0,0,0,0,1,0,0,1
32936,46,1,0,1,0,4,4,231,1,999,...,0,0,0,0,1,0,0,0,0,1


In [20]:
from azureml.core import Datastore
from azureml.core import Dataset

datastore = Datastore.get_default(ws)

# since registering, replace bad column names before registering
cleaned_pdf.columns = cleaned_pdf.columns.str.replace('.', '_')

registered_dataset = Dataset.Tabular.register_pandas_dataframe(cleaned_pdf, name="bankmarketing-cleaned", target=datastore)

Validating arguments.
Arguments validated.
Validating arguments.
Arguments validated.
'overwrite' is set to True. Any file already present in the target will be overwritten.
Uploading files from '/tmp/tmp9j0ucsvy' to 'managed-dataset/f552c94a-7152-4b7e-8729-38f96692ea8f/'
Copying 1 files with concurrency set to 1
Copied /tmp/tmp9j0ucsvy/dataframe.parquet, file 1 out of 1. Destination path: https://mlstrg270850.blob.core.windows.net/azureml-blobstore-12acd1e3-b7fb-4192-a3da-99293ff3a6f8/managed-dataset/f552c94a-7152-4b7e-8729-38f96692ea8f/dataframe.parquet
Files copied=1, skipped=0, failed=0
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [23]:
from azureml.train.automl import AutoMLConfig

if "training" not in os.listdir():
    os.mkdir("./training")

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    iterations=20,
    primary_metric='accuracy',
    compute_target=aml_compute,
    training_data=registered_dataset,
    label_column_name='y',
    n_cross_validations=2,
    test_size=0.2,
    path='./training'
    )

In [24]:
# Submit your automl run

automl_exp = Experiment(workspace=ws, name="bankmarketing_automl")
automl_run = automl_exp.submit(automl_config, show_output=True)


Submitting remote run.
No run_configuration provided, running on anph-aml-compute with default configuration
Running on remote compute: anph-aml-compute


Experiment,Id,Type,Status,Details Page,Docs Page
bankmarketing_automl,AutoML_aa557fb2-c6eb-45da-a52a-bdb1767f2fb2,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+-------------------------------------

In [40]:
# Retrieve and save your best automl model.

automl_best_run, automl_fitted_model = automl_run.get_output()

automl_best_run_metrics = automl_best_run.get_metrics()

print(f"Best run Id: {automl_best_run.id}")
print(f"Best run files: \n{automl_best_run.get_file_names()}\n\n")
print(f"Best run metrics: \n{automl_best_run_metrics}")

Best run Id: AutoML_aa557fb2-c6eb-45da-a52a-bdb1767f2fb2_18
Best run files: 
['accuracy_table', 'automl_driver.py', 'confusion_matrix', 'explanation/8463160f/classes.interpret.json', 'explanation/8463160f/eval_data_viz.interpret.json', 'explanation/8463160f/expected_values.interpret.json', 'explanation/8463160f/features.interpret.json', 'explanation/8463160f/global_names/0.interpret.json', 'explanation/8463160f/global_rank/0.interpret.json', 'explanation/8463160f/global_values/0.interpret.json', 'explanation/8463160f/local_importance_values.interpret.json', 'explanation/8463160f/per_class_names/0.interpret.json', 'explanation/8463160f/per_class_rank/0.interpret.json', 'explanation/8463160f/per_class_values/0.interpret.json', 'explanation/8463160f/rich_metadata.interpret.json', 'explanation/8463160f/true_ys_viz.interpret.json', 'explanation/8463160f/visualization_dict.interpret.json', 'explanation/8463160f/ys_pred_proba_viz.interpret.json', 'explanation/8463160f/ys_pred_viz.interpret.js

In [None]:
automl_fitted_model

In [42]:
automl_best_run.register_model(
    model_name='bank-marketing-effectiveness-automl', 
    model_path='outputs/model.pkl', 
    tags={
        'project': 'classification'
        },
    properties={
        'framework': 'scikit-learn',
        'accuracy': automl_best_run_metrics['accuracy'],
        'algorithm': automl_best_run.get_details().get('properties')['run_algorithm']
        }
    )

Model(workspace=Workspace.create(name='quick-starts-ws-270850', subscription_id='a0a76bad-11a1-4a2d-9887-97a29122c8ed', resource_group='aml-quickstarts-270850'), name=bank-marketing-effectiveness-automl, id=bank-marketing-effectiveness-automl:1, version=1, tags={'project': 'classification'}, properties={'framework': 'scikit-learn', 'accuracy': '0.9162367223065251', 'algorithm': 'VotingEnsemble'})