# MSA 2023 Phase 2 - Run Training Experiment

## 1. Load and connect to workplace

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.52.0 to work with MSA-Phase2-Azure


## 2. Create a parameterised training script

In [2]:
import os, shutil

# Create a folder for the experiment files
training_folder = 'training'
os.makedirs(training_folder, exist_ok=True)

# Copy the data file into the experiment folder
shutil.copy(
    '../1. Analysis and Preprocessing/preprocessed_datasets/market_segmentation_interaction.csv',
    os.path.join(training_folder, "market_segmentation_interaction.csv")
)
shutil.copy(
    '../1. Analysis and Preprocessing/preprocessed_datasets/market_segmentation.csv',
    os.path.join(training_folder, "market_segmentation.csv")
)

'training\\market_segmentation.csv'

In [4]:
%%writefile $training_folder/lg-training.py
# Import libraries
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
import os
import argparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import LabelBinarizer

# Get the experiment run context
run = Run.get_context()

# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg_rate', type=float, dest='reg', default=0.01)
args = parser.parse_args()
reg = args.reg

# load the market segmentation dataset
print("Loading Data...")
market_segmentation = pd.read_csv('market_segmentation_interaction.csv')

# Separate features and labels
X, y = market_segmentation.drop(columns="Segmentation"), market_segmentation.Segmentation

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg).fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_scores = model.predict_proba(X_test)
for class_of_interest in ["A", "B", "C", "D"]:
    class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
    auc = roc_auc_score(y_onehot_test[:,class_id],y_scores[:,class_id])
    print(f'AUC {class_of_interest} vs rest: ' + str(auc))
    run.log(f'AUC {class_of_interest} vs rest', np.float(auc))

# Save the trained model in the outputs folder
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/lg-model.pkl')

run.complete()

Writing training/lg-training.py


In [9]:
%%writefile $training_folder/rf-training.py
# Import libraries
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
import os
import argparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import LabelBinarizer

# Get the experiment run context
run = Run.get_context()

# Set hyperparameters
parser = argparse.ArgumentParser()
parser.add_argument('--max_features', type=int, dest='max_features', default=4)
parser.add_argument('--max_depth', type=int, dest='max_depth', default=None)
parser.add_argument('--min_samples_split', type=int, dest='min_samples_split', default=2)
parser.add_argument('--min_samples_leaf', type=int, dest='min_samples_leaf', default=1)
args = parser.parse_args()
max_features = args.max_features
max_depth = args.max_depth
min_samples_split = args.min_samples_split
min_samples_leaf = args.min_samples_leaf

# load the market segmentation dataset
print("Loading Data...")
market_segmentation = pd.read_csv('market_segmentation.csv')

# Separate features and labels
X, y = market_segmentation.drop(columns="Segmentation"), market_segmentation.Segmentation

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a random forest model
print(
    'Training a random forest model with max features of', max_features,
    'max depth of', max_depth,
    'min samples split of', min_samples_split,
    'and min_samples_leaf of', min_samples_leaf
)
run.log('Maximum depth of tree', max_depth)
run.log('Maximum features per split', max_features)
run.log('Minimum samples requierd for split', min_samples_split)
run.log('Minimum samples per leaf', min_samples_leaf)
model = RandomForestClassifier(
    max_features=max_features,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf
).fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_scores = model.predict_proba(X_test)
for class_of_interest in ["A", "B", "C", "D"]:
    class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
    auc = roc_auc_score(y_onehot_test[:,class_id],y_scores[:,class_id])
    print(f'AUC {class_of_interest} vs rest: ' + str(auc))
    run.log(f'AUC {class_of_interest} vs rest', np.float(auc))

# Save the trained model in the outputs folder
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/rf-model.pkl')

run.complete()

Overwriting training/rf-training.py


## 3. Create a compute cluster

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "aguo921"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


## 4. Run the script with arguments

In [6]:
%%writefile $training_folder/environment.yml
name: batch_environment
dependencies:
- python=3.6.2
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults


Writing training/environment.yml


In [11]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.runconfig import DockerConfiguration

# Create a Python environment for the experiment (from a .yml file)
env = Environment.from_conda_specification("experiment_env", training_folder + "/environment.yml")

# Create a script config
script_config = ScriptRunConfig(
    source_directory=training_folder,
    script='lg-training.py',
    arguments = ['--reg_rate', 0.1],
    environment=env,
    docker_runtime_config=DockerConfiguration(use_docker=True),
    compute_target=cluster_name
) 

# submit the experiment run
experiment_name = 'train-segmentation-lg'
experiment = Experiment(workspace=ws, name=experiment_name)
run_lg = experiment.submit(config=script_config)

# Block until the experiment run has completed
run_lg.wait_for_completion()

{'runId': 'train-segmentation-lg_1690932545_36eca8b1',
 'target': 'aguo921',
 'status': 'Finalizing',
 'startTimeUtc': '2023-08-01T23:29:19.464458Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlcdsi',
  'ContentSnapshotId': 'f2af527f-0515-42d8-8110-28527c2c6b32',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json',
  'azureml.git.repository_uri': 'https://github.com/aguo921/2023-Phase-2.git',
  'mlflow.source.git.repoURL': 'https://github.com/aguo921/2023-Phase-2.git',
  'azureml.git.branch': 'main',
  'mlflow.source.git.branch': 'main',
  'azureml.git.commit': '7713ca5b70f1a9b878d98a95edff6282e85729c0',
  'mlflow.source.git.commit': '7713ca5b70f1a9b878d98a95edff6282e85729c0',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'lg-training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--reg_rate', '0.1'],
  'sourceDirectoryDataS

In [12]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.runconfig import DockerConfiguration

# Create a Python environment for the experiment (from a .yml file)
env = Environment.from_conda_specification("experiment_env", training_folder + "/environment.yml")

# Create a script config
script_config = ScriptRunConfig(
    source_directory=training_folder,
    script='rf-training.py',
    arguments = ['--max_depth', 50, '--max_features', 5, '--min_samples_split', 4, '--min_samples_leaf', 4],
    environment=env,
    docker_runtime_config=DockerConfiguration(use_docker=True),
    compute_target=cluster_name
) 

# submit the experiment run
experiment_name = 'train-segmentation-rf'
experiment = Experiment(workspace=ws, name=experiment_name)
run_rf = experiment.submit(config=script_config)

# Block until the experiment run has completed
run_rf.wait_for_completion()

{'runId': 'train-segmentation-rf_1690932661_3eb821b7',
 'target': 'aguo921',
 'status': 'Finalizing',
 'startTimeUtc': '2023-08-01T23:31:09.796301Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlcdsi',
  'ContentSnapshotId': 'f2af527f-0515-42d8-8110-28527c2c6b32',
  'azureml.git.repository_uri': 'https://github.com/aguo921/2023-Phase-2.git',
  'mlflow.source.git.repoURL': 'https://github.com/aguo921/2023-Phase-2.git',
  'azureml.git.branch': 'main',
  'mlflow.source.git.branch': 'main',
  'azureml.git.commit': '7713ca5b70f1a9b878d98a95edff6282e85729c0',
  'mlflow.source.git.commit': '7713ca5b70f1a9b878d98a95edff6282e85729c0',
  'azureml.git.dirty': 'True',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'rf-training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--max_depth',
   '50',
   '--max_features',


In [13]:
# Get logged metrics and files for logistic regression training
metrics = run_lg.get_metrics()
for key in metrics.keys():
    print(key, ":", metrics.get(key))
print('\n')
for file in run_lg.get_file_names():
    print(file)

Regularization Rate : 0.1
Accuracy : 0.4965453707968678
AUC A vs rest : 0.6942899716985923
AUC C vs rest : 0.7599926507739654
AUC B vs rest : 0.6838742009488384
AUC D vs rest : 0.8749635705529935


outputs/lg-model.pkl
system_logs/cs_capability/cs-capability.log
system_logs/hosttools_capability/hosttools-capability.log
system_logs/lifecycler/execution-wrapper.log
system_logs/lifecycler/lifecycler.log
system_logs/lifecycler/vm-bootstrapper.log
system_logs/metrics_capability/metrics-capability.log
system_logs/snapshot_capability/snapshot-capability.log
user_logs/std_log.txt


In [14]:
# Get logged metrics and files for random forest training
metrics = run_rf.get_metrics()
for key in metrics.keys():
    print(key, ":", metrics.get(key))
print('\n')
for file in run_rf.get_file_names():
    print(file)

Maximum depth of tree : 50
Maximum features per split : 5
Minimum samples requierd for split : 4
Minimum samples per leaf : 4
Accuracy : 0.49746660525103636
AUC A vs rest : 0.7171151955013227
AUC B vs rest : 0.6807604071541529
AUC C vs rest : 0.7722985852739883
AUC D vs rest : 0.8757733610085896


outputs/rf-model.pkl
system_logs/cs_capability/cs-capability.log
system_logs/hosttools_capability/hosttools-capability.log
system_logs/lifecycler/execution-wrapper.log
system_logs/lifecycler/lifecycler.log
system_logs/lifecycler/vm-bootstrapper.log
system_logs/metrics_capability/metrics-capability.log
system_logs/snapshot_capability/snapshot-capability.log
user_logs/std_log.txt


## 5. Register the model

In [16]:
from azureml.core import Model

# Register the logistic regression model
run_lg.register_model(
    model_path='outputs/lg-model.pkl', model_name='segmentation-lg',
    tags={'Training context':'Script'},
    properties={
        'AUC A vs rest': run_lg.get_metrics()['AUC A vs rest'],
        'AUC B vs rest': run_lg.get_metrics()['AUC B vs rest'],
        'AUC C vs rest': run_lg.get_metrics()['AUC C vs rest'],
        'AUC D vs rest': run_lg.get_metrics()['AUC D vs rest'],
        'Accuracy': run_lg.get_metrics()['Accuracy']
    }
)

Model(workspace=Workspace.create(name='MSA-Phase2-Azure', subscription_id='b5ba4903-ea86-4021-ae33-60b2d6e5d120', resource_group='MSA-Phase2-Azure'), name=segmentation-lg, id=segmentation-lg:2, version=2, tags={'Training context': 'Script'}, properties={'AUC A vs rest': '0.6942899716985923', 'AUC B vs rest': '0.6838742009488384', 'AUC C vs rest': '0.7599926507739654', 'AUC D vs rest': '0.8749635705529935', 'Accuracy': '0.4965453707968678'})

In [17]:
# Register the random forest model
run_rf.register_model(
    model_path='outputs/rf-model.pkl', model_name='segmentation-rf',
    tags={'Training context':'Script'},
    properties={
        'AUC A vs rest': run_rf.get_metrics()['AUC A vs rest'],
        'AUC B vs rest': run_rf.get_metrics()['AUC B vs rest'],
        'AUC C vs rest': run_rf.get_metrics()['AUC C vs rest'],
        'AUC D vs rest': run_rf.get_metrics()['AUC D vs rest'],
        'Accuracy': run_rf.get_metrics()['Accuracy']
    }
)

Model(workspace=Workspace.create(name='MSA-Phase2-Azure', subscription_id='b5ba4903-ea86-4021-ae33-60b2d6e5d120', resource_group='MSA-Phase2-Azure'), name=segmentation-rf, id=segmentation-rf:1, version=1, tags={'Training context': 'Script'}, properties={'AUC A vs rest': '0.7171151955013227', 'AUC B vs rest': '0.6807604071541529', 'AUC C vs rest': '0.7722985852739883', 'AUC D vs rest': '0.8757733610085896', 'Accuracy': '0.49746660525103636'})

In [18]:
# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

segmentation-rf version: 1
	 Training context : Script
	 AUC A vs rest : 0.7171151955013227
	 AUC B vs rest : 0.6807604071541529
	 AUC C vs rest : 0.7722985852739883
	 AUC D vs rest : 0.8757733610085896
	 Accuracy : 0.49746660525103636


segmentation-lg version: 2
	 Training context : Script
	 AUC A vs rest : 0.6942899716985923
	 AUC B vs rest : 0.6838742009488384
	 AUC C vs rest : 0.7599926507739654
	 AUC D vs rest : 0.8749635705529935
	 Accuracy : 0.4965453707968678


segmentation-lg version: 1
	 Training context : Script
	 AUC A vs rest : 0.6942899716985923
	 AUC B vs rest : 0.6838742009488384
	 AUC C vs rest : 0.7599926507739654
	 AUC D vs rest : 0.8749635705529935
	 Accuracy : 0.4965453707968678


segmentation-logistic-regression version: 4


segmentation-logistic-regression version: 3


segmentation-model version: 5
	 Training context : Hyperdrive
	 AUC A vs rest : 0.7173467464996174
	 AUC B vs rest : 0.6771250706291696
	 AUC C vs rest : 0.7548471920284576
	 AUC D vs rest : 0.866