# **Azure Machine Learning Nano Degree (Udacity)**

## Project 1: Optimizing an ML Pipeline in Azure

## Author: Andre Salerno

In [1]:
#checking the azure ml sdk version
from azureml.core import VERSION

print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")

You are currently using version 1.19.0 of the Azure ML SDK


# **1) Setup**

In [2]:
#setup
from azureml.core                                    import Dataset, Datastore, Experiment, ScriptRunConfig, Workspace 

from azureml.core.compute                            import AmlCompute, ComputeTarget 
from azureml.core.compute_target                     import ComputeTargetException

from azureml.core.run                                import Run

from azureml.data.dataset_factory                    import TabularDatasetFactory

from azureml.interpret                               import ExplanationClient

from azureml.pipeline.core                           import Pipeline, PipelineData
from azureml.pipeline.steps                          import PythonScriptStep

from azureml.train.automl                            import AutoMLConfig
from azureml.train.sklearn                           import SKLearn

from azureml.train.hyperdrive.parameter_expressions  import uniform, quniform
from azureml.train.hyperdrive.policy                 import BanditPolicy
from azureml.train.hyperdrive.run                    import PrimaryMetricGoal
from azureml.train.hyperdrive.runconfig              import HyperDriveConfig
from azureml.train.hyperdrive.sampling               import RandomParameterSampling

from azureml.widgets                                 import RunDetails

from sklearn.linear_model                            import LogisticRegression
from sklearn.metrics                                 import mean_squared_error
from sklearn.model_selection                         import train_test_split
from sklearn.preprocessing                           import OneHotEncoder

import argparse
import joblib
import numpy as np
import os
import pandas as pd

#load_dotenv()

# **2) Creating an Azure ML Workspace, connecting it and creating an experiment**

## **2.1) Hyperdrive**

In [3]:
#creating the azure ml compute cluster
ws = Workspace.from_config() # this automatically looks for a directory .azureml
ws

Workspace.create(name='quick-starts-ws-134041', subscription_id='cdbe0b43-92a0-4715-838a-f2648cc7ad21', resource_group='aml-quickstarts-134041')

In [6]:
#checking a new function
ws = Workspace.get(name="quick-starts-ws-134041", subscription_id=os.getenv('cdbe0b43-92a0-4715-838a-f2648cc7ad21'))
ws

Workspace.create(name='quick-starts-ws-134041', subscription_id='cdbe0b43-92a0-4715-838a-f2648cc7ad21', resource_group='aml-quickstarts-134041')

In [7]:
#creating experiment
exp_hyperdrive = Experiment(workspace=ws, name="bankmkt-experiment-hyperdrive")
exp_hyperdrive

Name,Workspace,Report Page,Docs Page
bankmkt-experiment-hyperdrive,quick-starts-ws-134041,Link to Azure Machine Learning studio,Link to Documentation


In [8]:
# Choose a name for your CPU cluster
cpu_cluster_name = "computer-instance"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS2_v2',
                                                            max_nodes=4, 
                                                            idle_seconds_before_scaledown=2400)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

Found existing cluster, use it.


In [9]:
cpu_cluster.wait_for_completion(show_output=True)


Running


In [10]:
output = {}
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Experiment Name'] = exp_hyperdrive.name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
Subscription ID,cdbe0b43-92a0-4715-838a-f2648cc7ad21
Workspace,quick-starts-ws-134041
Resource Group,aml-quickstarts-134041
Location,southcentralus
Experiment Name,bankmkt-experiment-hyperdrive


### **2.1.1) Logistic Regression Model**

In [11]:
# Specify parameter sampler
ps = RandomParameterSampling({'--C': uniform(0.1, 1),
                              '--max_iter': quniform(100, 1500, 100),})

ps

<azureml.train.hyperdrive.sampling.RandomParameterSampling at 0x7f439a6ee9e8>

In [12]:
# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

policy

<azureml.train.hyperdrive.policy.BanditPolicy at 0x7f42e660c780>

In [21]:
if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn("./scripts",
              compute_target=cpu_cluster,
              entry_script="train.py" )

est

FileNotFoundError: [Errno 2] No such file or directory: './train.py'

In [14]:
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
     hyperparameter_sampling=ps,
     policy=policy,
     primary_metric_name='Accuracy',
     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
     max_total_runs=25,
     max_concurrent_runs=4,)

# Submit your hyperdrive run to the experiment and show run details with the widget.
run_hyperdrive =exp_hyperdrive.submit(config=hyperdrive_config)

RunDetails(run_hyperdrive).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [16]:
# Get your best run and save the model from that run.

best_run = run_hyperdrive.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

model = best_run.register_model(model_name='bankmarketing-logit', model_path='outputs/bankmarketing-logit-model.joblib')
model.download(target_dir="outputs", exist_ok=True)

# Evaluation of model perf on our holdout-set.

from scripts.train import clean_data
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


factory = TabularDatasetFactory()
test_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"
test_ds = factory.from_delimited_files(test_data_path)
X_test, y_test = clean_data(test_ds)

logit_model = joblib.load('outputs/bankmarketing-logit-model.joblib')

print(logit_model.score(X_test, y_test))
print(classification_report(y_test, logit_model.predict(X_test)))
print(confusion_matrix(y_test, logit_model.predict(X_test)))

['--C', '0.9489883314264986', '--max_iter', '1300']
0.9094660194174757
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      3636
           1       0.70      0.41      0.51       484

    accuracy                           0.91      4120
   macro avg       0.81      0.69      0.73      4120
weighted avg       0.90      0.91      0.90      4120

[[3550   86]
 [ 287  197]]


The sklearn.linear_model.logistic module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.linear_model. Anything that cannot be imported from sklearn.linear_model is now part of the private API.
Trying to unpickle estimator LogisticRegression from version 0.20.3 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.
