<a href="https://colab.research.google.com/github/andresalerno/udacity_nd_azure_ml/blob/main/C%C3%B3pia_de_salernos_project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Azure Machine Learning Nano Degree (Udacity)**

## Project 1: Optimizing an ML Pipeline in Azure

## Author: Andre Salerno

In [None]:
#checking the azure ml sdk version
from azureml.core import VERSION

print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")

You are currently using version 1.19.0 of the Azure ML SDK


# **1) Setup**

In [None]:
#setup
from azureml.core                                    import Dataset, Datastore, Experiment, ScriptRunConfig, Workspace 

from azureml.core.compute                            import AmlCompute, ComputeTarget 
from azureml.core.compute_target                     import ComputeTargetException

from azureml.core.run                                import Run

from azureml.data.dataset_factory                    import TabularDatasetFactory

from azureml.interpret                               import ExplanationClient

from azureml.pipeline.core                           import Pipeline, PipelineData
from azureml.pipeline.steps                          import PythonScriptStep

from azureml.train.automl                            import AutoMLConfig
from azureml.train.sklearn                           import SKLearn

from azureml.train.hyperdrive.parameter_expressions  import uniform, quniform
from azureml.train.hyperdrive.policy                 import BanditPolicy
from azureml.train.hyperdrive.run                    import PrimaryMetricGoal
from azureml.train.hyperdrive.runconfig              import HyperDriveConfig
from azureml.train.hyperdrive.sampling               import RandomParameterSampling

from azureml.widgets                                 import RunDetails

from sklearn.linear_model                            import LogisticRegression
from sklearn.metrics                                 import mean_squared_error
from sklearn.model_selection                         import train_test_split
from sklearn.preprocessing                           import OneHotEncoder

import argparse
import joblib
import numpy as np
import os
import pandas as pd

#load_dotenv()

# **2) Creating an Azure ML Workspace, connecting it and creating an experiment**

## **2.1) Hyperdrive**

In [None]:
#creating the azure ml compute cluster
ws = Workspace.from_config() # this automatically looks for a directory .azureml
ws

Workspace.create(name='quick-starts-ws-134041', subscription_id='cdbe0b43-92a0-4715-838a-f2648cc7ad21', resource_group='aml-quickstarts-134041')

In [None]:
#checking a new function
ws = Workspace.get(name="quick-starts-ws-134041", subscription_id=os.getenv('cdbe0b43-92a0-4715-838a-f2648cc7ad21'))
ws

Workspace.create(name='quick-starts-ws-134041', subscription_id='cdbe0b43-92a0-4715-838a-f2648cc7ad21', resource_group='aml-quickstarts-134041')

In [None]:
#creating experiment
exp_hyperdrive = Experiment(workspace=ws, name="bankmkt-experiment-hyperdrive")
exp_hyperdrive

Name,Workspace,Report Page,Docs Page
bankmkt-experiment-hyperdrive,quick-starts-ws-134041,Link to Azure Machine Learning studio,Link to Documentation


In [None]:
# Choose a name for your CPU cluster
cpu_cluster_name = "computer-instance"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS2_v2',
                                                            max_nodes=4, 
                                                            idle_seconds_before_scaledown=2400)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

Found existing cluster, use it.


In [None]:
cpu_cluster.wait_for_completion(show_output=True)


Running


In [None]:
output = {}
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Experiment Name'] = exp_hyperdrive.name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
Subscription ID,cdbe0b43-92a0-4715-838a-f2648cc7ad21
Workspace,quick-starts-ws-134041
Resource Group,aml-quickstarts-134041
Location,southcentralus
Experiment Name,bankmkt-experiment-hyperdrive


### **2.1.1) Logistic Regression Model**

In [None]:
# Specify parameter sampler
ps = RandomParameterSampling({'--C': uniform(0.1, 1),
                              '--max_iter': quniform(100, 1500, 100),})

ps

<azureml.train.hyperdrive.sampling.RandomParameterSampling at 0x7f1b98d47240>

In [None]:
# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

policy

<azureml.train.hyperdrive.policy.BanditPolicy at 0x7f1b98d47e10>

In [None]:
if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn("./scripts",
              compute_target=cpu_cluster,
              entry_script="train.py" )

est

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


<azureml.train.sklearn.SKLearn at 0x7f1b98d472b0>

In [None]:
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
     hyperparameter_sampling=ps,
     policy=policy,
     primary_metric_name='Accuracy',
     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
     max_total_runs=25,
     max_concurrent_runs=4,)

# Submit your hyperdrive run to the experiment and show run details with the widget.
run_hyperdrive =exp_hyperdrive.submit(config=hyperdrive_config)

RunDetails(run_hyperdrive).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [None]:
# Get your best run and save the model from that run.

best_run = run_hyperdrive.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

model = best_run.register_model(model_name='bankmarketing-logit', model_path='outputs/bankmarketing-logit-model.joblib')
model.download(target_dir="outputs", exist_ok=True)

# Evaluation of model perf on our holdout-set.

from scripts.train import clean_data
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


factory = TabularDatasetFactory()
test_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"
test_ds = factory.from_delimited_files(test_data_path)
X_test, y_test = clean_data(test_ds)

logit_model = joblib.load('outputs/bankmarketing-logit-model.joblib')

print(logit_model.score(X_test, y_test))
print(classification_report(y_test, logit_model.predict(X_test)))
print(confusion_matrix(y_test, logit_model.predict(X_test)))

## 2.2) Auto ML

In [None]:
exp_automl = Experiment(workspace=ws, name="bankmkt-experiment-automl")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Resource group: ' + ws.resource_group, sep = '\n')

In [None]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
datastore = ws.get_default_datastore()
factory = TabularDatasetFactory()
data_path_train = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
data_path_valid = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv"
data_path_test = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"


ds_train = factory.from_delimited_files(data_path_train)
ds_valid = factory.from_delimited_files(data_path_valid)
ds_test = factory.from_delimited_files(data_path_test)

In [None]:
import logging
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

label="y"

automl_settings = {
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "primary_metric": 'accuracy',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

automl_config = AutoMLConfig(experiment_timeout_minutes=60,
                             task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=aml_compute,
                             experiment_exit_score = 0.9984,
                             blocked_models = ['KNN','LinearSVM'],
                             enable_onnx_compatible_models=True,
                             training_data = ds_train,
                             label_column_name = label,
                             validation_data = ds_valid,
                             n_cross_validations=5
                             **automl_settings
                            )

In [None]:
# Submit your automl run

remote_run = exp_automl.submit(automl_config, show_output = False)

In [None]:
RunDetails(remote_run).show()

In [None]:
# Retrieve and save your best automl model, evaluate locally on hold out set
best_run_aml, fitted_model_aml = remote_run.get_output()
model_name = best_run_aml.properties['model_name']

In [None]:
best_run_aml.download_file('outputs/model.pkl', 'outputs/bankmarketing-aml-model.pkl')
best_run_aml.download_file('outputs/scoring_file_v_1_0_0.py', 'outputs/score_aml.py')
best_run_aml.download_file('automl_driver.py', 'outputs/automl_driver.py')

In [None]:
import pickle
file = open("outputs/bankmarketing-aml-model.pkl",'rb')
aml_model = pickle.load(file)
file.close()

In [None]:
from scripts.logit_train import clean_data
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


factory = TabularDatasetFactory()
test_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"
test_ds = pd.read_csv(test_data_path)
y_test = test_ds[['y']]


In [None]:
print(accuracy_score(aml_model.predict(test_ds.drop(columns=['y'])), y_test))
print(classification_report(y_test, aml_model.predict(test_ds.drop(columns=['y']))))
print(confusion_matrix(y_test, aml_model.predict(test_ds.drop(columns=['y']))))

## 3) Clean up

In [None]:
try:
    aml_compute.delete()
    print('Computetarget deleted')
except ComputeTargetException:
    print('Computetarget not found')