# Automated ML

In [2]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.19.0


## Dataset

### Overview

The dataset is a combination of 2 datasets that contain the measures taken to combat covid-19 spread and the weekly infection cases per country.  
The first dataset contains the start and end dates of the measures.  
The second dataset is a timeseries per week of the number of cases.  

To combine the 2 sets I transformed the first set in a timeseries with measures as boolean features. It ended up as one hot encoded feature set per week. I then joined the datasets and ommited some columns that were not informative (country_code, population, cumulative_count etc.). I then saved the dataset in the workspace datastore and manually registered it as a dataset in the Machine Learning Studio.


In [3]:
ws = Workspace.from_config()
experiment_name = 'Capstone'
project_folder = './pipeline-project'

experiment=Experiment(ws, experiment_name)
experiment

dataset = Dataset.get_by_name(ws, name='covid-19-measures-cases-weekly')
df = dataset.to_pandas_dataframe()

### Check the compute cluster and get a handle for it

In [53]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "worker"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## ForecastingParameters Configuration

We are going to create a forecasting model.  
Azure ml sdk has already a class that groups all the parameters needed for a forecasting task: `ForecastingParameters`.  

In our case the dataset timestamp column is called `week` so we set `time_column_name='week'`.  
We would like to have an acurate forecast for at least 10 weeks so we set the `forecast_horizon` to 10.   
We have the same time series repeated per country - to capture that we set the `time_series_id_column_names` to `['country']`.   
The time series frequency is per week and we can express that with the parameter `freq='W'`.  
Target_lags is set to auto because we dont know which features are dependent and which not.  
`target_rolling_window_size=10` means that we take into account 10 past records in order to perform the forecasting.  

## AutoML Configuration
We are going to create a forecasting model so we set `task='forecasting'`   
In order to get the model cross validated better we set the `n_cross_validations=30`  
We need to specify the column that we want to forecast so we set `label_column_name='rate_14_day'`  
The forecast task need to optimize for normalized_root_mean_squared_error because it is a better metric for values that do not differ in order of magnitudes from each other thus we set `primary_metric` to 'normalized_root_mean_squared_error'.  
Running the experiment for too long is not our goal so we set a timeout to 20 minutes: `experiment_timeout_hours=0.3`  
We will block the folliwing models:
 - 'ExtremeRandomTrees' because it performs poor when there is a high number of noisy features (in high dimensional data-sets).
 - 'AutoArima' - because our dataset is a multivariate time series - it depends not only on one variable but on 64.
 - 'Prophet' because it works best with time series that have strong seasonal effects but in our case we dont know that and dont want to assume that either.  
 
To do that we set `blocked_models` to `['ExtremeRandomTrees', 'AutoArima', 'Prophet']`.   
We have already limited the run duration of our experiment to 20 minute so we will disable early stopping like this: `enable_early_stopping=False`

In [22]:
from azureml.automl.core.forecasting_parameters import ForecastingParameters

forecasting_parameters = ForecastingParameters(time_column_name='week', 
                                               forecast_horizon=10,
                                               time_series_id_column_names=["country"],
                                               freq='W',
                                               target_lags='auto',
                                               target_rolling_window_size=10)

automl_config = AutoMLConfig(task='forecasting',                             
                             primary_metric='normalized_root_mean_squared_error',
                             blocked_models = ['ExtremeRandomTrees', 'AutoArima', 'Prophet'],       
                             experiment_timeout_hours=0.3,
                             label_column_name='rate_14_day',
                             enable_early_stopping=False,
                             n_cross_validations=30,                             
                             training_data=dataset,
                             compute_target=compute_target,
                             verbosity=logging.INFO,
                             forecasting_parameters=forecasting_parameters)                                                             

In [28]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))

model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

In [29]:
automl_step = AutoMLStep(
    name='automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)

In [30]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(
    description="Covid-19-aftermath-3",
    workspace=ws,    
    steps=[automl_step])

In [31]:
remote_run = experiment.submit(pipeline)

Created step automl_module [71a44c37][57056e7b-1a93-4e36-8840-e92af472a768], (This step will run and generate new outputs)
Submitted PipelineRun dd13962c-4b1f-46a2-82bb-d5f242a022ac
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Capstone/runs/dd13962c-4b1f-46a2-82bb-d5f242a022ac?wsid=/subscriptions/f08c5f25-28be-4c21-993c-ad64d5c84d3a/resourcegroups/ML/workspaces/capstone


## Run Details

In [32]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [33]:
remote_run.wait_for_completion()

PipelineRunId: dd13962c-4b1f-46a2-82bb-d5f242a022ac
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Capstone/runs/dd13962c-4b1f-46a2-82bb-d5f242a022ac?wsid=/subscriptions/f08c5f25-28be-4c21-993c-ad64d5c84d3a/resourcegroups/ML/workspaces/capstone
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 9213a29f-a51b-415d-ba54-7b54b2318e94
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Capstone/runs/9213a29f-a51b-415d-ba54-7b54b2318e94?wsid=/subscriptions/f08c5f25-28be-4c21-993c-ad64d5c84d3a/resourcegroups/ML/workspaces/capstone
StepRun( automl_module ) Status: NotStarted
StepRun( automl_module ) Status: Running

StepRun(automl_module) Execution Summary
StepRun( automl_module ) Status: Finished



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'dd13962c-4b1f-46a2-82bb-d5f242a022ac', 'status': 'Completed', 'startTimeUtc': '2021-02-05T13:00:10.830966Z', 'endTimeUtc': '2021-02-05T13:30:54.218771Z', 'proper

'Finished'

In [34]:
metrics_output = remote_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)
import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
metrics_df = pd.DataFrame(deserialized_metrics_output)
metrics_df

Downloading azureml/9213a29f-a51b-415d-ba54-7b54b2318e94/metrics_data
Downloaded azureml/9213a29f-a51b-415d-ba54-7b54b2318e94/metrics_data, 1 files out of an estimated total of 1


Unnamed: 0,9213a29f-a51b-415d-ba54-7b54b2318e94_6,9213a29f-a51b-415d-ba54-7b54b2318e94_8,9213a29f-a51b-415d-ba54-7b54b2318e94_10,9213a29f-a51b-415d-ba54-7b54b2318e94_9,9213a29f-a51b-415d-ba54-7b54b2318e94_7,9213a29f-a51b-415d-ba54-7b54b2318e94_11
normalized_root_mean_squared_error,[0.4680279827952882],[0.4490204899805414],[0.4790149774442213],[0.43521980962558265],[0.4408959680415582],[0.4349341379582462]
mean_absolute_error,[152.3679943631077],[148.61930025733452],[150.47646846367346],[147.29828646461573],[150.26607197951472],[144.165063046842]
normalized_root_mean_squared_log_error,[0.2728431635645],[0.27966411656853574],[0.26554625064888987],[0.273284203885111],[0.2842519823684439],[0.2570696576809009]
median_absolute_error,[93.89215155429046],[87.8283681156991],[91.86853709882207],[87.59991070890315],[88.93284321669617],[83.96516516111504]
root_mean_squared_error,[232.60081278546295],[230.0817093148482],[230.42321317911075],[228.75078242438607],[232.883090495313],[224.86961026692944]
normalized_median_absolute_error,[0.36258641696222066],[0.3343667381640037],[0.3758935795910562],[0.3244624058547206],[0.32389650886928484],[0.3302735319105047]
spearman_correlation,[0.21030703526716932],[0.21478605599010808],[0.23202390957736024],[0.2577974297593894],[0.23272735955162924],[0.2755307169096398]
root_mean_squared_log_error,[1.4735435542337256],[1.599578416237949],[1.387665583074417],[1.5780451878831039],[1.6379344374493587],[1.3655980367955074]
explained_variance,[-0.3137022895856728],[-0.4801404788458008],[-0.41827105291255806],[-0.521645565742935],[-0.6721372591744349],[-0.3143578724325526]
mean_absolute_percentage_error,[363.5327745745606],[346.2007318330028],[399.433113842998],[301.85693396947005],[325.40890969545217],[315.2093480431469]


## Best Model

In [35]:
# Retrieve best model from Pipeline Run
best_model_output = remote_run.get_pipeline_output(best_model_output_name)
num_file_downloaded = best_model_output.download('.', show_progress=True)

Downloading azureml/9213a29f-a51b-415d-ba54-7b54b2318e94/model_data
Downloaded azureml/9213a29f-a51b-415d-ba54-7b54b2318e94/model_data, 1 files out of an estimated total of 1


In [37]:
import pickle

with open(best_model_output._path_on_datastore, "rb" ) as f:
    best_model = pickle.load(f)
best_model

ForecastingPipelineWrapper(pipeline=Pipeline(memory=None,
                                             steps=[('timeseriestransformer',
                                                     TimeSeriesTransformer(featurization_config=None,
                                                                           pipeline_type=<TimeSeriesPipelineType.FULL: 1>)),
                                                    ('prefittedsoftvotingregressor',
                                                     PreFittedSoftVotingRegressor(estimators=[('9',
                                                                                               Pipeline(memory=None,
                                                                                                        steps=[('robustscaler',
                                                                                                                RobustScaler(copy=True,
                                                                       

## Register the best model 

In [42]:
remote_run.upload_file("outputs/my_model.pickle", best_model_output._path_on_datastore)

automl_model = remote_run.register_model(
    model_path="outputs/my_model.pickle", 
    model_name='Covid-19-cases-forecaster-2', 
    description='Forecast the number of cases per 100.000 of infections with covid-19.')

print(automl_model.name, automl_model.version, sep = '\t')


Covid-19-cases-forecaster-2	1


## Model Deployment


In [43]:
model_run = next(next(remote_run.get_children()).get_children())

### Download run outputs

In [263]:
model_run.download_files('outputs', 'outputs')

### Correct the scoring_file

The scoring file that we downloaded doesnt work well for our case. To fix it replace this part:
```python
        y_query = None
        if 'y_query' in data.columns:
            y_query = data.pop('y_query').values
        result = model.forecast(data, y_query)
```
with this part:
```python
        y_query = None
        if 'y_query' in data.columns:
            y_query = data.pop('y_query').values
        else:
            y_query = np.full(len(data.index), np.NaN)
            
        result = model.forecast(data, y_query, ignore_data_errors=True)
```
I had to do it 2 times to make it right hence the name of the script: `scoring_file_v_1_0_3.py`

In [44]:
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig

myenv = Environment.from_conda_specification(name = "covid-19", file_path = "outputs/outputs/conda_env_v_1_0_0.yml")
inference_config = InferenceConfig(entry_script='outputs/outputs/scoring_file_v_1_0_3.py', environment=myenv)

Possible deployment configuration parameters:
```Python
deploy_configuration(
    cpu_cores=None, 
    memory_gb=None, 
    tags=None, 
    properties=None, 
    description=None, 
    location=None, 
    auth_enabled=None, 
    ssl_enabled=None, 
    enable_app_insights=None, 
    ssl_cert_pem_file=None, 
    ssl_key_pem_file=None, 
    ssl_cname=None, 
    dns_name_label=None, 
    primary_key=None, 
    secondary_key=None, 
    collect_model_data=None, 
    cmk_vault_base_url=None, 
    cmk_key_name=None, 
    cmk_key_version=None, 
    vnet_name=None, 
    subnet_name=None)
```

In [47]:
from azureml.core.webservice import AciWebservice, AksWebservice, LocalWebservice
deployment_config = AciWebservice.deploy_configuration(
    cpu_cores = 1, 
    memory_gb = 1,
    auth_enabled = True, 
    enable_app_insights = True, 
    tags = {'category':'training', 'trait':'ML'})

In [48]:
from azureml.core.model import Model
automl_model = Model(ws, "covid-19-cases-forecaster-2")
service = Model.deploy(ws, "covid-19-cases-forecaster-2", [automl_model], inference_config, deployment_config)
service.wait_for_deployment(show_output = True)
print(service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running....................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


Consuming the service:

In [50]:
import requests
import json

data = {"data":
    [
        {
            "country": "Netherlands",
            "week": "2021-02-07T00:00:00",
            "AdaptationOfWorkplace": "false",
            "AdaptationOfWorkplacePartial": "false",
            "BanOnAllEvents": "false",
            "BanOnAllEventsPartial": "false",
            "ClosDaycare": "false",
            "ClosDaycarePartial": "false",
            "ClosHigh": "false",
            "ClosHighPartial": "false",
            "ClosPrim": "false",
            "ClosPrimPartial": "false",
            "ClosPubAny": "false",
            "ClosPubAnyPartial": "false",
            "ClosSec": "false",
            "ClosSecPartial": "false",
            "ClosureOfPublicTransport": "false",
            "ClosureOfPublicTransportPartial": "false",
            "EntertainmentVenues": "false",
            "EntertainmentVenuesPartial": "false",
            "GymsSportsCentres": "false",
            "GymsSportsCentresPartial": "false",
            "HotelsOtherAccommodation": "false",
            "HotelsOtherAccommodationPartial": "false",
            "IndoorOver100": "false",
            "IndoorOver1000": "false",
            "IndoorOver50": "false",
            "IndoorOver500": "false",
            "MasksMandatoryAllSpaces": "false",
            "MasksMandatoryAllSpacesPartial": "false",
            "MasksMandatoryClosedSpaces": "false",
            "MasksMandatoryClosedSpacesPartial": "false",
            "MasksVoluntaryAllSpaces": "false",
            "MasksVoluntaryAllSpacesPartial": "false",
            "MasksVoluntaryClosedSpaces": "false",
            "MasksVoluntaryClosedSpacesPartial": "false",
            "MassGather50": "false",
            "MassGather50Partial": "false",
            "MassGatherAll": "false",
            "MassGatherAllPartial": "false",
            "NonEssentialShops": "false",
            "NonEssentialShopsPartial": "false",
            "OutdoorOver100": "false",
            "OutdoorOver1000": "false",
            "OutdoorOver50": "false",
            "OutdoorOver500": "false",
            "PlaceOfWorship": "false",
            "PlaceOfWorshipPartial": "false",
            "PrivateGatheringRestrictions": "false",
            "PrivateGatheringRestrictionsPartial": "false",
            "RegionalStayHomeOrder": "false",
            "RegionalStayHomeOrderPartial": "false",
            "RestaurantsCafes": "false",
            "RestaurantsCafesPartial": "false",
            "SocialCircle": "false",
            "SocialCirclePartial": "false",
            "StayHomeGen": "false",
            "StayHomeGenPartial": "false",
            "StayHomeOrder": "false",
            "StayHomeOrderPartial": "false",
            "StayHomeRiskG": "false",
            "StayHomeRiskGPartial": "false",
            "Teleworking": "false",
            "TeleworkingPartial": "false",
            "WorkplaceClosures": "false",
            "WorkplaceClosuresPartial": "false"
        },
        {
            "country": "Netherlands",
            "week": "2021-02-14T00:00:00",
            "AdaptationOfWorkplace": "false",
            "AdaptationOfWorkplacePartial": "false",
            "BanOnAllEvents": "false",
            "BanOnAllEventsPartial": "false",
            "ClosDaycare": "false",
            "ClosDaycarePartial": "false",
            "ClosHigh": "false",
            "ClosHighPartial": "false",
            "ClosPrim": "false",
            "ClosPrimPartial": "false",
            "ClosPubAny": "false",
            "ClosPubAnyPartial": "false",
            "ClosSec": "false",
            "ClosSecPartial": "false",
            "ClosureOfPublicTransport": "false",
            "ClosureOfPublicTransportPartial": "false",
            "EntertainmentVenues": "false",
            "EntertainmentVenuesPartial": "false",
            "GymsSportsCentres": "false",
            "GymsSportsCentresPartial": "false",
            "HotelsOtherAccommodation": "false",
            "HotelsOtherAccommodationPartial": "false",
            "IndoorOver100": "false",
            "IndoorOver1000": "false",
            "IndoorOver50": "false",
            "IndoorOver500": "false",
            "MasksMandatoryAllSpaces": "false",
            "MasksMandatoryAllSpacesPartial": "false",
            "MasksMandatoryClosedSpaces": "false",
            "MasksMandatoryClosedSpacesPartial": "false",
            "MasksVoluntaryAllSpaces": "false",
            "MasksVoluntaryAllSpacesPartial": "false",
            "MasksVoluntaryClosedSpaces": "false",
            "MasksVoluntaryClosedSpacesPartial": "false",
            "MassGather50": "false",
            "MassGather50Partial": "false",
            "MassGatherAll": "false",
            "MassGatherAllPartial": "false",
            "NonEssentialShops": "false",
            "NonEssentialShopsPartial": "false",
            "OutdoorOver100": "false",
            "OutdoorOver1000": "false",
            "OutdoorOver50": "false",
            "OutdoorOver500": "false",
            "PlaceOfWorship": "false",
            "PlaceOfWorshipPartial": "false",
            "PrivateGatheringRestrictions": "false",
            "PrivateGatheringRestrictionsPartial": "false",
            "RegionalStayHomeOrder": "false",
            "RegionalStayHomeOrderPartial": "false",
            "RestaurantsCafes": "false",
            "RestaurantsCafesPartial": "false",
            "SocialCircle": "false",
            "SocialCirclePartial": "false",
            "StayHomeGen": "false",
            "StayHomeGenPartial": "false",
            "StayHomeOrder": "false",
            "StayHomeOrderPartial": "false",
            "StayHomeRiskG": "false",
            "StayHomeRiskGPartial": "false",
            "Teleworking": "false",
            "TeleworkingPartial": "false",
            "WorkplaceClosures": "false",
            "WorkplaceClosuresPartial": "false"
        }
    ]
}

input_data = json.dumps(data)
service.run(input_data)

'{"forecast": [516.0555766729951, 518.8032188317503], "index": [{"week": 1612656000000, "country": "Netherlands", "origin": 1612051200000}, {"week": 1613260800000, "country": "Netherlands", "origin": 1612051200000}]}'

Consume the service using a webclient

In [52]:
scoring_uri = service.scoring_uri
key = '4yGGFM7aE34fl5TMxuvWyOiYxvXgfEo0'

headers = {'Content-Type': 'application/json'}
headers['Authorization'] = f'Bearer {key}'

resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.json())

{"forecast": [516.0555766729951, 518.8032188317503], "index": [{"week": 1612656000000, "country": "Netherlands", "origin": 1612051200000}, {"week": 1613260800000, "country": "Netherlands", "origin": 1612051200000}]}


Getting the logs from the service:

In [51]:
logs = service.get_logs()

for line in logs.split('\n'):
    print(line)

2021-02-05T13:37:01,525690100+00:00 - gunicorn/run 
2021-02-05T13:37:01,551440100+00:00 - rsyslog/run 
2021-02-05T13:37:01,549068700+00:00 - nginx/run 
2021-02-05T13:37:01,573898300+00:00 - iot-server/run 
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_9c539d20199ae6be65c41c0382029684/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

Remove the service:

In [6]:
service.delete()