# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [3]:
import logging
import os
import joblib
import math
import pandas as pd
import datetime
import azureml.core
import azureml.automl

from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from azureml.widgets import RunDetails

from azureml.core.environment import Environment
from azureml.automl.core.shared import constants

from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.model import Model

from inference.utils import align_outputs

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.42.0


## Dataset

### Overview
TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.


TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [4]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

# choose a name for experiment
experiment_name = 'automl-forecast'

experiment=Experiment(ws, experiment_name)

quick-starts-ws-199690
aml-quickstarts-199690
southcentralus
3d1a56d2-7c81-4118-9790-f85d1acf0c77


In [5]:
# Create or attach a compute cluster
cluster_name = "canina-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('This compute target already exists.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2",
                                                          max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    print('A new compute target has been created.')
    
compute_target.wait_for_completion(show_output=True, min_node_count=0, timeout_in_minutes=10)

A new compute target has been created.
InProgress.
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [6]:
DATA_DIR = os.path.join(os.path.abspath(os.getcwd()), "ojdata")
LAST_WEEK = 138
NUM_WEEK_TEST = 3
time_column_name = "week_start"
grain_column_names = ["store", "brand"]
target = "move"

# The start datetime of the first week in the record
FIRST_WEEK_START = pd.to_datetime("1989-09-14 00:00:00")

df = pd.read_csv(os.path.join(DATA_DIR, "yx.csv"))
df = df.loc[df.week <= LAST_WEEK]

In [7]:
# Convert logarithm of the unit sales to unit sales
df["move"] = df["logmove"].apply(lambda x: round(math.exp(x)))

# Add timestamp column
df["week_start"] = df["week"].apply(lambda x: FIRST_WEEK_START + datetime.timedelta(days=(x - 1) * 7))

In [8]:
# Split data into training and test sets

def split_last_n_by_grain(df, n):
    """Group df by grain and split on last n rows for each group."""
    df_grouped = df.sort_values(time_column_name).groupby(
        grain_column_names, group_keys=False
    )
    df_head = df_grouped.apply(lambda dfg: dfg.iloc[:-n])
    df_tail = df_grouped.apply(lambda dfg: dfg.iloc[-n:])
    return df_head, df_tail


train_df, test_df = split_last_n_by_grain(df, NUM_WEEK_TEST)
train_df.drop("logmove", axis=1, inplace=True)
train_df.reset_index(drop=True)
test_df.reset_index(drop=True)

local_data_paths = [
    os.path.join(DATA_DIR, "train_automl.csv"),
    os.path.join(DATA_DIR, "test_automl.csv")
]

train_df.to_csv(local_data_paths[0], index=None, header=True)
test_df.to_csv(local_data_paths[1], index=None, header=True)

In [9]:
# Upload data to datastore

ds = ws.get_default_datastore()
ds.upload_files(
    files=local_data_paths, 
    target_path="dataset/", 
    overwrite=True, 
    show_progress=False)

"datastore.upload_files" is deprecated after version 1.0.69. Please use "FileDatasetFactory.upload_directory" instead. See Dataset API change notice at https://aka.ms/dataset-deprecation.


$AZUREML_DATAREFERENCE_632e3f4bd8e6464aa470496000fe0f24

In [10]:
train_dataset = Dataset.Tabular.from_delimited_files(path=ds.path("dataset/train_automl.csv"))

In [11]:
# Visualize the first five lines of the training dataset

train_dataset.to_pandas_dataframe().head(5)

Unnamed: 0,store,brand,week,constant,price1,price2,price3,price4,price5,price6,price7,price8,price9,price10,price11,deal,feat,profit,move,week_start
0,2,1,40,1,0.060469,0.060497,0.042031,0.029531,0.049531,0.053021,0.038906,0.041406,0.028906,0.024844,0.038984,1,0.0,37.992326,8256,1990-06-14
1,2,1,46,1,0.060469,0.060312,0.045156,0.046719,0.049531,0.047813,0.045781,0.027969,0.042969,0.042031,0.038984,0,0.0,30.126667,6144,1990-07-26
2,2,1,47,1,0.060469,0.060312,0.045156,0.046719,0.037344,0.053021,0.045781,0.041406,0.048125,0.032656,0.038984,0,0.0,30.0,3840,1990-08-02
3,2,1,48,1,0.060469,0.060312,0.049844,0.037344,0.049531,0.053021,0.045781,0.041406,0.042344,0.032656,0.038984,0,0.0,29.95,8000,1990-08-09
4,2,1,50,1,0.060469,0.060312,0.043594,0.031094,0.049531,0.053021,0.046648,0.041406,0.042344,0.032656,0.038203,0,0.0,29.92,8896,1990-08-23


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [12]:
# Forecast settings
forecast_settings = {
    "time_column_name": time_column_name,
    "grain_column_names": grain_column_names,
    "max_horizon": NUM_WEEK_TEST
}
# Automl settings 
automl_settings = {
    "experiment_timeout_hours" : 1.2,
    "primary_metric" : "normalized_mean_absolute_error",
    "n_cross_validations" : 3
}

# Automl config 
automl_config = AutoMLConfig(
    task="forecasting",
    debug_log="automl_errors.log",
    training_data=train_dataset,
    label_column_name=target,
    compute_target=compute_target,
    enable_early_stopping=True,
    verbosity=logging.INFO,
    **automl_settings,
    **forecast_settings
)

In [13]:
# Submit the experiment
remote_run = experiment.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on canina-cluster with default configuration
Running on remote compute: canina-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
automl-forecast,AutoML_fad18826-43dd-4bbc-b521-accc8cb32cb6,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Time Series ID detection
STATUS:       PASSED
DESCRIPTION:  The data set was analyzed, and no duplicate time index were detected.
              Learn more about time-series f

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [14]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [15]:
remote_run.wait_for_completion(show_output=False)

{'runId': 'AutoML_fad18826-43dd-4bbc-b521-accc8cb32cb6',
 'target': 'canina-cluster',
 'status': 'Completed',
 'startTimeUtc': '2022-06-28T13:33:38.297342Z',
 'endTimeUtc': '2022-06-28T15:01:08.714985Z',
 'services': {},
   'message': 'Experiment timeout reached, hence experiment stopped. Current experiment timeout: 1 hour(s) 12 minute(s)'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'normalized_mean_absolute_error',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'canina-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"e09edf01-29d1-4350-a2a5-a238b286c56a\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'forecasting',
  'dependencies_versions': '{"azureml-widgets": "1.42.0", "azureml-training-tabular": "1.42.0", "azureml-train": "1.42.

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [16]:
# Get the best model
best_run_automl, best_automl_model = remote_run.get_output()

In [17]:
# List all files to see the saved model as "model.pkl"
best_run_automl.get_file_names()

['automl_driver.py',
 'explanation/e007a503/expected_values.interpret.json',
 'explanation/e007a503/features.interpret.json',
 'explanation/e007a503/global_names/0.interpret.json',
 'explanation/e007a503/global_rank/0.interpret.json',
 'explanation/e007a503/global_values/0.interpret.json',
 'explanation/e007a503/local_importance_values.interpret.json',
 'explanation/e007a503/rich_metadata.interpret.json',
 'explanation/e007a503/visualization_dict.interpret.json',
 'explanation/e007a503/ys_pred_viz.interpret.json',
 'explanation/e9485e95/eval_data_viz.interpret.json',
 'explanation/e9485e95/expected_values.interpret.json',
 'explanation/e9485e95/features.interpret.json',
 'explanation/e9485e95/global_names/0.interpret.json',
 'explanation/e9485e95/global_rank/0.interpret.json',
 'explanation/e9485e95/global_values/0.interpret.json',
 'explanation/e9485e95/local_importance_values.interpret.json',
 'explanation/e9485e95/rich_metadata.interpret.json',
 'explanation/e9485e95/visualization_d

In [23]:
if "outputs" not in os.listdir():
    os.mkdir("./outputs")

In [24]:
# Download the best AutoML model
id_file = 30
best_run_automl.download_file(best_run_automl.get_file_names()[id_file],
                                 output_file_path = './outputs/')

In [25]:
print("Model ID", remote_run.model_id)

Model ID None


In [26]:
run_metrics = best_run_automl.get_metrics()
for m, val in run_metrics.items():
    print(f"{m}: {val}")

normalized_root_mean_squared_log_error: 0.1945550504526623
normalized_mean_absolute_error: 0.10960286633862626
explained_variance: 0.08800651717195869
normalized_root_mean_squared_error: 0.14071916098297987
median_absolute_error: 2263.019157560532
spearman_correlation: 0.7565418928644784
normalized_median_absolute_error: 0.07499737912954695
mean_absolute_error: 9682.832564720467
mean_absolute_percentage_error: 95.58172663345624
root_mean_squared_error: 31927.000301509164
root_mean_squared_log_error: 0.8695950502106419
r2_score: 0.07548594845059264
residuals: aml://artifactId/ExperimentRun/dcid.AutoML_fad18826-43dd-4bbc-b521-accc8cb32cb6_20/residuals
predicted_true: aml://artifactId/ExperimentRun/dcid.AutoML_fad18826-43dd-4bbc-b521-accc8cb32cb6_20/predicted_true
forecast_table: aml://artifactId/ExperimentRun/dcid.AutoML_fad18826-43dd-4bbc-b521-accc8cb32cb6_20/forecast_table


In [27]:
best_run_automl.get_tags()

{'_aml_system_azureml.automlComponent': 'AutoML',
 '_aml_system_ComputeTargetStatus': '{"AllocationState":"steady","PreparingNodeCount":0,"RunningNodeCount":1,"CurrentNodeCount":1}',
 'mlflow.source.type': 'JOB',
 'mlflow.source.name': 'automl_driver.py',
 '_aml_system_codegen': 'completed',
 '_aml_system_automl_is_child_run_end_telemetry_event_logged': 'True',
 'model_explain_run_id': 'AutoML_fad18826-43dd-4bbc-b521-accc8cb32cb6_ModelExplain',
 'model_explanation': 'True'}

In [28]:
print(best_automl_model.steps)

[('timeseriestransformer', TimeSeriesTransformer(country_or_region=None, drop_column_names=[], featurization_config=FeaturizationConfig(blocked_transformers=None, column_purposes=None, dataset_language=None, prediction_transform_type=None, transformer_params=None), force_time_index_features=None, freq='W-THU', grain_column_names=['store', 'brand'], group=None, lookback_features_removed=False, max_horizon=3, origin_time_colname='origin', pipeline=Pipeline(memory=None, steps=[('make_numeric_na_dummies', MissingDummiesTransformer(numerical_columns=['week', 'constant', 'price1', 'price2', 'price3', 'price4', 'price5', 'price6', 'price7', 'price8', 'price9', 'price10', 'price11', 'deal', 'feat', 'profit'])), ('impute_na_numeric_datetime', TimeSeriesImputer(end=None, freq='W-THU', impute_by_horizon=False, input_column=['week', 'constant', 'price1', 'price2', 'price3', 'price4', 'price5', 'price6', 'price7', 'price8', 'price9', 'price10', 'price11', 'deal', 'feat', 'profit'], limit=None, limi

## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [74]:
# Register the best AutoML model
automl_model_name = best_run_automl.properties["model_name"]
model_reg = remote_run.register_model(model_name = automl_model_name,
                                  description = "AutoML model for forecasting.",
                                  tags = None)

In [79]:
automl_model_name

'AutoMLfad18826420'

In [75]:
model_reg

Model(workspace=Workspace.create(name='quick-starts-ws-199690', subscription_id='3d1a56d2-7c81-4118-9790-f85d1acf0c77', resource_group='aml-quickstarts-199690'), name=AutoMLfad18826420, id=AutoMLfad18826420:3, version=3, tags={}, properties={})

In [76]:
model_reg.download(target_dir='outputs_automl', exist_ok=True)

'outputs_automl/model.pkl'

In [77]:
print("Model ID", remote_run.model_id)

Model ID AutoMLfad18826420


In [104]:
model = Model(ws,automl_model_name, version=1)

In [32]:
# Create a file containing the environment details
best_run_automl.download_file(constants.CONDA_ENV_FILE_PATH, 'myenv.yml')
myenv = Environment.from_conda_specification(name="myenv", file_path="myenv.yml")

In [105]:
inference_config = InferenceConfig(entry_script="score.py", environment=myenv)
aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=1,
    memory_gb=1,
    description="AutoML model to forecast Orange Juice data"
)

service = Model.deploy(workspace=ws,
             name="automl", 
             models=[model], 
             inference_config=inference_config, 
             deployment_config=aciconfig)

In [106]:
service.wait_for_deployment(True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2022-06-28 19:15:23+00:00 Creating Container Registry if not exists.
2022-06-28 19:15:23+00:00 Registering the environment.
2022-06-28 19:15:24+00:00 Generating deployment configuration..
2022-06-28 19:19:51+00:00 Submitting deployment to compute..
2022-06-28 19:20:19+00:00 Checking the status of deployment automl..
2022-06-28 19:25:10+00:00 Checking the status of inference endpoint automl.
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [107]:
print(service.state)

Healthy


TODO: In the cell below, send a request to the web service you deployed to test it.

In [36]:
# Prepare data to test it on the web service
X = test_df
ground_truth = test_df.pop(target).values

In [40]:
# Visualize the first five lines of the test data
X.head(5)

Unnamed: 0,store,brand,week,logmove,constant,price1,price2,price3,price4,price5,price6,price7,price8,price9,price10,price11,deal,feat,profit,week_start
85,2,1,136,8.59,1,0.05,0.05,0.05,0.05,0.04,0.05,0.03,0.04,0.03,0.02,0.03,0,0.0,33.54,1992-04-16
86,2,1,137,9.19,1,0.04,0.05,0.05,0.04,0.03,0.04,0.03,0.04,0.04,0.02,0.03,0,0.0,20.43,1992-04-23
87,2,1,138,9.74,1,0.04,0.04,0.05,0.04,0.04,0.05,0.04,0.04,0.04,0.03,0.03,1,1.0,11.29,1992-04-30
195,2,2,136,9.14,1,0.05,0.05,0.05,0.05,0.04,0.05,0.03,0.04,0.03,0.02,0.03,1,0.0,27.13,1992-04-16
196,2,2,137,8.74,1,0.04,0.05,0.05,0.04,0.03,0.04,0.03,0.04,0.04,0.02,0.03,0,0.0,33.3,1992-04-23


In [111]:
test_X = X.iloc[:1,:]

In [112]:
# Save the test data "X" as a json file "data.json"
raw_data = './data.json'
test_X.to_json('./data.json')

In [113]:
forecasts, X_future = service.run(input_data=raw_data)

ERROR:azureml.core.webservice.aci:Received bad response from service. More information can be found by calling `.get_logs()` on the webservice object.
Response Code: 502
Headers: {'Connection': 'keep-alive', 'Content-Length': '24', 'Content-Type': 'text/html; charset=utf-8', 'Date': 'Tue, 28 Jun 2022 19:50:10 GMT', 'Server': 'nginx/1.18.0 (Ubuntu)', 'X-Ms-Request-Id': 'c6ff49bb-e4a6-4c91-abd2-1013defadc16', 'X-Ms-Run-Function-Failed': 'True'}
Content: b'Expected object or value'



WebserviceException: WebserviceException:
	Message: Received bad response from service. More information can be found by calling `.get_logs()` on the webservice object.
Response Code: 502
Headers: {'Connection': 'keep-alive', 'Content-Length': '24', 'Content-Type': 'text/html; charset=utf-8', 'Date': 'Tue, 28 Jun 2022 19:50:10 GMT', 'Server': 'nginx/1.18.0 (Ubuntu)', 'X-Ms-Request-Id': 'c6ff49bb-e4a6-4c91-abd2-1013defadc16', 'X-Ms-Run-Function-Failed': 'True'}
Content: b'Expected object or value'
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Received bad response from service. More information can be found by calling `.get_logs()` on the webservice object.\nResponse Code: 502\nHeaders: {'Connection': 'keep-alive', 'Content-Length': '24', 'Content-Type': 'text/html; charset=utf-8', 'Date': 'Tue, 28 Jun 2022 19:50:10 GMT', 'Server': 'nginx/1.18.0 (Ubuntu)', 'X-Ms-Request-Id': 'c6ff49bb-e4a6-4c91-abd2-1013defadc16', 'X-Ms-Run-Function-Failed': 'True'}\nContent: b'Expected object or value'"
    }
}

In [None]:
df_result = align_outputs(forecasts, X_future, X, ground_truth, target)

In [None]:
df_result.head(5)

TODO: In the cell below, print the logs of the web service and delete the service

In [None]:
# Enable application insights
service.update(enable_app_insights=True)

In [114]:
service.get_logs()

'/bin/bash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n/bin/bash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n/bin/bash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n/bin/bash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n2022-06-28T19:22:27,658227000+00:00 - gunicorn/run \n2022-06-28T19:22:27,659829400+00:00 - iot-server/run \n2022-06-28T19:22:27,651695200+00:00 - rsyslog/run \nbash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by bash)\n2022-06-28T19:22:27,669996900+00:00 | gunicorn/run | \n2022-06-28T19:22:27,676337500+00:00 - nginx/run \n2022-06-28T19:22:27,685858100

In [115]:
# Print the logs of the Web service
logs = service.get_logs()
for line in logs.split('\n'):
    print(line)

/bin/bash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by /bin/bash)
2022-06-28T19:22:27,658227000+00:00 - gunicorn/run 
2022-06-28T19:22:27,659829400+00:00 - iot-server/run 
2022-06-28T19:22:27,651695200+00:00 - rsyslog/run 
bash: /azureml-envs/azureml_7becf92bd9d8786204e7278bb441885c/lib/libtinfo.so.6: no version information available (required by bash)
2022-06-28T19:22:27,669996900+00:00 | gunicorn/run | 
2022-06-28T19:22:27,676337500+00:00 - nginx/run 
2022-06-28T19:22:27,685858100+00:00 | gu

In [None]:
# Remove the web service
service.delete()

# Remove the compute cluster
compute_target.delete()

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
