# Automated ML



Import Dependencies.

In [3]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


In [4]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

quick-starts-ws-138243
aml-quickstarts-138243
southcentralus
f5091c60-1c3c-430f-8d81-d802f6bf2414


In [5]:
experiment_name = 'ml-experiment-1'
project_folder = './pipeline-project'

experiment = Experiment(ws, experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
ml-experiment-1,quick-starts-ws-138243,Link to Azure Machine Learning studio,Link to Documentation


In [6]:
from azureml.core.compute import ComputeTarget,AmlCompute
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "nueva"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

Creating
Succeeded................................................................................................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


## Dataset

### Overview


The dataset was download from Kaggle: https://www.kaggle.com/thegurusteam/spanish-high-speed-rail-system-ticket-pricing
Then I have to create some new variables for example day of the week, delete empty rows and create a sample dataset with fewer obs. The reason why I had to perform a sample is that azure needs more than one hour to train an automl model when there are 30.000.000 obs.
Then I uploaded the dataset:


In [7]:
found = False
key = "datalite"
description_text = "dataset"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        print("error")


df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,duration,departureDay,departureMonth,ALICANTE,BARCELONA,CADIZ,CASTELLO,CASTELLON,CIUDAD REAL,CORDOBA,...,4,5,6,earlyMorning,lateNight,midday,midmorning,morning,night,y
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2.914604,16.0444,5.5842,0.0058,0.1298,0.0006,0.0008,0.0042,0.0014,0.0274,...,0.1448,0.1104,0.1308,0.0074,0.1824,0.2352,0.1778,0.1972,0.047,58.024988
std,1.601784,8.725987,2.273575,0.075944,0.336117,0.02449,0.028276,0.064678,0.037394,0.163262,...,0.351934,0.313419,0.337215,0.085713,0.386212,0.424166,0.382383,0.397924,0.21166,25.032202
min,0.38,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.45
25%,1.87,9.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.7
50%,2.53,16.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.7
75%,3.13,24.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75.4
max,11.52,31.0,12.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,183.5


## AutoML Configuration



I have choosen regression as task because I need to predic a continue variable. I choose normalized_root_mean_squared_error as primary metric because it is one of the metric that a regression model try to minimize.

In [9]:


automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'normalized_root_mean_squared_error',
    "task": "regression",
    "training_data":dataset,
    "label_column_name":"y",   
    "path" : project_folder,
    "enable_early_stopping": True,
                             "featurization": 'auto',
                             "debug_log":"automl_errors.log",
}

# TODO: Put your automl config here
automl_config = AutoMLConfig(
    compute_target=compute_target,
    **automl_settings)

## Run Details

In [10]:
# use the `RunDetails` widget to show the different experiments.

In [11]:
pipeline_run = experiment.submit(automl_config)

Running on remote.


In [13]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Cross validation
STATUS:       DONE
DESCRIPTION:  Each iteration of the trained model was validated through cross-validation.
              
DETAILS:      
+---------------------------------+
|Number of folds                  |
|3                                |
+---------------------------------+

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS: 

{'runId': 'AutoML_3687d5b9-ac3f-4b63-aeb7-53bdd3637de9',
 'target': 'nueva',
 'status': 'Completed',
 'startTimeUtc': '2021-02-09T08:33:59.386819Z',
 'endTimeUtc': '2021-02-09T09:01:05.079715Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'normalized_root_mean_squared_error',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'nueva',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"0d05fbbc-1330-4e17-83cf-7654307e62dc\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/02-09-2021_081917_UTC/dataLite.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-138243\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"f5091c60-1c3c-430f-8d81

## Best Model

In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [14]:
best_run, best_model = pipeline_run.get_output()

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


In [15]:
print(best_model)

RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=None,
                                                             enable_feature_sweeping=None,
                                                             feature_sweeping_config=None,
                                                             feature_sweeping_timeout=None,
                                                             featurization_config=None,
                                                             force_text_dnn=None,
                                                             is_cross_validation=None,
                                                             is_onnx_compatible=None,
                                                             logger=None,
                                                             observer=None,
                                         

In [16]:
print(best_model._final_estimator)

PreFittedSoftVotingRegressor(estimators=[('0',
                                          Pipeline(memory=None,
                                                   steps=[('maxabsscaler',
                                                           MaxAbsScaler(copy=True)),
                                                          ('lightgbmregressor',
                                                           LightGBMRegressor(boosting_type='gbdt',
                                                                             class_weight=None,
                                                                             colsample_bytree=1.0,
                                                                             importance_type='split',
                                                                             learning_rate=0.1,
                                                                             max_depth=-1,
                                                                  

In [17]:
best_model.steps

[('datatransformer',
  DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                  feature_sweeping_config=None, feature_sweeping_timeout=None,
                  featurization_config=None, force_text_dnn=None,
                  is_cross_validation=None, is_onnx_compatible=None, logger=None,
                  observer=None, task=None, working_dir=None)),
 ('prefittedsoftvotingregressor',
  PreFittedSoftVotingRegressor(estimators=[('0',
                                            Pipeline(memory=None,
                                                     steps=[('maxabsscaler',
                                                             MaxAbsScaler(copy=True)),
                                                            ('lightgbmregressor',
                                                             LightGBMRegressor(boosting_type='gbdt',
                                                                               class_weight=None,
                             

In [18]:
print(best_run)

Run(Experiment: ml-experiment-1,
Id: AutoML_3687d5b9-ac3f-4b63-aeb7-53bdd3637de9_38,
Type: azureml.scriptrun,
Status: Completed)


## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [19]:
import joblib
joblib.dump(value=best_model, filename='bestautomlmodel.pkl')

['bestautomlmodel.pkl']

In [20]:
from azureml.core.model import Model

In [21]:
model = Model.register(workspace=ws,model_name = "bestautomlmodel",
                       model_path = 'bestautomlmodel.pkl')

Registering model bestautomlmodel


In [22]:
from azureml.automl.core.shared import constants
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig
best_run.download_file(constants.CONDA_ENV_FILE_PATH, 'condaEnv.yml')
env = Environment.from_conda_specification('condaEnv', 'condaEnv.yml')

inference_config = InferenceConfig(entry_script='score.py', environment=env)


In [23]:
from azureml.core.webservice import LocalWebservice, AciWebservice
aci_config=AciWebservice.deploy_configuration(cpu_cores=1,memory_gb=1)




aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1, 
                                                auth_enabled=True)
service = model.deploy(workspace=ws,name="aml",
                       models=[model],inference_config=inference_config,
                       deployment_config=aci_config)



In [24]:
service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.......................................................................................................................................................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [25]:
service.update(enable_app_insights=True)

In [26]:
print(service.state)

Healthy


In [27]:
print(service.scoring_uri)

http://910f4960-da0f-46bd-b9cc-174d4b9e1963.southcentralus.azurecontainer.io/score


# save enviroment

In [33]:
env.register(ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20210104.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "AZUREML_ENTRY_SCRIPT": "score.py",
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "condaEnv",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "

In [38]:
env.save_to_directory('envi', overwrite=True)

# load environment

In [39]:
newenv = Environment.load_from_directory(path="envi")


# test endpoint

In [40]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

data = {
    "data":
    [
        {
            '1': "0",
            '2': "0",
            '3': "0",
            '4': "0",
            '5': "0",
            '6': "0",
            'duration': "0",
            'departureDay': "0",
            'departureMonth': "0",
            'ALICANTE': "0",
            'BARCELONA': "0",
            'CADIZ': "0",
            'CASTELLO': "0",
            'CASTELLON': "0",
            'CIUDAD REAL': "0",
            'CORDOBA': "0",
            'CUENCA': "0",
            'CÓRDOBA': "0",
            'GIRONA': "0",
            'GRANADA': "0",
            'GUADALAJARA': "0",
            'HUESCA': "0",
            'LEON': "0",
            'LEÓN': "0",
            'LLEIDA': "0",
            'MADRID': "0",
            'MALAGA': "0",
            'MÁLAGA': "0",
            'PALENCIA': "0",
            'PONFERRADA': "0",
            'SEGOVIA': "0",
            'SEVILLA': "0",
            'TARRAGONA': "0",
            'TOLEDO': "0",
            'VALENCIA': "0",
            'VALLADOLID': "0",
            'ZAMORA': "0",
            'ZARAGOZA': "0",
            'ALICANTE_1': "0",
            'BARCELONA_2': "example_value",
            'CADIZ_3': "example_value",
            'CASTELLO_4': "example_value",
            'CASTELLON_5': "example_value",
            'CIUDAD REAL_6': "example_value",
            'CORDOBA_7': "example_value",
            'CUENCA_8': "example_value",
            'CÓRDOBA_9': "example_value",
            'GIRONA_10': "example_value",
            'GRANADA_11': "example_value",
            'GUADALAJARA_12': "example_value",
            'HUESCA_13': "example_value",
            'LEON_14': "example_value",
            'LEÓN_15': "example_value",
            'LLEIDA_16': "example_value",
            'MADRID_17': "example_value",
            'MALAGA_18': "example_value",
            'MÁLAGA_19': "example_value",
            'PALENCIA_20': "example_value",
            'PONFERRADA_21': "example_value",
            'SEGOVIA_22': "example_value",
            'SEVILLA_23': "example_value",
            'TARRAGONA_24': "example_value",
            'TOLEDO_25': "example_value",
            'VALENCIA_26': "example_value",
            'VALLADOLID_27': "example_value",
            'ZAMORA_28': "example_value",
            'ZARAGOZA_29': "example_value",
            'ALVIA': "0",
            'AV City': "0",
            'AVANT': "0",
            'AVANT-AVE': "0",
            'AVE': "0",
            'AVE-AVANT': "0",
            'AVE-AVE': "0",
            'AVE-LD': "0",
            'AVE-MD': "0",
            'AVE-TGV': "0",
            'AVLO': "0",
            'EUROMED': "0",
            'INTERCITY': "0",
            'Intercity': "0",
            'LD': "0",
            'LD-AVANT': "0",
            'LD-AVE': "0",
            'LD-MD': "0",
            'MD': "0",
            'MD-AVANT': "0",
            'MD-AVE': "0",
            'MD-LD': "0",
            'R. EXPRES': "0",
            'REG.EXP.': "0",
            'REGIONAL': "0",
            'TORRE ORO': "0",
            'TRENHOTEL': "0",
            'earlyMorning': "0",
            'lateNight': "0",
            'midday': "0",
            'midmorning': "0",
            'morning': "0",
            'night': "0",
        },
    ],
}

body = str.encode(json.dumps(data))

url = 'http://910f4960-da0f-46bd-b9cc-174d4b9e1963.southcentralus.azurecontainer.io/score'
api_key = 'EetMUputzoyjgmaIt8FLc5uP8bwVmWNp' # Replace this with the API key for the web service
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(json.loads(error.read().decode("utf8", 'ignore')))

    
from datetime import datetime
datetime.now().strftime('%H:%M:%S')

b'[73.19211735441476]'


'09:56:31'