# Automated ML


In [6]:
import pandas as pd
import numpy as np
import json
import joblib
import json
import requests
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import OneHotEncoder

In [7]:
from azureml.train.automl import AutoMLConfig
from azureml.core.run import Run
from azureml.widgets import RunDetails
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Dataset
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.webservice import Webservice
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.model import InferenceConfig

In [11]:
ws = Workspace.from_config()
experiment_name = 'Bankrupty_Automl'
experiment = Experiment(ws, experiment_name)

run = experiment.start_logging()

In [10]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpu-cluster-4"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=6)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)
#compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

### Overview
* The data were collected from the Taiwan Economic Journal for the years 1999 to 2009. Company bankruptcy was defined based on the business regulations of the Taiwan Stock Exchange. This dataset has 6219 example for training. The dataset of 95 different financial ratios as features. This dataset is avaialable at kaggel.  

* This is a binary classification task where our target column is Bankrupt which may have the value 1 in case the company goes bankrupt 0 in case the company does not goes bankrupt.


In [12]:
from azureml.core import Dataset
ds = Dataset.get_by_name(ws, name = "bankruptcy_dataset")
df = ds.to_pandas_dataframe()
df.head(3)

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,operating gross margin,realized sales gross margin,operating profit rate,tax Pre-net interest rate,after-tax net interest rate,non-industry income and expenditure/revenue,...,net income to total assets,total assets to GNP price,No-credit interval,Gross profit to Sales,Net income to stockholder's Equity,liability to equity,Degree of financial leverage (DFL),Interest coverage ratio( Interest expense to EBIT ),one if net income was negative for the last two year zero otherwise,equity to liability
0,1,0.370594257300249,0.424389446140427,0.40574977247176,0.601457213277793,0.601457213277793,0.998969203197885,0.796887145860514,0.808809360876843,0.302646433889668,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464290937454297,0.53821412996075,0.516730017666899,0.610235085544617,0.610235085544617,0.998945978205482,0.797380191277827,0.809300725667939,0.303556430290771,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071271876371,0.499018752725687,0.472295090743616,0.601450006486113,0.601363524985947,0.998857353483229,0.796403369254357,0.808387521469543,0.302035177342951,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474



## AutoML Configuration

AutoML is a powerful tool that enables us to find the best model quickly. For my automl run I've used the following settings & configurations to find best combination of algorithms & hyperparameters:
* The primary metric is set to *accuracy*.
* The task is set to *classification* because we aim to get a binary result either 1 or 0, death or no death.
* We use the traing data we got from the dataset & we define the target column.
* Logs have been generated for debugging reasons. 
* Auto featurization is enabled, featurization includes automated feature engineering and scaling and normalization, which then impacts the selected algorithm and its hyperparameter values.
* Early stopping is enabled to save computational power. 
* Number of cross validation is set to 3.



In [13]:
# automl settings
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'average_precision_score_weighted'}

# automl config
automl_config = AutoMLConfig(compute_target = compute_target,
                             task = "classification",
                             training_data=ds,
                             label_column_name="Bankrupt?",   
                             featurization= 'auto',
                             n_cross_validations = 4,
                            enable_early_stopping= True,
                             debug_log = "automl_logs.log",
                             **automl_settings)

In [14]:
# Submitting the experiment
remote_run = experiment.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on cpu-cluster-4 with default configuration
Running on remote compute: cpu-cluster-4
Parent Run ID: AutoML_9fc44e25-e1ba-4374-a61c-1596b51212d6

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a

## Run Details

In [15]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|220                              |1                                |6819                                  |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_9fc44e25-e1ba-4374-a61c-1596b51212d6',
 'target': 'cpu-cluster-4',
 'status': 'Completed',
 'startTimeUtc': '2021-02-03T14:20:30.868171Z',
 'endTimeUtc': '2021-02-03T14:51:27.968111Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'average_precision_score_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'cpu-cluster-4',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"14a2e359-79c4-4f22-97b3-970894dec757\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/02-03-2021_021536_UTC/data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-137092\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"6971f5ac-8af1-

## Best Model

In [16]:
best_run, model = remote_run.get_output()

#Printing the best run
print(model) 
print('\nBest Run Id: ', best_run.id)

#Printing the metric details of the best run
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                  min_samples_split=0.15052631578947367,
                                                                                                  min_weight_fraction_leaf=0.0,
                                                                                                  n_estimators=25,
            

In [17]:
#Saving the best model
joblib.dump(model,'outputs/automl_model.pkl')

['outputs/automl_model.pkl']

## Model Deployment

* The AutoML model outperformed the model tuned by HyperDrive so we'll start deploying the best automl model. 

In [18]:
from azureml.core.model import Model
model = Model.register(workspace = ws,
                        model_path ="outputs/automl_model.pkl",
                        model_name = "automl_model")

Registering model automl_model


In [19]:
%%writefile score_automl.py
from azureml.core.model import Model
import numpy as np
import pandas as pd
import joblib
import json
import pickle
import os

def init():
    global model
    model_path = Model.get_model_path("automl_model")
    model = joblib.load(model_path)

def run(raw_data):
    try:
        data = json.loads(raw_data)['data']
        data = pd.DataFrame.from_dict(data)
        result = model.predict(data)
        return result.tolist()
    
    except Exception as ex:
        error = str(ex)
        return error

Writing score_automl.py


In [20]:
env = Environment.get(workspace=ws, name = "AzureML-AutoML")

In [None]:
from azureml.core.model import Model
from azureml.core.webservice import Webservice 
from azureml.core.webservice import  AciWebservice
from azureml.core.conda_dependencies import CondaDependencies
config_aci = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=1, 
                                               enable_app_insights=True, 
                                               auth_enabled=True)



ws = Workspace.from_config()
model = Model(ws, 'automl_model')

inference_config = InferenceConfig(entry_script="score_automl.py", environment=env)

service_name = 'automl1'
service = Model.deploy(workspace=ws, 
                       name=service_name, 
                       models=[model], 
                       inference_config=inference_config, 
                       deployment_config= config_aci)

service.wait_for_deployment(show_output=True)

print(service.state)
print(service.get_logs())

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running...................................

In [None]:
print(service.get_logs())

In [26]:
print("scoring URI: " + service.scoring_uri)

print("Swagger URI: " + service.swagger_uri)

print("Authetication Key: " + service.get_keys()[0])

scoring URI: http://23dfe88f-2b73-49a3-a2e4-9e5526966070.southcentralus.azurecontainer.io/score
Swagger URI: http://23dfe88f-2b73-49a3-a2e4-9e5526966070.southcentralus.azurecontainer.io/swagger.json
Authetication Key: 1oezrbV7Boh9KjvalcYa7hZw3G7hk88p


In [46]:
primary_key, secondary_key = service.get_keys()
print(primary_key,'\n',secondary_key)

DheElutv21WKWW1C9VYbr2l6wzcLkVy4 
 DXCVaeM79fZU2oFeO6AZbbTjDzKtpQ9K


TODO: In the cell below, send a request to the web service you deployed to test it.

In [47]:
key = primary_key
scoringuri = service.scoring_uri

data= { "data":
       [
           {
               'age': 60,
               'anaemia': 245,
               'creatinine_phosphokinase': 0,
               'diabetes': 0,
               'ejection_fraction': 38,
               'high_blood_pressure': 1,
               'platelets': 163000,
               'serum_creatinine': 50,
               'serum_sodium':100,
               'sex':1,
               'smoking':1,
               'time':7
               
               
           }
       ]
    }
input_data = json.dumps(data)

headers = {'Content-Type': 'application/json'}
headers['Authorization'] = f'Bearer {key}'

response = requests.post(scoringuri, input_data, headers = headers)
print(response.text)

[1]


TODO: In the cell below, print the logs of the web service and delete the service

In [49]:
ws = Workspace.from_config()

service = Webservice(name="aml-service", workspace=ws)
service.update(enable_app_insights=True)
logs = service.get_logs()
print(logs)

2021-01-12T02:44:58,371356973+00:00 - iot-server/run 
2021-01-12T02:44:58,371770998+00:00 - gunicorn/run 
2021-01-12T02:44:58,372986871+00:00 - rsyslog/run 
2021-01-12T02:44:58,377098918+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

In [50]:
#Deleting the service
service.delete()
print(service)

No service with name aml-service found to delete.
AciWebservice(workspace=Workspace.create(name='quick-starts-ws-134413', subscription_id='6b4af8be-9931-443e-90f6-c4c34a1f9737', resource_group='aml-quickstarts-134413'), name=aml-service, image_id=None, compute_type=None, state=ACI, scoring_uri=Failed, tags=None, properties={}, created_by={'hasInferenceSchema': 'False', 'hasHttps': 'False'})


In [51]:
#Deleting the compute cluster once the training process is complete
compute_target.delete()