# Automated ML

In [1]:
# import all the dependencies
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Workspace, Experiment, Dataset
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails
from azureml.train.automl.utilities import get_primary_metrics
from azureml.core.webservice import AciWebservice, LocalWebservice
from azureml.core import Environment
from azureml.core.model import InferenceConfig
from azureml.core.model import Model
import requests
import json

ws = Workspace.from_config()

In [2]:
# Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
# source: https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.compute.amlcompute(class)?view=azure-ml-py

cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

### Overview

This project uses the data from a DrivenData competition - [Pump it Up: Data Mining the Water Table](https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/).

The training data is devided into two files, with the target variable (labels) and the other variables (values). The target variable describe the functioning status of each pump (*functional*, *functional need repair* and *non functional*). Descriptive variables inlude waterpoint location, its founder, water quality and quantity, waterpoint type, etc.

As one need to be logged in to DrivenData in order to access the data, it cannot be downloaded via direct links and was stored as .csv files in the *data* folder. The original data stored to the Azure datastore, merged into a single data set and registered as a dataset.

In [3]:
#local paths to train data
path_labels = "data/train_labels.csv"
path_values = "data/train_values.csv"

# get the datastore to upload prepared data
datastore = ws.get_default_datastore()

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='data', target_path='data', overwrite=True)

# create datasets referencing the cloud location
ds_labels = Dataset.Tabular.from_delimited_files(path = [(datastore, (path_labels))])
ds_values = Dataset.Tabular.from_delimited_files(path = [(datastore, (path_values))])

Uploading an estimated of 2 files
Uploading data/train_labels.csv
Uploaded data/train_labels.csv, 1 files out of an estimated total of 2
Uploading data/train_values.csv
Uploaded data/train_values.csv, 2 files out of an estimated total of 2
Uploaded 2 files


In [4]:
ds_labels.take(3).to_pandas_dataframe()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional


In [5]:
ds_values.take(3).to_pandas_dataframe()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe


In [6]:
# join the target variable with other variables
df_labels = ds_labels.to_pandas_dataframe()
df_values = ds_values.to_pandas_dataframe()
df_joined = df_values.join(df_labels.set_index('id'), on='id')

In [7]:
df_joined.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


Basic exploratory data analysis (EDA) was completed by profiling the data in Azure. As a result, some variables are excluded. Deeper EDA and consequent data wrangling are highly recommended, but omitted for now, as the goal of the project is different.

In [8]:
# remove some columns
exclude_vars = ['id', 'recorded_by']
df_joined = df_joined.drop(exclude_vars, axis=1)

# store the merged data locally
path_df_joined = "data/train_pump.csv"
df_joined.to_csv(path_df_joined,index=False)

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='data', target_path='data', overwrite=True)
ds_joined = Dataset.Tabular.from_delimited_files(path = [(datastore, (path_df_joined))])

# register dataset
# source: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-register-datasets
ds_joined = ds_joined.register(workspace=ws,
                               name='train_pump',
                               description='Training data for the Pump it Up project',
                               create_new_version=True)

Uploading an estimated of 3 files
Uploading data/train_labels.csv
Uploaded data/train_labels.csv, 1 files out of an estimated total of 3
Uploading data/train_values.csv
Uploaded data/train_values.csv, 2 files out of an estimated total of 3
Uploading data/train_pump.csv
Uploaded data/train_pump.csv, 3 files out of an estimated total of 3
Uploaded 3 files


In [9]:
# create experiment
experiment_name = 'pump_up'
experiment = Experiment(ws, experiment_name)

## AutoML Configuration

The problem at hand is a multiclass classification. The fact of unbalanced dataset suggest against the often used accuracy metric. Among the available performance metrics in AutoML classification (see below), the weighted AUC was chosen.

Choice of the cloud compute target allows to profit from higher compute capabilities. Enabling early stopping saves computation time for prospectless children runs.

In [10]:
# establish a list of available metrics
get_primary_metrics('classification')

['AUC_weighted',
 'norm_macro_recall',
 'accuracy',
 'average_precision_score_weighted',
 'precision_score_weighted']

In [11]:
# source: https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-auto-train-models
# automl settings 
automl_settings = {
       "n_cross_validations": 3,
       "primary_metric": 'AUC_weighted',
       "enable_early_stopping": True,
       "experiment_timeout_hours": 1.0,
       "max_concurrent_iterations": 4,
       "max_cores_per_iteration": -1,
       "verbosity": logging.INFO,
   }

# automl config 
automl_config = AutoMLConfig(task = 'classification',
                               compute_target = cpu_cluster,
                               training_data = ds_joined,
                               label_column_name = "status_group",
                               **automl_settings
                            )

In [12]:
# Submit your experiment
remote_run = experiment.submit(automl_config)

Running on remote.


## Run Details

In [13]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model

This section demonstrates the best performing model, downloads and registers it.

In [29]:
# Retrieve the best automl model
# source: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train
best_automl_run, automl_model = remote_run.get_output()
automl_model_name = best_automl_run.properties['model_name']
print('Best AutoML model name: ' + automl_model_name,
      'Best AutoML model run: ' + str(best_automl_run),
      'Best AutoML model specification: ' + str(automl_model), sep = '\n\n')

Best AutoML model name: AutoML8095cb95348

Best AutoML model run: Run(Experiment: pump_up,
Id: AutoML_8095cb95-3f69-4245-b118-045f8f840148_48,
Type: azureml.scriptrun,
Status: Completed)

Best AutoML model specification: Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                intercept_scaling=1,
                                                      

In [47]:
# Register and save the best model
automl_model_registered = remote_run.register_model(model_name='pump_it_up_automl_model')

automl_model_registered.download(target_dir="outputs_automl", exist_ok=True)

'outputs_automl/model.pkl'

## Model Deployment

The AutoML model showed better performance and therefore is it deployed as a web service. 

In [95]:
# source: https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-deploy-models-with-aml?view=azure-ml-py
aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=4, 
                                               enable_app_insights=True,
                                               description='Predict Pump It Up')

model = Model(ws, 'pump_it_up_automl_model')

env_deploy = Environment.get(workspace=ws, name='AzureML-AutoML')

inference_config = InferenceConfig(entry_script="score.py", environment=env_deploy)

service = Model.deploy(workspace=ws, 
                       name='pump-it-up-automl', 
                       models=[model], 
                       inference_config=inference_config, 
                       deployment_config=aciconfig)

In [96]:
service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running....................................
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [97]:
print(service.scoring_uri)
print(service.swagger_uri)

http://c913e7a4-71e0-4b6c-8eb3-ee0063c263ed.eastus2.azurecontainer.io/score
http://c913e7a4-71e0-4b6c-8eb3-ee0063c263ed.eastus2.azurecontainer.io/swagger.json


The deployed endpoint is tested by sending input data to it.

In [99]:
# scoring endpoint
scoring_uri = service.scoring_uri

data = {
  "data": [
    {
      "amount_tsh": 0,
      "date_recorded": "2013-02-04 00:00:00,000000",
      "funder": "Dmdd",
      "gps_height": 1996,
      "installer": "DMDD",
      "longitude": 35.2907992,
      "latitude": -4.05969643,
      "wpt_name": "Dinamu Secondary School",
      "num_private": 0,
      "basin": "Internal",
      "subvillage": "Magoma",
      "region": "Manyara",
      "region_code": 21,
      "district_code": 3,
      "lga": "Mbulu",
      "ward": "Bashay",
      "population": 321,
      "public_meeting": "True",
      "scheme_management": "Parastatal",
      "scheme_name": "",
      "permit": "True",
      "construction_year": 2012,
      "extraction_type": "other",
      "extraction_type_group": "other",
      "extraction_type_class": "other",
      "management": "parastatal",
      "management_group": "parastatal",
      "payment": "never pay",
      "payment_type": "never pay",
      "water_quality": "soft",
      "quality_group": "good",
      "quantity": "seasonal",
      "quantity_group": "seasonal",
      "source": "rainwater harvesting",
      "source_type": "rainwater harvesting",
      "source_class": "surface",
      "waterpoint_type": "other",
      "waterpoint_type_group": "other"
    }
  ]
}
# Convert to JSON string
input_data = json.dumps(data)

# Set the content type
headers = {'Content-Type': 'application/json'}

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.json())

['functional']


Access the logs of the web service and clean up resources (web service and compute cluster).

In [100]:
print(service.get_logs())

2020-12-28T20:18:29,756042900+00:00 - iot-server/run 
2020-12-28T20:18:29,759879400+00:00 - nginx/run 
2020-12-28T20:18:29,766060900+00:00 - gunicorn/run 
2020-12-28T20:18:29,765251500+00:00 - rsyslog/run 
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_8eff28b157f42edcd2424a5aae6c8074/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

In [101]:
# delete service
service.delete()

In [None]:
# delete compute cluster
cpu_cluster.delete()