Author: Kevin ALBERT  

Created: March 2021

### Import open-source packages

In [1]:
# environment packages
import platform
import psutil
import os

# other packages
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', 100) # default 50, the maximum width in characters of a column
pd.set_option('display.max_columns', 40)   # default 20, the maximum amount of columns in view 
pd.set_option('display.max_rows', 60)      # default 60, the maximum amount of rows in view
import logging
import json
import requests
import joblib

### Import azure machine learning SDK packages

In [2]:
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.core.experiment import Experiment
from azureml.data.datapath import DataPath
from azureml.core.compute import ComputeTarget, AmlCompute, AksCompute
from azureml.core.model import Model, InferenceConfig
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails
from azureml.core.webservice import Webservice, AciWebservice, AksWebservice
from azureml.exceptions import WebserviceException
from azureml.core.environment import Environment
from azureml.train.estimator import Estimator
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.sampling import RandomParameterSampling, GridParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.core.runconfig import RunConfiguration
from azureml.pipeline.core import PipelineData, Pipeline
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.pipeline.core.run import PipelineRun
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.interpret import ExplanationClient
import azureml.core
print("azureml.core version:", azureml.core.__version__)

azureml.core version: 1.24.0


### Workspace

In [3]:
# load the workspace
ws = Workspace.from_config()

### Experiment

In [60]:
# choose an experiment name
experiment = Experiment(ws, 'automl-binary-classification-SDSHackathon')

In [103]:
# dataset = pd.read_parquet("../../../data/platinum/dataset.parquet")
# dataset = pd.read_parquet("../../../data/silver/ultraPlusSpotify.parquet")
dataset = pd.read_parquet("../../../data/silver/ultraPlusSpotifyplusPDF.parquet")

In [62]:
# dataset.columns

In [45]:
# dataset.columns

Index(['#', 'Artiste', 'Titre', 'fileDate', 'freq', 'True_Target',
       'CleanedTitle', 'id', 'popularity', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms'],
      dtype='object')

In [104]:
dataset.columns

Index(['index', '#_x', 'Artiste', 'Titre', 'fileDate_x', 'freq', 'True_Target',
       'CleanedTitle', 'id', 'popularity', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', '#_y', 'VW', 'W',
       'Artiest', 'Titel', 'Label', 'Distributeur', 'HP', 'fileDate_y'],
      dtype='object')

In [22]:
# dataset.sample(3)

Unnamed: 0,True_Target,length,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo
50,True,228983,1,0.15,0.56,0.6,0.0,0.07,-6.16,0.06,104.76
41,True,278719,5,0.05,0.45,0.58,0.0,0.08,-6.76,0.03,139.63
13,True,219800,5,0.01,0.47,0.94,0.0,0.24,-3.68,0.05,75.05


In [46]:
# dataset.iloc[[50,41,13]]

Unnamed: 0,#,Artiste,Titre,fileDate,freq,True_Target,CleanedTitle,id,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
50,36,Aya Nakamura,Jolie nana,20-11-21,2,False,Jolie nana,65RWVU6N81CeH65nu52K1U,72,0.81,0.64,2,-5.91,1,0.27,0.27,0.0,0.13,0.49,92.99,147076
41,42,Julien Doré,La fièvre,20-11-14,1,False,La fièvre,79jeTvkW3gvUdldDkQpYOf,62,0.8,0.65,2,-7.93,0,0.07,0.6,0.01,0.1,0.41,89.98,241547
13,14,Ariana Grande,Positions,20-11-14,17,True,positions,35mvY5S1H3J2QZyna3TFe0,93,0.74,0.8,0,-4.77,1,0.09,0.47,0.0,0.09,0.68,144.01,172325


In [105]:
dataset.iloc[[50,41,13]]

Unnamed: 0,index,#_x,Artiste,Titre,fileDate_x,freq,True_Target,CleanedTitle,id,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,#_y,VW,W,Artiest,Titel,Label,Distributeur,HP,fileDate_y
50,50,36,Aya Nakamura,Jolie nana,20-11-21,2,False,Jolie nana,65RWVU6N81CeH65nu52K1U,72,0.81,0.64,2,-5.91,1,0.27,0.27,0.0,0.13,0.49,92.99,147076,49.0,37.0,18.0,Aya Nakamura,Jolie nana,REC. 118,WARNER,8.0,28_11_2020
41,41,42,Julien Doré,La fièvre,20-11-14,1,False,La fièvre,79jeTvkW3gvUdldDkQpYOf,62,0.8,0.65,2,-7.93,0,0.07,0.6,0.01,0.1,0.41,89.98,241547,,,,,,,,,
13,13,14,Ariana Grande,Positions,20-11-14,17,True,positions,35mvY5S1H3J2QZyna3TFe0,93,0.74,0.8,0,-4.77,1,0.09,0.47,0.0,0.09,0.68,144.01,172325,42.0,36.0,14.0,Ariana Grande,Positions,REPUBLIC,UNIVERSAL,14.0,30_01_2021


In [106]:
# dataset.drop(["#","Artiste_x","Titre","fileDate","freq","album","Artiste_y","release_date","time_signature"], axis=1, inplace=True)
# dataset.drop(["#","fileDate","Artiste","Titre","freq","CleanedTitle","id"], axis=1, inplace=True)
dataset.drop(["index","#_x","#_y","fileDate_x","fileDate_y","Artiste","Titre","freq","CleanedTitle","id","Artiest","Titel","Label","Distributeur","HP","W"], axis=1, inplace=True)

In [130]:
# we propose to use these features:
dataset.iloc[[50,41,13]]

Unnamed: 0,True_Target,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,VW
50,False,72,0.81,0.64,2,-5.91,1,0.27,0.27,0.0,0.13,0.49,92.99,147076,37
41,False,62,0.8,0.65,2,-7.93,0,0.07,0.6,0.01,0.1,0.41,89.98,241547,50
13,True,93,0.74,0.8,0,-4.77,1,0.09,0.47,0.0,0.09,0.68,144.01,172325,36


In [126]:
dataset["VW"].unique()

array([ 2,  7,  6, 37, 28, 50, 45, 10, 17, 16, 36, 23, 20, 30, 25, 11, 24,
       34, 43, 44, 38, 27, 33, 35, 47,  4, 29, 40,  9, 12, 21,  8, 26,  1,
        5, 19, 22, 46])

In [111]:
dataset['VW'] = dataset['VW'].replace(np.nan, '50')

In [114]:
dataset['VW'] = dataset['VW'].replace('NEW', '50')

In [123]:
dataset = dataset.astype({'VW':'int'})

In [25]:
# dataset["True_Target"].sum()

43

In [67]:
dataset["True_Target"].sum()

89

In [68]:
dataset.shape

(118, 15)

In [7]:
# we are not going to split for now... use the whole dataset

## Train

In [127]:
automl_settings = {
    "enable_early_stopping":True,
    "experiment_timeout_hours":0.25,
    "iterations":25, # number of runs
    "iteration_timeout_minutes":5,
    "max_concurrent_iterations":1,
    "max_cores_per_iteration":-1,
    #"experiment_exit_score":0.9920,
    "model_explainability":True,
    "n_cross_validations":5,
    "primary_metric":'precision_score_weighted', # we could use recall_score_weighted, AUC_weighted
    "featurization":'auto',
    "verbosity":logging.INFO, # {INFO, DEBUG, CRITICAL, ERROR, WARNING} -- debug_log=<*.log>
}

automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             compute_target='local', # {training_cluster or 'local'}
                             #blacklist_models=['KNN','LinearSVM'],
                             enable_onnx_compatible_models=False,
                             training_data=dataset,
                             label_column_name="True_Target", # the name of the target variable
                             **automl_settings
                            )
# ouputs "model.pkl" and "automl_errors.log"

In [128]:
automl_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_073d7624-35cd-471a-8b3f-6b19ee9f129a

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing f

### explore the best pipeline

In [131]:
RunDetails(automl_run).show()
automl_run.wait_for_completion() # get more parameter info

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_073d7624-35cd-471a-8b3f-6b19ee9f129a',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2021-03-14T09:12:38.463082Z',
 'endTimeUtc': '2021-03-14T09:30:04.77002Z',
 'properties': {'num_iterations': '25',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'precision_score_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.24.0", "azureml-train-restclients-hyperdrive": "1.24.0", "azureml-train-core": "1.24.0", "azureml-train-automl": "1.24.0", "azureml-train-automl-runtime": "1.24.0", "azureml-train-automl-client": "1.24.0", "azureml-telemetry": "1.24.0", "azureml-pipeline-steps": "1.24.0", "azureml-pipeline-core": "1.24.0", "azureml-model-management-

### select best pipeline 

In [132]:
best_run, fitted_model = automl_run.get_output()

### inspect model properties

In [133]:
# pipeline steps
for step in fitted_model.named_steps:
    print(step)

datatransformer
prefittedsoftvotingclassifier


In [134]:
# model properties
fitted_model.named_steps

{'datatransformer': DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                 feature_sweeping_config=None, feature_sweeping_timeout=None,
                 featurization_config=None, force_text_dnn=None,
                 is_cross_validation=None, is_onnx_compatible=None, logger=None,
                 observer=None, task=None, working_dir=None),
 'prefittedsoftvotingclassifier': PreFittedSoftVotingClassifier(classification_labels=None,
                               estimators=[('2',
                                            Pipeline(memory=None,
                                                     steps=[('minmaxscaler',
                                                             MinMaxScaler(copy=True,
                                                                          feature_range=(0,
                                                                                         1))),
                                                            ('randomforestc

In [135]:
# show all metrics
best_run.get_metrics()

{'recall_score_weighted': 0.8054347826086957,
 'matthews_correlation': 0.41223535469692785,
 'f1_score_weighted': 0.7513797054836291,
 'AUC_weighted': 0.706155093616394,
 'AUC_macro': 0.706155093616394,
 'f1_score_macro': 0.6182596265523095,
 'average_precision_score_micro': 0.8248899140704612,
 'accuracy': 0.8054347826086957,
 'precision_score_macro': 0.8978260869565217,
 'log_loss': 0.547422004452099,
 'norm_macro_recall': 0.22047619047619044,
 'average_precision_score_macro': 0.7186658248330582,
 'f1_score_micro': 0.8054347826086957,
 'weighted_accuracy': 0.9176962510558806,
 'AUC_micro': 0.8309198435202688,
 'recall_score_macro': 0.6102380952380952,
 'precision_score_micro': 0.8054347826086957,
 'precision_score_weighted': 0.8483695652173913,
 'average_precision_score_weighted': 0.7943991765175218,
 'recall_score_micro': 0.8054347826086957,
 'balanced_accuracy': 0.6102380952380952,
 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_073d7624-35cd-471a-8b3f-6b19ee9f129a_2

### quick feature importance checks

In [136]:
client = ExplanationClient.from_run(best_run)
engineered_explanations = client.download_model_explanation(raw=False)
feature_importance = engineered_explanations.get_feature_importance_dict() # get model feature importance values
columns = ["modelFeatureImportance_name", "modelFeatureImportance_value"]
pd.DataFrame(list(feature_importance.items()), columns=columns)

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value
0,acousticness_MeanImputer,0.34
1,speechiness_MeanImputer,0.34
2,duration_ms_MeanImputer,0.31
3,VW_MeanImputer,0.27
4,liveness_MeanImputer,0.19
5,instrumentalness_MeanImputer,0.13
6,valence_MeanImputer,0.13
7,loudness_MeanImputer,0.12
8,tempo_MeanImputer,0.12
9,popularity_MeanImputer,0.11


## Register

####  prepare the scoring script, environment file and model

In [137]:
# get the score and environment files
model_name = best_run.properties['model_name'] # score.py script will look for the name of the registered model

# make a local copy of the best scoring script, environment file and the model file
script_file_name = 'inference/score.py'
conda_env_file_name = 'inference/env.yml'
model_pickle_file_name = 'inference/model.pkl'
# model_onnx_file_name = 'inference/model.onnx'
best_run.download_file('outputs/scoring_file_v_1_0_0.py', script_file_name)
best_run.download_file('outputs/conda_env_v_1_0_0.yml', conda_env_file_name)
best_run.download_file('outputs/model.pkl', model_pickle_file_name)
# best_run.download_file('outputs/model.onnx', model_onnx_file_name)

In [138]:
! cat inference/env.yml

# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
  - azureml-train-automl-runtime==1.24.0
  - inference-schema
  - azureml-interpret==1.24.0
  - azureml-defaults==1.24.0
- numpy>=1.16.0,<1.19.0
- pandas==0.25.1
- scikit-learn==0.22.1
- py-xgboost<=0.90
- fbprophet==0.5
- holidays==0.9.11
- psutil>=5.2.2,<6.0.0
channels:
- anaconda
- conda-forge


In [139]:
! cat inference/score.py

# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import json
import logging
import os
import pickle
import numpy as np
import pandas as pd
import joblib

import azureml.automl.core
from azureml.automl.core.shared import logging_utilities, log_server
from azureml.telemetry import INSTRUMENTATION_KEY

from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType


input_sample = pd.DataFrame({"VW": pd.Series([0], dtype="int64"), "acousticness": pd.Series([0.0], dtype="float64"), "danceability": pd.Series([0.0], dtype="float64"), "duration_ms": pd.Series([0], dtype="int64"), "energy": pd.Series([0.0], dtype="float64"), "instrumentalness": pd.Series([0.0], dty

### Register the model 

#### we use the local /path/model folder with Model.register()

In [140]:
model = Model.register(workspace=ws,
                       model_name=model_name, # registered model name used in scoring script init()
                       model_framework=Model.Framework.SCIKITLEARN, # {TensorFlow, ScikitLearn, Onnx, Custom}
                       model_framework_version='0.22.2',
                       model_path='inference/model.pkl', # local file {'model.pkl', 'model.onnx'}
                       tags={'Training context': 'autoML Training'},
                       properties={'AUC': best_run.get_metrics()['AUC_weighted'],
                                   'Accuracy': best_run.get_metrics()['accuracy']},
                       description="Classification model to predict diabetes")

Registering model AutoML073d7624320


### Deploy model as webservice (ACI)

#### this deplayment can take 15-20min... (using smallest compute 1CPU, 1GB mem)

In [141]:
%%time
# Configure the scoring environment
service_name = "automl-projname-service" # only lowercase letters, numbers, or dashes

# Remove any existing service under the same name
try:
    Webservice(ws, service_name).delete()
except WebserviceException:
    print('"' + service_name + '" does not exist, creating the webservice...')

myenv = Environment.from_conda_specification(name="myenv", file_path=conda_env_file_name)
inference_config = InferenceConfig(entry_script=script_file_name, environment=myenv)

deployment_config = AciWebservice.deploy_configuration(cpu_cores=1,
                                                       memory_gb=1)

# build container from environment, start webservice ACI and deploy inference scrips 
service = Model.deploy(ws, service_name, [model], inference_config, deployment_config)
service.wait_for_deployment(show_output=True)

"automl-projname-service" does not exist, creating the webservice...
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-03-14 10:40:37+01:00 Creating Container Registry if not exists.
2021-03-14 10:40:37+01:00 Registering the environment.
2021-03-14 10:40:39+01:00 Use the existing image.
2021-03-14 10:40:39+01:00 Generating deployment configuration.
2021-03-14 10:40:39+01:00 Submitting deployment to compute..
2021-03-14 10:40:43+01:00 Checking the status of deployment automl-projname-service..
2021-03-14 10:44:26+01:00 Checking the status of inference endpoint automl-projname-service.
Succeeded
ACI service creation operation finished, operation "Succeeded"
CPU times: user 1min 15s, sys: 32.2 s, total: 1min 47s
Wall time: 4min 15s


In [142]:
# get webservice logs
print(service.get_logs())

2021-03-14T09:44:15,139824400+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_fe9df1de9dcfc7f534c43ede471eccd6/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_fe9df1de9dcfc7f534c43ede471eccd6/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_fe9df1de9dcfc7f534c43ede471eccd6/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_fe9df1de9dcfc7f534c43ede471eccd6/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_fe9df1de9dcfc7f534c43ede471eccd6/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-03-14T09:44:15,155938100+00:00 - gunicorn/run 
2021-03-14T09:44:15,163717900+00:00 - iot-server/run 
2021-03-14T09:44:15,165133700+00:00 - rsyslog/run 
rsyslogd

Send a HTTP triggered webrequest with testdata to the model for a prediction value.  
In this example we test a person is inTop50 (1) or not-Top50 (0).  
The data must be a list of 11 features to predict a binary classification.  

In [None]:
# input_sample = pd.DataFrame({"VW": pd.Series([0], dtype="int64"), "acousticness": pd.Series([0.0], dtype="float64"), "danceability": pd.Series([0.0], dtype="float64"), "duration_ms": pd.Series([0], dtype="int64"), "energy": pd.Series([0.0], dtype="float64"), "instrumentalness": pd.Series([0.0], dtype="float64"), "key": pd.Series([0], dtype="int64"), "liveness": pd.Series([0.0], dtype="float64"), "loudness": pd.Series([0.0], dtype="float64"), "mode": pd.Series([0], dtype="int64"), "popularity": pd.Series([0], dtype="int64"), "speechiness": pd.Series([0.0], dtype="float64"), "tempo": pd.Series([0.0], dtype="float64"), "valence": pd.Series([0.0], dtype="float64")})
# output_sample = np.array([0])

In [145]:
# get webservice URI
endpoint = service.scoring_uri

# raw test data
 
# rawdata = [[50, 2.55e-05, 0.502, 301920, 0.912, 0.000173, 1, 0.106, -4.556, 1, 80, 0.0564, 116.761, 0.72],
#            [50, 6, 0, 148, 58, 11, 179, 39, 0.16, 45]]
rawdata = [[50, 2.55e-05, 0.502, 301920, 0.912, 0.000173, 1, 0.106, -4.556, 1, 80, 0.0564, 116.761, 0.72]]

print("URI: " + endpoint)
print("Body: " + json.dumps({"data": rawdata})) # convert array to a serialized JSON formatted string object

URI: http://2202f4f2-bb98-4850-93bc-59b5d66723b2.northeurope.azurecontainer.io/score
Body: {"data": [[50, 10, 0.502, 301920, 0.912, 0.000173, 1, 0.106, -4.556, 1, 80, 0.0564, 116.761, 0.72]]}


In [146]:
response = requests.post(endpoint, json={"data": rawdata})
response.json()

'{"result": [true]}'

In [129]:
# service.delete()