Author: Kevin ALBERT  

Created: March 2021

### Import open-source packages

In [1]:
# environment packages
import platform
import psutil
import os

# other packages
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', 100) # default 50, the maximum width in characters of a column
pd.set_option('display.max_columns', 40)   # default 20, the maximum amount of columns in view 
pd.set_option('display.max_rows', 60)      # default 60, the maximum amount of rows in view
import logging
import json
import requests
import joblib

### Import azure machine learning SDK packages

In [2]:
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.core.experiment import Experiment
from azureml.data.datapath import DataPath
from azureml.core.compute import ComputeTarget, AmlCompute, AksCompute
from azureml.core.model import Model, InferenceConfig
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails
from azureml.core.webservice import Webservice, AciWebservice, AksWebservice
from azureml.exceptions import WebserviceException
from azureml.core.environment import Environment
from azureml.train.estimator import Estimator
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.sampling import RandomParameterSampling, GridParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.core.runconfig import RunConfiguration
from azureml.pipeline.core import PipelineData, Pipeline
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.pipeline.core.run import PipelineRun
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.interpret import ExplanationClient
import azureml.core
print("azureml.core version:", azureml.core.__version__)

azureml.core version: 1.24.0


### Workspace

In [3]:
# load the workspace
ws = Workspace.from_config()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code C98TGF9MF to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.


### Experiment

In [4]:
# choose an experiment name
experiment = Experiment(ws, 'automl-binary-classification-SDSHackathon')

In [5]:
dataset = pd.read_parquet("../../../data/platinum/dataset_fake_target.parquet")

In [8]:
dataset.columnsmns

Index(['name', 'album', 'artist', 'release_date', 'length', 'popularity',
       'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature',
       'isInTop50'],
      dtype='object')

In [6]:
dataset.shape

(6217, 16)

In [7]:
# we are not going to split for now... use the whole dataset

## Train

In [9]:
automl_settings = {
    "enable_early_stopping":True,
    "experiment_timeout_hours":0.25,
    "iterations":10, # number of runs
    "iteration_timeout_minutes":5,
    "max_concurrent_iterations":1,
    "max_cores_per_iteration":-1,
    #"experiment_exit_score":0.9920,
    "model_explainability":True,
    "n_cross_validations":5,
    "primary_metric":'AUC_weighted', # we could use AUC_micro
    "featurization":'auto',
    "verbosity":logging.INFO, # {INFO, DEBUG, CRITICAL, ERROR, WARNING} -- debug_log=<*.log>
}

automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             compute_target='local', # {training_cluster or 'local'}
                             #blacklist_models=['KNN','LinearSVM'],
                             enable_onnx_compatible_models=False,
                             training_data=dataset,
                             label_column_name="isInTop50", # the name of the target variable
                             **automl_settings
                            )
# ouputs "model.pkl" and "automl_errors.log"

In [10]:
automl_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_78355921-ea00-42c0-9af3-6de918bebb7a

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely p

Run.fail() is deprecated, use Run.tag() to mark run as failed or use Run.cancel() to mark the run as canceled
Run.fail() will be removed shortly


                                               0:05:04          nan       nan
ERROR: Iteration timeout reached, skipping execution of the child run. Consider increasing iteration_timeout_minutes.
         1   MaxAbsScaler XGBoostClassifier                 0:02:11       0.4988    0.4988
         2   MaxAbsScaler RandomForest                      0:00:37       0.5311    0.5311
         3   MaxAbsScaler RandomForest                      0:00:36       0.4489    0.5311
         4   MaxAbsScaler RandomForest                      0:00:35       0.4701    0.5311
         5   MaxAbsScaler ExtremeRandomTrees                0:00:36       0.4453    0.5311
         6   MaxAbsScaler ExtremeRandomTrees                0:00:39       0.5281    0.5311
         7   MaxAbsScaler ExtremeRandomTrees                0:00:41       0.5412    0.5412
         8   VotingEnsemble                                 0:00:38       0.5487    0.5487
         9   StackEnsemble                                  0:02:12       0.

### explore the best pipeline

In [11]:
RunDetails(automl_run).show()
automl_run.wait_for_completion() # get more parameter info

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_78355921-ea00-42c0-9af3-6de918bebb7a',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2021-03-12T23:58:52.397691Z',
 'endTimeUtc': '2021-03-13T00:13:34.977479Z',
 'properties': {'num_iterations': '10',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.24.0", "azureml-train-restclients-hyperdrive": "1.24.0", "azureml-train-core": "1.24.0", "azureml-train-automl": "1.24.0", "azureml-train-automl-runtime": "1.24.0", "azureml-train-automl-client": "1.24.0", "azureml-telemetry": "1.24.0", "azureml-pipeline-steps": "1.24.0", "azureml-pipeline-core": "1.24.0", "azureml-model-management-sdk": "1.0.