Author: Kevin ALBERT  

Created: March 2021

### Import open-source packages

In [1]:
# environment packages
import platform
import psutil
import os

# other packages
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', 100) # default 50, the maximum width in characters of a column
pd.set_option('display.max_columns', 40)   # default 20, the maximum amount of columns in view 
pd.set_option('display.max_rows', 60)      # default 60, the maximum amount of rows in view
import logging
import json
import requests
import joblib

### Import azure machine learning SDK packages

In [2]:
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.core.experiment import Experiment
from azureml.data.datapath import DataPath
from azureml.core.compute import ComputeTarget, AmlCompute, AksCompute
from azureml.core.model import Model, InferenceConfig
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails
from azureml.core.webservice import Webservice, AciWebservice, AksWebservice
from azureml.exceptions import WebserviceException
from azureml.core.environment import Environment
from azureml.train.estimator import Estimator
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.sampling import RandomParameterSampling, GridParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.core.runconfig import RunConfiguration
from azureml.pipeline.core import PipelineData, Pipeline
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.pipeline.core.run import PipelineRun
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.interpret import ExplanationClient
import azureml.core
print("azureml.core version:", azureml.core.__version__)

azureml.core version: 1.24.0


### Workspace

In [3]:
# load the workspace
ws = Workspace.from_config()

### Experiment

In [4]:
# choose an experiment name
experiment = Experiment(ws, 'automl-binary-classification-SDSHackathon')

In [5]:
dataset = pd.read_parquet("../../../data/platinum/dataset.parquet")

In [6]:
dataset.columns

Index(['#', 'Artiste_x', 'Titre', 'fileDate', 'freq', 'True_Target', 'album',
       'Artiste_y', 'release_date', 'length', 'popularity', 'acousticness',
       'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'time_signature'],
      dtype='object')

In [22]:
dataset.sample(3)

Unnamed: 0,True_Target,length,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo
50,True,228983,1,0.15,0.56,0.6,0.0,0.07,-6.16,0.06,104.76
41,True,278719,5,0.05,0.45,0.58,0.0,0.08,-6.76,0.03,139.63
13,True,219800,5,0.01,0.47,0.94,0.0,0.24,-3.68,0.05,75.05


In [14]:
dataset.drop(["#","Artiste_x","Titre","fileDate","freq","album","Artiste_y","release_date","time_signature"], axis=1, inplace=True)

In [25]:
dataset["True_Target"].sum()

43

In [23]:
dataset.shape

(58, 11)

In [7]:
# we are not going to split for now... use the whole dataset

## Train

In [20]:
automl_settings = {
    "enable_early_stopping":True,
    "experiment_timeout_hours":0.25,
    "iterations":10, # number of runs
    "iteration_timeout_minutes":5,
    "max_concurrent_iterations":1,
    "max_cores_per_iteration":-1,
    #"experiment_exit_score":0.9920,
    "model_explainability":True,
    "n_cross_validations":5,
    "primary_metric":'AUC_weighted', # we could use AUC_micro
    "featurization":'auto',
    "verbosity":logging.INFO, # {INFO, DEBUG, CRITICAL, ERROR, WARNING} -- debug_log=<*.log>
}

automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             compute_target='local', # {training_cluster or 'local'}
                             #blacklist_models=['KNN','LinearSVM'],
                             enable_onnx_compatible_models=False,
                             training_data=dataset,
                             label_column_name="True_Target", # the name of the target variable
                             **automl_settings
                            )
# ouputs "model.pkl" and "automl_errors.log"

In [21]:
automl_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_c72950d2-d712-4d0d-8938-aeb4cece943e

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing f

### explore the best pipeline

In [11]:
RunDetails(automl_run).show()
automl_run.wait_for_completion() # get more parameter info

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_78355921-ea00-42c0-9af3-6de918bebb7a',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2021-03-12T23:58:52.397691Z',
 'endTimeUtc': '2021-03-13T00:13:34.977479Z',
 'properties': {'num_iterations': '10',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.24.0", "azureml-train-restclients-hyperdrive": "1.24.0", "azureml-train-core": "1.24.0", "azureml-train-automl": "1.24.0", "azureml-train-automl-runtime": "1.24.0", "azureml-train-automl-client": "1.24.0", "azureml-telemetry": "1.24.0", "azureml-pipeline-steps": "1.24.0", "azureml-pipeline-core": "1.24.0", "azureml-model-management-sdk": "1.0.