Author: Kevin ALBERT

Created: Nov 2020

In [None]:
! pip install seaborn

In [None]:
! pip install cryptography==3.1.1

In [None]:
! pip install zipp==3.3.1

In [1]:
# import logging
import os
import logging
import pandas as pd
import numpy as np
import json
import requests
import joblib

In [2]:
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.core.experiment import Experiment
from azureml.data.datapath import DataPath
from azureml.core.compute import ComputeTarget, AmlCompute, AksCompute
from azureml.core.model import Model, InferenceConfig
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails
from azureml.core.webservice import Webservice, AciWebservice, AksWebservice
from azureml.exceptions import WebserviceException
from azureml.core.environment import Environment
from azureml.train.estimator import Estimator
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.sampling import RandomParameterSampling, GridParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.core.runconfig import RunConfiguration
from azureml.pipeline.core import PipelineData, Pipeline
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.pipeline.core.run import PipelineRun
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.explain.model._internal.explanation_client import ExplanationClient
import azureml.core
print("azureml.core version:", azureml.core.__version__)

azureml.core version: 1.17.0


The azureml-explain-model package is deprecated and will be removed in a future release of the AzureML SDK. Please use the azureml-interpret and interpret-community packages which support the functionality azureml-explain-model used to provide.


In [3]:
# pd.describe_option('display')            # show all pandas options, parameters can slow down notebook
pd.set_option('display.max_colwidth', 100) # default 50, the maximum width in characters of a column
pd.set_option('display.max_columns', 40)   # default 20, the maximum amount of columns in view 
pd.set_option('display.max_rows', 60)      # default 60, the maximum amount of rows in view

In [4]:
!pip list |grep -i azureml

azureml-accel-models                  1.17.0
azureml-automl-core                   1.17.0
azureml-automl-runtime                1.17.0
azureml-cli-common                    1.17.0
azureml-contrib-dataset               1.17.0
azureml-contrib-fairness              1.17.0
azureml-contrib-gbdt                  1.17.0
azureml-contrib-interpret             1.17.0
azureml-contrib-notebook              1.17.0
azureml-contrib-pipeline-steps        1.17.0
azureml-contrib-reinforcementlearning 1.17.0
azureml-contrib-server                1.17.0
azureml-contrib-services              1.17.0
azureml-core                          1.17.0
azureml-datadrift                     1.17.0
azureml-dataprep                      2.4.2
azureml-dataprep-native               24.0.0
azureml-dataprep-rslex                1.2.2
azureml-dataset-runtime               1.17.0
azureml-defaults                      1.17.0
azureml-explain-model                 1.17.0
azureml-interpret                   

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
#import eli5
#from eli5.sklearn import PermutationImportance
# import scikitplot as skplt
from sklearn.decomposition import PCA

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None) # display all columns

import warnings
warnings.filterwarnings('ignore')

In [6]:
# ML workspace setup and copy config.json file manually

In [7]:
# load the workspace
ws = Workspace.from_config()

In [8]:
# choose an experiment name
experiment = Experiment(ws, 'automl-classification-synthetic-health-data')

In [9]:
# putting dataset in datalake GEN2 cloud storage manually first

In [10]:
ds = Datastore.register_azure_blob_container(
    workspace=ws,
    datastore_name="datalakestoragegen2",
    container_name="datalake",
    account_name="datalake27112020",
    account_key="WJ4lTl5w9ze3hberxHsnNoWON5DTZJyajggoLY3j7WgsDDFm5w/NPuDAfO4Po/bNellztxilXm2Gpo9GzEzxdA==",
    create_if_not_exists=False)
# list available datastores
ws.datastores

{'datalakestoragegen2': {
   "name": "datalakestoragegen2",
   "container_name": "datalake",
   "account_name": "datalake27112020",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspacefilestore': {
   "name": "workspacefilestore",
   "container_name": "azureml-filestore-9835fb79-8b03-46ca-ba4b-b8cd0d3e846a",
   "account_name": "machinelstorage9af0d08f1",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspaceblobstore': {
   "name": "workspaceblobstore",
   "container_name": "azureml-blobstore-9835fb79-8b03-46ca-ba4b-b8cd0d3e846a",
   "account_name": "machinelstorage9af0d08f1",
   "protocol": "https",
   "endpoint": "core.windows.net"
 }}

In [11]:
# setup parquet file(s) into a tabular dataset
ds_path = [DataPath(ds, 'silver/synthetic_data_processed.parquet')] # {path/*.parquet}
dataset = Dataset.Tabular.from_parquet_files(path=ds_path)
# show dataset settings
dataset

{
  "source": [
    "('datalakestoragegen2', 'silver/synthetic_data_processed.parquet')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ]
}

In [12]:
# define the target and features
df = dataset.to_pandas_dataframe()
print(df.shape)
X = df.drop('readmitted', axis=1)
y = df['readmitted']

(78441, 45)


In [13]:
# make sure we stratify our sample based on the target
# set a fixed sample state
# split between 80/20 (train/test)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=101)
training_data = pd.concat([X_train, y_train], axis=1)
validation_data = pd.concat([X_test, y_test], axis=1)
print(training_data.shape)
print(validation_data.shape)

(62752, 45)
(15689, 45)


### create training environment

In [14]:
# myenv = Environment("training_environment")
# myenv.docker.enabled = True
# myenv.python.user_managed_dependencies = False
# conda_packages = ['scikit-learn', 'joblib', 'python==3.6.2']
# pip_packages = ['azureml-defaults', 'azureml-dataprep[pandas,fuse]', 'pyarrow', 'fastparquet']
# myenv.python.conda_dependencies = CondaDependencies.create(conda_packages=conda_packages, pip_packages=pip_packages)
# myenv.register(ws)

In [15]:
# load the docker environment
# training_env = Environment.get(ws, 'training_environment')

In [16]:
automl_settings = {
    "enable_early_stopping":True,
    "experiment_timeout_hours":4,
    "iterations":1000, # number of runs ex: 20
    "iteration_timeout_minutes":15,
    "max_concurrent_iterations":1,
    "max_cores_per_iteration":-1,
    #"experiment_exit_score":0.9920,
    "model_explainability":True,
    "n_cross_validations":5, # make 10 if small dataset, else 5
    "primary_metric":'AUC_weighted',
    "featurization":'auto',
    "verbosity":logging.INFO, # {INFO, DEBUG, CRITICAL, ERROR, WARNING} -- debug_log=<*.log>
    "enable_dnn":False,
}

automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             compute_target='local', # {training_cluster or 'local'}
                             #blacklist_models=['KNN','LinearSVM'],
#                              environment_definition=training_env,
                             enable_onnx_compatible_models=True,
                             training_data=dataset,
                             label_column_name="readmitted",
#                              training_data=training_data,
#                              validation_data=validation_data,
#                              label_column_name="readmitted",
                             **automl_settings
                            )
# ouputs "model.pkl" and "automl_errors.log"

In [17]:
automl_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_f1f3cd1f-573e-4fe7-96b2-f742bcc575a6

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         High card

### optional: retrieve specific run

In [None]:
# runId = 'AutoML_891419fd-d69c-4a91-b536-f008adcb800c'
# automl_run = AutoMLRun(experiment, run_id=runId)

### results: explore the best pipeline

In [18]:
RunDetails(automl_run).show()
automl_run.wait_for_completion() # get more parameter info

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_f1f3cd1f-573e-4fe7-96b2-f742bcc575a6',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-11-29T00:55:35.135001Z',
 'endTimeUtc': '2020-11-29T01:31:41.431476Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'local',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"37cde55a-bedd-4de8-b9c0-6a659935a4aa\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"datalakestoragegen2\\\\\\", \\\\\\"path\\\\\\": \\\\\\"silver/synthetic_data_processed.parquet\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"myResourceGroup02\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"43c1f93a-903d-4b23-a4bf-92bd7a150627\\\\\\", \\\

In [19]:
best_run, fitted_model = automl_run.get_output()

In [20]:
# pipeline steps
for step in fitted_model.named_steps:
    print(step)

datatransformer
prefittedsoftvotingclassifier


In [21]:
# model properties
fitted_model.named_steps

{'datatransformer': DataTransformer(allow_chargram=None, enable_dnn=None,
                 enable_feature_sweeping=None, feature_sweeping_config=None,
                 feature_sweeping_timeout=None, featurization_config=None,
                 force_text_dnn=None, is_cross_validation=None,
                 is_onnx_compatible=None, logger=None, observer=None, task=None,
                 working_dir=None),
 'prefittedsoftvotingclassifier': PreFittedSoftVotingClassifier(classification_labels=None,
                               estimators=[('0',
                                            Pipeline(memory=None,
                                                     steps=[('maxabsscaler',
                                                             MaxAbsScaler(copy=True)),
                                                            ('lightgbmclassifier',
                                                             LightGBMClassifier(boosting_type='gbdt',
                                     

In [22]:
# show all metrics
best_run.get_metrics()

{'f1_score_macro': 0.7238536225559721,
 'balanced_accuracy': 0.7063753749222992,
 'f1_score_micro': 0.7771191153102166,
 'average_precision_score_macro': 0.7992318675728961,
 'recall_score_micro': 0.7771191153102166,
 'f1_score_weighted': 0.773601698864318,
 'precision_score_weighted': 0.7736630353074163,
 'recall_score_weighted': 0.7771191153102166,
 'AUC_weighted': 0.8957500396581913,
 'average_precision_score_weighted': 0.8495791932566241,
 'precision_score_micro': 0.7771191153102166,
 'AUC_micro': 0.9196929635011244,
 'accuracy': 0.7771191153102166,
 'average_precision_score_micro': 0.858297753821336,
 'recall_score_macro': 0.7063753749222992,
 'log_loss': 0.5496707624632544,
 'AUC_macro': 0.9005075296352704,
 'matthews_correlation': 0.6073417968740799,
 'precision_score_macro': 0.749605388133183,
 'norm_macro_recall': 0.5595630623834488,
 'weighted_accuracy': 0.8115070730233832,
 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_f1f3cd1f-573e-4fe7-96b2-f742bcc575a6_3

In [23]:
# best_run.download_file('residuals')

In [24]:
# best_run.download_file('predicted_true')

In [25]:
best_run.get_details()

{'runId': 'AutoML_f1f3cd1f-573e-4fe7-96b2-f742bcc575a6_30',
 'status': 'Completed',
 'startTimeUtc': '2020-11-29T01:30:26.348081Z',
 'endTimeUtc': '2020-11-29T01:31:39.205241Z',
 'properties': {'runTemplate': 'automl_child',
  'pipeline_id': '__AutoML_Ensemble__',
  'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'AUC_weighted\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'automl-classification-synthetic-health-data\',\'compute_target\':\'local\',\'subscription_id\':\'43c1f93a-903d-4b23-a4bf-92bd7a150627\',\'region\':\'westeurope\',\'spark_service\':None}","ensemble_run_id":"AutoML_f1f3cd1f-573e-4fe7-96b2-f742bcc575a6_30","experiment_name":null,"workspace_name":"machine_learning_workspace02","subscription_id":"43c1f93a-903d-4b23-a4bf-92bd7a1506

In [26]:
client = ExplanationClient.from_run(best_run)
engineered_explanations = client.download_model_explanation(raw=False)
feature_importance = engineered_explanations.get_feature_importance_dict() # get model feature importance values
feature_importance

{'number_inpatient_CharGramCountVectorizer_0': 0.7957287121691267,
 'change_ModeCatImputer_LabelEncoder': 0.38994392496155733,
 '_diag_1_CharGramCountVectorizer_Diseases of the circulatory system': 0.2669470586616131,
 'time_in_hospital_severitylvl_CharGramCountVectorizer_Normal': 0.20430982642727513,
 '_diag_3_CharGramCountVectorizer_Diabetes mellitus': 0.18891590265452263,
 'age_CharGramCountVectorizer_80-90': 0.17701126627060051,
 '_diag_2_CharGramCountVectorizer_Diabetes mellitus': 0.1414473620327482,
 'number_diagnoses_CharGramCountVectorizer_9': 0.13540643067758204,
 '_diag_1_CharGramCountVectorizer_Diseases of the respiratory system': 0.13466272599947468,
 'num_medications_MeanImputer': 0.1208103414075009,
 'number_inpatient_CharGramCountVectorizer_2': 0.10850877678416578,
 'insulin_CharGramCountVectorizer_No': 0.10494269912990022,
 '_diag_1_CharGramCountVectorizer_Injury and poisoning': 0.10166701302536857,
 'number_outpatient_CharGramCountVectorizer_0': 0.09861132926481743,
 '

In [27]:
columns = ["modelFeatureImportance_name", "modelFeatureImportance_value"]
fi = pd.DataFrame(list(feature_importance.items()), columns=columns) # transform dict into DataFrame
fi

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value
0,number_inpatient_CharGramCountVectorizer_0,0.80
1,change_ModeCatImputer_LabelEncoder,0.39
2,_diag_1_CharGramCountVectorizer_Diseases of the circulatory system,0.27
3,time_in_hospital_severitylvl_CharGramCountVectorizer_Normal,0.20
4,_diag_3_CharGramCountVectorizer_Diabetes mellitus,0.19
...,...,...
95,_diag_3_CharGramCountVectorizer_Diseases of the musculoskeletal system and connective tissue,0.00
96,metformin_CharGramCountVectorizer_No,0.00
97,_diag_1_CharGramCountVectorizer_Diseases of the musculoskeletal system and connective tissue,0.00
98,"_diag_2_CharGramCountVectorizer_Complications of pregnancy, childbirth, and the puerperium",0.00


In [28]:
# fi.iloc[:,1][95]

In [29]:
# fi.iloc[:,2][95]

In [30]:
fi = fi[fi.iloc[:,1] >= 0.01] # remove small or zero values
fi

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value
0,number_inpatient_CharGramCountVectorizer_0,0.8
1,change_ModeCatImputer_LabelEncoder,0.39
2,_diag_1_CharGramCountVectorizer_Diseases of the circulatory system,0.27
3,time_in_hospital_severitylvl_CharGramCountVectorizer_Normal,0.2
4,_diag_3_CharGramCountVectorizer_Diabetes mellitus,0.19
5,age_CharGramCountVectorizer_80-90,0.18
6,_diag_2_CharGramCountVectorizer_Diabetes mellitus,0.14
7,number_diagnoses_CharGramCountVectorizer_9,0.14
8,_diag_1_CharGramCountVectorizer_Diseases of the respiratory system,0.13
9,num_medications_MeanImputer,0.12


In [31]:
fi_sum = fi.iloc[:,1].sum()
relativeWeight = lambda x: x/fi_sum # normalize percentage values of weights
fi["modelFeatureImportance_relativeWeight"] = fi.iloc[:,1].apply(relativeWeight)
fi

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value,modelFeatureImportance_relativeWeight
0,number_inpatient_CharGramCountVectorizer_0,0.8,0.16
1,change_ModeCatImputer_LabelEncoder,0.39,0.08
2,_diag_1_CharGramCountVectorizer_Diseases of the circulatory system,0.27,0.06
3,time_in_hospital_severitylvl_CharGramCountVectorizer_Normal,0.2,0.04
4,_diag_3_CharGramCountVectorizer_Diabetes mellitus,0.19,0.04
5,age_CharGramCountVectorizer_80-90,0.18,0.04
6,_diag_2_CharGramCountVectorizer_Diabetes mellitus,0.14,0.03
7,number_diagnoses_CharGramCountVectorizer_9,0.14,0.03
8,_diag_1_CharGramCountVectorizer_Diseases of the respiratory system,0.13,0.03
9,num_medications_MeanImputer,0.12,0.02


# run model on Real Dataset

In [10]:
# setup parquet file(s) into a tabular dataset
ds_path = [DataPath(ds, 'silver/real_data_processed.parquet')] # {path/*.parquet}
dataset = Dataset.Tabular.from_parquet_files(path=ds_path)
# show dataset settings
dataset

{
  "source": [
    "('datalakestoragegen2', 'silver/real_data_processed.parquet')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ]
}

In [11]:
# choose an experiment name
experiment = Experiment(ws, 'automl-classification-real-health-data')

In [12]:
automl_settings = {
    "enable_early_stopping":True,
    "experiment_timeout_hours":0.5,
    "iterations":20, # number of runs ex: 20
    "iteration_timeout_minutes":5,
    "max_concurrent_iterations":1,
    "max_cores_per_iteration":-1,
    #"experiment_exit_score":0.9920,
    "model_explainability":True,
    "n_cross_validations":5, # make 10 if small dataset, else 5
    "primary_metric":'AUC_weighted',
    "featurization":'auto',
    "verbosity":logging.INFO, # {INFO, DEBUG, CRITICAL, ERROR, WARNING} -- debug_log=<*.log>
    "enable_dnn":False,
}

automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             compute_target='local', # {training_cluster or 'local'}
                             #blacklist_models=['KNN','LinearSVM'],
#                              environment_definition=training_env,
                             enable_onnx_compatible_models=True,
                             training_data=dataset,
                             label_column_name="readmitted",
#                              training_data=training_data,
#                              validation_data=validation_data,
#                              label_column_name="readmitted",
                             **automl_settings
                            )
# ouputs "model.pkl" and "automl_errors.log"

In [13]:
automl_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_18c0f58e-30a7-4e63-bf58-c1067b9e91ec

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         High card

In [35]:
RunDetails(automl_run).show()
automl_run.wait_for_completion() # get more parameter info

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_46766335-e00c-4431-adae-31680e3e2adf',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-11-28T16:16:32.050678Z',
 'endTimeUtc': '2020-11-28T16:23:40.254571Z',
 'properties': {'num_iterations': '10',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'local',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"abd3833b-ff31-4028-b5fa-9cdee8d1a204\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"datalakestoragegen2\\\\\\", \\\\\\"path\\\\\\": \\\\\\"silver/real_data_processed.parquet\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"myResourceGroup02\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"43c1f93a-903d-4b23-a4bf-92bd7a150627\\\\\\", \\\\\\"wor

In [14]:
best_run, fitted_model = automl_run.get_output()

In [15]:
# pipeline steps
for step in fitted_model.named_steps:
    print(step)

datatransformer
MaxAbsScaler
LightGBMClassifier


In [16]:
# model properties
fitted_model.named_steps

{'datatransformer': DataTransformer(allow_chargram=None, enable_dnn=None,
                 enable_feature_sweeping=None, feature_sweeping_config=None,
                 feature_sweeping_timeout=None, featurization_config=None,
                 force_text_dnn=None, is_cross_validation=None,
                 is_onnx_compatible=None, logger=None, observer=None, task=None,
                 working_dir=None),
 'MaxAbsScaler': MaxAbsScaler(copy=True),
 'LightGBMClassifier': LightGBMClassifier(boosting_type='gbdt', class_weight=None,
                    colsample_bytree=1.0, importance_type='split',
                    learning_rate=0.1, max_depth=-1, min_child_samples=20,
                    min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
                    n_jobs=-1, num_leaves=31, objective=None, random_state=None,
                    reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
                    subsample_for_bin=200000, subsample_freq=0, verbose=-10)}

In [17]:
# show all metrics
best_run.get_metrics()

{'AUC_weighted': 0.6540198549547457,
 'accuracy': 0.5754899511407496,
 'precision_score_macro': 0.5170074013167871,
 'recall_score_macro': 0.4011196461638723,
 'precision_score_weighted': 0.5475618364095322,
 'average_precision_score_micro': 0.5986236081274043,
 'balanced_accuracy': 0.4011196461638723,
 'f1_score_weighted': 0.5216643286380028,
 'norm_macro_recall': 0.10167946924580847,
 'log_loss': 0.8973036612800973,
 'recall_score_micro': 0.5754899511407496,
 'AUC_micro': 0.7641147613467902,
 'recall_score_weighted': 0.5754899511407496,
 'AUC_macro': 0.6476366745070036,
 'matthews_correlation': 0.18708773523544217,
 'f1_score_macro': 0.37790990068850216,
 'weighted_accuracy': 0.6740387120134201,
 'average_precision_score_macro': 0.44949881154362126,
 'f1_score_micro': 0.5754899511407496,
 'precision_score_micro': 0.5754899511407496,
 'average_precision_score_weighted': 0.5522691632768659,
 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_18c0f58e-30a7-4e63-bf58-c1067b9

In [18]:
best_run.get_details()

{'runId': 'AutoML_18c0f58e-30a7-4e63-bf58-c1067b9e91ec_0',
 'status': 'Completed',
 'startTimeUtc': '2020-11-28T22:39:56.580418Z',
 'endTimeUtc': '2020-11-28T22:40:30.600913Z',
 'properties': {'runTemplate': 'automl_child',
  'pipeline_id': '5dfac790c5c209f98a1da2dc1c7fb76f0397324f',
  'pipeline_spec': '{"objects":[{"spec_class":"preproc","class_name":"MaxAbsScaler","module":"sklearn.preprocessing","param_args":[],"param_kwargs":{},"prepared_kwargs":{}},{"spec_class":"sklearn","class_name":"LightGBMClassifier","module":"automl.client.core.common.model_wrappers","param_args":[],"param_kwargs":{"min_data_in_leaf":20},"prepared_kwargs":{}}],"pipeline_id":"5dfac790c5c209f98a1da2dc1c7fb76f0397324f","module":"sklearn.pipeline","class_name":"Pipeline"}',
  'training_percent': '100',
  'predicted_cost': None,
  'iteration': '0',
  '_azureml.ComputeTargetType': 'local',
  '_aml_system_scenario_identification': 'Local.Child',
  'run_template': 'automl_child',
  'run_preprocessor': 'MaxAbsScaler'

In [19]:
client = ExplanationClient.from_run(best_run)
engineered_explanations = client.download_model_explanation(raw=False)
feature_importance = engineered_explanations.get_feature_importance_dict() # get model feature importance values
feature_importance

{'_diag_1_CharGramCountVectorizer_Diseases of the nervous system': 3.7914758943895728,
 'age_CharGramCountVectorizer_50-60': 2.9666241390502246,
 'num_medications_perday_MeanImputer': 2.6330764853612787,
 'number_inpatient_CharGramCountVectorizer_11': 2.1404416799116337,
 'number_inpatient_CharGramCountVectorizer_0': 1.952983739241292,
 'time_in_hospital_CharGramCountVectorizer_9': 1.4103933051519244,
 'num_lab_procedures_MeanImputer': 1.361226433504038,
 'num_medications_MeanImputer': 0.9007092812665515,
 'repaglinide_CharGramCountVectorizer_Steady': 0.8518861288243009,
 '_diag_2_CharGramCountVectorizer_Neoplasms': 0.6329274083258766,
 '_diag_1_CharGramCountVectorizer_Endocrine, nutritional, and metabolic diseases and immunity disorders, without diabetes': 0.546763044372514,
 '_diag_2_CharGramCountVectorizer_Diseases of the circulatory system': 0.458015061662098,
 '_diag_3_CharGramCountVectorizer_Supplemental classification': 0.45589868345464873,
 'number_inpatient_CharGramCountVector

In [20]:
columns = ["modelFeatureImportance_name", "modelFeatureImportance_value"]
fi = pd.DataFrame(list(feature_importance.items()), columns=columns) # transform dict into DataFrame
fi

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value
0,_diag_1_CharGramCountVectorizer_Diseases of the nervous system,3.79
1,age_CharGramCountVectorizer_50-60,2.97
2,num_medications_perday_MeanImputer,2.63
3,number_inpatient_CharGramCountVectorizer_11,2.14
4,number_inpatient_CharGramCountVectorizer_0,1.95
...,...,...
95,_diag_3_CharGramCountVectorizer_Diseases of the genitourinary system,0.01
96,time_in_hospital_severitylvl_CharGramCountVectorizer_Severe,0.01
97,_diag_1_CharGramCountVectorizer_Diseases of the blood and blood-forming organs,0.01
98,number_inpatient_CharGramCountVectorizer_12,0.01


In [21]:
fi = fi[fi.iloc[:,1] >= 0.01] # remove small or zero values
fi

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value
0,_diag_1_CharGramCountVectorizer_Diseases of the nervous system,3.79
1,age_CharGramCountVectorizer_50-60,2.97
2,num_medications_perday_MeanImputer,2.63
3,number_inpatient_CharGramCountVectorizer_11,2.14
4,number_inpatient_CharGramCountVectorizer_0,1.95
...,...,...
83,number_diagnoses_CharGramCountVectorizer_7,0.01
84,number_inpatient_CharGramCountVectorizer_9,0.01
85,number_inpatient_CharGramCountVectorizer_10,0.01
86,age_CharGramCountVectorizer_40-50,0.01


In [22]:
fi_sum = fi.iloc[:,1].sum()
relativeWeight = lambda x: x/fi_sum # normalize percentage values of weights
fi["modelFeatureImportance_relativeWeight"] = fi.iloc[:,1].apply(relativeWeight)
fi

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value,modelFeatureImportance_relativeWeight
0,_diag_1_CharGramCountVectorizer_Diseases of the nervous system,3.79,0.15
1,age_CharGramCountVectorizer_50-60,2.97,0.12
2,num_medications_perday_MeanImputer,2.63,0.10
3,number_inpatient_CharGramCountVectorizer_11,2.14,0.08
4,number_inpatient_CharGramCountVectorizer_0,1.95,0.08
...,...,...,...
83,number_diagnoses_CharGramCountVectorizer_7,0.01,0.00
84,number_inpatient_CharGramCountVectorizer_9,0.01,0.00
85,number_inpatient_CharGramCountVectorizer_10,0.01,0.00
86,age_CharGramCountVectorizer_40-50,0.01,0.00


In [23]:
fi.head(20)

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value,modelFeatureImportance_relativeWeight
0,_diag_1_CharGramCountVectorizer_Diseases of the nervous system,3.79,0.15
1,age_CharGramCountVectorizer_50-60,2.97,0.12
2,num_medications_perday_MeanImputer,2.63,0.1
3,number_inpatient_CharGramCountVectorizer_11,2.14,0.08
4,number_inpatient_CharGramCountVectorizer_0,1.95,0.08
5,time_in_hospital_CharGramCountVectorizer_9,1.41,0.05
6,num_lab_procedures_MeanImputer,1.36,0.05
7,num_medications_MeanImputer,0.9,0.04
8,repaglinide_CharGramCountVectorizer_Steady,0.85,0.03
9,_diag_2_CharGramCountVectorizer_Neoplasms,0.63,0.02
