Author: Kevin ALBERT

Created: Nov 2020

In [1]:
# import logging
import os
import logging
import pandas as pd
import numpy as np
import json
import requests
import joblib

In [2]:
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.core.experiment import Experiment
from azureml.data.datapath import DataPath
from azureml.core.compute import ComputeTarget, AmlCompute, AksCompute
from azureml.core.model import Model, InferenceConfig
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails
from azureml.core.webservice import Webservice, AciWebservice, AksWebservice
from azureml.exceptions import WebserviceException
from azureml.core.environment import Environment
from azureml.train.estimator import Estimator
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.sampling import RandomParameterSampling, GridParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.core.runconfig import RunConfiguration
from azureml.pipeline.core import PipelineData, Pipeline
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.pipeline.core.run import PipelineRun
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.explain.model._internal.explanation_client import ExplanationClient
import azureml.core
print("azureml.core version:", azureml.core.__version__)

azureml.core version: 1.17.0


The azureml-explain-model package is deprecated and will be removed in a future release of the AzureML SDK. Please use the azureml-interpret and interpret-community packages which support the functionality azureml-explain-model used to provide.


In [3]:
# pd.describe_option('display')            # show all pandas options, parameters can slow down notebook
pd.set_option('display.max_colwidth', 100) # default 50, the maximum width in characters of a column
pd.set_option('display.max_columns', 40)   # default 20, the maximum amount of columns in view 
pd.set_option('display.max_rows', 60)      # default 60, the maximum amount of rows in view

In [4]:
!pip list |grep -i azureml

azureml-accel-models                  1.17.0
azureml-automl-core                   1.17.0
azureml-automl-runtime                1.17.0
azureml-cli-common                    1.17.0
azureml-contrib-dataset               1.17.0
azureml-contrib-fairness              1.17.0
azureml-contrib-gbdt                  1.17.0
azureml-contrib-interpret             1.17.0
azureml-contrib-notebook              1.17.0
azureml-contrib-pipeline-steps        1.17.0
azureml-contrib-reinforcementlearning 1.17.0
azureml-contrib-server                1.17.0
azureml-contrib-services              1.17.0
azureml-core                          1.17.0
azureml-datadrift                     1.17.0
azureml-dataprep                      2.4.2
azureml-dataprep-native               24.0.0
azureml-dataprep-rslex                1.2.2
azureml-dataset-runtime               1.17.0
azureml-defaults                      1.17.0
azureml-explain-model                 1.17.0
azureml-interpret                   

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
#import eli5
#from eli5.sklearn import PermutationImportance
# import scikitplot as skplt
from sklearn.decomposition import PCA

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None) # display all columns

import warnings
warnings.filterwarnings('ignore')

In [6]:
# load the workspace
ws = Workspace.from_config()

In [7]:
# choose an experiment name
experiment = Experiment(ws, 'automl-classification-synthetic-health-data')

In [8]:
ds = Datastore.register_azure_blob_container(
    workspace=ws,
    datastore_name="datalakestoragegen2",
    container_name="datalake",
    account_name="datalake27112020",
    account_key="WJ4lTl5w9ze3hberxHsnNoWON5DTZJyajggoLY3j7WgsDDFm5w/NPuDAfO4Po/bNellztxilXm2Gpo9GzEzxdA==",
    create_if_not_exists=False)
# list available datastores
ws.datastores

{'datalakestoragegen2': {
   "name": "datalakestoragegen2",
   "container_name": "datalake",
   "account_name": "datalake27112020",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspacefilestore': {
   "name": "workspacefilestore",
   "container_name": "azureml-filestore-9835fb79-8b03-46ca-ba4b-b8cd0d3e846a",
   "account_name": "machinelstorage9af0d08f1",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspaceblobstore': {
   "name": "workspaceblobstore",
   "container_name": "azureml-blobstore-9835fb79-8b03-46ca-ba4b-b8cd0d3e846a",
   "account_name": "machinelstorage9af0d08f1",
   "protocol": "https",
   "endpoint": "core.windows.net"
 }}

In [9]:
# setup parquet file(s) into a tabular dataset
ds_path = [DataPath(ds, 'silver/synthetic_data_processed_remove_hba1c.parquet')] # {path/*.parquet}
dataset = Dataset.Tabular.from_parquet_files(path=ds_path)
# show dataset settings
dataset

{
  "source": [
    "('datalakestoragegen2', 'silver/synthetic_data_processed_remove_hba1c.parquet')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ]
}

In [11]:
# dataset.to_pandas_dataframe()["A1Cresult"]

In [12]:
automl_settings = {
    "enable_early_stopping":True,
    "experiment_timeout_hours":0.5,
    "iterations":10, # number of runs ex: 20
    "iteration_timeout_minutes":5,
    "max_concurrent_iterations":1,
    "max_cores_per_iteration":-1,
    #"experiment_exit_score":0.9920,
    "model_explainability":True,
    "n_cross_validations":5, # make 10 if small dataset, else 5
    "primary_metric":'AUC_weighted',
    "featurization":'auto',
    "verbosity":logging.INFO, # {INFO, DEBUG, CRITICAL, ERROR, WARNING} -- debug_log=<*.log>
    "enable_dnn":False,
}

automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             compute_target='local', # {training_cluster or 'local'}
                             #blacklist_models=['KNN','LinearSVM'],
#                              environment_definition=training_env,
                             enable_onnx_compatible_models=True,
                             training_data=dataset,
                             label_column_name="readmitted",
#                              training_data=training_data,
#                              validation_data=validation_data,
#                              label_column_name="readmitted",
                             **automl_settings
                            )
# ouputs "model.pkl" and "automl_errors.log"

In [13]:
automl_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_1766ab23-4500-4668-95f2-c4cf355d7b19

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         High card

### optional: retrieve specific run

In [None]:
# runId = 'AutoML_891419fd-d69c-4a91-b536-f008adcb800c'
# automl_run = AutoMLRun(experiment, run_id=runId)

### results: explore the best pipeline

In [14]:
RunDetails(automl_run).show()
automl_run.wait_for_completion() # get more parameter info

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_1766ab23-4500-4668-95f2-c4cf355d7b19',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-11-28T20:51:21.027136Z',
 'endTimeUtc': '2020-11-28T20:55:53.430654Z',
 'properties': {'num_iterations': '10',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'local',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"b3d7ce4e-900f-49c6-8070-b9382b46f3c4\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"datalakestoragegen2\\\\\\", \\\\\\"path\\\\\\": \\\\\\"silver/synthetic_data_processed_remove_hba1c.parquet\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"myResourceGroup02\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"43c1f93a-903d-4b23-a4bf-92bd7a150627\

In [15]:
best_run, fitted_model = automl_run.get_output()
# pipeline steps
for step in fitted_model.named_steps:
    print(step)

datatransformer
prefittedsoftvotingclassifier


In [16]:
# model properties
fitted_model.named_steps

{'datatransformer': DataTransformer(allow_chargram=None, enable_dnn=None,
                 enable_feature_sweeping=None, feature_sweeping_config=None,
                 feature_sweeping_timeout=None, featurization_config=None,
                 force_text_dnn=None, is_cross_validation=None,
                 is_onnx_compatible=None, logger=None, observer=None, task=None,
                 working_dir=None),
 'prefittedsoftvotingclassifier': PreFittedSoftVotingClassifier(classification_labels=None,
                               estimators=[('0',
                                            Pipeline(memory=None,
                                                     steps=[('maxabsscaler',
                                                             MaxAbsScaler(copy=True)),
                                                            ('lightgbmclassifier',
                                                             LightGBMClassifier(boosting_type='gbdt',
                                     

In [17]:
# show all metrics
best_run.get_metrics()

{'f1_score_weighted': 0.7447772502096816,
 'f1_score_macro': 0.6865905114847157,
 'matthews_correlation': 0.5617864259969967,
 'recall_score_micro': 0.7516767720959348,
 'average_precision_score_macro': 0.7695858933483806,
 'precision_score_micro': 0.7516767720959348,
 'AUC_weighted': 0.8775572510294355,
 'balanced_accuracy': 0.6647108348052492,
 'precision_score_macro': 0.7294028566975033,
 'recall_score_macro': 0.6647108348052492,
 'precision_score_weighted': 0.7479405114928387,
 'norm_macro_recall': 0.4970662522078738,
 'f1_score_micro': 0.7516767720959348,
 'accuracy': 0.7516767720959348,
 'AUC_macro': 0.8829318852482348,
 'average_precision_score_micro': 0.8314633741843664,
 'average_precision_score_weighted': 0.8240378775437387,
 'AUC_micro': 0.9043359985156428,
 'weighted_accuracy': 0.7947083791333165,
 'log_loss': 0.5947824462488391,
 'recall_score_weighted': 0.7516767720959348,
 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_1766ab23-4500-4668-95f2-c4cf355d7b19_

In [18]:
# best_run.download_file('residuals')

In [19]:
# best_run.download_file('predicted_true')

In [20]:
best_run.get_details()

{'runId': 'AutoML_1766ab23-4500-4668-95f2-c4cf355d7b19_9',
 'status': 'Completed',
 'startTimeUtc': '2020-11-28T20:55:29.28262Z',
 'endTimeUtc': '2020-11-28T20:55:52.670628Z',
 'properties': {'runTemplate': 'automl_child',
  'pipeline_id': '__AutoML_Ensemble__',
  'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'AUC_weighted\',\'verbosity\':20,\'ensemble_iterations\':10,\'is_timeseries\':False,\'name\':\'automl-classification-synthetic-health-data\',\'compute_target\':\'local\',\'subscription_id\':\'43c1f93a-903d-4b23-a4bf-92bd7a150627\',\'region\':\'westeurope\',\'spark_service\':None}","ensemble_run_id":"AutoML_1766ab23-4500-4668-95f2-c4cf355d7b19_9","experiment_name":null,"workspace_name":"machine_learning_workspace02","subscription_id":"43c1f93a-903d-4b23-a4bf-92bd7a150627"

In [21]:
client = ExplanationClient.from_run(best_run)
engineered_explanations = client.download_model_explanation(raw=False)
feature_importance = engineered_explanations.get_feature_importance_dict() # get model feature importance values
feature_importance

{'number_inpatient_CharGramCountVectorizer_0': 0.7772008069962925,
 'change_ModeCatImputer_LabelEncoder': 0.3801518351807926,
 '_diag_1_CharGramCountVectorizer_Diseases of the circulatory system': 0.27387390988417126,
 'age_CharGramCountVectorizer_80-90': 0.19182798967702477,
 '_diag_1_CharGramCountVectorizer_Diseases of the respiratory system': 0.1500073796332051,
 '_diag_3_CharGramCountVectorizer_Diabetes mellitus': 0.14128871524204825,
 'number_diagnoses_CharGramCountVectorizer_9': 0.13275789346747544,
 '_diag_3_CharGramCountVectorizer_Diseases of the circulatory system': 0.1259235152860604,
 '_diag_2_CharGramCountVectorizer_Diabetes mellitus': 0.1181136603027101,
 'time_in_hospital_severitylvl_CharGramCountVectorizer_Normal': 0.11593947177044918,
 'number_inpatient_CharGramCountVectorizer_2': 0.11537379265322602,
 '_diag_2_CharGramCountVectorizer_Diseases of the circulatory system': 0.11496547044462507,
 '_diag_2_CharGramCountVectorizer_Endocrine, nutritional, and metabolic disease

In [22]:
columns = ["modelFeatureImportance_name", "modelFeatureImportance_value"]
fi = pd.DataFrame(list(feature_importance.items()), columns=columns) # transform dict into DataFrame
fi

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value
0,number_inpatient_CharGramCountVectorizer_0,0.78
1,change_ModeCatImputer_LabelEncoder,0.38
2,_diag_1_CharGramCountVectorizer_Diseases of the circulatory system,0.27
3,age_CharGramCountVectorizer_80-90,0.19
4,_diag_1_CharGramCountVectorizer_Diseases of the respiratory system,0.15
...,...,...
95,_diag_1_CharGramCountVectorizer_Diseases of the skin and subcutaneous tissue,0.00
96,insulin_CharGramCountVectorizer_Steady,0.00
97,_diag_3_CharGramCountVectorizer_Infectious and parasitic diseases,0.00
98,repaglinide_CharGramCountVectorizer_Steady,0.00


In [23]:
fi = fi[fi.iloc[:,1] >= 0.01] # remove small or zero values
fi

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value
0,number_inpatient_CharGramCountVectorizer_0,0.78
1,change_ModeCatImputer_LabelEncoder,0.38
2,_diag_1_CharGramCountVectorizer_Diseases of the circulatory system,0.27
3,age_CharGramCountVectorizer_80-90,0.19
4,_diag_1_CharGramCountVectorizer_Diseases of the respiratory system,0.15
...,...,...
67,number_diagnoses_CharGramCountVectorizer_4,0.01
68,"_diag_2_CharGramCountVectorizer_Other symptoms, signs, and ill-defined conditions",0.01
69,time_in_hospital_CharGramCountVectorizer_6,0.01
70,max_glu_serum_CharGramCountVectorizer_None,0.01


In [24]:
fi_sum = fi.iloc[:,1].sum()
relativeWeight = lambda x: x/fi_sum # normalize percentage values of weights
fi["modelFeatureImportance_relativeWeight"] = fi.iloc[:,1].apply(relativeWeight)
fi

Unnamed: 0,modelFeatureImportance_name,modelFeatureImportance_value,modelFeatureImportance_relativeWeight
0,number_inpatient_CharGramCountVectorizer_0,0.78,0.16
1,change_ModeCatImputer_LabelEncoder,0.38,0.08
2,_diag_1_CharGramCountVectorizer_Diseases of the circulatory system,0.27,0.06
3,age_CharGramCountVectorizer_80-90,0.19,0.04
4,_diag_1_CharGramCountVectorizer_Diseases of the respiratory system,0.15,0.03
...,...,...,...
67,number_diagnoses_CharGramCountVectorizer_4,0.01,0.00
68,"_diag_2_CharGramCountVectorizer_Other symptoms, signs, and ill-defined conditions",0.01,0.00
69,time_in_hospital_CharGramCountVectorizer_6,0.01,0.00
70,max_glu_serum_CharGramCountVectorizer_None,0.01,0.00


# second run on >30

In [None]:
# setup parquet file(s) into a tabular dataset
ds_path = [DataPath(ds, 'silver/synthetic_data_processed_morethan30.parquet')] # {path/*.parquet}
dataset = Dataset.Tabular.from_parquet_files(path=ds_path)
# show dataset settings
dataset

In [None]:
# choose an experiment name
experiment = Experiment(ws, 'automl-classification-synthetic-change')

In [None]:
automl_settings = {
    "enable_early_stopping":True,
    "experiment_timeout_hours":0.5,
    "iterations":10, # number of runs ex: 20
    "iteration_timeout_minutes":5,
    "max_concurrent_iterations":1,
    "max_cores_per_iteration":-1,
    #"experiment_exit_score":0.9920,
    "model_explainability":True,
    "n_cross_validations":5, # make 10 if small dataset, else 5
    "primary_metric":'AUC_weighted',
    "featurization":'auto',
    "verbosity":logging.INFO, # {INFO, DEBUG, CRITICAL, ERROR, WARNING} -- debug_log=<*.log>
    "enable_dnn":False,
}

automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             compute_target='local', # {training_cluster or 'local'}
                             #blacklist_models=['KNN','LinearSVM'],
#                              environment_definition=training_env,
                             enable_onnx_compatible_models=True,
                             training_data=dataset,
                             label_column_name="change",
#                              training_data=training_data,
#                              validation_data=validation_data,
#                              label_column_name="readmitted",
                             **automl_settings
                            )
# ouputs "model.pkl" and "automl_errors.log"

In [None]:
automl_run = experiment.submit(automl_config, show_output=True)

In [None]:
RunDetails(automl_run).show()
automl_run.wait_for_completion() # get more parameter info

In [None]:
best_run, fitted_model = automl_run.get_output()

In [None]:
# pipeline steps
for step in fitted_model.named_steps:
    print(step)

In [None]:
# model properties
fitted_model.named_steps

In [None]:
# show all metrics
best_run.get_metrics()

In [None]:
best_run.get_details()

In [None]:
client = ExplanationClient.from_run(best_run)
engineered_explanations = client.download_model_explanation(raw=False)
feature_importance = engineered_explanations.get_feature_importance_dict() # get model feature importance values
feature_importance

In [None]:
columns = ["modelFeatureImportance_name", "modelFeatureImportance_value"]
fi = pd.DataFrame(list(feature_importance.items()), columns=columns) # transform dict into DataFrame
fi

In [None]:
fi = fi[fi.iloc[:,1] >= 0.01] # remove small or zero values
fi

In [None]:
fi_sum = fi.iloc[:,1].sum()
relativeWeight = lambda x: x/fi_sum # normalize percentage values of weights
fi["modelFeatureImportance_relativeWeight"] = fi.iloc[:,1].apply(relativeWeight)
fi

# third run on No

In [None]:
# setup parquet file(s) into a tabular dataset
ds_path = [DataPath(ds, 'silver/synthetic_data_processed_withNo.parquet')] # {path/*.parquet}
dataset = Dataset.Tabular.from_parquet_files(path=ds_path)
# show dataset settings
dataset

In [None]:
dataset.to_pandas_dataframe().shape

In [None]:
# choose an experiment name
experiment = Experiment(ws, 'automl-classification-synthetic-change')

In [None]:
automl_settings = {
    "enable_early_stopping":True,
    "experiment_timeout_hours":0.5,
    "iterations":10, # number of runs ex: 20
    "iteration_timeout_minutes":5,
    "max_concurrent_iterations":1,
    "max_cores_per_iteration":-1,
    #"experiment_exit_score":0.9920,
    "model_explainability":True,
    "n_cross_validations":5, # make 10 if small dataset, else 5
    "primary_metric":'AUC_weighted',
    "featurization":'auto',
    "verbosity":logging.INFO, # {INFO, DEBUG, CRITICAL, ERROR, WARNING} -- debug_log=<*.log>
    "enable_dnn":False,
}

automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             compute_target='local', # {training_cluster or 'local'}
                             #blacklist_models=['KNN','LinearSVM'],
#                              environment_definition=training_env,
                             enable_onnx_compatible_models=True,
                             training_data=dataset,
                             label_column_name="change",
#                              training_data=training_data,
#                              validation_data=validation_data,
#                              label_column_name="readmitted",
                             **automl_settings
                            )
# ouputs "model.pkl" and "automl_errors.log"

In [None]:
automl_run = experiment.submit(automl_config, show_output=True)

In [None]:
RunDetails(automl_run).show()
automl_run.wait_for_completion() # get more parameter info

In [None]:
best_run, fitted_model = automl_run.get_output()

In [None]:
# pipeline steps
for step in fitted_model.named_steps:
    print(step)

In [None]:
# model properties
fitted_model.named_steps

In [None]:
# show all metrics
best_run.get_metrics()

In [None]:
best_run.get_details()

In [None]:
client = ExplanationClient.from_run(best_run)
engineered_explanations = client.download_model_explanation(raw=False)
feature_importance = engineered_explanations.get_feature_importance_dict() # get model feature importance values
feature_importance

In [None]:
columns = ["modelFeatureImportance_name", "modelFeatureImportance_value"]
fi = pd.DataFrame(list(feature_importance.items()), columns=columns) # transform dict into DataFrame
fi

In [None]:
fi = fi[fi.iloc[:,1] >= 0.01] # remove small or zero values
fi

In [None]:
fi_sum = fi.iloc[:,1].sum()
relativeWeight = lambda x: x/fi_sum # normalize percentage values of weights
fi["modelFeatureImportance_relativeWeight"] = fi.iloc[:,1].apply(relativeWeight)
fi