## Using Azure AutoML to predict COVID-19 outomes from EHR Data

In [3]:
import argparse
import azureml
import os
import sklearn
import pandas as pd 
import numpy as np
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from azureml.core import Run, Dataset
from sklearn.preprocessing import LabelEncoder
from azureml.core import Workspace, Experiment, Run, RunConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import ScriptRunConfig, Environment
from azureml.core import  Workspace
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

In [4]:
workspace = "cdh-azml-dev-mlw"
resource_group= 'CSELS-CDH-DEV'
subscription_id= "320d8d57-c87c-4434-827f-59ee7d86687a"

ws = Workspace.get(name=workspace,
                    resource_group=resource_group,
                    subscription_id=subscription_id)

In [5]:
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: cdh-azml-dev-mlw
Azure region: eastus
Subscription id: 320d8d57-c87c-4434-827f-59ee7d86687a
Resource group: CSELS-CDH-DEV


In [6]:
# current working directory
pwd = os.getcwd()
print("Current Directory:", pwd)
  
# parent directory
parent = os.path.join(pwd, os.pardir)
  
# prints parent directory
print("\nParent Directory:", os.path.abspath(parent))

premier_path = os.path.abspath(parent)

Current Directory: c:\Users\wsn8\Code\premier_analysis\azure_ml

Parent Directory: c:\Users\wsn8\Code\premier_analysis


In [7]:
clustername = 'StandardD13v2'
is_new_cluster = False
try:
    aml_compute_cpu = ComputeTarget(workspace = ws,name= clustername)
    print("Find the existing cluster")
except ComputeTargetException:
    print("Cluster not find - Creating cluster.....")
    is_new_cluster = True
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS13_V2',
                                                           max_nodes=2)
    aml_compute_cpu = ComputeTarget.create(ws, clustername, compute_config)

aml_compute_cpu.wait_for_completion(show_output=True)

Find the existing cluster
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [8]:
premier_train_baseline_env = Environment.from_conda_specification(name='premier_train_baseline_env', file_path='conda_dependencies_baseline.yml')
# Specify a CPU base image
premier_train_baseline_env.docker.enabled = True
premier_train_baseline_env.docker.base_image = DEFAULT_CPU_IMAGE
premier_train_baseline_env.register(workspace=ws)

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


{
    "assetId": "azureml://locations/eastus/workspaces/d5539876-73f2-429b-9d16-cd4969e1602d/environments/premier_train_baseline_env/versions/4",
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20220915.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "buildContext": null,
        "enabled": true,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "premier_trai

In [9]:
from azureml.core import Workspace, Dataset
from azureml.core import Run
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.pipeline.core import PipelineParameter
data_store = ws.get_default_datastore()

##########Loading the data from datastore



In [25]:

%%writefile ./automl/baseline_preprocessing.py

import os
import time
from importlib import reload
import pandas as pd
import argparse
import numpy as np
from azureml.core import Model,Dataset
import joblib
import pickle as pkl
from azureml.core import Workspace, Experiment, Run, RunConfiguration
from scipy.sparse import lil_matrix
from sklearn.model_selection import train_test_split

def flatten(l):
    return [item for sublist in l for item in sublist]

def baseline_preprocessing(OUTCOME):

    DAY_ONE_ONLY = True
    USE_DEMOG = True
    TEST_SPLIT = 0.1
    VAL_SPLIT = 0.2
    RAND = 42

    run = Run.get_context()
    print("run name:",run.display_name)
    print("run details:",run.get_details())
    
    ws = run.experiment.workspace

    data_store = ws.get_default_datastore()

    print("Creating dataset from Datastore")
    inputs = Dataset.File.from_files(path=data_store.path('output/pkl/trimmed_seqs.pkl'))  
    vocab = Dataset.File.from_files(path=data_store.path('output/pkl/all_ftrs_dict.pkl'))
    demog_dict = Dataset.File.from_files(path=data_store.path('output/pkl/demog_dict.pkl'))
    cohort = Dataset.Tabular.from_delimited_files(path=data_store.path('output/cohort/cohort.csv'))
    
    pwd = os.path.dirname(__file__)
    output_dir = os.path.abspath(os.path.join(pwd,"output"))
    pkl_dir = os.path.join(output_dir, "pkl")
    csv_dir = os.path.join(output_dir, "csv")

    os.makedirs(pkl_dir, exist_ok=True)
    os.makedirs(csv_dir, exist_ok=True)

    print("Dowloading data from Datastore...")

    inputs.download(target_path=pkl_dir,overwrite=True,ignore_not_found=True)
    vocab.download(target_path=pkl_dir,overwrite=True,ignore_not_found=True)
    demog_dict.download(target_path=pkl_dir,overwrite=True,ignore_not_found=True)
    cohort.to_pandas_dataframe().to_csv(os.path.join(csv_dir,'cohort.csv'))

    print("Loading var...")
    with open(os.path.join(pkl_dir, "trimmed_seqs.pkl"), "rb") as f:
        inputs = pkl.load(f)

    with open(os.path.join(pkl_dir, "all_ftrs_dict.pkl"), "rb") as f:
        vocab = pkl.load(f)

    with open(os.path.join(pkl_dir, "demog_dict.pkl"), "rb") as f:
        demog_dict = pkl.load(f)
        demog_dict = {k: v for v, k in demog_dict.items()}

    
    # Separating the inputs and labels
    features = [t[0] for t in inputs]
    demog = [t[1] for t in inputs]
    cohort = pd.read_csv(os.path.join(csv_dir, 'cohort.csv'))
    labels = cohort[OUTCOME]

    # Counts to use for loops and stuff
    n_patients = len(features)
    n_features = np.max(list(vocab.keys()))
    n_classes = len(np.unique(labels))
    binary = n_classes <= 2

        # Converting the labels to an array
    y = np.array(labels, dtype=np.uint8)

    # Optionally limiting the features to only those from the first day
    # of the actual COVID visit
    if DAY_ONE_ONLY:
        features = [l[-1] for l in features]
    else:
        features = [flatten(l) for l in features]

    new_demog = [[i + n_features for i in l] for l in demog]
    features = [features[i] + new_demog[i] for i in range(n_patients)]
    demog_vocab = {k + n_features: v for k, v in demog_dict.items()}
    vocab.update(demog_vocab)
    n_features = np.max([np.max(l) for l in features])
    # all_feats.update({v: v for k, v in demog_dict.items()})

    # Converting the features to a sparse matrix
    mat = lil_matrix((n_patients, n_features + 1))
    for row, cols in enumerate(features):
        mat[row, cols] = 1

    # Converting to csr because the internet said it would be faster
    print("Converting to csr..")
    X = mat.tocsr()

    # Splitting the data; 'all' will produce the same test sample
    # for every outcome (kinda nice)

    STRATIFY = None

    strat_var = y
    train, test = train_test_split(range(n_patients),
                                    test_size=TEST_SPLIT,
                                    stratify=strat_var,
                                    random_state=RAND)

    # Doing a validation split for threshold-picking on binary problems
    train, val = train_test_split(train,
                                    test_size=VAL_SPLIT,
                                    stratify=strat_var[train],
                                    random_state=RAND)

    return  X[train],y[train],X[test],y[test]


if __name__ == '__main__':
    parser = argparse.ArgumentParser("feature")
    parser.add_argument("--outcome",type=str)

    args = parser.parse_args()

    OUTCOME = args.outcome

    x_train,y_train,x_test,y_test = baseline_preprocessing(OUTCOME=OUTCOME)

    x_train_df = pd.DataFrame.sparse.from_spmatrix(x_train)
    x_test_df = pd.DataFrame.sparse.from_spmatrix(x_test)

    train_data = pd.concat([x_train_df,pd.DataFrame(y_train)],axis =1)
    print("train shape:",train_data.shape)
    
    run = Run.get_context()
    ws = run.experiment.workspace

    data_store = ws.get_default_datastore()

    dataset_name = f"automl-train-baseline-{OUTCOME}"
    ds_train = Dataset.Tabular.register_pandas_dataframe(train_data,target=data_store,name=dataset_name,show_progress=True)


Overwriting ./automl/baseline_preprocessing.py


In [26]:
OUTCOME = 'icu'
source_directory ='./automl'
job_feature_processing = ScriptRunConfig(
                         script="baseline_preprocessing.py", 
                         arguments=["--outcome",OUTCOME],
                         compute_target=aml_compute_cpu, 
                         environment=premier_train_baseline_env,
                         source_directory=source_directory)
print("job_feature_processing created")

job_feature_processing created


In [27]:
exp_name = f"Job-feature-preprocess-automl-baseline"
print("Submit Experiment:",exp_name)
# Create experiment
experiment = Experiment(workspace=ws, name = exp_name)
run = experiment.submit(job_feature_processing)

Submit Experiment: Job-feature-preprocess-automl-baseline


#### Auto ML configuration

In [42]:
from azureml.train.automl import AutoMLConfig

# Get the batch dataset for input
dataset_name = f"automl-train-baseline-{OUTCOME}"
batch_data_set = ws.datasets['premier_features']

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=15,
    task='classification',
    primary_metric="accuracy",
    compute_target = aml_compute_cpu,
    training_data=train_data,
    label_column_name='y',
    n_cross_validations=5)

In [44]:
# Submit your automl run
exp = Experiment(workspace=ws, name="automl-train-baseline-icu")
automl_run = exp.submit(automl_config, show_output=True)

ConfigException: ConfigException:
	Message: Input of type '<class 'pandas.core.frame.DataFrame'>' is not supported. Supported types: [azureml.data.tabular_dataset.TabularDataset]Please refer to documentation for converting to Supported types: https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.dataset.dataset?view=azure-ml-py
	InnerException: None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "Input of type '<class 'pandas.core.frame.DataFrame'>' is not supported. Supported types: [azureml.data.tabular_dataset.TabularDataset]Please refer to documentation for converting to Supported types: https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.dataset.dataset?view=azure-ml-py",
        "details_uri": "https://aka.ms/AutoMLConfig",
        "target": "training_data",
        "inner_error": {
            "code": "BadArgument",
            "inner_error": {
                "code": "ArgumentInvalid",
                "inner_error": {
                    "code": "InvalidInputDatatype"
                }
            }
        }
    }
}