# Build Deep Learning Pipeline - Premier Analysis on Azure Machine Learning
### LSTM and DAN 

In [10]:
import argparse
import azureml
import os
import sklearn
import pandas as pd 
import numpy as np
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from azureml.core import Run, Dataset
from sklearn.preprocessing import LabelEncoder
from azureml.core import Workspace, Experiment, Run, RunConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import ScriptRunConfig, Environment
from azureml.widgets import RunDetails
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import DEFAULT_CPU_IMAGE, DEFAULT_GPU_IMAGE

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)
print(sklearn.__version__)

Azure ML SDK Version:  1.24.0
1.0.2


In [2]:
from azureml.core import  Workspace
from azureml.core.authentication import InteractiveLoginAuthentication
interactive_auth = InteractiveLoginAuthentication(tenant_id="9ce70869-60db-44fd-abe8-d2767077fc8f")

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: cdh-azml-dev-mlw
Azure region: eastus
Subscription id: 320d8d57-c87c-4434-827f-59ee7d86687a
Resource group: CSELS-CDH-DEV


In [3]:
# current working directory
path = os.getcwd()
print("Current Directory:", path)
  
# parent directory
parent = os.path.join(path, os.pardir)
  
# prints parent directory
print("\nParent Directory:", os.path.abspath(parent))

premier_path = os.path.abspath(parent)

Current Directory: c:\Users\wsn8\Code\premier_analysis\azure_ml

Parent Directory: c:\Users\wsn8\Code\premier_analysis


### Create CPU Compute

In [5]:
clustername = 'StandardD13v2'
is_new_cluster = False
try:
    aml_compute_cpu = ComputeTarget(workspace = ws,name= clustername)
    print("Find the existing cluster")
except ComputeTargetException:
    print("Cluster not find - Creating cluster.....")
    is_new_cluster = True
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS13_V2',
                                                           max_nodes=2)
    aml_compute_cpu = ComputeTarget.create(ws, clustername, compute_config)

aml_compute_cpu.wait_for_completion(show_output=True)

Find the existing cluster
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### Create GPU Compute

In [6]:
clustername = 'StandardNC6'
is_new_cluster = False
try:
    aml_cluster_gpu = ComputeTarget(workspace = ws,name= clustername)
    print("Find the existing cluster")
except ComputeTargetException:
    print("Cluster not find - Creating cluster.....")
    is_new_cluster = True
    compute_config = AmlCompute.provisioning_configuration(vm_size='StandardNC6',
                                                           max_nodes=2)
    aml_cluster_gpu = ComputeTarget.create(ws, clustername, compute_config)

aml_cluster_gpu.wait_for_completion(show_output=True)

Find the existing cluster
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [172]:
%%writefile conda_dependencies_features.yml

channels:
- anaconda
- default
dependencies:
- python=3.8
- pip:
  - azureml-defaults
  - matplotlib
  - pandas==1.1.5
  - argparse
  - joblib
  - scikit-learn
  - azureml-sdk
  - openpyxl

Overwriting conda_dependencies_features.yml


In [173]:
%%writefile conda_dependencies_model.yml
channels:
- anaconda
- default
dependencies:
- python=3.8
- pip:
  - azureml-defaults
  - matplotlib
  - pandas
  - argparse
  - joblib
  - scikit-learn
  - azureml-sdk
  - openpyxl
  - tensorflow
  - keras-tuner

Overwriting conda_dependencies_model.yml


In [7]:
premier_feature_env = Environment.from_conda_specification(name='premier_feature_env', file_path='conda_dependencies_features.yml')
# Specify a CPU base image
#premier_feature_env.docker.enabled = True
premier_feature_env.docker.base_image = DEFAULT_CPU_IMAGE
premier_feature_env.register(workspace=ws)
run_config_feature = RunConfiguration()
run_config_feature.environment = premier_feature_env

In [8]:
premier_train_model_env = Environment.from_conda_specification(name='premier_train_model_env', file_path='conda_dependencies_model.yml')
# Specify a GPU base image
# premier_train_model_env.docker.enabled = True
premier_train_model_env.docker.base_image = DEFAULT_CPU_IMAGE
premier_train_model_env.register(workspace=ws)
run_config_train = RunConfiguration()
run_config_train.environment = premier_train_model_env

In [176]:
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference


datastore_name = 'edav_dev_ds'
cdh_path = 'exploratory/databricks_ml/mitre_premier/data/'
ds = Datastore.get(ws, datastore_name)

print("Datastore's name: {}".format(ds.name))

premier_data_ref = DataReference(
    datastore=ds,
    data_reference_name='premier_data',
    path_on_datastore=cdh_path)
print("DataReference object created")

Datastore's name: edav_dev_ds
DataReference object created


In [188]:
data_store = ws.get_default_datastore()
flat_features = PipelineData("flat_features_data",datastore=data_store).as_dataset()
feature_lookup = PipelineData("feature_lookup_data",datastore=data_store).as_dataset()
trimmed_seq = PipelineData("trimmed_seq",datastore=data_store).as_dataset()
pat_data = PipelineData("pat_data",datastore=data_store).as_dataset()
demog_dict = PipelineData("demog_dict_data",datastore=data_store).as_dataset()
all_ftrs_dict = PipelineData("all_ftrs_dict_data",datastore=data_store).as_dataset()
int_seqs = PipelineData("int_seqs_data",datastore=data_store).as_dataset()

trimmed_seq_pkl = PipelineData("trimmed_seq_pkl_data",datastore=data_store).as_dataset()
cohort = PipelineData("cohort",datastore=data_store).as_dataset()
model_file = PipelineData("model_probs",datastore=data_store).as_dataset()
stats_file = PipelineData("analysis_data",datastore=data_store).as_dataset()
preds_file = PipelineData("prediction_data",datastore=data_store).as_dataset()
probs_file = PipelineData("probs_data",datastore=data_store).as_dataset()

In [12]:
model_name = PipelineParameter(name="model_name", default_value="dan")
outcome = PipelineParameter(name="outcome", default_value="misa_pt")
n_epochs = PipelineParameter(name="n_epochs",default_value=10)


## Feature Extraction

In [189]:
source_directory ='./training'
step1 = PythonScriptStep(name="feature_extraction",
                         script_name="feature_extraction.py", 
                         inputs=[premier_data_ref.as_download()],
                         arguments=["--flat_features",flat_features,"--feature_lookup",feature_lookup],
                         outputs=[flat_features,feature_lookup],
                         compute_target=aml_compute_cpu, 
                         runconfig=run_config_feature,
                         source_directory=source_directory,
                         allow_reuse=True)
print("Step1 feature_extraction created")

Step1 feature_extraction created


### Feature Tokenization

In [190]:
source_directory ='./training'
step2 = PythonScriptStep(name="feature_tokenization",
                         script_name="feature_tokenization.py", 
                         inputs=[flat_features],
                         arguments=["--flat_features",flat_features,
                                    "--trimmed_seq_file",trimmed_seq,
                                    "--pat_data_file",pat_data,
                                    "--demog_dict_file",demog_dict,
                                    "--all_ftrs_dict_file",all_ftrs_dict,
                                    "--int_seqs_file",int_seqs],
                         outputs=[trimmed_seq,demog_dict,pat_data,all_ftrs_dict,int_seqs],
                         compute_target=aml_compute_cpu, 
                         runconfig=run_config_feature,
                         source_directory=source_directory,
                         allow_reuse=True)
print("Step2 feature_tokenization created")

Step2 feature_tokenization created


### Sequence Trimming

In [191]:
source_directory ='./training'
step3 = PythonScriptStep(name="sequence_trimming",
                         script_name="sequence_trimming.py", 
                         inputs=[pat_data,all_ftrs_dict,int_seqs,feature_lookup],
                         arguments=["--trimmed_seq_pkl_file",trimmed_seq_pkl,
                                    "--pat_data_file",pat_data,
                                    "--feature_lookup",feature_lookup,
                                    "--all_ftrs_dict_file",all_ftrs_dict,
                                    "--int_seqs_file",int_seqs,
                                    "--cohort",cohort],
                         outputs=[trimmed_seq_pkl,cohort],
                         compute_target=aml_compute_cpu, 
                         runconfig=run_config_feature,
                         source_directory=source_directory,
                         allow_reuse=True)
print("Step3 sequence_trimming created")

Step3 sequence_trimming created


### Training Step

In [192]:
source_directory ='./training'
step4= PythonScriptStep(name="train_model",
                         script_name="train_model_step.py", 
                         inputs=[demog_dict,trimmed_seq_pkl,all_ftrs_dict,feature_lookup,cohort],
                         arguments=["--model",model_name,
                                    "--outcome",outcome,
                                    "--epochs",n_epochs,
                                    "--model_file",model_file,
                                    "--stats_file",stats_file,
                                    "--preds_file",preds_file, 
                                    "--probs_file",probs_file,   
                                    "--demog_dict_file",demog_dict,
                                    "--trimmed_seq_pkl_file",trimmed_seq_pkl,
                                    "--feature_lookup",feature_lookup,
                                    "--all_ftrs_dict_file",all_ftrs_dict,
                                    "--cohort",cohort],
                         outputs=[model_file,stats_file,preds_file,probs_file],
                         compute_target=aml_cluster_gpu, 
                         runconfig=run_config_train,
                         source_directory=source_directory,
                         allow_reuse=True)
print("Step4 train_model created")

Step4 train_model created


### Register model

In [197]:
source_directory ='./training/register'
step5= PythonScriptStep(name="register_model",
                         script_name="register_model.py", 
                         inputs=[model_file],
                         arguments=["--model_file",model_file,
                                     "--model",model_name,
                                     "--outcome",outcome],
                         compute_target=aml_cluster_gpu, 
                         runconfig=run_config_train,
                         source_directory=source_directory,
                         allow_reuse=True)
print("Step5 register_model created")

Step5 register_model created


In [198]:
steps = [step1,step2,step3,step4,step5]
pipeline1 = Pipeline(workspace=ws,steps=steps,default_datastore=data_store)

In [199]:
run_exp = Experiment(workspace=ws, name="Premier-Feature-Pipeline-3")

In [200]:
run_exp.submit(pipeline1,pipeline_parameters={"model_name": "lstm",
                                            "outcome":'icu',
                                            "n_epochs":1})

Created step feature_extraction [ab154d49][86dc60af-ff47-4d8a-9a35-f3a117095b5c], (This step will run and generate new outputs)
Created step feature_tokenization [453b55d5][a3db32d6-8886-4ebb-9e1b-09ad9579aa28], (This step will run and generate new outputs)Created step sequence_trimming [fa18507f][80dd786d-ed24-4592-ae74-82396a13638b], (This step will run and generate new outputs)

Created step train_model [c67c4afd][2fb4f67d-3753-4787-a342-7208cf79e488], (This step will run and generate new outputs)
Created step register_model [7e735cd1][a3c6a8e0-aa63-43f1-bc3a-9c546d96f09e], (This step will run and generate new outputs)
Using data reference premier_data for StepId [9bc0a31f][b2d198f2-4469-4899-9ebe-e6a54440d08a], (Consumers of this data are eligible to reuse prior runs.)
Submitted PipelineRun 7ced846d-95c3-4569-8e03-244aeca3f226
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/7ced846d-95c3-4569-8e03-244aeca3f226?wsid=/subscriptions/320d8d57-c87c-4434-827f-59ee7d86687

Experiment,Id,Type,Status,Details Page,Docs Page
Premier-Feature-Pipeline-3,7ced846d-95c3-4569-8e03-244aeca3f226,azureml.PipelineRun,Running,Link to Azure Machine Learning studio,Link to Documentation
