## Data preparation for  Premier Analysis on Azure Machine Learning

In [8]:
import argparse
import os
import sklearn
import pandas as pd 
import numpy as np
from azureml.data import OutputFileDatasetConfig
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from azureml.core import Workspace, Experiment, Run, RunConfiguration
from azureml.core import Run, Dataset, Environment,Experiment,ScriptRunConfig
from sklearn.preprocessing import LabelEncoder
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

In [9]:
from azureml.core import  Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

ws = Workspace.from_config()

In [10]:
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: cdh-azml-dev-mlw
Azure region: eastus
Subscription id: 320d8d57-c87c-4434-827f-59ee7d86687a
Resource group: CSELS-CDH-DEV


In [11]:
# current working directory
path = os.getcwd()
print("Current Directory:", path)
  
# parent directory
parent = os.path.join(path, os.pardir)
  
# prints parent directory
print("\nParent Directory:", os.path.abspath(parent))

premier_path = os.path.abspath(parent)

Current Directory: c:\Users\wsn8\Code\premier_analysis\azure_ml

Parent Directory: c:\Users\wsn8\Code\premier_analysis


#### Create Compute

In [12]:
clustername = 'StandardD13v2'
is_new_cluster = False
try:
    aml_compute_cpu = ComputeTarget(workspace = ws,name= clustername)
    print("Find the existing cluster")
except ComputeTargetException:
    print("Cluster not find - Creating cluster.....")
    is_new_cluster = True
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS13_V2',
                                                           max_nodes=2)
    aml_compute_cpu = ComputeTarget.create(ws, clustername, compute_config)

aml_compute_cpu.wait_for_completion(show_output=True)

Find the existing cluster
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [13]:
premier_feature_env = Environment.from_conda_specification(name='premier_feature_env', file_path='./environments/conda_dependencies_features.yml')
# Specify a CPU base image
#premier_feature_env.docker.enabled = True
premier_feature_env.docker.base_image = DEFAULT_CPU_IMAGE
premier_feature_env.register(workspace=ws)


{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20210220.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "premier_feature_env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "defau

In [14]:
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference


datastore_name = 'edav_dev_ds'
cdh_path = 'exploratory/databricks_ml/mitre_premier/data/'
ds = Datastore.get(ws, datastore_name)

print("Datastore's name: {}".format(ds.name))

premier_data_ref = DataReference(
    datastore=ds,
    data_reference_name='premier_data',
    path_on_datastore=cdh_path)
print("DataReference object created")

print(premier_data_ref.as_mount())

Datastore's name: edav_dev_ds
DataReference object created
$AZUREML_DATAREFERENCE_premier_data


#### Feature Extraction

In [15]:

data_store = ws.get_default_datastore()
flat_features = OutputFileDatasetConfig(destination=(data_store, 'output/parquet'))
feature_lookup = OutputFileDatasetConfig(destination=(data_store, 'output/pkl'))

In [16]:
source_directory ='./training'
job_feature_extraction = ScriptRunConfig(
                         script="feature_extraction.py", 
                         arguments=["--flat_features",flat_features,"--feature_lookup",feature_lookup],
                         compute_target=aml_compute_cpu, 
                         environment=premier_feature_env,
                         source_directory=source_directory)
           



In [17]:
# Create experiment
experiment_extraction = Experiment(workspace=ws, name=f"Job-feature-extraction")
    
print("Submit Experiment")
run_extraction = experiment_extraction.submit(job_feature_extraction)
run_extraction.wait_for_completion(show_output=False)

Submit Experiment


TypeError: Object of type DataReference is not JSON serializable

#### Feature Tokenization

In [15]:
trimmed_seq = OutputFileDatasetConfig(destination=(data_store, 'output/pkl'))
pat_data = OutputFileDatasetConfig(destination=(data_store, 'output/pkl'))
demog_dict = OutputFileDatasetConfig(destination=(data_store, 'output/pkl'))
all_ftrs_dict = OutputFileDatasetConfig(destination=(data_store, 'output/pkl'))
int_seqs = OutputFileDatasetConfig(destination=(data_store, 'output/pkl'))

In [16]:
source_directory ='./training'
job_feature_tokenization = ScriptRunConfig(script="feature_tokenization.py", 
                         arguments=["--flat_features",flat_features,
                                    "--trimmed_seq_file",trimmed_seq,
                                    "--pat_data_file",pat_data,
                                    "--demog_dict_file",demog_dict,
                                    "--all_ftrs_dict_file",all_ftrs_dict,
                                    "--int_seqs_file",int_seqs],
                         compute_target=aml_compute_cpu, 
                         environment=premier_feature_env,
                         source_directory=source_directory)


In [17]:
# Create experiment
experiment_tokenization = Experiment(workspace=ws, name=f"Job-feature-tokenization")
    
print("Submit Experiment")
run_tokenization = experiment_tokenization.submit(job_feature_tokenization)
run_tokenization.wait_for_completion(show_output=False)

Submit Experiment


#### Sequence Trimming

In [21]:
trimmed_seq_pkl = OutputFileDatasetConfig(destination=(data_store, 'output/pkl'))
cohort = OutputFileDatasetConfig(destination=(data_store, 'output/cohort'))

In [22]:
source_directory ='./training'
job_sequence_trimming = ScriptRunConfig(
                         script="sequence_trimming.py", 
                         arguments=["--trimmed_seq_pkl_file",trimmed_seq_pkl,
                                    "--pat_data_file",pat_data,
                                    "--feature_lookup",feature_lookup,
                                    "--all_ftrs_dict_file",all_ftrs_dict,
                                    "--int_seqs_file",int_seqs,
                                    "--cohort",cohort],
                         compute_target=aml_compute_cpu, 
                         environment=premier_feature_env, 
                         source_directory=source_directory)
print(" sequence_trimming created")

 sequence_trimming created


In [23]:
# Create experiment
experiment_trimming = Experiment(workspace=ws, name=f"Job-feature-sequence-trimming")
    
print("Submit Experiment")
run_trimming = experiment_trimming.submit(job_sequence_trimming)
run_trimming.wait_for_completion(show_output=False)

Submit Experiment
