## Import Required packages

In [1]:
import azureml.core
from azureml.core import Workspace
import os, shutil
from azureml.core import Workspace, Experiment, Datastore, Environment, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.pipeline.core import Pipeline, PipelineParameter, PipelineData, PublishedPipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineParameter, PipelineData, PipelineEndpoint
from azureml.data.output_dataset_config import OutputTabularDatasetConfig, OutputDatasetConfig, OutputFileDatasetConfig
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.36.0 to work with mm-aml


In [2]:
import os, shutil
folder_name = 'batch-inferencing-silly'
script_folder = os.path.join(os.getcwd(), folder_name)
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing-silly


## Connect to AML Workspace

In [3]:
from azureml.core.experiment import Experiment
experiment = Experiment(ws, 'email-batch-inferencing-pipeline-silly')

#Get default datastore
default_ds = ws.get_default_datastore()

## Create Cluster

In [4]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.exceptions import ComputeTargetException

compute_name =  "email-cluster3"
print(compute_name)

# checks to see if compute target already exists in workspace, else create it
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
except ComputeTargetException:
    config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D13",
                                                   min_nodes=2, 
                                                   max_nodes=10)

    compute_target = ComputeTarget.create(workspace=ws, name=compute_name, provisioning_configuration=config)
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=120)

email-cluster3


In [5]:
%%writefile $script_folder/email_classification_inference.yml
name: email_classification_inference
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing-silly/email_classification_inference.yml


In [6]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# Create an Environment for the experiment
batch_env = Environment.from_conda_specification("email_classification_inference", script_folder + "/email_classification_inference.yml")
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')

Configuration ready.


# Define Output Datasets

Below we define the configuration for datasets that will be passed between steps in our pipeline. Note, in all cases we specify the datastore that should hold the datasets and whether they should be registered following step completion or not. This can optionally be disabled by removing the register_on_complete() call


In [7]:
inferencing_dataset = OutputFileDatasetConfig(name='email_inferencing_dataset', destination=(default_ds, 'inferencing_dataset/{run-id}')).read_delimited_files().register_on_complete(name='inferencing_data')
scored_dataset      = OutputFileDatasetConfig(name='email_scored_dataset', destination=(default_ds, 'scored_dataset/{run-id}')).read_delimited_files().register_on_complete(name='scored_data')




# Register a dataset for the input data
# batch_data_set = Dataset.File.from_files(path=(default_ds, 'spam-data-inferencing/'), validate=False)
# try:
#     batch_data_set = batch_data_set.register(workspace=ws, 
#                                              name='batch-data',
#                                              description='batch data',
#                                              create_new_version=True)
# except Exception as ex:
#     print(ex)

# print("Done!")

# Define Pipeline Parameters

PipelineParameter objects serve as variable inputs to an Azure ML pipeline and can be specified at runtime. Below we specify a pipeline parameter object model_name which will be used to reference the locally trained model that was uploaded and registered within the Azure ML workspace. Multiple pipeline parameters can be created and used. Included here are multiple sample pipeline parameters (get_data_param_*) to highlight how parameters can be passed into and consumed by various pipeline steps.

In [8]:
model_name = PipelineParameter(name='model_name', default_value='email_classifier')

# Define Pipeline Steps

The pipeline below consists of steps to gather and register data from a remote source, a scoring step where the registered model is used to make predictions on loaded, and a data publish step where scored data can be exported to a remote data source. All of the PythonScriptSteps have a corresponding *.py file which is referenced in the step arguments. Also, any PipelineParameters defined above can be passed to and consumed within these steps.


In [9]:
import os, shutil
folder_name = 'batch-inferencing'
script_folder = os.path.join(os.getcwd(), folder_name)
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing


In [31]:
%%writefile $script_folder/batch_inferencing_data_silly.py

import os
import numpy as np
from azureml.core import Model
import joblib
import time


def init():
    # Runs when the pipeline step is initialized
    global model

    # load the model
    print('****loaded model**********')
    model_path = Model.get_model_path('email_classifier')
    model = joblib.load(model_path)


def run(mini_batch):
    # This runs for each batch
    print(f'run method start: {__file__}, run({mini_batch})')
    resultList = []
    print('type of mini batch')
    print(str(type(mini_batch)))
    # process each file in the batch
    for f in mini_batch:
        print('****working on mini_batch**********')
        print(f)
        
        print("Printed immediately.")
        time.sleep(1.2)
        print("Printed after 1.2 seconds.")

        # Read the comma-delimited data into an array
        #data = np.genfromtxt(f, delimiter=',')
        # Reshape into a 2-dimensional array for prediction (model expects multiple items)
        #prediction = model.predict(data.reshape(1, -1))
        # Append prediction to results
        resultList.append("{}: {}".format(os.path.basename(f), 'ham'))
    return resultList

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing/batch_inferencing_data_silly.py


You're going to use a pipeline to run the batch prediction script, generate predictions from the input data, and save the results as a text file in the output folder. To do this, you can use a **ParallelRunStep**, which enables the batch data to be processed in parallel and the results collated in a single output file named *parallel_run_step.txt*.

In [32]:
# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(default_ds, 'spam-data-inferencing/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='spam-batch-data-inference',
                                             description='inference batch data',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Done!")

Done!


In [33]:
script_folder

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing'

In [34]:
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig
from azureml.core.runconfig import DockerConfiguration

output_dir = OutputFileDatasetConfig(name='inferences')

parallel_run_config = ParallelRunConfig(
    source_directory=script_folder,
    entry_script="batch_inferencing_data_silly.py",
    mini_batch_size="50",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target=compute_target,
    node_count=2)

parallelrun_step = ParallelRunStep(
    name='batch-score-diabetes',
    parallel_run_config=parallel_run_config,
    inputs=[batch_data_set.as_named_input('email_batch')],
    output=output_dir,
    arguments=[],
    allow_reuse=False
)

print('Steps defined')

Steps defined


In [35]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
pipeline_run = Experiment(ws, 'email-classifcation-batch-silly').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

Created step batch-score-diabetes [8898fbf6][bf929ef4-2c9f-42a3-ab55-96d7d4abb741], (This step will run and generate new outputs)
Submitted PipelineRun 5e4065cb-1aa3-4408-b9ec-33080f40c34f
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/5e4065cb-1aa3-4408-b9ec-33080f40c34f?wsid=/subscriptions/5da07161-3770-4a4b-aa43-418cbbb627cf/resourcegroups/mm-aml-rg/workspaces/mm-aml&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
PipelineRunId: 5e4065cb-1aa3-4408-b9ec-33080f40c34f
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/5e4065cb-1aa3-4408-b9ec-33080f40c34f?wsid=/subscriptions/5da07161-3770-4a4b-aa43-418cbbb627cf/resourcegroups/mm-aml-rg/workspaces/mm-aml&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
PipelineRun Status: Running


StepRunId: f01d6188-951d-40c5-97e7-8aa81551eda1
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/f01d6188-951d-40c5-97e7-8aa81551eda1?wsid=/subscriptions/5da07161-3770-4a4b-aa43-418cbbb627cf/resourcegroups/mm-aml-rg/workspaces/


Streaming azureml-logs/70_driver_log.txt
2022/01/14 16:00:18 Didn't get JobInfoJson from env, now read from file
2022/01/14 16:00:18 Starting App Insight Logger for task:  runTaskLet
2022/01/14 16:00:18 Version: 3.0.01830.0002 Branch: 2022-01-05 Commit: 3618b7d
2022/01/14 16:00:18 Attempt 1 of http call to http://10.0.0.4:16384/sendlogstoartifacts/info
2022/01/14 16:00:18 Send process info logs to master server succeeded
2022/01/14 16:00:18 Attempt 1 of http call to http://10.0.0.4:16384/sendlogstoartifacts/status
2022/01/14 16:00:18 Send process info logs to master server succeeded
[2022-01-14T16:00:18.944861] Entering context manager injector.
[2022-01-14T16:00:19.551154] context_manager_injector.py Command line Options: Namespace(inject=['ProjectPythonPath:context_managers.ProjectPythonPath', 'Dataset:context_managers.Datasets', 'RunHistory:context_managers.RunHistory', 'TrackUserError:context_managers.TrackUserError', 'UserExceptions:context_managers.UserExceptions'], invocation=[



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '5e4065cb-1aa3-4408-b9ec-33080f40c34f', 'status': 'Completed', 'startTimeUtc': '2022-01-14T15:59:24.165751Z', 'endTimeUtc': '2022-01-14T16:05:19.926078Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mmaml7489685591.blob.core.windows.net/azureml/ExperimentRun/dcid.5e4065cb-1aa3-4408-b9ec-33080f40c34f/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=7Ty0vcT%2FVq1UFwy0K5tNTIlzeKIE6UFayuvVHn%2FMZBM%3D&skoid=df057cdb-33ee-4949-b0a0-f29dd30edb46&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2022-01-14T14%3A38%3A27Z&ske=2022-01-15T22%3A48%3A27Z&sks=b&skv=2019-07-07&st=2022-01-14T15%3A55%3A21Z&se=2022-01-15T00%3A05%3A21Z&sp=r', 'logs/azu

'Finished'

## Publish the Pipeline

In [36]:
published_pipeline = pipeline.publish(name = 'Email Batch Prediction Pipeline Silly',
                                     description = 'Pipeline that generates batch predictions using a registered trained model.',
                                     continue_on_step_failure = False)

In [37]:
published_pipeline

Name,Id,Status,Endpoint
Email Batch Prediction Pipeline Silly,2c8fc5ae-1508-4bf9-9dda-24c21fb2e8aa,Active,REST Endpoint


In [43]:
pipeline_id = '2c8fc5ae-1508-4bf9-9dda-24c21fb2e8aa'
experiment_name = 'scheduled_silly_email'
recurrence = ScheduleRecurrence(frequency="Minute", interval=5)
recurring_schedule = Schedule.create(ws, name="MyRecurringSchedule", 
                            description="Based on time",
                            pipeline_id=pipeline_id, 
                            experiment_name=experiment_name, 
                            recurrence=recurrence)

# Manage Pipeline Runs

## Get published pipeline Info

In [42]:
experiments = Experiment.list(ws)
# for experiment in experiments:
#     print(experiment.name)

published_pipelines = PublishedPipeline.list(ws)
for published_pipeline in  published_pipelines:
    print(f"{published_pipeline.name},'{published_pipeline.id}'")

Email Batch Prediction Pipeline Silly,'2c8fc5ae-1508-4bf9-9dda-24c21fb2e8aa'


In [17]:
ss = Schedule.list(ws)
for s in ss:
    print(s)
    print('****************')

In [None]:
def stop_by_schedule_id(ws, schedule_id):
    s = next(s for s in Schedule.list(ws) if s.id == schedule_id)
    s.disable()
    return s

#stop_by_schedule_id(ws, '60166fcd-5276-4557-9a5b-c5a0ce3ec84e')

In [20]:
pipeline = PublishedPipeline.get(ws, id = '898c1939-7278-4ce8-976f-106b71bbb678')
pipeline.disable()

# for published_pipeline in  published_pipelines:
#     pipeline = PublishedPipeline.get(ws, id = published_pipeline.id)
#     pipeline.disable()

## Set Schedule for Pipeline

In [None]:
pipeline_id = published_pipeline.Id
experiment_name = 'silly_scheduled_email'
recurrence = ScheduleRecurrence(frequency="Minute", interval=5)
recurring_schedule = Schedule.create(ws, name="MyRecurringSchedule", 
                            description="Based on time",
                            pipeline_id=pipeline_id, 
                            experiment_name=experiment_name, 
                            recurrence=recurrence)

In [None]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
# shutil.rmtree('diabetes-results', ignore_errors=True)

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='diabetes-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('diabetes-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

In [None]:
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule