## Import Required packages

In [1]:
import azureml.core
from azureml.core import Workspace
import os, shutil
from azureml.core import Workspace, Experiment, Datastore, Environment, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.pipeline.core import Pipeline, PipelineParameter, PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineParameter, PipelineData, PipelineEndpoint
from azureml.data.output_dataset_config import OutputTabularDatasetConfig, OutputDatasetConfig, OutputFileDatasetConfig
from azureml.core.experiment import Experiment

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.36.0 to work with mm-aml


In [2]:
import os, shutil
folder_name = 'batch-inferencing'
script_folder = os.path.join(os.getcwd(), folder_name)
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing


In [3]:
#Get default datastore
default_ds = ws.get_default_datastore()

Create Cluster

In [4]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.exceptions import ComputeTargetException

compute_name =  "email-cluster"
print(compute_name)

# checks to see if compute target already exists in workspace, else create it
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
except ComputeTargetException:
    config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D13",
                                                   min_nodes=0, 
                                                   max_nodes=5)

    compute_target = ComputeTarget.create(workspace=ws, name=compute_name, provisioning_configuration=config)
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=120)

email-cluster


In [5]:
%%writefile $script_folder/email_classification_inference.yml
name: email_classification_inference
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing/email_classification_inference.yml


In [6]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# Create an Environment for the experiment
batch_env = Environment.from_conda_specification("email_classification_inference", script_folder + "/email_classification_inference.yml")
batch_env.docker.base_image = DEFAULT_CPU_IMAGE
print('Configuration ready.')

Configuration ready.


# Define Pipeline Parameters

PipelineParameter objects serve as variable inputs to an Azure ML pipeline and can be specified at runtime. Below we specify a pipeline parameter object model_name which will be used to reference the locally trained model that was uploaded and registered within the Azure ML workspace. Multiple pipeline parameters can be created and used. Included here are multiple sample pipeline parameters (get_data_param_*) to highlight how parameters can be passed into and consumed by various pipeline steps.

In [7]:
model_name = PipelineParameter(name='model_name', default_value='email_classifier')

# Define Pipeline Steps

The pipeline below consists of steps to gather and register data from a remote source, a scoring step where the registered model is used to make predictions on loaded, and a data publish step where scored data can be exported to a remote data source. All of the PythonScriptSteps have a corresponding *.py file which is referenced in the step arguments. Also, any PipelineParameters defined above can be passed to and consumed within these steps.


In [8]:
import os, shutil
folder_name = 'batch-inferencing'
script_folder = os.path.join(os.getcwd(), folder_name)
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing


In [45]:
%%writefile $script_folder/batch_inferencing_data.py

import os
import numpy as np
from azureml.core import Model
import joblib
import pandas as pd

def init():
    # Runs when the pipeline step is initialized
    global model

    # load the model
    print('****loaded model**********')
    model_path = Model.get_model_path('email_classifier')
    model = joblib.load(model_path)


def run(mini_batch):
    # This runs for each batch
    print(f'run method start: {__file__}, run({mini_batch})')
    resultList = []
    print('type of mini batch')
    print(str(type(mini_batch)))
    # process each file in the batch
    for f in mini_batch:
        print('****working on mini_batch**********')
        print(f)
        #open text file in read mode
        text_file = open(f, "r")
        data = text_file.read()
        text_file.close()
        result = model.predict([data])
        print(data)
        resultList.append("{}: {}".format(os.path.basename(f), result[0]))
    return resultList

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing/batch_inferencing_data.py


You're going to use a pipeline to run the batch prediction script, generate predictions from the input data, and save the results as a text file in the output folder. To do this, you can use a **ParallelRunStep**, which enables the batch data to be processed in parallel and the results collated in a single output file named *parallel_run_step.txt*.

In [46]:
# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(default_ds, 'spam-data-inferencing/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='spam-batch-data-inference',
                                             description='inference batch data',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Done!")

Done!


In [47]:
script_folder

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz3/code/Users/memasanz/email-classification/batch-inferencing'

In [48]:
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig
from azureml.core.runconfig import DockerConfiguration

output_dir = OutputFileDatasetConfig(name='inferences')

parallel_run_config = ParallelRunConfig(
    source_directory=script_folder,
    entry_script="batch_inferencing_data.py",
    mini_batch_size="50",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target=compute_target,
    node_count=2)

parallelrun_step = ParallelRunStep(
    name='batch-score-diabetes',
    parallel_run_config=parallel_run_config,
    inputs=[batch_data_set.as_named_input('email_batch')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

print('Steps defined')

Steps defined


In [49]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
pipeline_run = Experiment(ws, '02-email-classifcation-batch').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

Created step batch-score-diabetes [8844fa41][96ee9f02-dfed-42ab-9689-cda89f580553], (This step will run and generate new outputs)
Submitted PipelineRun be0db82b-07ad-40a4-ac5f-d0d476e8036d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/be0db82b-07ad-40a4-ac5f-d0d476e8036d?wsid=/subscriptions/5da07161-3770-4a4b-aa43-418cbbb627cf/resourcegroups/mm-aml-rg/workspaces/mm-aml&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
PipelineRunId: be0db82b-07ad-40a4-ac5f-d0d476e8036d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/be0db82b-07ad-40a4-ac5f-d0d476e8036d?wsid=/subscriptions/5da07161-3770-4a4b-aa43-418cbbb627cf/resourcegroups/mm-aml-rg/workspaces/mm-aml&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
PipelineRun Status: Running


StepRunId: 4bef8128-e5c7-41cc-82f6-e280200fbef1
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4bef8128-e5c7-41cc-82f6-e280200fbef1?wsid=/subscriptions/5da07161-3770-4a4b-aa43-418cbbb627cf/resourcegroups/mm-aml-rg/workspaces/


Streaming azureml-logs/65_job_prep-tvmps_0f15ffa1ca4291e5beadc630cc87be44b0ab1533d9e628a284cd4c5cc6d6d8bb_d.txt
[2022-02-09T02:37:50.032204] Entering job preparation.
[2022-02-09T02:37:50.706774] Starting job preparation.
[2022-02-09T02:37:50.706821] Extracting the control code.
[2022-02-09T02:37:50.707167] Starting extract_project.
[2022-02-09T02:37:50.707238] Starting to extract zip file.
[2022-02-09T02:37:50.725728] Finished extracting zip file.
[2022-02-09T02:37:50.729532] Using urllib.request Python 3.0 or later
[2022-02-09T02:37:50.729596] Start fetching snapshots.
[2022-02-09T02:37:50.729630] Start fetching snapshot.
Starting the daemon thread to refresh tokens in background for process with pid = 59
[2022-02-09T02:37:51.120457] Finished fetching snapshot.
[2022-02-09T02:37:51.120499] Start fetching snapshot.
[2022-02-09T02:37:58.964514] Finished fetching snapshot.
[2022-02-09T02:37:58.964561] Finished fetching snapshots.
[2022-02-09T02:37:58.964570] Finished extract_project.
[



[2022-02-09T02:39:03.271492] The experiment completed successfully. Finalizing run...
Cleaning up all outstanding Run operations, waiting 900.0 seconds
3 items cleaning up...
Cleanup took 0.22355246543884277 seconds
[2022-02-09T02:39:03.643429] Finished context manager injector.
2022/02/09 02:39:04 Attempt 1 of http call to http://10.0.0.5:16384/sendlogstoartifacts/status
2022/02/09 02:39:04 Send process info logs to master server succeeded
2022/02/09 02:39:04 Not exporting to RunHistory as the exporter is either stopped or there is no data.
Stopped: false
OriginalData: 2
FilteredData: 0.
2022/02/09 02:39:04 Process Exiting with Code:  0
2022/02/09 02:39:05 All App Insights Logs was sent successfully or the close timeout of 10 was reached

Streaming azureml-logs/75_job_post-tvmps_0f15ffa1ca4291e5beadc630cc87be44b0ab1533d9e628a284cd4c5cc6d6d8bb_d.txt
[2022-02-09T02:39:07.775533] Entering job release
Failure while loading azureml_run_type_providers. Failed to load entrypoint azureml.sc



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'be0db82b-07ad-40a4-ac5f-d0d476e8036d', 'status': 'Completed', 'startTimeUtc': '2022-02-09T02:37:27.316049Z', 'endTimeUtc': '2022-02-09T02:39:26.320671Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mmaml7489685591.blob.core.windows.net/azureml/ExperimentRun/dcid.be0db82b-07ad-40a4-ac5f-d0d476e8036d/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=Z%2Flp6b%2F4vV1Z08sbgkLfoeG7aygSvH9f8XGcrj1djhQ%3D&skoid=df057cdb-33ee-4949-b0a0-f29dd30edb46&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2022-02-09T01%3A10%3A27Z&ske=2022-02-10T09%3A20%3A27Z&sks=b&skv=2019-07-07&st=2022-02-09T02%3A28%3A16Z&se=2022-02-09T10%3A38%3A16Z&sp=r', 'logs/azu

'Finished'

## Retrieve results

In [50]:
import pandas as pd
import shutil

# Remove the local results folder if left over from a previous run
try:
    shutil.rmtree('diabetes-results', ignore_errors=True)
except:
    print('keep going dude')

# Get the run for the first step and download its output
prediction_run = next(pipeline_run.get_children())
prediction_output = prediction_run.get_output_data('inferences')
prediction_output.download(local_path='diabetes-results')

# Traverse the folder hierarchy and find the results file
for root, dirs, files in os.walk('diabetes-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

Unnamed: 0,File,Prediction
0,3000.csv,spam
1,3001.csv,ham
2,3002.csv,ham
3,3003.csv,spam
4,3004.csv,ham
5,3005.csv,ham
6,3006.csv,ham
7,3007.csv,spam
8,3008.csv,ham
9,3009.csv,ham


In [None]:
# pipeline_id = pipeline.Id
# experiment_name = 'silly_scheduled_email'
# recurrence = ScheduleRecurrence(frequency="Minute", interval=5)
# recurring_schedule = Schedule.create(ws, name="MyRecurringSchedule", 
#                             description="Based on time",
#                             pipeline_id=pipeline_id, 
#                             experiment_name=experiment_name, 
#                             recurrence=recurrence)