In [None]:
from azureml.core import Workspace, Model, Dataset, Datastore, Experiment, Environment, ScriptRunConfig, RunConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.environment import CondaDependencies
import joblib
import os
import pandas as pd
from datetime import datetime
import logging
import requests
from azure.identity import ChainedTokenCredential,ManagedIdentityCredential
from azure.storage.filedatalake import DataLakeServiceClient
import azureml.core
from time import sleep
import azure.functions as func
import json 
print('SDK version:', azureml.core.VERSION)

In [None]:
### connecting to ML workspace
ws = Workspace.from_config()

### create compute target
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2',
                                                           min_nodes = 0, max_nodes=1)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)

In [None]:
# some helper functions to read a file from storage account. These need role permissions to be set up via managed identities.

def initialize_storage_account(storage_account_name):
    try:
        global service_client
        MSI_credential = ManagedIdentityCredential()
        credential_chain = ChainedTokenCredential(MSI_credential)   
        service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format("https", storage_account_name), credential=credential_chain)    
    except Exception as e:
        print(e)

def writeFile(container, filepath, filename, data_in):
    file_system_client = service_client.get_file_system_client(file_system=container)
    directory_client = file_system_client.get_directory_client(filepath)
    file_client = directory_client.create_file(filename)
    file_client.upload_data(data=data_in,overwrite=True)
    return func.HttpResponse(body="Successfully saved JSON",status_code=200)

def getFile(container, filepath, filename):
    file_system_client = service_client.get_file_system_client(file_system=container)
    directory_client = file_system_client.get_directory_client(filepath)
    file_client = directory_client.get_file_client(filename)
    download = file_client.download_file()
    downloaded_bytes = download.readall()
    return downloaded_bytes

In [None]:
# initalize storage account and get the csv that is to be predicted
initialize_storage_account("name of your storage account")
payload = getFile("container name", "folder name","your aml file.csv")

In [None]:
# convert the bytes file that is read from storage to a pandas dataframe and remove column that is to be predicted
from io import StringIO
s=str(payload,'utf-8')
data = StringIO(s) 
df=pd.read_csv(data)
# df.drop(df.columns[[0]], axis=1, inplace=True)

# modify datatypes so they match the training data set
# data1_bool = df.copy()                                     # Create copy of pandas DataFrame
# data1_bool['bruises'] = data1_bool['bruises'].map({'t': True, 'f': False})      # Replace string by boolean

df.head()

In [None]:
### create directory for script
os.makedirs('./scripts', exist_ok= True)

In [None]:
%%writefile scripts/batch_score.py

from azureml.core import Workspace, Model, Dataset, Datastore, Run
from azureml.core.authentication import ServicePrincipalAuthentication #
import joblib
import pandas as pd
from datetime import datetime

subscription_id = 'your subscription id'
resource_group = 'your resource group name'
workspace_name = 'your workspace name'
ws = Workspace(subscription_id, resource_group, workspace_name)

### Load model for scoring, overwrite during download if it already exists
model = Model(workspace = ws, name='your model name')
model.download(exist_ok=True)
loaded_model = joblib.load("model.pkl")

### Score new data
results = loaded_model.predict(data1_bool)
df['prediction'] = results

# write csv to blob store. this can be improved to write the whole file back and not only predictions + improve formatting
writeFile("output container name", "output folder name", "predictions.csv", results)

In [None]:
### defining run environment
myenv = Environment(name="myenv")
conda_dep = CondaDependencies()
conda_dep.add_pip_package("joblib")
conda_dep.add_pip_package("pandas")
conda_dep.add_pip_package("sklearn")
conda_dep.add_pip_package("azureml-sdk")
myenv.python.conda_dependencies=conda_dep

In [None]:
### creating run configuration
rc = RunConfiguration(script= './scripts/batch_score.py', conda_dependencies = conda_dep)

In [None]:
### create pipeline step
score_step = PythonScriptStep(name = 'Score step', script_name = 'batch_score.py', source_directory = './scripts', compute_target = compute_target, runconfig = rc, allow_reuse = False)

In [None]:
### create & validate pipeline
steps = [score_step]
pipeline = Pipeline(workspace=ws, steps=steps)
pipeline.validate()

In [None]:
experiment = Experiment(ws, 'Batch-Scoring-Remote')
run = experiment.submit(pipeline)

In [None]:
### publish pipeline
pipeline.publish(name='your pipeline name')