**NOTE:** This template notebook assumes you have succcessfully ran through Challenge 2.  
You should already have a train.py, driver_training.py, and a parameters.json in an experiment folder. These resources will be used as the first step in the Machine Learning pipeline created and run later in this notebook.

In [1]:
# Set the folder for the experiment files used in Challenge 2
training_folder = 'driver-training'

In [2]:
%%writefile $training_folder/train_drivers.py
# Import libraries
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import lightgbm
from sklearn import metrics

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--output_folder', type=str, dest='output_folder', default="diabetes_model", help='output folder')
args = parser.parse_args()
output_folder = args.output_folder

from azureml.core import Workspace
from azureml.core import Dataset
ws = Workspace.get(name='mlopsdev',
           subscription_id='c46a9435-c957-4e6c-a0f4-b9a597984773',
           resource_group='mlops'
)

#ws = Workspace.from_config()

# Get the experiment run context
run = Run.get_context()

# load the safe driver prediction dataset
#train_df = pd.read_csv('porto_seguro_safe_driver_prediction_input.csv')
#train_df = run.input_datasets['driversdataset'].to_pandas_dataframe()
dataset = Dataset.get_by_name(ws, name='driversdataset')
data_df = dataset.to_pandas_dataframe()

# Load the parameters for training the model from the file
#with open("parameters.json") as f:
#    pars = json.load(f)
#    parameters = pars["training"]
    
parameters = {
    'learning_rate': 0.02,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'sub_feature': 0.7,
    'num_leaves': 60,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': 2
}

# Log each of the parameters to the run
for param_name, param_value in parameters.items():
    run.log(param_name, param_value)

features = data_df.drop(['target', 'id'], axis=1)
labels = np.array(data_df['target'])
(features_train, features_valid, labels_train, labels_valid) = train_test_split(features, labels, test_size=0.2, random_state=0)

train_data = lightgbm.Dataset(features_train, label=labels_train)
valid_data = lightgbm.Dataset(features_valid, label=labels_valid, free_raw_data=False)
    
model = lightgbm.train(parameters, train_data, valid_sets=valid_data, num_boost_round=500, early_stopping_rounds=20)
    
#model = train_model(train_data, valid_data, parameters)
#predictions = get_model_metrics(model, valid_data)

predictions = model.predict(valid_data.data)
fpr, tpr, thresholds = metrics.roc_curve(valid_data.label, predictions)
model_metrics = {"auc": (metrics.auc(fpr, tpr))}
print(model_metrics)

run.log('Accuracy', model_metrics)
run.log('ModelType', 'LightGbm')

# Save the trained model to the output folder
os.makedirs(output_folder, exist_ok=True)
output_path = output_folder + "/driver_model.pkl"
joblib.dump(value=model, filename=output_path)

print(output_path)
print(model)

# Save the trained model
#os.makedirs(output_folder, exist_ok=True)
#output_path = output_folder + "/model.pkl"
#joblib.dump(value=model, filename=output_path)

run.complete()

Writing driver-training/train_drivers.py


## register_model.py
This script loads the model from where it was saved, and then registers it in the workspace. This will be the second step in the pipeline. The script is written to the experiment folder from this notebook for convenience.

In [3]:
%%writefile $training_folder/register_model.py
# Import libraries
import argparse
import joblib
from azureml.core import Workspace, Model, Run

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--model_folder', type=str, dest='model_folder', default="driver_model", help='model location')
args = parser.parse_args()
model_folder = args.model_folder

# Get the experiment run context
run = Run.get_context()

# load the model
print("Loading model from " + model_folder)
model_name = 'driver_model'
model_file = model_folder + "/" + model_name + ".pkl"

metrics=run.get_metrics(recursive=True)

# Load the model
print("Loading model from " + model_folder)
model_file = model_folder + "/driver_model.pkl"
model = joblib.load(model_file)

#run.upload_file('driver_model.pkl',model_file)
run.upload_file(model_name, model_file)

#run.register_model(model_path = model_file,
#                   model_name = 'driver_model.pkl', 
#                   tags=metrics)
run.register_model(model_path = model_name,
                   model_name = model_name, 
                   tags=metrics)

run.complete()

Writing driver-training/register_model.py


## Create an Azure Machine Learning Pipeline to Run the Scripts as a Pipeline

See [this tutorial](https://github.com/MicrosoftDocs/mslearn-aml-labs/blob/master/05-Creating_a_Pipeline.ipynb) for a starting point

Use the scikit-learn and lightgbm conda packages

In [4]:
import azureml.core
from azureml.core import Workspace

# Load the workspace
ws = Workspace.from_config()

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.compute import ComputeTarget, AmlCompute

compute_name = "cpu-cluster1"
vm_size = "STANDARD_D14_V2"
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('Found compute target: ' + compute_name)
else:
    print('Creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,  # STANDARD_NC6 is GPU-enabled
                                                                min_nodes=0,
                                                                max_nodes=4)
    # create the compute target
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # Can poll for a minimum number of nodes and for a specific timeout.
    # If no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current cluster status, use the 'status' property
    print(compute_target.status.serialize())

Found compute target: cpu-cluster1


In [6]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment
# Let Azure ML manage dependencies by setting user_managed_dependencies to False
# Use docker containers by setting docker.enabled to True 
## TODO

# Create a Python environment for the experiment
drivers_env = Environment("drivers-pipeline-env")
drivers_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
drivers_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
drivers_packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas', 'lightgbm'],
                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])

# Add the dependencies to the environment
drivers_env.python.conda_dependencies = drivers_packages

# Register the environment (just in case you want to use it again)
drivers_env.register(workspace=ws)
registered_env = Environment.get(ws, 'drivers-pipeline-env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = compute_target

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


In [7]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.train.estimator import Estimator

from azureml.train.estimator import Estimator
from azureml.core import Experiment

from azureml.train.estimator import Estimator
from azureml.core import Experiment

# Create a PipelineData (temporary Data Reference) for the model folder
# Create a folder for the experiment files
training_folder = 'driver-training'
os.makedirs(training_folder, exist_ok=True)

#model_folder = "models"

# Get the training dataset
drivers_ds = ws.datasets.get("driversdataset")

# Create Step 1, which runs the estimator to train the model
# Create an estimator
#model_folder = PipelineData("model_folder", datastore=ws.get_default_datastore())
model_folder = PipelineData("model_folder", datastore=ws.get_default_datastore())

#model_folder = "models"

estimator = Estimator(source_directory=training_folder,
                        compute_target = compute_target,
                        environment_definition=pipeline_run_config.environment,
                        entry_script='train_drivers.py')


# Create Step 2, which runs the model registration script
## TODO
# Step 1, run the estimator to train the model
train_step = EstimatorStep(name = "Train Model",
                           estimator=estimator, 
                           estimator_entry_script_arguments=['--output_folder', model_folder],
                           inputs=[drivers_ds.as_named_input('driversdataset')],
                           outputs=[model_folder],
                           compute_target = compute_target,
                           allow_reuse = True)

# Step 2, run the model registration script
register_step = PythonScriptStep(name = "Register Model",
                                source_directory = training_folder,
                                script_name = "register_model.py",
                                arguments = ['--model_folder', model_folder],
                                inputs=[model_folder],
                                compute_target = compute_target,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)


print("Pipeline steps defined")

'Estimator' is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or an Azure ML curated environment.


Pipeline steps defined


In [8]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline, which contains Step 1 & 2
pipeline_steps = [train_step, register_step]
pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace = ws, name = 'driver-training-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")

# Run the experiment based on the estimator
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion()

Pipeline is built.
Created step Train Model [74ff141f][23bd3cb7-de8c-44a6-bf59-e94917378231], (This step will run and generate new outputs)
Created step Register Model [a76527d9][7963a442-8156-44f6-bacc-4b6e0b644928], (This step will run and generate new outputs)
Submitted PipelineRun ec8f41a2-5f6c-4b76-bf59-23985951e381
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/driver-training-pipeline/runs/ec8f41a2-5f6c-4b76-bf59-23985951e381?wsid=/subscriptions/c46a9435-c957-4e6c-a0f4-b9a597984773/resourcegroups/mlops/workspaces/mlopsdev
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: ec8f41a2-5f6c-4b76-bf59-23985951e381
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/driver-training-pipeline/runs/ec8f41a2-5f6c-4b76-bf59-23985951e381?wsid=/subscriptions/c46a9435-c957-4e6c-a0f4-b9a597984773/resourcegroups/mlops/workspaces/mlopsdev
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 8608c8e0-c7ca-4451-b358-6b3cd38851f4
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/driver-training-pipeline/runs/8608c8e0-c7ca-4451-b358-6b3cd38851f4?wsid=/subscriptions/c46a9435-c957-4e6c-a0f4-b9a597984773/resourcegroups/mlops/workspaces/mlopsdev
StepRun( Train Model ) Status: NotStarted
StepRun( Train Model ) Status: Running

Streaming azureml-logs/20_image_build_log.txt
2021/01/24 17:07:15 Downloading source code...
2021/01/24 17:07:16 Finished downloading source code
2021/01/24 17:07:17 Creating Docker network: acb_default_network, driver: 'bridge'
2021/01/24 17:07:17 Successfully set up Docker network

'Finished'

In [9]:
# Print the model name, version, tag, and properties
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

driver_model version: 3


driver_model.pkl version: 9


driver_model.pkl version: 8
	 Accuracy : {'auc': 0.6377511613946426, 'f1score': 1.0}
	 learningrate : 0.02
	 f1score : 1.0
	 auc : 0.6377511613946426


NYCGreenTaxiModel version: 1


driver_model version: 2


driver_model.pkl version: 7
	 Accuracy : {'auc': 0.6377511613946426, 'f1score': 1.0}
	 learningrate : 0.02
	 f1score : 1.0
	 auc : 0.6377511613946426


gpt-2 version: 1
	 title : GPT-2 model card
	 datasheet_description : 
Last updated: November 2019

Inspired by [Model Cards for Model Reporting (Mitchell et al.)](https://arxiv.org/abs/1810.03993), we’re providing some accompanying information about the GPT-2 family of models we're releasing.


	 details : This model was developed by researchers at OpenAI to help us understand how the capabilities of language model capabilities scale as a function of the size of the models (by parameter count) combined with very large internet-scale datasets (WebText).
	 date : February 2019,

In [26]:
published_pipeline = pipeline.publish(name="drivers_Training_Pipeline",
                                      description="Trains drivers model",
                                      version="1.0")
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

https://centralus.api.azureml.ms/pipelines/v1.0/subscriptions/c46a9435-c957-4e6c-a0f4-b9a597984773/resourceGroups/mlops/providers/Microsoft.MachineLearningServices/workspaces/mlopsdev/PipelineRuns/PipelineSubmit/37ccd06c-3f50-400b-a434-699869c6f352


In [29]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()

In [None]:
auth_header

In [30]:
import requests
experiment_name = 'Run-drivers-pipeline'

response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": experiment_name})
run_id = response.json()["Id"]
run_id

'f73c7d37-8b24-4f8f-aa1d-b0e136b6abe9'

In [None]:
response.json()

In [32]:
from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails

published_pipeline_run = PipelineRun(ws.experiments[experiment_name], run_id)
RunDetails(published_pipeline_run).show()
pipeline_run.wait_for_completion()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: ec8f41a2-5f6c-4b76-bf59-23985951e381
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/driver-training-pipeline/runs/ec8f41a2-5f6c-4b76-bf59-23985951e381?wsid=/subscriptions/c46a9435-c957-4e6c-a0f4-b9a597984773/resourcegroups/mlops/workspaces/mlopsdev

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'ec8f41a2-5f6c-4b76-bf59-23985951e381', 'status': 'Completed', 'startTimeUtc': '2021-01-24T17:06:54.635608Z', 'endTimeUtc': '2021-01-24T17:25:40.471615Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mlopsdev3695286978.blob.core.windows.net/azureml/ExperimentRun/dcid.ec8f41a2-5f6c-4b76-bf59-23985951e381/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=UyMuP8M6mWEmakCQr8BeqMIgQ43yRPgYl1kBeRW%2F6so%3D&st=2021-01-24T16%3A57%3A16Z&se=2021-01-25T01%3A07%3A1

'Finished'

In [17]:
import joblib

#model_path = Model.get_model_path(model_name="driver_model.pkl")
model_path= "./driver-training/driver_model.pkl"
LGBM_MODEL = joblib.load(model_path)

In [18]:
TEST_ROW = '{"data":[[0,1,8,1,0,0,1,0,0,0,0,0,0,0,12,1,0,0,0.5,0.3,0.610327781,7,1,-1,0,-1,1,1,1,2,1,65,1,0.316227766,0.669556409,0.352136337,3.464101615,0.1,0.8,0.6,1,1,6,3,6,2,9,1,1,1,12,0,1,1,0,0,1],[4,2,5,1,0,0,0,0,1,0,0,0,0,0,5,1,0,0,0.9,0.5,0.771362431,4,1,-1,0,0,11,1,1,0,1,103,1,0.316227766,0.60632002,0.358329457,2.828427125,0.4,0.5,0.4,3,3,8,4,10,2,7,2,0,3,10,0,0,1,1,0,1]]}' 

In [19]:
import json
import numpy

data = json.loads(TEST_ROW)["data"]
data = numpy.array(data)
result = LGBM_MODEL.predict(data)

In [20]:
result

array([0.02731105, 0.02612313])