## Contents
1. [Workspace](#Workspace)
1. [Import](#Import)
1. [Introduction](#Introduction)
1. [Setup](#Setup)
1. [Compute](#Compute)
1. [Data](#Data)
1. [Train](#Train)
1. [Featurization](#Featurization)
1. [Evaluate](#Evaluate)

## Import open source Python packages

In [None]:
# import logging
# import os
# import random
# import re
# import lightgbm
# import pandas as pd
# import numpy as np
# import json
# import csv
# from matplotlib import pyplot as plt
# from matplotlib.pyplot import imshow
# from sklearn import datasets
# from shutil import copy2
# import seaborn as sns
# sns.set(color_codes='True')

## Import Azure Machine Learning Python SDK

In [None]:
# import azureml.core
# from azureml.core import Workspace
# from azureml.core.experiment import Experiment
# from azureml.core.workspace import Workspace
# from azureml.core.compute import AksCompute, ComputeTarget
# from azureml.core.compute import ComputeTarget, AmlCompute
# from azureml.core.compute_target import ComputeTargetException
# from azureml.core.webservice import Webservice, AksWebservice
# from azureml.core.image import Image
# from azureml.core.model import Model
# from azureml.train.automl import AutoMLConfig
# from azureml.train.automl.run import AutoMLRun
# from azureml.widgets import RunDetails

## Workspace

In [1]:
from azureml.core import Workspace

In [3]:
# download config.json from machine learning portal
ws = Workspace.from_config()

## Dataset

## Experiment & Run  
### Interactive inline method

In [None]:
from azureml.core import Experiment

In [None]:
# create an experiment variable
experiment = Experiment(workspace=ws, name="experiment_01")
# start the experiment
run = experiment.start_logging()
# experiment code goes here
# log          Record a single named value
# log_list     Record a named list of values
# log_row      Record a row with multiple columns
# log_table    Record a dictionary as a table
# log_image    Record an image file or a plot
run.log('Accuracy', 0.50)
run.log('Accuracy', 0.55)
run.log('Accuracy', 0.60)
run.log('Accuracy', 0.65)
run.log('Accuracy', 0.77)
# end the experiment
run.complete()
# only for this specific Run we can get the log data:
run.get_metrics()

## View progress

In [None]:
from azureml.widgets import RunDetails

In [None]:
# notebook widget to view the progress of model training
RunDetails(run).show()

## Experiment & Run  
### Script method

In [None]:
# creating a script "experiment.py"

In [None]:
%%writefile experiment.py
from azureml.core import Run
import pandas as pd
import os

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
data = pd.read_csv('data.csv')

# Count the rows and log the result
row_count = (len(data))
run.log('observations', row_count)

# Save a sample of the data
os.makedirs('outputs', exist_ok=True)
data.head(2).to_csv("outputs/sample.csv", index=False, header=True)

# Complete the run
run.complete()

In [None]:
# creating testdata
import pandas as pd
df = pd.DataFrame({"firstName":["bart","koen","karel"],
                   "lastName":["Vermeers","Aerts","Venbelsteren"]})
df.to_csv("data.csv")

In [1]:
# RunConfiguration = python environment setup
# ScriptRunConfig  = script + environment setup
from azureml.core import Experiment, RunConfiguration, ScriptRunConfig

In [None]:
# create a new RunConfig object
# Represents configuration for experiment runs targeting different compute targets in Azure Machine Learning
experiment_run_config = RunConfiguration()

In [None]:
# Create a ScriptRunConfig object
# Represents configuration information for submitting a training run in Azure Machine Learning
script_config = ScriptRunConfig(source_directory='.',
                                script='experiment.py',
                                run_config=experiment_run_config) 

In [None]:
# submit the experiment
experiment = Experiment(workspace=ws, name='experiment_02')
run = experiment.submit(config=script_config)
run.wait_for_completion(show_output=True)

# Experiment & Run
## Estimator (generic)

In [2]:
from azureml.train.estimator import Estimator
from azureml.core import Experiment

In [3]:
# Create an estimator
estimator = Estimator(source_directory='.',
                      entry_script='experiment.py',
                      compute_target='local',
                      conda_packages=['scikit-learn']
                      )

In [8]:
# Create and run an experiment
experiment = Experiment(workspace=ws, name='experiment_03')
run = experiment.submit(config=estimator)

In [11]:
#run.wait_for_completion(show_output=True)

In [12]:
# encapsulates a 'Run Configuration' and a 'Script Run Configuration' in a single object !

# Experiment & Run
## passing arguments

In [27]:
%%writefile titanic.csv
,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,1,0,3,22.0,1,0,7.2500,1.0,0.0,1.0
1,2,1,1,38.0,1,0,71.2833,0.0,0.0,0.0
2,3,1,3,26.0,0,0,7.9250,0.0,0.0,1.0
3,4,1,1,35.0,1,0,53.1000,0.0,0.0,1.0
4,5,0,3,35.0,0,0,8.0500,1.0,0.0,1.0

Writing titanic.csv


In [30]:
import pandas as pd
titan = pd.read_csv("titanic.csv", )
titan

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,1,0,3,22.0,1,0,7.25,1.0,0.0,1.0
1,1,2,1,1,38.0,1,0,71.2833,0.0,0.0,0.0
2,2,3,1,3,26.0,0,0,7.925,0.0,0.0,1.0
3,3,4,1,1,35.0,1,0,53.1,0.0,0.0,1.0
4,4,5,0,3,35.0,0,0,8.05,1.0,0.0,1.0


In [37]:
titan.columns

Index(['Unnamed: 0', 'PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp',
       'Parch', 'Fare', 'male', 'Q', 'S'],
      dtype='object')

In [38]:
%%writefile experiment_argparse.py
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Get the experiment run context
run = Run.get_context()

# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg_rate', type=float, dest='reg', default=0.01)
args = parser.parse_args()
reg = args.reg

# Prepare the dataset
data = pd.read_csv('data.csv')
titanic = pd.read_csv('titanic.csv')
X, y = titanic[['PassengerId','Pclass','Age','SibSp','Parch','Fare','male','Q','S']].values, titanic['Survived'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Train a logistic regression model
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# Count the rows and log the result and save the argument value
row_count = (len(data))
run.log('observations', row_count)
run.log("the given 'reg_rate' parameter:", reg) # <------------

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))

# Save a sample of the data
os.makedirs('outputs', exist_ok=True)
data.head(2).to_csv("outputs/sample.csv", index=False, header=True)

# Save the trained model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/titanic_model.pkl')

# Complete the run
run.complete()

Overwriting experiment_argparse.py


### use script_params = {'--reg_rate': 0.1}

In [39]:
from azureml.train.estimator import Estimator
from azureml.core import Experiment

In [40]:
# Create an estimator
estimator = Estimator(source_directory='.',
                      entry_script='experiment_argparse.py',
                      script_params = {'--reg_rate': 0.1}, # <-------------
                      compute_target='local',
                      conda_packages=['scikit-learn', 'joblib'])

In [41]:
# Create and run an experiment
experiment = Experiment(workspace=ws, name='experiment_04')
run = experiment.submit(config=estimator)

In [42]:
run.wait_for_completion(show_output=True)

RunId: experiment_04_1585219254_ef64c469
Web View: https://ml.azure.com/experiments/experiment_04/runs/experiment_04_1585219254_ef64c469?wsid=/subscriptions/43c1f93a-903d-4b23-a4bf-92bd7a150627/resourcegroups/myResourceGroup/workspaces/machine_learning_workspace

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt
Starting the daemon thread to refresh tokens in background for process with pid = 22426
Running: ['/bin/bash', '/tmp/azureml_runs/experiment_04_1585219254_ef64c469/azureml-environment-setup/docker_env_checker.sh']

Found materialized image on target: azureml/azureml_586a3ed27470f038ee8054b84967c621


Logging experiment running status in history service.
Running: ['sudo', 'docker', 'run', '--name', 'experiment_04_1585219254_ef64c469', '--rm', '-v', '/tmp/azureml_runs/experiment_04_1585219254_ef64c469:/azureml-run', '--shm-size', '2g', '-e', 'EXAMPLE_ENV_VAR=EXAMPLE_VALUE', '-e', 'AZUREML_CONTEXT_MANAGER_TRACKUSERERROR=eyJTa2lwSGlzdG9ye

{'runId': 'experiment_04_1585219254_ef64c469',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-03-26T10:40:57.797123Z',
 'endTimeUtc': '2020-03-26T10:41:06.256736Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '2769b7e8-81cb-4cd4-abf2-3c7eddf947b3',
  'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '52678fcbd4c5cd3218a03f1cfd3043f4bdf6d765',
  'mlflow.source.git.commit': '52678fcbd4c5cd3218a03f1cfd3043f4bdf6d765',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [],
 'runDefinition': {'script': 'experiment_argparse.py',
  'useAbsolutePath': False,
  'arguments': ['--reg_rate', '0.1'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferenc

## Retrieving files

In [43]:
# "run" is a reference to a completed experiment run
# List the files generated by the experiment
for file in run.get_file_names():
    print(file)

azureml-logs/60_control_log.txt
azureml-logs/70_driver_log.txt
logs/azureml/8_azureml.log
outputs/sample.csv
outputs/titanic_model.pkl


In [44]:
# Download a named file
#run.download_file(name='outputs/model.pkl', output_file_path='model.pkl')
run.download_file(name='outputs/sample.csv', output_file_path='sample.csv')

In [45]:
!ls -l sample.csv

-rw-rw-r-- 1 ubuntu ubuntu 59 Mar 26 11:44 sample.csv


# Register a model

### option A
Fails, because it first need to download the model file

In [59]:
# register a model from a local file, you can use the register method of the Model object
# from azureml.core import Model

# model = Model.register(workspace=ws,
#                        model_name='titanic_classification_model',
#                        model_path='outputs/titanic_model.pkl', # local path
#                        description='A classification model Titanic',
#                        tags={'testmodel': 'titanic'},
#                        model_framework=Model.Framework.SCIKITLEARN,
#                        model_framework_version='0.20.3')

### option B
this is better because it grabs the model file from the run !

In [47]:
# Register a model using reference to the Run use its register_model method
run.register_model(model_name='titanic_classification_model',
                   model_path='outputs/titanic_model.pkl', # run outputs path
                   description='A classification model Titanic',
                   tags={'testmodel': 'titanic'},
                   model_framework=Model.Framework.SCIKITLEARN,
                   model_framework_version='0.20.3')

Model(workspace=Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup'), name=titanic_classification_model, id=titanic_classification_model:1, version=1, tags={'testmodel': 'titanic'}, properties={})

In [48]:
# view registered models with
Model.list(ws)

[Model(workspace=Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup'), name=titanic_classification_model, id=titanic_classification_model:1, version=1, tags={'testmodel': 'titanic'}, properties={})]

# Working with Data

In [51]:
from azureml.core import Datastore

In [53]:
# list all datastores (already registered a few manually)
ws.datastores

{'data_lake_gen2': <azureml.data.azure_data_lake_datastore.AzureDataLakeGen2Datastore at 0x7efdd6ef9cc0>,
 'workspacefilestore': <azureml.data.azure_storage_datastore.AzureFileDatastore at 0x7efdd6ef9be0>,
 'workspaceblobstore': <azureml.data.azure_storage_datastore.AzureBlobDatastore at 0x7efdd6e8dc88>}

In [54]:
# get a reference to ex: data_lake_gen2
blob_store = Datastore.get(ws, datastore_name='workspaceblobstore')
data_lake_gen2 = Datastore.get(ws, datastore_name='data_lake_gen2')

In [57]:
print(type(blob_store))
print(type(data_lake_gen2))

<class 'azureml.data.azure_storage_datastore.AzureBlobDatastore'>
<class 'azureml.data.azure_data_lake_datastore.AzureDataLakeGen2Datastore'>


In [58]:
# copy titanic.csv to datalake in /datalake/gold/

### option1: Download

### option2: Upload

### option3: Mount (preferred) - not possible on local compute

In [None]:
# you must pass "script_params" parameter to an experiment script
# ex:   script_params = {'--data_folder': data_ref}

In [None]:
%%writefile experiment_argparse.py
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Get the experiment run context
run = Run.get_context()

# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg_rate', type=float, dest='reg', default=0.01)
args = parser.parse_args()
reg = args.reg

# set datastore local reference path
parser = argparse.ArgumentParser()
parser.add_argument('--data_folder', type=str, dest='data_folder')
args = parser.parse_args()
data_files = os.listdir(args.data_folder)

# Prepare the dataset
data = pd.read_csv('data.csv')
titanic = pd.read_csv('titanic.csv')
X, y = titanic[['PassengerId','Pclass','Age','SibSp','Parch','Fare','male','Q','S']].values, titanic['Survived'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Train a logistic regression model
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# Count the rows and log the result and save the argument value
row_count = (len(data))
run.log('observations', row_count)
run.log("the given 'reg_rate' parameter:", reg) # <------------

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))

# Save a sample of the data
os.makedirs('outputs', exist_ok=True)
data.head(2).to_csv("outputs/sample.csv", index=False, header=True)

# Save the trained model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/titanic_model.pkl')

# Complete the run
run.complete()

In [69]:
data_ref = blob_store.path("data/files").as_download(path_on_compute='training_data')

In [70]:
import os
os.listdir(data_ref)

TypeError: listdir: path should be string, bytes, os.PathLike, integer or None, not DataReference

In [None]:
data_ref = blob_ds.path('gold/').as_download(path_on_compute='training_data')
estimator = Estimator(source_directory='.',
                      entry_script='experiment_argparse.py',
                      script_params = {'--reg_rate': 0.1}, # <-------------
                      compute_target='local',
                      conda_packages=['scikit-learn', 'joblib'],
                      pip_packages=['azureml-sdk'],
                      script_params = {'--data_folder': data_ref})

## Datasets
### Retrieving a registered dataset
https://nbviewer.jupyter.org/github/MicrosoftDocs/mslearn-aml-labs/blob/master/03-Working_with_Data.ipynb

In [77]:
# we manually created and registered a dataset from the datalake

In [78]:
# show a list of available datasets
ws.datasets

{'datalake': DatasetRegistration(id='a2af81a9-8e27-429d-8845-489bd371e9ca', name='datalake', version=1, description='', tags={})}

In [79]:
# Get a dataset from the workspace datasets collection
#ds1 = ws.datasets['datalake']

In [80]:
from azureml.core import Dataset

In [81]:
# Get a dataset by name from the datasets class
ds2 = Dataset.get_by_name(ws, 'datalake')

In [83]:
# list all files in the datalake (incl. directories)
ds2.to_path()

['/bronze/db_v2_csv/_committed_1187318739692831567',
 '/bronze/db_v2_csv/_started_1187318739692831567',
 '/bronze/db_v2_csv/part-00000-tid-1187318739692831567-fae6394b-2577-421a-ab58-50e75b7b6889-9-1-c000.csv',
 '/bronze/db_v2_csv/part-00001-tid-1187318739692831567-fae6394b-2577-421a-ab58-50e75b7b6889-10-1-c000.csv',
 '/bronze/db_v2_csv/part-00002-tid-1187318739692831567-fae6394b-2577-421a-ab58-50e75b7b6889-11-1-c000.csv',
 '/bronze/docph/DB_V2.parquet',
 '/bronze/pharma_ref.xlsx',
 '/bronze/pharma_ref_csv/_committed_3971660126738673139',
 '/bronze/pharma_ref_csv/_started_3971660126738673139',
 '/bronze/pharma_ref_csv/part-00000-tid-3971660126738673139-e84ee614-8706-4e8b-afc5-89e3bd88a7a4-4-1-c000.csv',
 '/bronze/pharma_ref_csv/part-00001-tid-3971660126738673139-e84ee614-8706-4e8b-afc5-89e3bd88a7a4-5-1-c000.csv',
 '/bronze/pharma_ref_csv/part-00002-tid-3971660126738673139-e84ee614-8706-4e8b-afc5-89e3bd88a7a4-6-1-c000.csv',
 '/bronze/pharma_ref_csv/part-00003-tid-3971660126738673139-e84

### passing a file dataset, you must specify the access mode

In [None]:
# the script will need to work with a Dataset object, you must include pip packages:

#estimator = Estimator(pip_packages=['azureml-sdk'])

In [None]:
estimator = Estimator(source_directory='.',
                      entry_script='experiment_argparse.py'
                      compute_target='local',
                      inputs=[img_ds.as_named_input('img_data').as_download(path_on_compute='data')],
                      pip_packages=['azureml-dataprep[pandas]')

In [None]:
estimator = SKLearn( source_directory='experiment_folder',
                     entry_script='training_script.py',
                     compute_target='local',
                     inputs=[tab_ds.as_named_input('csv_data')],
                     pip_packages=['azureml-dataprep[pandas]')

In [None]:
data_ref = blob_ds.path('gold/').as_download(path_on_compute='training_data')
estimator = Estimator(source_directory='.',
                      entry_script='experiment_argparse.py',
                      script_params = {'--reg_rate': 0.1}, # <-------------
                      compute_target='local',
                      conda_packages=['scikit-learn', 'joblib'],
                      pip_packages=['azureml-sdk'],
                      script_params = {'--data_folder': data_ref})

# Train a Model from a File Dataset (mount mode)

In [56]:
# put 2 files in the ADSL Gen2 data lake
# they are in container "datalake" and this is registered in ml workspace as "datalake" Datastore
# in turn the container "datalake" is registered as a Dataset in azure ML Workspace
# 2 specific files of intrest: 
# /gold/diabetes.csv
# /gold/diabetes2.csv
# the goal is to mount these into a run script send to compute nodes to train model on

In [57]:
# the dataset input passed to the script represents a mount point containing file paths

In [59]:
# create A folder named diabetes_training_from_file_dataset here locally
import os

# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_file_dataset'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

diabetes_training_from_file_dataset folder created


In [60]:
# create a script that trains a classification model by using a file dataset that is passed to it as an input

In [61]:
print("diabetes" + "/*.csv")

diabetes/*.csv


In [118]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import argparse
from azureml.core import Workspace, Dataset, Experiment, Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import glob

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
data_path = run.input_datasets['diabetes']  # Get the training data from the estimator input
print("data_path: " + str(data_path))         # diabetes_path
all_files = glob.glob(data_path + "/*")
print("data_path + '/*.csv': " + str(data_path + '/*.csv')) # diabetes_path/*.csv
print([file for file in all_files])   # ['diabetes_path/diabetes.csv', 'diabetes_path/diabetes2.csv']
print("type(all_files): " + str(type(all_files)))
print(type(all_files[0]))
diabetes = pd.concat((pd.read_csv(f) for f in all_files))
print("number of records: " + str(len(diabetes)))

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting diabetes_training_from_file_dataset/diabetes_training.py


Next we need to change the way we pass the dataset to the estimator - it needs to define a mount point from which the script can read the files. For large volumes of data, you'd generally use the **as_mount** method to stream the files directly from the dataset source; but when running on local compute (as we are in this example), you need to use the **as_download** option to download the dataset files to a local folder.

Also, since the **Dataset** class is defined in the **azureml-dataprep** package, we need to include that in the experiment environment.

In [64]:
from azureml.train.sklearn import SKLearn
from azureml.core import Experiment
from azureml.core import Dataset
from azureml.widgets import RunDetails

In [65]:
# Set the script parameters
script_params = {
    '--regularization': 0.1
}

In [66]:
ws.datasets

{'diabetes1': DatasetRegistration(id='ebc21ed2-3f94-494b-8072-2c71d2190200', name='diabetes1', version=1, description='', tags={}), 'datalakegold': DatasetRegistration(id='ad71d877-e111-4bd4-bf8a-8a602709dffd', name='datalakegold', version=1, description='', tags={}), 'datalake': DatasetRegistration(id='a2af81a9-8e27-429d-8845-489bd371e9ca', name='datalake', version=1, description='', tags={})}

In [67]:
# Get the training dataset
diabetes_ds = ws.datasets.get("datalakegold")

# Get a dataset from the workspace datasets collection
#ds1 = ws.datasets['datalakegold']
#-or-
# Get a dataset by name from the datasets class
#ds2 = Dataset.get_by_name(ws, 'datalakegold')

In [68]:
diabetes_ds

{
  "source": [
    "('data_lake_gen2', 'gold/**')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "ad71d877-e111-4bd4-bf8a-8a602709dffd",
    "name": "datalakegold",
    "version": 1,
    "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
  }
}

In [69]:
[diabetes_ds]

[{
   "source": [
     "('data_lake_gen2', 'gold/**')"
   ],
   "definition": [
     "GetDatastoreFiles"
   ],
   "registration": {
     "id": "ad71d877-e111-4bd4-bf8a-8a602709dffd",
     "name": "datalakegold",
     "version": 1,
     "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
   }
 }]

In [70]:
#diabetes_ds.as_named_input('diabetes').as_download(path_on_compute='diabetes_data')

In [72]:
# Create an estimator
estimator = SKLearn(source_directory=experiment_folder,
                    entry_script='diabetes_training.py',
                    script_params=script_params,
                    compute_target = 'local',
                    inputs=[diabetes_ds.as_named_input('diabetes').as_download(path_on_compute='diabetes_path')], # Pass the Dataset object as an input
                    pip_packages=['azureml-dataprep[pandas]'] # so we need the dataprep package
                   )

In [104]:
diabetes_ds.as_named_input('diabetes').as_download(path_on_compute='diabetes_path').__dict__
# notice the mode if set to Download or Mount then 
# Run.input_datasets will return the base path of the delivered data
# ex: Run.input_datasets['diabetes']   ---> "diabetes_path" string value
# this is probably the path_on_compute where the data should be put...
# files are available on  ['diabetes_path/diabetes.csv', 'diabetes_path/diabetes2.csv']

{'dataset': {
   "source": [
     "('data_lake_gen2', 'gold/**')"
   ],
   "definition": [
     "GetDatastoreFiles"
   ],
   "registration": {
     "id": "ad71d877-e111-4bd4-bf8a-8a602709dffd",
     "name": "datalakegold",
     "version": 1,
     "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
   }
 },
 'name': 'diabetes',
 'mode': 'download',
 'path_on_compute': 'diabetes_path'}

In [119]:
# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)
# Run the experiment
run = experiment.submit(config=estimator)

In [120]:
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'diabetes-training_1585308364_a95f99c6',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2020-03-27T11:26:06.544218Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '761f2bce-8d30-4c56-9dd5-0be0c7d55ccb',
  'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '8189bb0610764a3d5583f351f729bdc6cb32fe0e',
  'mlflow.source.git.commit': '8189bb0610764a3d5583f351f729bdc6cb32fe0e',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [{'dataset': {'id': 'ad71d877-e111-4bd4-bf8a-8a602709dffd'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'diabetes', 'mechanism': 'Download', 'pathOnCompute': 'diabetes_path'}}],
 'runDefinition': {'script': 'diabetes_training.py',
  'useAbsolutePath': False,
  'arguments': ['

When the experiment has completed, in the widget, view the **azureml-logs/70_driver_log.txt** output log to verify that the file dataset was processed and the data files downloaded.

# Datastore method (mount mode)

In [129]:
from azureml.core import Workspace

In [130]:
ws = Workspace.from_config()

In [131]:
from azureml.core import Datastore, Dataset

In [132]:
# available datastore names
ws.datastores

{'data_lake_gen2': <azureml.data.azure_data_lake_datastore.AzureDataLakeGen2Datastore at 0x7f2bd1730668>,
 'workspacefilestore': <azureml.data.azure_storage_datastore.AzureFileDatastore at 0x7f2bd17302b0>,
 'workspaceblobstore': <azureml.data.azure_storage_datastore.AzureBlobDatastore at 0x7f2bc226ac50>}

In [158]:
ds = Datastore.get(workspace=ws, datastore_name="data_lake_gen2")

In [134]:
ds.__dict__

{'_workspace': Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup'),
 '_name': 'data_lake_gen2',
 '_datastore_type': 'AzureDataLakeGen2',
 'tenant_id': '73b49191-8db3-45ab-87b3-b8f956ac123b',
 'client_id': '38c02221-4a41-4ec8-b8da-a81f16c38e82',
 'client_secret': 'l]ABG6@Z/9r/hX7EK0zavK5Nx[MA-J1V',
 'resource_url': 'https://storage.azure.com',
 'authority_url': 'https://login.microsoftonline.com',
 'container_name': 'datalake',
 'account_name': 'datalake21032020',
 'protocol': 'https',
 'endpoint': 'core.windows.net'}

In [135]:
from azureml.core import Dataset

In [136]:
# available dataset names
ws.datasets

{'diabetes1': DatasetRegistration(id='ebc21ed2-3f94-494b-8072-2c71d2190200', name='diabetes1', version=1, description='', tags={}), 'datalakegold': DatasetRegistration(id='ad71d877-e111-4bd4-bf8a-8a602709dffd', name='datalakegold', version=1, description='', tags={}), 'datalake': DatasetRegistration(id='a2af81a9-8e27-429d-8845-489bd371e9ca', name='datalake', version=1, description='', tags={})}

In [137]:
from azureml.data.datapath import DataPath

In [138]:
# Creating and registering file datasets
#blob_ds = Dataset.get_by_name(workspace=ws, name="datalakegold")
datastore = Datastore.get(workspace=ws, datastore_name="data_lake_gen2")

In [139]:
datastore_path = [
    DataPath(datastore, 'platinum/diabetes.csv'),
    DataPath(datastore, 'platinum/folder/*.csv')
]

In [140]:
datastore_path

[<azureml.data.datapath.DataPath at 0x7f2bc225aeb8>,
 <azureml.data.datapath.DataPath at 0x7f2bc225afd0>]

In [141]:
# select the files that you need
file_dataset = Dataset.File.from_files(path=datastore_path)

In [142]:
file_dataset

{
  "source": [
    "('data_lake_gen2', 'platinum/diabetes.csv')",
    "('data_lake_gen2', 'platinum/folder/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}

In [143]:
# registering these files
file_ds = file_dataset.register(workspace=ws, name='diabetes1')

In [162]:
# get the registered dataset by name from the datasets class
ds = Dataset.get_by_name(workspace=ws, name="diabetes2")

In [145]:
ds

{
  "source": [
    "('data_lake_gen2', 'platinum/diabetes.csv')",
    "('data_lake_gen2', 'platinum/folder/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "ebc21ed2-3f94-494b-8072-2c71d2190200",
    "name": "diabetes1",
    "version": 1,
    "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
  }
}

In [146]:
# list all files
ds.to_path()

['/data_lake_gen2/platinum/diabetes.csv',
 '/data_lake_gen2/platinum/folder/diabetes2.csv']

In [147]:
# Passing a dataset to an experiment script

In [148]:
# create A folder named diabetes_training_from_file_dataset here locally
import os

# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_file_dataset'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

diabetes_training_from_file_dataset folder created


In [149]:
# we have created a global variable "experiment_folder"

In [180]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import argparse
from azureml.core import Workspace, Dataset, Experiment, Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import glob

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
data_path = run.input_datasets['diabetes2'] # Get the training data from the estimator input
print("data_path: " + str(data_path))
all_files = glob.glob(data_path + "/folder/*.csv")
#print("data_path + '/*.csv': " + str(data_path + '/*.csv')) # diabetes_path/*.csv
print([file for file in all_files])   # ['diabetes_path/diabetes.csv', 'diabetes_path/diabetes2.csv']
print("type(all_files): " + str(type(all_files)))
print(type(all_files[0]))
diabetes = pd.concat((pd.read_csv(f) for f in all_files))
print("number of records: " + str(len(diabetes)))

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting diabetes_training_from_file_dataset/diabetes_training.py


In [151]:
from azureml.train.sklearn import SKLearn
from azureml.core import Experiment
from azureml.core import Dataset
from azureml.widgets import RunDetails

In [166]:
# get the registered dataset by name from the datasets class
ds = Dataset.get_by_name(workspace=ws, name="diabetes2")

In [167]:
# Set the script parameters
script_params = {
    '--regularization': 0.1
}

In [169]:
ds.as_named_input('diabetes2').__dict__

{'dataset': {
   "source": [
     "('data_lake_gen2', 'platinum/**')"
   ],
   "definition": [
     "GetDatastoreFiles"
   ],
   "registration": {
     "id": "2c81c692-c43c-4f03-9952-45124c0da47c",
     "name": "diabetes2",
     "version": 1,
     "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
   }
 },
 'name': 'diabetes2',
 'mode': 'direct',
 'path_on_compute': None}

In [170]:
ds.as_named_input('diabetes2').as_download(path_on_compute='diabetes_path')

<azureml.data.dataset_consumption_config.DatasetConsumptionConfig at 0x7f2bc2209438>

In [183]:
# list of all the files
ds.to_path()

['/diabetes.csv', '/folder/diabetes2.csv']

In [171]:
# Create an estimator
estimator = SKLearn(source_directory=experiment_folder,
                    entry_script='diabetes_training.py',
                    script_params=script_params,
                    compute_target = 'local',
                    inputs=[ds.as_named_input('diabetes2').as_download(path_on_compute='diabetes_path')], # Pass the Dataset object as an input
                    pip_packages=['azureml-dataprep[pandas]'] # so we need the dataprep package
                   )

In [181]:
# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)
# Run the experiment
run = experiment.submit(config=estimator)

In [182]:
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'diabetes-training_1585314547_c0946781',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2020-03-27T13:09:09.090884Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'a3adee05-57f2-4bdc-903f-8941cc1e936b',
  'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '8189bb0610764a3d5583f351f729bdc6cb32fe0e',
  'mlflow.source.git.commit': '8189bb0610764a3d5583f351f729bdc6cb32fe0e',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [{'dataset': {'id': '2c81c692-c43c-4f03-9952-45124c0da47c'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'diabetes2', 'mechanism': 'Download', 'pathOnCompute': 'diabetes_path'}}],
 'runDefinition': {'script': 'diabetes_training.py',
  'useAbsolutePath': False,
  'arguments': [

# let's try again with Mount

In [3]:
from azureml.core import Workspace
from azureml.core import Datastore
from azureml.core import Dataset

#from azureml.train.sklearn import SKLearn
from azureml.core import Experiment
from azureml.widgets import RunDetails

In [5]:
ws = Workspace.from_config()

In [7]:
ws.datasets

{'diabetes2': DatasetRegistration(id='2c81c692-c43c-4f03-9952-45124c0da47c', name='diabetes2', version=1, description='', tags={}), 'diabetes1': DatasetRegistration(id='ebc21ed2-3f94-494b-8072-2c71d2190200', name='diabetes1', version=1, description='', tags={}), 'datalakegold': DatasetRegistration(id='ad71d877-e111-4bd4-bf8a-8a602709dffd', name='datalakegold', version=1, description='', tags={}), 'datalake': DatasetRegistration(id='a2af81a9-8e27-429d-8845-489bd371e9ca', name='datalake', version=1, description='', tags={})}

In [8]:
ds = Dataset.get_by_name(workspace=ws, name="diabetes2")

In [9]:
ds.to_path()

['/diabetes.csv', '/folder/diabetes2.csv']

In [10]:
# create A folder named diabetes_training_from_file_dataset here locally
import os

# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_file_dataset'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

diabetes_training_from_file_dataset folder created


In [15]:
# Set the script parameters
script_params = {
    '--regularization': 0.1
}

In [11]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import argparse
from azureml.core import Workspace, Dataset, Experiment, Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import glob

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
data_path = run.input_datasets['diabetes2'] # Get the training data from the estimator input
print("data_path: " + str(data_path))
all_files = glob.glob(data_path + "/**/*.csv", recursive=True)
print("data_path + '/**/*.csv': " + str(data_path + '/**/*.csv')) # diabetes_path/*.csv
print([file for file in all_files])   # ['diabetes_path/diabetes.csv', 'diabetes_path/diabetes2.csv']
print("type(all_files): " + str(type(all_files)))
print(type(all_files[0]))
diabetes = pd.concat((pd.read_csv(f) for f in all_files))
print("number of records: " + str(len(diabetes)))
print("writing outputs/diabetes.parquet:")
diabetes.to_parquet("outputs/diabetes.parquet")
print("writing logs/out.csv:")
diabetes.to_csv("logs/out.csv", index=False)
print("crap upload")
#diabetes.to_csv("diabetes_path/diabetes.csv", index=False)
#run.output_datasets['diabetes2']
# read-only filesystem !!

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting diabetes_training_from_file_dataset/diabetes_training.py


In [12]:
# mode: download
#ds.as_named_input('diabetes2').as_download(path_on_compute='diabetes_path').__dict__
# mode: mount
ds.as_named_input('diabetes2').as_mount(path_on_compute='diabetes_path').__dict__

{'dataset': {
   "source": [
     "('data_lake_gen2', 'platinum/**')"
   ],
   "definition": [
     "GetDatastoreFiles"
   ],
   "registration": {
     "id": "2c81c692-c43c-4f03-9952-45124c0da47c",
     "name": "diabetes2",
     "version": 1,
     "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
   }
 },
 'name': 'diabetes2',
 'mode': 'mount',
 'path_on_compute': 'diabetes_path'}

In [13]:
# Create an estimator
# estimator = SKLearn(source_directory=experiment_folder,
#                     entry_script='diabetes_training.py',
#                     script_params=script_params,
#                     compute_target = 'local',
#                     inputs=[ds.as_named_input('diabetes2').as_download(path_on_compute='diabetes_path')], # Pass the Dataset object as an input
#                     pip_packages=['azureml-dataprep[pandas]'] # so we need the dataprep package
#                    )

In [14]:
# Create an estimator
# estimator = Estim(source_directory=experiment_folder,
#                     entry_script='diabetes_training.py',
#                     script_params=script_params,
#                     compute_target = 'local',
#                     inputs=[ds.as_named_input('diabetes2').as_mount(path_on_compute='diabetes_path')], # Pass the Dataset object as an input
#                     pip_packages=['azureml-dataprep[pandas]', 'azureml-dataprep[fuse]', 'pyarrow', 'fastparquet'] # so we need the dataprep package
#                    )

In [16]:
from azureml.train.estimator import Estimator
# Create an estimator
#data_ref = blob_ds.path('gold/').as_download(path_on_compute='training_data')
estimator = Estimator(source_directory=experiment_folder,
                      entry_script='diabetes_training.py',
                      script_params=script_params,
                      compute_target = 'local',
                      inputs=[ds.as_named_input('diabetes2').as_mount(path_on_compute='diabetes_path')],
                      conda_packages=['scikit-learn', 'joblib'],
                      pip_packages=['azureml-dataprep[pandas]', 'azureml-dataprep[fuse]', 'pyarrow', 'fastparquet']
                     )

In [17]:
# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)
# Run the experiment
run = experiment.submit(config=estimator)

In [18]:
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()
#run.wait_for_completion(show_output=True)

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'diabetes-training_1585387694_4b338672',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2020-03-28T09:28:18.865017Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '1df106e5-ab5c-43de-8c4b-ee4ed3c1ad43',
  'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': 'f0ca49dc4562d42ef37c283476bd7532e5beaac6',
  'mlflow.source.git.commit': 'f0ca49dc4562d42ef37c283476bd7532e5beaac6',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [{'dataset': {'id': '2c81c692-c43c-4f03-9952-45124c0da47c'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'diabetes2', 'mechanism': 'Mount', 'pathOnCompute': 'diabetes_path'}}],
 'runDefinition': {'script': 'diabetes_training.py',
  'useAbsolutePath': False,
  'arguments': ['--

# Compute contexts

In [30]:
from azureml.core import Workspace

In [23]:
from azureml.core import Workspace

ws = Workspace.from_config()

In [24]:
from azureml.core import Environment

In [68]:
%%writefile conda.yml
name: py_env
dependencies:
  - numpy
  - pandas
  - scikit-learn
  - joblib
  - pip:
    - azureml-defaults
    - azureml-dataprep[pandas]
    - azureml-dataprep[fuse]
    - pyarrow
    - fastparquet

Overwriting conda.yml


In [69]:
# create environment from a file
env = Environment.from_conda_specification(name='training_environment',
                                           file_path='./conda.yml')



In [70]:
env_names = Environment.list(workspace=ws)
for env_name in env_names:
    print('Name:',env_name)

Name: training_environment
Name: AzureML-Tutorial
Name: AzureML-Minimal
Name: AzureML-Chainer-5.1.0-GPU
Name: AzureML-PyTorch-1.2-CPU
Name: AzureML-TensorFlow-1.12-CPU
Name: AzureML-TensorFlow-1.13-CPU
Name: AzureML-PyTorch-1.1-CPU
Name: AzureML-TensorFlow-1.10-CPU
Name: AzureML-PyTorch-1.0-GPU
Name: AzureML-TensorFlow-1.12-GPU
Name: AzureML-TensorFlow-1.13-GPU
Name: AzureML-Chainer-5.1.0-CPU
Name: AzureML-PyTorch-1.0-CPU
Name: AzureML-Scikit-learn-0.20.3
Name: AzureML-PyTorch-1.2-GPU
Name: AzureML-PyTorch-1.1-GPU
Name: AzureML-TensorFlow-1.10-GPU
Name: AzureML-PyTorch-1.3-GPU
Name: AzureML-TensorFlow-2.0-CPU
Name: AzureML-PyTorch-1.3-CPU
Name: AzureML-TensorFlow-2.0-GPU
Name: AzureML-PySpark-MmlSpark-0.15
Name: AzureML-AutoML
Name: AzureML-PyTorch-1.4-GPU
Name: AzureML-PyTorch-1.4-CPU
Name: AzureML-VowpalWabbit-8.8.0
Name: AzureML-Hyperdrive-ForecastDNN
Name: AzureML-AutoML-GPU
Name: AzureML-AutoML-DNN-GPU
Name: AzureML-AutoML-DNN
Name: AzureML-Designer-R
Name: AzureML-Designer-Recomm

In [71]:
#create environment from the DSVM itself
# env = Environment.from_existing_conda_environment(name='training_environment',
#                                                   conda_environment_name='azureml_py36_automl')

In [72]:
env.register(workspace=ws)

{
    "name": "training_environment",
    "version": "5",
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "python": {
        "userManagedDependencies": false,
        "interpreterPath": "python",
        "condaDependenciesFile": null,
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "dependencies": [
                "numpy",
                "pandas",
                "scikit-learn",
                "joblib",
                {
                    "pip": [
                        "azureml-defaults",
                        "azureml-dataprep[pandas]",
                        "azureml-dataprep[fuse]",
                        "pyarrow",
                        "fastparquet"
                    ]
                },
                "python=3.6.2"
            ],
            "name": "azureml_5920f805fa659293f97bedc85ff62dbe"
        }
    },
    "docker": {
        "enabled": false,
        "baseImage": "mcr.microsoft.com/azure

In [50]:
env_names = Environment.list(workspace=ws)
for env_name in env_names:
    print('Name:',env_name)

Name: training_environment
Name: AzureML-Tutorial
Name: AzureML-Minimal
Name: AzureML-Chainer-5.1.0-GPU
Name: AzureML-PyTorch-1.2-CPU
Name: AzureML-TensorFlow-1.12-CPU
Name: AzureML-TensorFlow-1.13-CPU
Name: AzureML-PyTorch-1.1-CPU
Name: AzureML-TensorFlow-1.10-CPU
Name: AzureML-PyTorch-1.0-GPU
Name: AzureML-TensorFlow-1.12-GPU
Name: AzureML-TensorFlow-1.13-GPU
Name: AzureML-Chainer-5.1.0-CPU
Name: AzureML-PyTorch-1.0-CPU
Name: AzureML-Scikit-learn-0.20.3
Name: AzureML-PyTorch-1.2-GPU
Name: AzureML-PyTorch-1.1-GPU
Name: AzureML-TensorFlow-1.10-GPU
Name: AzureML-PyTorch-1.3-GPU
Name: AzureML-TensorFlow-2.0-CPU
Name: AzureML-PyTorch-1.3-CPU
Name: AzureML-TensorFlow-2.0-GPU
Name: AzureML-PySpark-MmlSpark-0.15
Name: AzureML-AutoML
Name: AzureML-PyTorch-1.4-GPU
Name: AzureML-PyTorch-1.4-CPU
Name: AzureML-VowpalWabbit-8.8.0
Name: AzureML-Hyperdrive-ForecastDNN
Name: AzureML-AutoML-GPU
Name: AzureML-AutoML-DNN-GPU
Name: AzureML-AutoML-DNN
Name: AzureML-Designer-R
Name: AzureML-Designer-Recomm

In [35]:
# retrieve any from the list
training_env = Environment.get(workspace=ws, name='training_environment')

In [None]:
# later you put it in "environment_definition=...":

# estimator = Estimator(source_directory='experiment_folder'
#                       entry_script='training_script.py',
#                       compute_target='local',
#                       environment_definition=training_env)

#### Create compute targets

In [36]:
from azureml.core.compute import ComputeTarget, AmlCompute

In [37]:
# Specify a name for the compute (unique within the workspace)
compute_name = 'aml-cluster'

In [39]:
# Define compute configuration
# compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
#                                                        min_nodes=0, max_nodes=4,
#                                                        vm_priority='dedicated')
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',
                                                       min_nodes=0, max_nodes=4,
                                                       vm_priority='lowpriority')

In [40]:
# Create the compute
aml_cluster = ComputeTarget.create(ws, compute_name, compute_config)
aml_cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


#### Using attached compute (databricks or DSVM)

In [41]:
# using "unmanaged" compute target

In [42]:
# I prefer to do this manually in the portal

In [43]:
# later you put compute in Estimator, "compute_target=...":
# estimator = Estimator(source_directory='experiment_folder',
#                       entry_script='training_script.py',
#                       environment_definition=training_env,
#                       compute_target='alm-cluster')

In [73]:
from azureml.core.compute import ComputeTarget

In [74]:
training_cluster = ComputeTarget(workspace=ws, name='aml-cluster')

In [75]:
# retrieve your prepped compute environment from the list
training_env = Environment.get(workspace=ws, name='training_environment')

In [80]:
training_env

{
    "name": "training_environment",
    "version": "5",
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "python": {
        "userManagedDependencies": false,
        "interpreterPath": "python",
        "condaDependenciesFile": null,
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "dependencies": [
                "numpy",
                "pandas",
                "scikit-learn",
                "joblib",
                {
                    "pip": [
                        "azureml-defaults",
                        "azureml-dataprep[pandas]",
                        "azureml-dataprep[fuse]",
                        "pyarrow",
                        "fastparquet"
                    ]
                },
                "python=3.6.2"
            ],
            "name": "azureml_5920f805fa659293f97bedc85ff62dbe"
        }
    },
    "docker": {
        "enabled": true,
        "baseImage": "mcr.microsoft.com/azurem

In [76]:
# estimator = Estimator(source_directory='experiment_folder',
#                       entry_script='training_script.py',
#                       environment_definition=training_env,
#                       compute_target=training_cluster)

In [77]:
estimator = Estimator(source_directory=experiment_folder,
                      entry_script='diabetes_training.py',
                      script_params=script_params,
                      environment_definition=training_env,
                      compute_target=training_cluster,
                      inputs=[ds.as_named_input('diabetes2').as_mount(path_on_compute='diabetes_path')]
                     )



In [78]:
# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)
# Run the experiment
run = experiment.submit(config=estimator)

In [79]:
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()
#run.wait_for_completion(show_output=True)

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'diabetes-training_1585395706_505c3720',
 'target': 'aml-cluster',
 'status': 'Finalizing',
 'startTimeUtc': '2020-03-28T11:49:38.057787Z',
 'error': {'error': {'code': 'ServiceError',
   'message': 'Dataset initialization failed: Missing required package "azureml-dataprep[fuse]", which can be installed by running: "/azureml-envs/azureml_5920f805fa659293f97bedc85ff62dbe/bin/python" -m pip install azureml-dataprep[fuse] --upgrade.',
   'details': [],
   'debugInfo': {'type': 'ImportError',
    'message': 'Missing required package "azureml-dataprep[fuse]", which can be installed by running: "/azureml-envs/azureml_5920f805fa659293f97bedc85ff62dbe/bin/python" -m pip install azureml-dataprep[fuse] --upgrade.',
    'stackTrace': '  File "/mnt/batch/tasks/shared/LS_root/jobs/machine_learning_workspace/azureml/diabetes-training_1585395706_505c3720/mounts/workspaceblobstore/azureml/diabetes-training_1585395706_505c3720/azureml-setup/context_manager_injector.py", line 44, in __enter__\

https://nbviewer.jupyter.org/github/MicrosoftDocs/mslearn-aml-labs/blob/master/04-Working_with_Compute.ipynb  
https://nbviewer.jupyter.org/github/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/using-environments/using-environments.ipynb  
https://docs.microsoft.com/en-gb/learn/modules/use-compute-contexts-in-aml/2-environments  

# let's try again

In [1]:
from azureml.train.estimator import Estimator
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core import Datastore
from azureml.core import Dataset
from azureml.core import Environment
from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget
from azureml.core.conda_dependencies import CondaDependencies

Failure while loading azureml_run_type_providers. Failed to load entrypoint hyperdrive = azureml.train.hyperdrive:HyperDriveRun._from_run_dto with exception cannot import name '_DistributedTraining'.


In [2]:
ws = Workspace.from_config()

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [3]:
# create A folder named diabetes_training_from_file_dataset here locally
import os

# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_file_dataset'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

diabetes_training_from_file_dataset folder created


In [4]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
print("start custom script...")
import argparse
print("argparse loaded")
from azureml.core import Workspace, Dataset, Experiment, Run
print("azureml.core loaded")
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import glob
print("all imports loaded")

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
data_path = run.input_datasets['diabetes2'] # Get the training data from the estimator input
print("data_path: " + str(data_path))
all_files = glob.glob(data_path + "/**/*.csv", recursive=True)
print("data_path + '/**/*.csv': " + str(data_path + '/**/*.csv')) # diabetes_path/*.csv
print([file for file in all_files])   # ['diabetes_path/diabetes.csv', 'diabetes_path/diabetes2.csv']
print("type(all_files): " + str(type(all_files)))
print(type(all_files[0]))
diabetes = pd.concat((pd.read_csv(f) for f in all_files))
print("number of records: " + str(len(diabetes)))
print("writing outputs/diabetes.parquet:")
diabetes.to_parquet("outputs/diabetes.parquet")
print("writing logs/out.csv:")
diabetes.to_csv("logs/out.csv", index=False)
print("crap upload")
#diabetes.to_csv("diabetes_path/diabetes.csv", index=False)
#run.output_datasets['diabetes2']
# read-only filesystem !!

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting diabetes_training_from_file_dataset/diabetes_training.py


In [5]:
%%writefile conda.yml
name: py_env
dependencies:
  - numpy
  - pandas
  - scikit-learn
  - joblib
  - pip:
    - azureml-defaults
    - azureml-dataprep[pandas]
    - azureml-dataprep[fuse]
    - pyarrow
    - fastparquet

Overwriting conda.yml


In [6]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies


# Create a Python environment for the experiment
env = Environment("training_environment")
env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
env.docker.enabled = True # Use a docker container

# Create a set of package dependencies (conda or pip as required)
env_packages = CondaDependencies.create(conda_packages=['scikit-learn', 'joblib'],
                                        pip_packages=['azureml-defaults',
                                                      'azureml-dataprep[pandas]',
                                                      'azureml-dataprep[fuse]',
                                                      'pyarrow',
                                                      'fastparquet'])

# Add the dependencies to the environment
env.python.conda_dependencies = env_packages

print(env.name, 'defined.')

training_environment defined.


In [7]:
# create environment from a file
#env = Environment.from_conda_specification(name='training_environment', file_path='./conda.yml')

In [8]:
env.register(workspace=ws)

{
    "name": "training_environment",
    "version": "13",
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "python": {
        "userManagedDependencies": false,
        "interpreterPath": "python",
        "condaDependenciesFile": null,
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "conda-forge"
            ],
            "dependencies": [
                "python=3.6.2",
                {
                    "pip": [
                        "azureml-defaults==1.0.85.*",
                        "azureml-dataprep[fuse]",
                        "pyarrow",
                        "fastparquet"
                    ]
                },
                "scikit-learn",
                "joblib"
            ],
            "name": "azureml_cec3c0c2eda4dee5bf29ecf1761c4111"
        }
    },
    "docker": {
        "enabled": true,
        "baseImage": "mcr.microsoft.com/azureml/base:intelmpi2018.3

In [9]:
# Set the script parameters
script_params = {'--regularization': 0.1}

ds = Dataset.get_by_name(workspace=ws, name="diabetes2")

# Get the environment
# retrieve your prepped compute environment from the list
training_env = Environment.get(workspace=ws, name='training_environment')

training_cluster = ComputeTarget(workspace=ws, name='aml-cluster')

estimator = Estimator(source_directory=experiment_folder,
                      entry_script='diabetes_training.py',
                      script_params=script_params,
                      environment_definition=training_env,
                      compute_target=training_cluster,
                      inputs=[ds.as_named_input('diabetes2').as_mount(path_on_compute='diabetes_path')]
                     )

# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)
# Run the experiment
run = experiment.submit(config=estimator)

# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()
#run.wait_for_completion(show_output=True)

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'diabetes-training_1585418882_2a8f68d2',
 'target': 'aml-cluster',
 'status': 'Finalizing',
 'startTimeUtc': '2020-03-28T18:20:22.04869Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'c7dcadc3-228b-40ed-88d5-40fdf5749982',
  'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': 'c61c81ddc6083d0223952071edde983513310a58',
  'mlflow.source.git.commit': 'c61c81ddc6083d0223952071edde983513310a58',
  'azureml.git.dirty': 'True',
  'AzureML.DerivedImageName': 'azureml/azureml_07d072b2a196016f8e79e803fe25ad26',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '2c81c692-c43c-4f03-9952-45124c0da47c'}, 'consumptionDetails': 

In [None]:
# it builds the docker container on this local DSVM,
# then pushes it onto ML workspace image register (~500MB/image - 10GB capacity)
# - image:
#     registry: machinelearn48d206af.azurecr.io
#     repository: azureml/azureml_07d072b2a196016f8e79e803fe25ad26
#     tag: latest
#     digest: sha256:e4b841deabe07ab49b7f142da5e052be741281aaed902c017a0e96ce5925abc1
# runtime-dependency:
#     registry: mcr.microsoft.com
#     repository: azureml/base
#     tag: intelmpi2018.3-ubuntu16.04
#     digest: sha256:a1b514f3ba884b9a7695cbba5638933ddaf222e8ce3e8c81e8cdf861679abb05

# then it provisions from your compute an instance (~batch service)
# resizing can take 2min and renting costs start...
# downloading docker image and then starting and mounting datalake etc and running and then closing off
# 120 sec for the server to shutdown again 

## final extra test about conda.yml - would be more elegant...
I think we can do it if we use False in that auto dependecy thing...
Let's try again below

In [14]:
from azureml.train.estimator import Estimator
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core import Datastore
from azureml.core import Dataset
from azureml.core import Environment
from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget
from azureml.core.conda_dependencies import CondaDependencies

In [15]:
ws = Workspace.from_config()

In [16]:
# create A folder named diabetes_training_from_file_dataset here locally
import os

# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_file_dataset'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

diabetes_training_from_file_dataset folder created


In [17]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
print("start custom script...")
import argparse
print("argparse loaded")
from azureml.core import Workspace, Dataset, Experiment, Run
print("azureml.core loaded")
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import glob
print("all imports loaded")

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
data_path = run.input_datasets['diabetes2'] # Get the training data from the estimator input
print("data_path: " + str(data_path))
all_files = glob.glob(data_path + "/**/*.csv", recursive=True)
print("data_path + '/**/*.csv': " + str(data_path + '/**/*.csv')) # diabetes_path/*.csv
print([file for file in all_files])   # ['diabetes_path/diabetes.csv', 'diabetes_path/diabetes2.csv']
print("type(all_files): " + str(type(all_files)))
print(type(all_files[0]))
diabetes = pd.concat((pd.read_csv(f) for f in all_files))
print("number of records: " + str(len(diabetes)))
print("writing outputs/diabetes.parquet:")
diabetes.to_parquet("outputs/diabetes.parquet")
print("writing logs/out.csv:")
diabetes.to_csv("logs/out.csv", index=False)
print("crap upload")
#diabetes.to_csv("diabetes_path/diabetes.csv", index=False)
#run.output_datasets['diabetes2']
# read-only filesystem !!

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting diabetes_training_from_file_dataset/diabetes_training.py


In [18]:
%%writefile conda.yml
name: py_env
channels:
 - conda-forge
dependencies:
 - python=3.8.2
 - scikit-learn
 - joblib
 - pip:
    - azureml-defaults
    - pyarrow
    - fastparquet

Overwriting conda.yml


In [19]:
from azureml.core import Environment
#from azureml.core.conda_dependencies import CondaDependencies

# create environment from a file
env = Environment.from_conda_specification(name='training_environment', file_path='./conda.yml')
#env = Environment("training_environment")  # this would create a new env 
env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
env.docker.enabled = True # Use a docker container

In [20]:
env.register(workspace=ws)

{
    "name": "training_environment",
    "version": "18",
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "python": {
        "userManagedDependencies": false,
        "interpreterPath": "python",
        "condaDependenciesFile": null,
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "conda-forge"
            ],
            "dependencies": [
                "python=3.8.2",
                "scikit-learn",
                "joblib",
                {
                    "pip": [
                        "azureml-defaults",
                        "pyarrow",
                        "fastparquet"
                    ]
                }
            ],
            "name": "azureml_ed6ad1ace64132edbe11676fea09b7be"
        }
    },
    "docker": {
        "enabled": true,
        "baseImage": "mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04",
        "baseDockerfile": null,
        "shar

In [21]:
# Set the script parameters
script_params = {'--regularization': 0.1}

ds = Dataset.get_by_name(workspace=ws, name="diabetes2")

# Get the environment
# retrieve your prepped compute environment from the list
training_env = Environment.get(workspace=ws, name='training_environment')

training_cluster = ComputeTarget(workspace=ws, name='aml-cluster')

estimator = Estimator(source_directory=experiment_folder,
                      entry_script='diabetes_training.py',
                      script_params=script_params,
                      environment_definition=training_env,
                      compute_target=training_cluster,
                      inputs=[ds.as_named_input('diabetes2').as_mount(path_on_compute='diabetes_path')]
                     )

# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)
# Run the experiment
run = experiment.submit(config=estimator)

# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()
#run.wait_for_completion(show_output=True)

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'diabetes-training_1585430064_9b4f0ac2',
 'target': 'aml-cluster',
 'status': 'Finalizing',
 'startTimeUtc': '2020-03-28T21:23:41.133957Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'c7dcadc3-228b-40ed-88d5-40fdf5749982',
  'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '0e7102f49f3ae90e172c3b504d5bebc9f50b6446',
  'mlflow.source.git.commit': '0e7102f49f3ae90e172c3b504d5bebc9f50b6446',
  'azureml.git.dirty': 'True',
  'AzureML.DerivedImageName': 'azureml/azureml_7a273b62f76221b8f2e7d811a9b5a0ed',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '2c81c692-c43c-4f03-9952-45124c0da47c'}, 'consumptionDetails':

# AutoML

### 1. algorithms
### 2. preprocessing

In [1]:
from azureml.train.automl import AutoMLConfig

In [3]:
#automl_run_config = RunConfiguration(framework='python')

In [None]:
# automl_config = AutoMLConfig(name='Automated ML Experiment',
#                              task='classification',
#                              primary_metric = 'AUC_weighted',
#                              compute_target=aml_compute,
#                              training_data = train_dataset,
#                              validation_data = test_dataset,
#                              label_column_name='Label',
#                              featurization='auto',
#                              iterations=12,
#                              max_concurrent_iterations=4)

In [4]:
from azureml.train.automl.utilities import get_primary_metrics

get_primary_metrics('classification')

['norm_macro_recall',
 'AUC_weighted',
 'average_precision_score_weighted',
 'precision_score_weighted',
 'accuracy']

In [5]:
from azureml.train.automl.utilities import get_primary_metrics

get_primary_metrics('regression')

['spearman_correlation',
 'normalized_root_mean_squared_error',
 'r2_score',
 'normalized_mean_absolute_error']

In [None]:
# from azureml.core.experiment import Experiment

# automl_experiment = Experiment(ws, 'automl_experiment')
# automl_run = automl_experiment.submit(automl_config)

In [None]:
# # Show the run details while running
# RunDetails(run).show()
# run.wait_for_completion()
# #run.wait_for_completion(show_output=True)

In [None]:
# identify the best run
# best_run, fitted_model = automl_run.get_output()
# best_run_metrics = best_run.get_metrics()
# for metric_name in best_run_metrics:
#     metric = best_run_metrics[metric_name]
#     print(metric_name, metric)

In [None]:
# view preprocessing steps
# for step_ in fitted_model.named_steps:
#     print(step)

# Let's try to autoML make it Work ! (using a local dataset)

In [None]:
# https://nbviewer.jupyter.org/github/MicrosoftDocs/mslearn-aml-labs/blob/master/07-Automated_ML.ipynb
# https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning#samples

In [None]:
# try many combinations of algorithms and preprocessing transformations
# automate the comparison of models trained using different algorithms and preprocessing options

In [2]:
# make sure you install this on the "py_37default" evironment !
# this is when we compute 'local' it uses this default environment !
! pip install --upgrade azureml-sdk[explain,automl]
! pip install --upgrade azureml-widgets

Collecting azureml-sdk[automl,explain]
  Downloading azureml_sdk-1.2.0-py3-none-any.whl (4.6 kB)
Collecting azureml-train-automl-client~=1.2.0
  Using cached azureml_train_automl_client-1.2.0-py3-none-any.whl (78 kB)
Collecting azureml-dataprep[fuse]<1.4.0a,>=1.3.5
  Using cached azureml_dataprep-1.3.5-py3-none-any.whl (26.6 MB)
Collecting azureml-pipeline~=1.2.0
  Downloading azureml_pipeline-1.2.0-py3-none-any.whl (3.7 kB)
Collecting azureml-core~=1.2.0
  Using cached azureml_core-1.2.0.post1-py3-none-any.whl (1.2 MB)
Collecting azureml-train~=1.2.0
  Downloading azureml_train-1.2.0-py3-none-any.whl (3.2 kB)
Collecting azureml-train-automl~=1.2.0; extra == "automl"
  Downloading azureml_train_automl-1.2.0-py3-none-any.whl (3.4 kB)
Collecting azureml-explain-model~=1.2.0; extra == "explain"
  Using cached azureml_explain_model-1.2.0-py3-none-any.whl (22 kB)
Collecting azureml-automl-core~=1.2.0
  Using cached azureml_automl_core-1.2.0-py3-none-any.whl (113 kB)
Collecting azureml-telem

Collecting websocket-client>=0.32.0
  Using cached websocket_client-0.57.0-py2.py3-none-any.whl (200 kB)
Collecting flake8<=3.7.9,>=3.1.0; python_version >= "3.6"
  Downloading flake8-3.7.9-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 3.8 MB/s  eta 0:00:01
[?25hCollecting azureml-train-restclients-hyperdrive~=1.2.0
  Downloading azureml_train_restclients_hyperdrive-1.2.0-py3-none-any.whl (18 kB)
Collecting pandas<=0.23.4,>=0.21.0
  Downloading pandas-0.23.4-cp37-cp37m-manylinux1_x86_64.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 39.2 MB/s eta 0:00:01
[?25hCollecting onnxconverter-common<=1.6.0,>=1.4.2
  Using cached onnxconverter_common-1.6.0-py2.py3-none-any.whl (43 kB)
Collecting pmdarima==1.1.1
  Downloading pmdarima-1.1.1-cp37-cp37m-manylinux1_x86_64.whl (696 kB)
[K     |████████████████████████████████| 696 kB 41.5 MB/s eta 0:00:01
[?25hCollecting sklearn-pandas<=1.7.0,>=1.4.0
  Downloading sklearn_pandas-1.7.0-py2.py3-none-a

Collecting packaging
  Downloading packaging-20.3-py2.py3-none-any.whl (37 kB)
Collecting shap<=0.34.0,>=0.20.0
  Downloading shap-0.34.0.tar.gz (264 kB)
[K     |████████████████████████████████| 264 kB 47.6 MB/s eta 0:00:01
[?25hCollecting interpret-core[required]==0.1.20
  Using cached interpret_core-0.1.20-py3-none-any.whl (7.9 MB)
Collecting PyYAML
  Downloading PyYAML-5.3.1.tar.gz (269 kB)
[K     |████████████████████████████████| 269 kB 44.0 MB/s eta 0:00:01
Collecting liac-arff>=2.1.1
  Using cached liac-arff-2.4.0.tar.gz (15 kB)
Collecting itsdangerous>=0.24
  Using cached itsdangerous-1.1.0-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: fusepy, py-cpuinfo, psutil, dill, JsonSir, JsonForm, json-logging-py, shap, PyYAML, liac-arff
  Building wheel for fusepy (setup.py) ... [?25ldone
[?25h  Created wheel for fusepy: filename=fusepy-3.0.1-py3-none-any.whl size=10502 sha256=651f70c09ed429bcc645dc74582d39e85acae1018962fb98f73c81582e311b95
  Stored in direc

  Building wheel for JsonForm (setup.py) ... [?25ldone
[?25h  Created wheel for JsonForm: filename=JsonForm-0.0.2-py3-none-any.whl size=3326 sha256=3569bb90a24a22ebafc27ea808a1a00779b42e6734c97c179cf818d0e1a7cef4
  Stored in directory: /home/ubuntu/.cache/pip/wheels/cb/e2/4e/2e3c9500e5e695f31fa97ad873d5565bbd985cc484cba4a265
  Building wheel for json-logging-py (setup.py) ... [?25ldone
[?25h  Created wheel for json-logging-py: filename=json_logging_py-0.2-py3-none-any.whl size=3924 sha256=85dc098889f1ff86a61778c782180accd3ae7dda50d17c25c6cb0e72c01cadae
  Stored in directory: /home/ubuntu/.cache/pip/wheels/2b/2c/0b/56aba27cc60071c52f66346a1abc22ee9db8c7376549aa4910
  Building wheel for shap (setup.py) ... [?25ldone
[?25h  Created wheel for shap: filename=shap-0.34.0-cp37-cp37m-linux_x86_64.whl size=388179 sha256=eda9e70a07b8f229381bc9b0fe0ad4c3f51cfd8766f3f0a7b44998730197c90c
  Stored in directory: /home/ubuntu/.cache/pip/wheels/05/86/23/2c22a86fb2ba700382f20e1dbe536e211b3b1578aec

In [7]:
#! pip install --upgrade azureml-widgets

Collecting azureml-widgets
  Downloading azureml_widgets-1.2.0-py3-none-any.whl (14.3 MB)
[K     |████████████████████████████████| 14.3 MB 12.5 MB/s eta 0:00:01




Installing collected packages: azureml-widgets
Successfully installed azureml-widgets-1.2.0


In [11]:
# here you can use the "azureml_py36_automl" kernel again...
from azureml.train.estimator import Estimator
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core import Datastore
from azureml.core import Dataset
from azureml.core import Environment
from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget
from azureml.core.conda_dependencies import CondaDependencies

import pandas as pd
from azureml.train.automl import AutoMLConfig

In [12]:
# Connect to Your Workspace
ws = Workspace.from_config()

In [3]:
# Configure Automated Machine Learning

In [4]:
# Load the data
train_data = pd.read_csv('data/diabetes.csv')

In [5]:
train_data.sample(3)

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
4118,1638800,6,172,60,41,63,28.26,0.29,30,1
5185,1236992,0,138,80,45,71,37.23,0.28,22,0
6773,1786417,3,106,85,43,33,40.28,0.69,53,1


In [6]:
train_data.columns

Index(['PatientID', 'Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
       'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age',
       'Diabetic'],
      dtype='object')

In [7]:
automl_config = AutoMLConfig(name='Automated ML Experiment',
                             task='classification',
                             compute_target='local',
                             training_data = train_data,
                             n_cross_validations = 2,
                             label_column_name = 'Diabetic',
                             iterations=6,
                             primary_metric = 'AUC_weighted',
                             max_concurrent_iterations=3,
                             featurization='auto'
                             )

In [8]:
# Run an Automated Machine Learning Experiment

In [9]:
# !pip install -U azureml-train-automl-runtime

In [10]:
automl_experiment = Experiment(ws, 'diabetes_automl')
automl_run = automl_experiment.submit(automl_config)
RunDetails(automl_run).show()
automl_run.wait_for_completion()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_735a99da-69ae-4cba-a0ea-039fcd8a112c',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-03-30T14:30:12.659269Z',
 'endTimeUtc': '2020-03-30T14:32:07.766982Z',
 'properties': {'num_iterations': '6',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.0.85", "azureml-train": "1.0.85", "azureml-train-restclients-hyperdrive": "1.0.85", "azureml-train-core": "1.0.85", "azureml-train-automl": "1.0.85", "azureml-train-automl-runtime": "1.2.0", "azureml-train-automl-client": "1.2.0", "azureml-tensorboard": "1.0.85", "azureml-telemetry": "1.2.0", "azureml-sdk": "1.0.85", "azureml-pipeline": "1.0.85

In [13]:
# Determine the Best Performing Model
# Notice: it uses sklearn transformation pipelines !
best_run, fitted_model = automl_run.get_output()
print(best_run)
print(fitted_model)
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)

Run(Experiment: diabetes_automl,
Id: AutoML_735a99da-69ae-4cba-a0ea-039fcd8a112c_0,
Type: None,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
        feature_sweeping_config=None, feature_sweeping_timeout=None,
        featurization_config=None, force_text_dnn=None,
        is_cross_validation=None, is_onnx_compatible=None, logger=None,
        obser...    silent=True, subsample=1.0, subsample_for_bin=200000,
          subsample_freq=0, verbose=-10))])
recall_score_micro 0.9480999999999999
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_735a99da-69ae-4cba-a0ea-039fcd8a112c_0/accuracy_table
log_loss 0.12762078769027035
f1_score_macro 0.9414766222109543
precision_score_weighted 0.9479516256433123
precision_score_micro 0.9480999999999999
average_precision_score_micro 0.9902351771273721
norm_macro_recall 0.8794903988906442
average_precision_score_macro 0.9868730917793174
AUC_weighted 0.98873

In [14]:
automl_run

Experiment,Id,Type,Status,Details Page,Docs Page
diabetes_automl,AutoML_735a99da-69ae-4cba-a0ea-039fcd8a112c,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [17]:
for step in fitted_model.named_steps:
    print(step)

datatransformer
MaxAbsScaler
LightGBMClassifier


In [16]:
automl_run.get_output()

(Run(Experiment: diabetes_automl,
 Id: AutoML_735a99da-69ae-4cba-a0ea-039fcd8a112c_0,
 Type: None,
 Status: Completed),
 Pipeline(memory=None,
      steps=[('datatransformer', DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
         feature_sweeping_config=None, feature_sweeping_timeout=None,
         featurization_config=None, force_text_dnn=None,
         is_cross_validation=None, is_onnx_compatible=None, logger=None,
         obser...    silent=True, subsample=1.0, subsample_for_bin=200000,
           subsample_freq=0, verbose=-10))]))

In [15]:
automl_run.__dict__

{'_jasmine_client': <azureml._restclient.jasmine_client.JasmineClient at 0x7f81ceadae48>,
 '_experiment': Experiment(Name: diabetes_automl,
 Workspace: machine_learning_workspace),
 '_run_id': 'AutoML_735a99da-69ae-4cba-a0ea-039fcd8a112c',
 '_identity': 'AutoMLRun#AutoML_735a99da-69ae-4cba-a0ea-039fcd8a112c',
 '_portal_url': 'https://ml.azure.com',
 '_workspace_url': 'https://ml.azure.com?wsid=/subscriptions/43c1f93a-903d-4b23-a4bf-92bd7a150627/resourcegroups/myResourceGroup/workspaces/machine_learning_workspace',
 '_experiment_url': 'https://ml.azure.com/experiments/diabetes_automl?wsid=/subscriptions/43c1f93a-903d-4b23-a4bf-92bd7a150627/resourcegroups/myResourceGroup/workspaces/machine_learning_workspace',
 '_run_details_url': 'https://ml.azure.com/experiments/diabetes_automl/runs/AutoML_735a99da-69ae-4cba-a0ea-039fcd8a112c?wsid=/subscriptions/43c1f93a-903d-4b23-a4bf-92bd7a150627/resourcegroups/myResourceGroup/workspaces/machine_learning_workspace',
 '_client': <azureml._run_impl.run

## autoML using remote compute (with a local dataset)

In [5]:
from azureml.core import Workspace

In [6]:
ws = Workspace.from_config()

In [None]:
# from azureml.core.compute import ComputeTarget, AmlCompute
# from azureml.core.compute_target import ComputeTargetException

# # Choose a name for your CPU cluster
# cpu_cluster_name = "cpu-cluster"

# # Verify that cluster does not exist already
# try:
#     compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
#     print('Found existing cluster, use it.')
# except ComputeTargetException:
#     compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
#                                                            max_nodes=4)
#     compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

# compute_target.wait_for_completion(show_output=True)

In [3]:
# we already have a compute instance, or we can make it manually in the portal

In [8]:
from azureml.core.compute import ComputeTarget
compute_target = ComputeTarget(workspace=ws, name='aml-cluster')

In [10]:
from azureml.core.experiment import Experiment
# choose a name for experiment
experiment_name = 'automl-classification-bmarketing-all'

experiment=Experiment(ws, experiment_name)

In [16]:
import pandas as pd
data = pd.read_csv("data/diabetes.csv")
data.sample(2)

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
1152,1669671,0,67,55,10,38,21.928172,0.177577,34,0
830,1245444,1,116,91,43,140,48.002186,0.087448,32,0


In [61]:
data.to_parquet("data/diabetes.parquet")

In [19]:
from azureml.core import Datastore

In [62]:
# This fails not compatible with data lake gen2 or something...
datastore = Datastore.get(ws, 'data_lake_gen2')
datastore.upload_files(files=['data/diabetes.csv', 'data/diabetes.parquet'], target_path="target_path/tabular/", overwrite=True, show_progress=True)

AttributeError: 'AzureDataLakeGen2Datastore' object has no attribute 'upload_files'

In [63]:
# if we use default blob storage, it works fine
datastore = Datastore.get(ws, 'workspaceblobstore')
datastore.upload_files(files=['data/diabetes.csv', 'data/diabetes.parquet'], target_path="target_path/tabular/", overwrite=True, show_progress=True)

Uploading an estimated of 2 files
Uploading data/diabetes.csv
Uploading data/diabetes.parquet
Uploaded data/diabetes.csv, 1 files out of an estimated total of 2
Uploaded data/diabetes.parquet, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_aa7f65b44dde4c2a85eec7d3fb8dd469

In [48]:
# what is needs is a dataset...
from azureml.core import Dataset

In [49]:
Datastore.get(ws, 'workspaceblobstore')

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-8ffd38a4-d688-44f6-9fc7-862df920c646",
  "account_name": "machinelstorage071578f15",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [55]:
# can it work from blob storage ?
train_dataset = Dataset.Tabular.from_delimited_files(path=[(Datastore.get(ws, 'workspaceblobstore'), 'target_path/tabular/diabetes.csv')])
train_dataset

{
  "source": [
    "('workspaceblobstore', 'target_path/tabular/diabetes.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [56]:
# can it do it from data lake gen 2 ?
train_dataset_2 = Dataset.Tabular.from_delimited_files(path=[(Datastore.get(ws, 'data_lake_gen2'), 'platinum/diabetes.csv')])
train_dataset_2

{
  "source": [
    "('data_lake_gen2', 'platinum/diabetes.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [13]:
# you normally split the data up in training, validation and testing datasets

In [57]:
# define autoML settings:
import logging

automl_settings = {
    "experiment_timeout_hours" : 0.3,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    #"n_cross_validations": 2,
    "primary_metric": 'AUC_weighted',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             #experiment_exit_score = 0.9984,
                             blacklist_models = ['KNN','LinearSVM'],
                             enable_onnx_compatible_models=True,
                             training_data = train_dataset,
                             label_column_name = "Diabetic",
                             #validation_data = validation_dataset,
                             **automl_settings
                            )

In [58]:
# Call the submit method on the experiment object and pass the run configuration. Execution of local runs is synchronous.
# Depending on the data and the number of iterations this can run for a while.
remote_run = experiment.submit(automl_config, show_output=False)

In [59]:
# define autoML settings:
import logging

automl_settings = {
    "experiment_timeout_hours" : 0.3,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    #"n_cross_validations": 2,
    "primary_metric": 'AUC_weighted',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             #experiment_exit_score = 0.9984,
                             blacklist_models = ['KNN','LinearSVM'],
                             enable_onnx_compatible_models=True,
                             training_data = train_dataset_2,
                             label_column_name = "Diabetic",
                             #validation_data = validation_dataset,
                             **automl_settings
                            )

In [60]:
# Call the submit method on the experiment object and pass the run configuration. Execution of local runs is synchronous.
# Depending on the data and the number of iterations this can run for a while.
remote_run = experiment.submit(automl_config, show_output=False)

## testing some new parquet data loading from data lake 

https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.dataset_factory.tabulardatasetfactory?view=azure-ml-py#from-parquet-files-path--validate-true--include-path-false--set-column-types-none-

https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/work-with-data/dataset-api-change-notice.md

In [64]:
from azureml.core import Dataset, Datastore
from azureml.data.datapath import DataPath

### this is the RIGHT way to register data !!!!  1/2

In [67]:
# register first
from azureml.core import Datastore
ds = Datastore.register_azure_blob_container(workspace=ws,
                                             datastore_name="datalakestoragegen2",
                                             container_name="datalake",
                                             account_name="datalake21032020",
                                             account_key="Ck/4hMq3Zrzq5toZ96zE6cDncjbw2VdkR9ny1xXA3GLBwQXIv7V1ycSc/KpqyNRcoPWKtzKljjpcZVqjWOu+3Q==",
                                             create_if_not_exists=False)

In [68]:
import pandas as pd
#data = 

In [69]:
ws.datastores

{'datalakestoragegen2': {
   "name": "datalakestoragegen2",
   "container_name": "datalake",
   "account_name": "datalake21032020",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'data_lake_gen2': <azureml.data.azure_data_lake_datastore.AzureDataLakeGen2Datastore at 0x7fbbeab5f5f8>,
 'workspacefilestore': {
   "name": "workspacefilestore",
   "container_name": "azureml-filestore-8ffd38a4-d688-44f6-9fc7-862df920c646",
   "account_name": "machinelstorage071578f15",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspaceblobstore': {
   "name": "workspaceblobstore",
   "container_name": "azureml-blobstore-8ffd38a4-d688-44f6-9fc7-862df920c646",
   "account_name": "machinelstorage071578f15",
   "protocol": "https",
   "endpoint": "core.windows.net"
 }}

# registering parquet files now work !!

In [2]:
from azureml.core import Workspace
ws = Workspace.from_config()

In [18]:
# from azureml.core.compute import ComputeTarget
# compute_target = ComputeTarget(workspace=ws, name='aml-cluster-fast')

In [5]:
from azureml.core.experiment import Experiment
# choose a name for experiment
experiment_name = 'automl-classification-2'
experiment = Experiment(ws, experiment_name)

In [6]:
# manual registration works on portal 

In [7]:
# see below if you want to do it code based manually

In [11]:
from azureml.core import Dataset, Datastore
from azureml.data.datapath import DataPath

### this is the right way to register dataset !!!! 2/2

In [14]:
# create tabular dataset from Parquet files in datastore
datastore = Datastore.get(ws, 'data_lake_gen2')
datastore_path = [DataPath(datastore, 'platinum/diabetes.parquet')]
tabular = Dataset.Tabular.from_parquet_files(path=datastore_path)

In [15]:
tabular

{
  "source": [
    "('data_lake_gen2', 'platinum/diabetes.parquet')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ]
}

In [16]:
# !! now, run autoML using the "tabular" dataset (loaded from parquet file from data lake gen2)

In [17]:
# from azureml.core.compute import ComputeTarget, AmlCompute
# from azureml.core.compute_target import ComputeTargetException

# # Choose a name for your CPU cluster
# cpu_cluster_name = "aml-cluster-fast"

# # Verify that cluster does not exist already
# try:
#     compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
#     print('Found existing cluster, use it.')
# except ComputeTargetException:
#     compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D1',
#                                                            max_nodes=4)
#     compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

# compute_target.wait_for_completion(show_output=True)

In [19]:
from azureml.core.compute import ComputeTarget
compute_target = ComputeTarget(workspace=ws, name='aml-cluster-fast')

In [22]:
# define autoML settings:
import logging

automl_settings = {
    "experiment_timeout_hours" : 0.3,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    #"n_cross_validations": 2,
    "primary_metric": 'AUC_weighted',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             #experiment_exit_score = 0.9984,
                             blacklist_models = ['KNN','LinearSVM'],
                             enable_onnx_compatible_models=True,
                             training_data = tabular,
                             label_column_name = "Diabetic",
                             #validation_data = validation_dataset,
                             **automl_settings
                            )

In [23]:
# Call the submit method on the experiment object and pass the run configuration. Execution of local runs is synchronous.
# Depending on the data and the number of iterations this can run for a while.
remote_run = experiment.submit(automl_config, show_output=False)

In [82]:
# or you can do it from the DSVM local compute:

In [83]:
# define autoML settings:
import logging

automl_settings = {
    "experiment_timeout_hours" : 0.3,
    "enable_early_stopping" : True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    #"n_cross_validations": 2,
    "primary_metric": 'AUC_weighted',
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target='local',
                             #experiment_exit_score = 0.9984,
                             blacklist_models = ['KNN','LinearSVM'],
                             enable_onnx_compatible_models=True,
                             training_data = tabular,
                             label_column_name = "Diabetic",
                             #validation_data = validation_dataset,
                             **automl_settings
                            )

In [84]:
# Call the submit method on the experiment object and pass the run configuration. Execution of local runs is synchronous.
# Depending on the data and the number of iterations this can run for a while.
remote_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_bda3cdce-2a48-47e2-8630-0b54114e0c1a

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Cross validation
STATUS:       DONE
DESCRIPTION:  Each iteration of the trained model was validated through cross-validation.
PARAMETERS:   Number of folds : 3
              
TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Classes are balanced in the training data.

TYPE:         High cardinality feature detection
STATUS:     

In [None]:
# It both worked, it took 1h for the nodes to run, not sure why it is not using 4 nodes...

# putting model into production

In [14]:
from azureml.core import Workspace

In [16]:
ws = Workspace.from_config()

In [21]:
from azureml.core import Experiment
from azureml.core import Model
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=ws, name="diabetes-training")
run = experiment.start_logging()
print("Starting experiment:", experiment.name)

# load the diabetes dataset
print("Loading Data...")
diabetes = pd.read_csv('data/diabetes.csv')

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Save the trained model
model_file = 'diabetes_model.pkl'
joblib.dump(value=model, filename=model_file)
run.upload_file(name='outputs/' + model_file,
                path_or_stream='./' + model_file)

# Complete the run
run.complete()

Starting experiment: diabetes-training
Loading Data...
Training a decision tree model
Accuracy: 0.8893333333333333
AUC: 0.8780635852529977


Register the model

In [22]:
run.register_model(model_path='outputs/diabetes_model.pkl',
                   model_name='diabetes_model',
                   tags={'Training context':'Inline Training'},
                   properties={'AUC': run.get_metrics()['AUC'],
                               'Accuracy': run.get_metrics()['Accuracy']})

print('Model trained and registered.')

Model trained and registered.


In [23]:
# see all registered models:
from azureml.core import Model
Model.list(ws)

[Model(workspace=Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup'), name=diabetes_model, id=diabetes_model:1, version=1, tags={'Training context': 'Inline Training'}, properties={'AUC': '0.8780635852529977', 'Accuracy': '0.8893333333333333'}),
 Model(workspace=Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup'), name=titanic_classification_model, id=titanic_classification_model:1, version=1, tags={'testmodel': 'titanic'}, properties={})]

In [24]:
# load the model
model = ws.models['diabetes_model']

In [25]:
import os

folder_name = 'diabetes_service'

# Create a folder for the web service files
experiment_folder = './' + folder_name
os.makedirs(folder_name, exist_ok=True)

print(folder_name, 'folder created.')

diabetes_service folder created.


In [None]:
# - init() is called when the service is loaded, to load a registered model
# - run(raw_data) is called when a prediction request is received, to predict on new data

In [26]:
%%writefile $folder_name/score_diabetes.py
import json
import joblib
import numpy as np
from azureml.core.model import Model

# Called when the service is loaded
def init():
    global model
    # Get the path to the deployed model file and load it
    model_path = Model.get_model_path('diabetes_model')
    model = joblib.load(model_path)

# Called when a request is received
def run(raw_data):
    # Get the input data as a numpy array
    data = np.array(json.loads(raw_data)['data'])
    # Get a prediction from the model
    predictions = model.predict(data)
    # Get the corresponding classname for each prediction (0 or 1)
    classnames = ['not-diabetic', 'diabetic']
    predicted_classes = []
    for prediction in predictions:
        predicted_classes.append(classnames[prediction])
    # Return the predictions as JSON
    return json.dumps(predicted_classes)

Writing diabetes_service/score_diabetes.py


In [27]:
from azureml.core.conda_dependencies import CondaDependencies 

# Add the dependencies for our model (AzureML defaults is already included)
myenv = CondaDependencies()
myenv.add_conda_package("scikit-learn")

# Save the environment config as a .yml file
env_file = folder_name + "/diabetes_env.yml"
with open(env_file,"w") as f:
    f.write(myenv.serialize_to_string())
print("Saved dependency info in", env_file)

# Print the .yml file
with open(env_file,"r") as f:
    print(f.read())

Saved dependency info in diabetes_service/diabetes_env.yml
# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
    # Required packages for AzureML execution, history, and data preparation.
  - azureml-defaults

- scikit-learn
channels:
- anaconda
- conda-forge



In [28]:
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig

# Configure the scoring environment
inference_config = InferenceConfig(runtime= "python",
                                   source_directory = folder_name,
                                   entry_script="score_diabetes.py",
                                   conda_file="diabetes_env.yml")

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

service_name = "diabetes-service"

service = Model.deploy(ws, service_name, [model], inference_config, deployment_config)

service.wait_for_deployment(True)
print(service.state)

Running............................................................................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [29]:
print(service.state)
print(service.get_logs())

Healthy
2020-04-01T12:54:26,548604474+00:00 - iot-server/run 
2020-04-01T12:54:26,549049671+00:00 - rsyslog/run 
2020-04-01T12:54:26,552403047+00:00 - gunicorn/run 
2020-04-01T12:54:26,555699824+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_4b824bcb98517d791c41923f24d65461/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)


In [32]:
# list webservices
ws.webservices

{'diabetes-service': AciWebservice(workspace=Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup'), name=diabetes-service, image_id=None, compute_type=None, state=ACI, scoring_uri=None, tags=http://e611bf6a-1020-495d-8275-acfe095ecbdd.westeurope.azurecontainer.io/score, properties={}, created_by={'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git', 'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git', 'azureml.git.branch': 'master', 'mlflow.source.git.branch': 'master', 'azureml.git.commit': '591e0ae11867b48aaf248645cebf031e33dd9790', 'mlflow.source.git.commit': '591e0ae11867b48aaf248645cebf031e33dd9790', 'azureml.git.dirty': 'True'})}

Use the Web Service

In [34]:
# HTTP requests to the web service
# determine the URL to which these applications must submit their requests
endpoint = service.scoring_uri
print(endpoint)

http://e611bf6a-1020-495d-8275-acfe095ecbdd.westeurope.azurecontainer.io/score


In [35]:
# sending the patient data in JSON (or binary) format, and receive back the predicted class(es)
import requests
import json

x_new = [[2,180,74,24,21,23.9091702,1.488172308,22],
         [0,148,58,11,179,39.19207553,0.160829008,45]]

# Convert the array to a serializable list in a JSON document
input_json = json.dumps({"data": x_new})

# Set the content type
headers = { 'Content-Type':'application/json' }

predictions = requests.post(endpoint, input_json, headers = headers)
predicted_classes = json.loads(predictions.json())

for i in range(len(x_new)):
    print ("Patient {}".format(x_new[i]), predicted_classes[i] )

Patient [2, 180, 74, 24, 21, 23.9091702, 1.488172308, 22] diabetic
Patient [0, 148, 58, 11, 179, 39.19207553, 0.160829008, 45] not-diabetic


In [None]:
# delete
#service.delete()