## Contents
1. [Workspace](#Workspace)
1. [Import](#Import)
1. [Introduction](#Introduction)
1. [Setup](#Setup)
1. [Compute](#Compute)
1. [Data](#Data)
1. [Train](#Train)
1. [Featurization](#Featurization)
1. [Evaluate](#Evaluate)

## Import open source Python packages

In [None]:
# import logging
# import os
# import random
# import re
# import lightgbm
# import pandas as pd
# import numpy as np
# import json
# import csv
# from matplotlib import pyplot as plt
# from matplotlib.pyplot import imshow
# from sklearn import datasets
# from shutil import copy2
# import seaborn as sns
# sns.set(color_codes='True')

## Import Azure Machine Learning Python SDK

In [None]:
# import azureml.core
# from azureml.core import Workspace
# from azureml.core.experiment import Experiment
# from azureml.core.workspace import Workspace
# from azureml.core.compute import AksCompute, ComputeTarget
# from azureml.core.compute import ComputeTarget, AmlCompute
# from azureml.core.compute_target import ComputeTargetException
# from azureml.core.webservice import Webservice, AksWebservice
# from azureml.core.image import Image
# from azureml.core.model import Model
# from azureml.train.automl import AutoMLConfig
# from azureml.train.automl.run import AutoMLRun
# from azureml.widgets import RunDetails

## Workspace

In [1]:
from azureml.core import Workspace

In [3]:
# download config.json from machine learning portal
ws = Workspace.from_config()

## Dataset

## Experiment & Run  
### Interactive inline method

In [None]:
from azureml.core import Experiment

In [None]:
# create an experiment variable
experiment = Experiment(workspace=ws, name="experiment_01")
# start the experiment
run = experiment.start_logging()
# experiment code goes here
# log          Record a single named value
# log_list     Record a named list of values
# log_row      Record a row with multiple columns
# log_table    Record a dictionary as a table
# log_image    Record an image file or a plot
run.log('Accuracy', 0.50)
run.log('Accuracy', 0.55)
run.log('Accuracy', 0.60)
run.log('Accuracy', 0.65)
run.log('Accuracy', 0.77)
# end the experiment
run.complete()
# only for this specific Run we can get the log data:
run.get_metrics()

## View progress

In [None]:
from azureml.widgets import RunDetails

In [None]:
# notebook widget to view the progress of model training
RunDetails(run).show()

## Experiment & Run  
### Script method

In [None]:
# creating a script "experiment.py"

In [None]:
%%writefile experiment.py
from azureml.core import Run
import pandas as pd
import os

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
data = pd.read_csv('data.csv')

# Count the rows and log the result
row_count = (len(data))
run.log('observations', row_count)

# Save a sample of the data
os.makedirs('outputs', exist_ok=True)
data.head(2).to_csv("outputs/sample.csv", index=False, header=True)

# Complete the run
run.complete()

In [None]:
# creating testdata
import pandas as pd
df = pd.DataFrame({"firstName":["bart","koen","karel"],
                   "lastName":["Vermeers","Aerts","Venbelsteren"]})
df.to_csv("data.csv")

In [1]:
# RunConfiguration = python environment setup
# ScriptRunConfig  = script + environment setup
from azureml.core import Experiment, RunConfiguration, ScriptRunConfig

In [None]:
# create a new RunConfig object
# Represents configuration for experiment runs targeting different compute targets in Azure Machine Learning
experiment_run_config = RunConfiguration()

In [None]:
# Create a ScriptRunConfig object
# Represents configuration information for submitting a training run in Azure Machine Learning
script_config = ScriptRunConfig(source_directory='.',
                                script='experiment.py',
                                run_config=experiment_run_config) 

In [None]:
# submit the experiment
experiment = Experiment(workspace=ws, name='experiment_02')
run = experiment.submit(config=script_config)
run.wait_for_completion(show_output=True)

# Experiment & Run
## Estimator (generic)

In [2]:
from azureml.train.estimator import Estimator
from azureml.core import Experiment

In [3]:
# Create an estimator
estimator = Estimator(source_directory='.',
                      entry_script='experiment.py',
                      compute_target='local',
                      conda_packages=['scikit-learn']
                      )

In [8]:
# Create and run an experiment
experiment = Experiment(workspace=ws, name='experiment_03')
run = experiment.submit(config=estimator)

In [11]:
#run.wait_for_completion(show_output=True)

In [12]:
# encapsulates a 'Run Configuration' and a 'Script Run Configuration' in a single object !

# Experiment & Run
## passing arguments

In [27]:
%%writefile titanic.csv
,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,1,0,3,22.0,1,0,7.2500,1.0,0.0,1.0
1,2,1,1,38.0,1,0,71.2833,0.0,0.0,0.0
2,3,1,3,26.0,0,0,7.9250,0.0,0.0,1.0
3,4,1,1,35.0,1,0,53.1000,0.0,0.0,1.0
4,5,0,3,35.0,0,0,8.0500,1.0,0.0,1.0

Writing titanic.csv


In [30]:
import pandas as pd
titan = pd.read_csv("titanic.csv", )
titan

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,1,0,3,22.0,1,0,7.25,1.0,0.0,1.0
1,1,2,1,1,38.0,1,0,71.2833,0.0,0.0,0.0
2,2,3,1,3,26.0,0,0,7.925,0.0,0.0,1.0
3,3,4,1,1,35.0,1,0,53.1,0.0,0.0,1.0
4,4,5,0,3,35.0,0,0,8.05,1.0,0.0,1.0


In [37]:
titan.columns

Index(['Unnamed: 0', 'PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp',
       'Parch', 'Fare', 'male', 'Q', 'S'],
      dtype='object')

In [38]:
%%writefile experiment_argparse.py
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Get the experiment run context
run = Run.get_context()

# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg_rate', type=float, dest='reg', default=0.01)
args = parser.parse_args()
reg = args.reg

# Prepare the dataset
data = pd.read_csv('data.csv')
titanic = pd.read_csv('titanic.csv')
X, y = titanic[['PassengerId','Pclass','Age','SibSp','Parch','Fare','male','Q','S']].values, titanic['Survived'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Train a logistic regression model
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# Count the rows and log the result and save the argument value
row_count = (len(data))
run.log('observations', row_count)
run.log("the given 'reg_rate' parameter:", reg) # <------------

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))

# Save a sample of the data
os.makedirs('outputs', exist_ok=True)
data.head(2).to_csv("outputs/sample.csv", index=False, header=True)

# Save the trained model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/titanic_model.pkl')

# Complete the run
run.complete()

Overwriting experiment_argparse.py


### use script_params = {'--reg_rate': 0.1}

In [39]:
from azureml.train.estimator import Estimator
from azureml.core import Experiment

In [40]:
# Create an estimator
estimator = Estimator(source_directory='.',
                      entry_script='experiment_argparse.py',
                      script_params = {'--reg_rate': 0.1}, # <-------------
                      compute_target='local',
                      conda_packages=['scikit-learn', 'joblib'])

In [41]:
# Create and run an experiment
experiment = Experiment(workspace=ws, name='experiment_04')
run = experiment.submit(config=estimator)

In [42]:
run.wait_for_completion(show_output=True)

RunId: experiment_04_1585219254_ef64c469
Web View: https://ml.azure.com/experiments/experiment_04/runs/experiment_04_1585219254_ef64c469?wsid=/subscriptions/43c1f93a-903d-4b23-a4bf-92bd7a150627/resourcegroups/myResourceGroup/workspaces/machine_learning_workspace

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt
Starting the daemon thread to refresh tokens in background for process with pid = 22426
Running: ['/bin/bash', '/tmp/azureml_runs/experiment_04_1585219254_ef64c469/azureml-environment-setup/docker_env_checker.sh']

Found materialized image on target: azureml/azureml_586a3ed27470f038ee8054b84967c621


Logging experiment running status in history service.
Running: ['sudo', 'docker', 'run', '--name', 'experiment_04_1585219254_ef64c469', '--rm', '-v', '/tmp/azureml_runs/experiment_04_1585219254_ef64c469:/azureml-run', '--shm-size', '2g', '-e', 'EXAMPLE_ENV_VAR=EXAMPLE_VALUE', '-e', 'AZUREML_CONTEXT_MANAGER_TRACKUSERERROR=eyJTa2lwSGlzdG9ye

{'runId': 'experiment_04_1585219254_ef64c469',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-03-26T10:40:57.797123Z',
 'endTimeUtc': '2020-03-26T10:41:06.256736Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '2769b7e8-81cb-4cd4-abf2-3c7eddf947b3',
  'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '52678fcbd4c5cd3218a03f1cfd3043f4bdf6d765',
  'mlflow.source.git.commit': '52678fcbd4c5cd3218a03f1cfd3043f4bdf6d765',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [],
 'runDefinition': {'script': 'experiment_argparse.py',
  'useAbsolutePath': False,
  'arguments': ['--reg_rate', '0.1'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferenc

## Retrieving files

In [43]:
# "run" is a reference to a completed experiment run
# List the files generated by the experiment
for file in run.get_file_names():
    print(file)

azureml-logs/60_control_log.txt
azureml-logs/70_driver_log.txt
logs/azureml/8_azureml.log
outputs/sample.csv
outputs/titanic_model.pkl


In [44]:
# Download a named file
#run.download_file(name='outputs/model.pkl', output_file_path='model.pkl')
run.download_file(name='outputs/sample.csv', output_file_path='sample.csv')

In [45]:
!ls -l sample.csv

-rw-rw-r-- 1 ubuntu ubuntu 59 Mar 26 11:44 sample.csv


# Register a model

### option A
Fails, because it first need to download the model file

In [59]:
# register a model from a local file, you can use the register method of the Model object
# from azureml.core import Model

# model = Model.register(workspace=ws,
#                        model_name='titanic_classification_model',
#                        model_path='outputs/titanic_model.pkl', # local path
#                        description='A classification model Titanic',
#                        tags={'testmodel': 'titanic'},
#                        model_framework=Model.Framework.SCIKITLEARN,
#                        model_framework_version='0.20.3')

### option B
this is better because it grabs the model file from the run !

In [47]:
# Register a model using reference to the Run use its register_model method
run.register_model(model_name='titanic_classification_model',
                   model_path='outputs/titanic_model.pkl', # run outputs path
                   description='A classification model Titanic',
                   tags={'testmodel': 'titanic'},
                   model_framework=Model.Framework.SCIKITLEARN,
                   model_framework_version='0.20.3')

Model(workspace=Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup'), name=titanic_classification_model, id=titanic_classification_model:1, version=1, tags={'testmodel': 'titanic'}, properties={})

In [48]:
# view registered models with
Model.list(ws)

[Model(workspace=Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup'), name=titanic_classification_model, id=titanic_classification_model:1, version=1, tags={'testmodel': 'titanic'}, properties={})]

# Working with Data

In [51]:
from azureml.core import Datastore

In [53]:
# list all datastores (already registered a few manually)
ws.datastores

{'data_lake_gen2': <azureml.data.azure_data_lake_datastore.AzureDataLakeGen2Datastore at 0x7efdd6ef9cc0>,
 'workspacefilestore': <azureml.data.azure_storage_datastore.AzureFileDatastore at 0x7efdd6ef9be0>,
 'workspaceblobstore': <azureml.data.azure_storage_datastore.AzureBlobDatastore at 0x7efdd6e8dc88>}

In [54]:
# get a reference to ex: data_lake_gen2
blob_store = Datastore.get(ws, datastore_name='workspaceblobstore')
data_lake_gen2 = Datastore.get(ws, datastore_name='data_lake_gen2')

In [57]:
print(type(blob_store))
print(type(data_lake_gen2))

<class 'azureml.data.azure_storage_datastore.AzureBlobDatastore'>
<class 'azureml.data.azure_data_lake_datastore.AzureDataLakeGen2Datastore'>


In [58]:
# copy titanic.csv to datalake in /datalake/gold/

### option1: Download

### option2: Upload

### option3: Mount (preferred) - not possible on local compute

In [None]:
# you must pass "script_params" parameter to an experiment script
# ex:   script_params = {'--data_folder': data_ref}

In [None]:
%%writefile experiment_argparse.py
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Get the experiment run context
run = Run.get_context()

# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg_rate', type=float, dest='reg', default=0.01)
args = parser.parse_args()
reg = args.reg

# set datastore local reference path
parser = argparse.ArgumentParser()
parser.add_argument('--data_folder', type=str, dest='data_folder')
args = parser.parse_args()
data_files = os.listdir(args.data_folder)

# Prepare the dataset
data = pd.read_csv('data.csv')
titanic = pd.read_csv('titanic.csv')
X, y = titanic[['PassengerId','Pclass','Age','SibSp','Parch','Fare','male','Q','S']].values, titanic['Survived'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Train a logistic regression model
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# Count the rows and log the result and save the argument value
row_count = (len(data))
run.log('observations', row_count)
run.log("the given 'reg_rate' parameter:", reg) # <------------

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))

# Save a sample of the data
os.makedirs('outputs', exist_ok=True)
data.head(2).to_csv("outputs/sample.csv", index=False, header=True)

# Save the trained model
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/titanic_model.pkl')

# Complete the run
run.complete()

In [69]:
data_ref = blob_store.path("data/files").as_download(path_on_compute='training_data')

In [70]:
import os
os.listdir(data_ref)

TypeError: listdir: path should be string, bytes, os.PathLike, integer or None, not DataReference

In [None]:
data_ref = blob_ds.path('gold/').as_download(path_on_compute='training_data')
estimator = Estimator(source_directory='.',
                      entry_script='experiment_argparse.py',
                      script_params = {'--reg_rate': 0.1}, # <-------------
                      compute_target='local',
                      conda_packages=['scikit-learn', 'joblib'],
                      pip_packages=['azureml-sdk'],
                      script_params = {'--data_folder': data_ref})

## Datasets
### Retrieving a registered dataset
https://nbviewer.jupyter.org/github/MicrosoftDocs/mslearn-aml-labs/blob/master/03-Working_with_Data.ipynb

In [77]:
# we manually created and registered a dataset from the datalake

In [78]:
# show a list of available datasets
ws.datasets

{'datalake': DatasetRegistration(id='a2af81a9-8e27-429d-8845-489bd371e9ca', name='datalake', version=1, description='', tags={})}

In [79]:
# Get a dataset from the workspace datasets collection
#ds1 = ws.datasets['datalake']

In [80]:
from azureml.core import Dataset

In [81]:
# Get a dataset by name from the datasets class
ds2 = Dataset.get_by_name(ws, 'datalake')

In [83]:
# list all files in the datalake (incl. directories)
ds2.to_path()

['/bronze/db_v2_csv/_committed_1187318739692831567',
 '/bronze/db_v2_csv/_started_1187318739692831567',
 '/bronze/db_v2_csv/part-00000-tid-1187318739692831567-fae6394b-2577-421a-ab58-50e75b7b6889-9-1-c000.csv',
 '/bronze/db_v2_csv/part-00001-tid-1187318739692831567-fae6394b-2577-421a-ab58-50e75b7b6889-10-1-c000.csv',
 '/bronze/db_v2_csv/part-00002-tid-1187318739692831567-fae6394b-2577-421a-ab58-50e75b7b6889-11-1-c000.csv',
 '/bronze/docph/DB_V2.parquet',
 '/bronze/pharma_ref.xlsx',
 '/bronze/pharma_ref_csv/_committed_3971660126738673139',
 '/bronze/pharma_ref_csv/_started_3971660126738673139',
 '/bronze/pharma_ref_csv/part-00000-tid-3971660126738673139-e84ee614-8706-4e8b-afc5-89e3bd88a7a4-4-1-c000.csv',
 '/bronze/pharma_ref_csv/part-00001-tid-3971660126738673139-e84ee614-8706-4e8b-afc5-89e3bd88a7a4-5-1-c000.csv',
 '/bronze/pharma_ref_csv/part-00002-tid-3971660126738673139-e84ee614-8706-4e8b-afc5-89e3bd88a7a4-6-1-c000.csv',
 '/bronze/pharma_ref_csv/part-00003-tid-3971660126738673139-e84

### passing a file dataset, you must specify the access mode

In [None]:
# the script will need to work with a Dataset object, you must include pip packages:

#estimator = Estimator(pip_packages=['azureml-sdk'])

In [None]:
estimator = Estimator(source_directory='.',
                      entry_script='experiment_argparse.py'
                      compute_target='local',
                      inputs=[img_ds.as_named_input('img_data').as_download(path_on_compute='data')],
                      pip_packages=['azureml-dataprep[pandas]')

In [None]:
estimator = SKLearn( source_directory='experiment_folder',
                     entry_script='training_script.py',
                     compute_target='local',
                     inputs=[tab_ds.as_named_input('csv_data')],
                     pip_packages=['azureml-dataprep[pandas]')

In [None]:
data_ref = blob_ds.path('gold/').as_download(path_on_compute='training_data')
estimator = Estimator(source_directory='.',
                      entry_script='experiment_argparse.py',
                      script_params = {'--reg_rate': 0.1}, # <-------------
                      compute_target='local',
                      conda_packages=['scikit-learn', 'joblib'],
                      pip_packages=['azureml-sdk'],
                      script_params = {'--data_folder': data_ref})

# Train a Model from a File Dataset (mount mode)

In [56]:
# put 2 files in the ADSL Gen2 data lake
# they are in container "datalake" and this is registered in ml workspace as "datalake" Datastore
# in turn the container "datalake" is registered as a Dataset in azure ML Workspace
# 2 specific files of intrest: 
# /gold/diabetes.csv
# /gold/diabetes2.csv
# the goal is to mount these into a run script send to compute nodes to train model on

In [57]:
# the dataset input passed to the script represents a mount point containing file paths

In [59]:
# create A folder named diabetes_training_from_file_dataset here locally
import os

# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_file_dataset'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

diabetes_training_from_file_dataset folder created


In [60]:
# create a script that trains a classification model by using a file dataset that is passed to it as an input

In [61]:
print("diabetes" + "/*.csv")

diabetes/*.csv


In [118]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import argparse
from azureml.core import Workspace, Dataset, Experiment, Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import glob

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
data_path = run.input_datasets['diabetes']  # Get the training data from the estimator input
print("data_path: " + str(data_path))         # diabetes_path
all_files = glob.glob(data_path + "/*")
print("data_path + '/*.csv': " + str(data_path + '/*.csv')) # diabetes_path/*.csv
print([file for file in all_files])   # ['diabetes_path/diabetes.csv', 'diabetes_path/diabetes2.csv']
print("type(all_files): " + str(type(all_files)))
print(type(all_files[0]))
diabetes = pd.concat((pd.read_csv(f) for f in all_files))
print("number of records: " + str(len(diabetes)))

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting diabetes_training_from_file_dataset/diabetes_training.py


Next we need to change the way we pass the dataset to the estimator - it needs to define a mount point from which the script can read the files. For large volumes of data, you'd generally use the **as_mount** method to stream the files directly from the dataset source; but when running on local compute (as we are in this example), you need to use the **as_download** option to download the dataset files to a local folder.

Also, since the **Dataset** class is defined in the **azureml-dataprep** package, we need to include that in the experiment environment.

In [64]:
from azureml.train.sklearn import SKLearn
from azureml.core import Experiment
from azureml.core import Dataset
from azureml.widgets import RunDetails

In [65]:
# Set the script parameters
script_params = {
    '--regularization': 0.1
}

In [66]:
ws.datasets

{'diabetes1': DatasetRegistration(id='ebc21ed2-3f94-494b-8072-2c71d2190200', name='diabetes1', version=1, description='', tags={}), 'datalakegold': DatasetRegistration(id='ad71d877-e111-4bd4-bf8a-8a602709dffd', name='datalakegold', version=1, description='', tags={}), 'datalake': DatasetRegistration(id='a2af81a9-8e27-429d-8845-489bd371e9ca', name='datalake', version=1, description='', tags={})}

In [67]:
# Get the training dataset
diabetes_ds = ws.datasets.get("datalakegold")

# Get a dataset from the workspace datasets collection
#ds1 = ws.datasets['datalakegold']
#-or-
# Get a dataset by name from the datasets class
#ds2 = Dataset.get_by_name(ws, 'datalakegold')

In [68]:
diabetes_ds

{
  "source": [
    "('data_lake_gen2', 'gold/**')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "ad71d877-e111-4bd4-bf8a-8a602709dffd",
    "name": "datalakegold",
    "version": 1,
    "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
  }
}

In [69]:
[diabetes_ds]

[{
   "source": [
     "('data_lake_gen2', 'gold/**')"
   ],
   "definition": [
     "GetDatastoreFiles"
   ],
   "registration": {
     "id": "ad71d877-e111-4bd4-bf8a-8a602709dffd",
     "name": "datalakegold",
     "version": 1,
     "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
   }
 }]

In [70]:
#diabetes_ds.as_named_input('diabetes').as_download(path_on_compute='diabetes_data')

In [72]:
# Create an estimator
estimator = SKLearn(source_directory=experiment_folder,
                    entry_script='diabetes_training.py',
                    script_params=script_params,
                    compute_target = 'local',
                    inputs=[diabetes_ds.as_named_input('diabetes').as_download(path_on_compute='diabetes_path')], # Pass the Dataset object as an input
                    pip_packages=['azureml-dataprep[pandas]'] # so we need the dataprep package
                   )

In [104]:
diabetes_ds.as_named_input('diabetes').as_download(path_on_compute='diabetes_path').__dict__
# notice the mode if set to Download or Mount then 
# Run.input_datasets will return the base path of the delivered data
# ex: Run.input_datasets['diabetes']   ---> "diabetes_path" string value
# this is probably the path_on_compute where the data should be put...
# files are available on  ['diabetes_path/diabetes.csv', 'diabetes_path/diabetes2.csv']

{'dataset': {
   "source": [
     "('data_lake_gen2', 'gold/**')"
   ],
   "definition": [
     "GetDatastoreFiles"
   ],
   "registration": {
     "id": "ad71d877-e111-4bd4-bf8a-8a602709dffd",
     "name": "datalakegold",
     "version": 1,
     "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
   }
 },
 'name': 'diabetes',
 'mode': 'download',
 'path_on_compute': 'diabetes_path'}

In [119]:
# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)
# Run the experiment
run = experiment.submit(config=estimator)

In [120]:
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'diabetes-training_1585308364_a95f99c6',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2020-03-27T11:26:06.544218Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '761f2bce-8d30-4c56-9dd5-0be0c7d55ccb',
  'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '8189bb0610764a3d5583f351f729bdc6cb32fe0e',
  'mlflow.source.git.commit': '8189bb0610764a3d5583f351f729bdc6cb32fe0e',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [{'dataset': {'id': 'ad71d877-e111-4bd4-bf8a-8a602709dffd'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'diabetes', 'mechanism': 'Download', 'pathOnCompute': 'diabetes_path'}}],
 'runDefinition': {'script': 'diabetes_training.py',
  'useAbsolutePath': False,
  'arguments': ['

When the experiment has completed, in the widget, view the **azureml-logs/70_driver_log.txt** output log to verify that the file dataset was processed and the data files downloaded.

# Datastore method (mount mode)

In [129]:
from azureml.core import Workspace

In [130]:
ws = Workspace.from_config()

In [131]:
from azureml.core import Datastore, Dataset

In [132]:
# available datastore names
ws.datastores

{'data_lake_gen2': <azureml.data.azure_data_lake_datastore.AzureDataLakeGen2Datastore at 0x7f2bd1730668>,
 'workspacefilestore': <azureml.data.azure_storage_datastore.AzureFileDatastore at 0x7f2bd17302b0>,
 'workspaceblobstore': <azureml.data.azure_storage_datastore.AzureBlobDatastore at 0x7f2bc226ac50>}

In [158]:
ds = Datastore.get(workspace=ws, datastore_name="data_lake_gen2")

In [134]:
ds.__dict__

{'_workspace': Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup'),
 '_name': 'data_lake_gen2',
 '_datastore_type': 'AzureDataLakeGen2',
 'tenant_id': '73b49191-8db3-45ab-87b3-b8f956ac123b',
 'client_id': '38c02221-4a41-4ec8-b8da-a81f16c38e82',
 'client_secret': 'l]ABG6@Z/9r/hX7EK0zavK5Nx[MA-J1V',
 'resource_url': 'https://storage.azure.com',
 'authority_url': 'https://login.microsoftonline.com',
 'container_name': 'datalake',
 'account_name': 'datalake21032020',
 'protocol': 'https',
 'endpoint': 'core.windows.net'}

In [135]:
from azureml.core import Dataset

In [136]:
# available dataset names
ws.datasets

{'diabetes1': DatasetRegistration(id='ebc21ed2-3f94-494b-8072-2c71d2190200', name='diabetes1', version=1, description='', tags={}), 'datalakegold': DatasetRegistration(id='ad71d877-e111-4bd4-bf8a-8a602709dffd', name='datalakegold', version=1, description='', tags={}), 'datalake': DatasetRegistration(id='a2af81a9-8e27-429d-8845-489bd371e9ca', name='datalake', version=1, description='', tags={})}

In [137]:
from azureml.data.datapath import DataPath

In [138]:
# Creating and registering file datasets
#blob_ds = Dataset.get_by_name(workspace=ws, name="datalakegold")
datastore = Datastore.get(workspace=ws, datastore_name="data_lake_gen2")

In [139]:
datastore_path = [
    DataPath(datastore, 'platinum/diabetes.csv'),
    DataPath(datastore, 'platinum/folder/*.csv')
]

In [140]:
datastore_path

[<azureml.data.datapath.DataPath at 0x7f2bc225aeb8>,
 <azureml.data.datapath.DataPath at 0x7f2bc225afd0>]

In [141]:
# select the files that you need
file_dataset = Dataset.File.from_files(path=datastore_path)

In [142]:
file_dataset

{
  "source": [
    "('data_lake_gen2', 'platinum/diabetes.csv')",
    "('data_lake_gen2', 'platinum/folder/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}

In [143]:
# registering these files
file_ds = file_dataset.register(workspace=ws, name='diabetes1')

In [162]:
# get the registered dataset by name from the datasets class
ds = Dataset.get_by_name(workspace=ws, name="diabetes2")

In [145]:
ds

{
  "source": [
    "('data_lake_gen2', 'platinum/diabetes.csv')",
    "('data_lake_gen2', 'platinum/folder/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "ebc21ed2-3f94-494b-8072-2c71d2190200",
    "name": "diabetes1",
    "version": 1,
    "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
  }
}

In [146]:
# list all files
ds.to_path()

['/data_lake_gen2/platinum/diabetes.csv',
 '/data_lake_gen2/platinum/folder/diabetes2.csv']

In [147]:
# Passing a dataset to an experiment script

In [148]:
# create A folder named diabetes_training_from_file_dataset here locally
import os

# Create a folder for the experiment files
experiment_folder = 'diabetes_training_from_file_dataset'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

diabetes_training_from_file_dataset folder created


In [149]:
# we have created a global variable "experiment_folder"

In [180]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import argparse
from azureml.core import Workspace, Dataset, Experiment, Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import glob

# Set regularization hyperparameter (passed as an argument to the script)
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
args = parser.parse_args()
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
data_path = run.input_datasets['diabetes2'] # Get the training data from the estimator input
print("data_path: " + str(data_path))
all_files = glob.glob(data_path + "/folder/*.csv")
#print("data_path + '/*.csv': " + str(data_path + '/*.csv')) # diabetes_path/*.csv
print([file for file in all_files])   # ['diabetes_path/diabetes.csv', 'diabetes_path/diabetes2.csv']
print("type(all_files): " + str(type(all_files)))
print(type(all_files[0]))
diabetes = pd.concat((pd.read_csv(f) for f in all_files))
print("number of records: " + str(len(diabetes)))

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

Overwriting diabetes_training_from_file_dataset/diabetes_training.py


In [151]:
from azureml.train.sklearn import SKLearn
from azureml.core import Experiment
from azureml.core import Dataset
from azureml.widgets import RunDetails

In [166]:
# get the registered dataset by name from the datasets class
ds = Dataset.get_by_name(workspace=ws, name="diabetes2")

In [167]:
# Set the script parameters
script_params = {
    '--regularization': 0.1
}

In [169]:
ds.as_named_input('diabetes2').__dict__

{'dataset': {
   "source": [
     "('data_lake_gen2', 'platinum/**')"
   ],
   "definition": [
     "GetDatastoreFiles"
   ],
   "registration": {
     "id": "2c81c692-c43c-4f03-9952-45124c0da47c",
     "name": "diabetes2",
     "version": 1,
     "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
   }
 },
 'name': 'diabetes2',
 'mode': 'direct',
 'path_on_compute': None}

In [170]:
ds.as_named_input('diabetes2').as_download(path_on_compute='diabetes_path')

<azureml.data.dataset_consumption_config.DatasetConsumptionConfig at 0x7f2bc2209438>

In [171]:
# Create an estimator
estimator = SKLearn(source_directory=experiment_folder,
                    entry_script='diabetes_training.py',
                    script_params=script_params,
                    compute_target = 'local',
                    inputs=[ds.as_named_input('diabetes2').as_download(path_on_compute='diabetes_path')], # Pass the Dataset object as an input
                    pip_packages=['azureml-dataprep[pandas]'] # so we need the dataprep package
                   )

In [181]:
# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)
# Run the experiment
run = experiment.submit(config=estimator)

In [182]:
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'diabetes-training_1585314547_c0946781',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2020-03-27T13:09:09.090884Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'a3adee05-57f2-4bdc-903f-8941cc1e936b',
  'azureml.git.repository_uri': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'mlflow.source.git.repoURL': 'https://github.com/albert-kevin/azuremachinelearning.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '8189bb0610764a3d5583f351f729bdc6cb32fe0e',
  'mlflow.source.git.commit': '8189bb0610764a3d5583f351f729bdc6cb32fe0e',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [{'dataset': {'id': '2c81c692-c43c-4f03-9952-45124c0da47c'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'diabetes2', 'mechanism': 'Download', 'pathOnCompute': 'diabetes_path'}}],
 'runDefinition': {'script': 'diabetes_training.py',
  'useAbsolutePath': False,
  'arguments': [

In [28]:
ds = Dataset.get_by_name(workspace=ws, name="datalakegold")

In [29]:
# list all files
ds.to_path()

['/diabetes.csv', '/diabetes2.csv']

In [30]:
csv_paths = [(ds, '/diabetes*.csv')]

In [31]:
csv_paths

[({
    "source": [
      "('data_lake_gen2', 'gold/**')"
    ],
    "definition": [
      "GetDatastoreFiles"
    ],
    "registration": {
      "id": "ad71d877-e111-4bd4-bf8a-8a602709dffd",
      "name": "datalakegold",
      "version": 1,
      "workspace": "Workspace.create(name='machine_learning_workspace', subscription_id='43c1f93a-903d-4b23-a4bf-92bd7a150627', resource_group='myResourceGroup')"
    }
  },
  '/diabetes*.csv')]

In [None]:
tab_ds = Dataset.Tabular.from_delimited_files(path=csv_paths)
tab_ds = tab_ds.register(workspace=ws, name='csv_table')