## Set up the experiment folder

In [1]:
import os, shutil

# Create a folder for the experiment files
training_folder = 'driver-training'
os.makedirs(training_folder, exist_ok=True)

# Copy the data file into the experiment folder
shutil.copy('data/porto_seguro_safe_driver_prediction_train.csv', os.path.join(training_folder, "porto_seguro_safe_driver_prediction_train.csv"))


'driver-training/porto_seguro_safe_driver_prediction_train.csv'

## train.py
This file defines the key functions required to train the model.  
The file can be invoked with `python train.py` for development purposes.

In [2]:
%%writefile $training_folder/train.py
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import lightgbm


def split_data(data_df):
    features = data_df.drop(['target', 'id'], axis=1)
    labels = np.array(data_df['target'])
    features_train, features_valid, labels_train, labels_valid = \
        train_test_split(features, labels, test_size=0.2, random_state=0)

    train_data = lightgbm.Dataset(features_train, label=labels_train)
    valid_data = lightgbm.Dataset(
        features_valid,
        label=labels_valid,
        free_raw_data=False)
    return (train_data, valid_data)


def train_model(data, parameters):
    model = lightgbm.train(parameters,
                           data[0],
                           valid_sets=data[1],
                           num_boost_round=500,
                           early_stopping_rounds=20)
    return model


def get_model_metrics(model, data):
    predictions = model.predict(data[1].data)
    fpr, tpr, thresholds = metrics.roc_curve(data[1].label, predictions)
    model_metrics = {"auc": (metrics.auc(fpr, tpr))}
    print(model_metrics)
    return model_metrics


def main():
    data_df = pd.read_csv('porto_seguro_safe_driver_prediction_train.csv')
    parameters = {
        'learning_rate': 0.02,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'sub_feature': 0.7,
        'num_leaves': 60,
        'min_data': 100,
        'min_hessian': 1,
        'verbose': 2
    }
    data = split_data(data_df)
    model = train_model(data, parameters)
    get_model_metrics(model, data)

    if __name__ == '__main__':
        main()


Overwriting driver-training/train.py


## parameters.json
This file will specify the parameters used to train the model.

In [24]:
%%writefile $training_folder/parameters.json
{
    "training":
    {
        "learning_rate": 0.04,
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "auc",
        "sub_feature": 0.7,
        "num_leaves": 60,
        "min_data": 100,
        "min_hessian": 1,
        "verbose": 0
    }
}


Overwriting driver-training/parameters.json


## driver_training.py
This file will be the entry script when running an Azure ML context.  
It calls the functions defined in train.py for data preparation and training, but reads parameters from a file, and logs output to the Azure ML context.  
The file can be invoked with `python driver_training.py` for development purposes.

In [25]:
%%writefile $training_folder/driver_training.py
# Import libraries
import argparse
from azureml.core import Run
import joblib
import json
import os
import pandas as pd

# Import functions from train.py
from train import split_data, train_model, get_model_metrics

# Get the output folder for the model from the '--output_folder' parameter
parser = argparse.ArgumentParser()
parser.add_argument(
    '--output_folder',
    type=str,
    dest='output_folder',
    default="outputs")
args = parser.parse_args()
output_folder = args.output_folder

# Get the experiment run context
run = Run.get_context()

# load the safe driver prediction dataset
train_df = pd.read_csv('porto_seguro_safe_driver_prediction_train.csv')

# Load the parameters for training the model from the file
with open("parameters.json") as f:
    pars = json.load(f)
    parameters = pars["training"]

# Log the parameters
for k, v in parameters.items():
    run.log(k, v)
    
data = split_data(train_df)
model = train_model(data, parameters)
model_metrics = get_model_metrics(model, data)

run.log('auc',model_metrics['auc'])

# Save the trained model to the output folder
os.makedirs(output_folder, exist_ok=True)
output_path = output_folder + "/driver_model.pkl"
joblib.dump(value=model, filename=output_path)

run.complete()

Overwriting driver-training/driver_training.py


In [26]:
import azureml.core
from azureml.core import Workspace

# Load the workspace
ws = Workspace.from_config()

## Use an Estimator to Run the Script as an Experiment

See [this tutorial](https://github.com/MicrosoftDocs/mslearn-aml-labs/blob/master/02-Training_Models.ipynb) for a starting point

Use the scikit-learn and lightgbm conda packages

In [27]:
from azureml.train.estimator import Estimator
from azureml.core import Experiment

# Create an estimator
estimator = Estimator(source_directory=training_folder,
                      entry_script='driver_training.py',
                      compute_target='local',
                      conda_packages=['scikit-learn','lightgbm']
                      )

# Create an experiment
experiment_name = 'driver-training'
experiment = Experiment(workspace = ws, name = experiment_name)

# Run the experiment based on the estimator
run = experiment.submit(config=estimator)
run.wait_for_completion(show_output=True)



RunId: driver-training_1590531244_e6526452
Web View: https://ml.azure.com/experiments/driver-training/runs/driver-training_1590531244_e6526452?wsid=/subscriptions/b4f30574-19b5-4753-926d-877888e82fc4/resourcegroups/oh-dsdata-data/workspaces/team5ws

Streaming azureml-logs/70_driver_log.txt

Entering context manager injector. Current time:2020-05-26T22:14:13.976826
Starting the daemon thread to refresh tokens in background for process with pid = 8
Entering Run History Context Manager.
Preparing to call script [ driver_training.py ] with arguments: []
After variable expansion, calling script [ driver_training.py ] with arguments: []

This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[1]	valid_0's auc: 0.595844
Training until validation scores don't improve for 20 rounds
[2]	valid_0's auc: 0.608461
[3]	valid_0's auc: 0.614317
[4]	valid_0's auc: 0.618862
[5]	valid_0's au

{'runId': 'driver-training_1590531244_e6526452',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2020-05-26T22:14:13.144412Z',
 'endTimeUtc': '2020-05-26T22:14:34.532386Z',
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '22d32282-5a10-4924-81d5-40764f9bb593'},
 'inputDatasets': [],
 'runDefinition': {'script': 'driver_training.py',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {},
  'jobName': None,
  'maxRunDurationSeconds': None,
  'nodeCount': 1,
  'environment': {'name': 'Experiment driver-training Environment',
   'version': 'Autosave_2020-05-26T17:43:28Z_e5477e2b',
   'python': {'interpreterPath': 'python',
    'userManagedDependencies': False,
    'condaDependencies': {'channels': ['anaconda', 'conda-forge'],
     'dependencies': ['python=3.6.2',
      {'pip': ['azureml-defaults']},
      'scik

In [28]:
# Print the resulting metrics
metrics = run.get_metrics(recursive=True)
for k, v in metrics.items():
        print(k, v)

driver-training_1590531244_e6526452 {'learning_rate': 0.04, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'sub_feature': 0.7, 'num_leaves': 60, 'min_data': 100, 'verbose': 0, 'min_hessian': 1, 'auc': 0.6380025131414137}


In [20]:
print(metrics)

SyntaxError: invalid syntax (<ipython-input-20-c5c402ee7418>, line 1)

In [29]:
# Register the model
run.register_model(model_path='outputs/driver_model.pkl', model_name='driver_model.pkl',tags={'metrics': str(metrics)})

Model(workspace=Workspace.create(name='team5ws', subscription_id='b4f30574-19b5-4753-926d-877888e82fc4', resource_group='oh-dsdata-data'), name=driver_model.pkl, id=driver_model.pkl:10, version=10, tags={'metrics': "{'driver-training_1590531244_e6526452': {'learning_rate': 0.04, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'sub_feature': 0.7, 'num_leaves': 60, 'min_data': 100, 'verbose': 0, 'min_hessian': 1, 'auc': 0.6380025131414137}}"}, properties={})

In [2]:
run.tag("auc", "0.6377511613946426")

NameError: name 'run' is not defined

In [30]:
!pytest driver-training

platform linux -- Python 3.6.9, pytest-5.4.1, py-1.8.0, pluggy-0.13.0
rootdir: /mnt/batch/tasks/shared/LS_root/mounts/clusters/devclusterteam5/code
plugins: arraydiff-0.3, openfiles-0.4.0, doctestplus-0.4.0, remotedata-0.3.2
collected 3 items                                                              [0m[1m

driver-training/test_train.py [32m.[0m[32m.[0m[32m.[0m[33m                                        [100%][0m

/anaconda/envs/azureml_py36/lib/python3.6/site-packages/nose/importer.py:12
    from imp import find_module, load_module, acquire_lock, release_lock

