## Loading the required library

In [1]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, quniform
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep

import os
import joblib
import pandas as pd
from pathlib import Path

## Configuring the Azure Workspace

In [5]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
ws.get_details()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-136269
Azure region: southcentralus
Subscription id: 9b72f9e6-56c5-4c16-991b-19c652994860
Resource group: aml-quickstarts-136269


## Creating the compute cluster

In [6]:
# Choose a name for your CPU cluster
amlcompute_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

aml_compute.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Configuring the experiment

In [7]:
exp = Experiment(workspace=ws, name="udacity-hyperdrive")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-136269
Azure region: southcentralus
Resource group: aml-quickstarts-136269


## Configuring the parameteris for hyperdrive experiment

In [8]:
# Specify parameter sampler
ps = RandomParameterSampling({'--C': uniform(0.1, 1),
                              '--max_iter': quniform(100, 1500, 100)})
# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(".",
              compute_target=aml_compute,
              entry_script="train.py" )

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
     hyperparameter_sampling=ps,
     policy=policy,
     primary_metric_name='Accuracy',
     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
     max_total_runs=25,
     max_concurrent_runs=4,)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [9]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
run_hyperdrive =exp.submit(config=hyperdrive_config)



In [10]:
RunDetails(run_hyperdrive).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [13]:
# Get your best run and save the model from that run.
best_run = run_hyperdrive.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])
best_run.get_file_names()

['--C', '0.10679530048273365', '--max_iter', '1100']


['azureml-logs/55_azureml-execution-tvmps_f615dbca32e9bcf1081ef3d1eeb03971e13a20998c788ecc81ded120309bb99d_d.txt',
 'azureml-logs/65_job_prep-tvmps_f615dbca32e9bcf1081ef3d1eeb03971e13a20998c788ecc81ded120309bb99d_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_f615dbca32e9bcf1081ef3d1eeb03971e13a20998c788ecc81ded120309bb99d_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/100_azureml.log',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log']

In [14]:
joblib.dump('best_model', filename="best_model")

['best_model']

In [15]:
best_model = best_run.register_model(model_name='shivam', model_path='.')
best_model.download(target_dir="outputs", exist_ok=True)

'outputs'

In [17]:
def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}

    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
    x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0)
    x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
    contact = pd.get_dummies(x_df.contact, prefix="contact")
    x_df.drop("contact", inplace=True, axis=1)
    x_df = x_df.join(contact)
    education = pd.get_dummies(x_df.education, prefix="education")
    x_df.drop("education", inplace=True, axis=1)
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0)

    y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)
    return x_df,y_df

from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


factory = TabularDatasetFactory()
test_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"
test_ds = factory.from_delimited_files(test_data_path)
X_test, y_test = clean_data(test_ds)

logit_model = joblib.load('outputs/bankmarketing-logit-model.joblib')

print(logit_model.score(X_test, y_test))
print(classification_report(y_test, logit_model.predict(X_test)))
print(confusion_matrix(y_test, logit_model.predict(X_test)))

0.9111650485436893
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      3636
           1       0.71      0.41      0.52       484

    accuracy                           0.91      4120
   macro avg       0.82      0.69      0.73      4120
weighted avg       0.90      0.91      0.90      4120

[[3557   79]
 [ 287  197]]


The sklearn.linear_model.logistic module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.linear_model. Anything that cannot be imported from sklearn.linear_model is now part of the private API.
Trying to unpickle estimator LogisticRegression from version 0.20.3 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.


## AutoML Experiment

In [18]:
exp_automl = Experiment(workspace=ws, name="udacity-automl")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: quick-starts-ws-136269
Azure region: southcentralus
Resource group: aml-quickstarts-136269


In [19]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
datastore = ws.get_default_datastore()
factory = TabularDatasetFactory()
data_path_train = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
data_path_valid = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv"
data_path_test = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"


ds_train = factory.from_delimited_files(data_path_train)
ds_valid = factory.from_delimited_files(data_path_valid)
ds_test = factory.from_delimited_files(data_path_test)

In [35]:
import logging
from azureml.train.automl import AutoMLConfig

label="y"

automl_settings = {
       "n_cross_validations": 3,
       "primary_metric": 'accuracy',
       "enable_early_stopping": True,
       "experiment_timeout_hours": 1.0,
       "max_concurrent_iterations": 4,
       "max_cores_per_iteration": -1,
       "verbosity": logging.INFO,
   }

automl_config = AutoMLConfig(task = 'classification',
                               compute_target = aml_compute,
                               training_data = ds_train,
                               label_column_name = label,
                               **automl_settings
                               )


In [36]:
# Submit your automl run

remote_run = exp_automl.submit(automl_config, show_output = False)

Running on remote.


In [37]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [40]:
best_run_aml, fitted_model_aml = remote_run.get_output()
model_name = best_run_aml.properties['model_name']

In [41]:
best_run_aml.download_file('outputs/model.pkl', 'outputs/bankmarketing-aml-model.pkl')
best_run_aml.download_file('outputs/scoring_file_v_1_0_0.py', 'outputs/score_aml.py')
best_run_aml.download_file('automl_driver.py', 'outputs/automl_driver.py')

In [42]:
import pickle
file = open("outputs/bankmarketing-aml-model.pkl",'rb')
aml_model = pickle.load(file)
file.close()

In [44]:
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

factory = TabularDatasetFactory()
test_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_test.csv"
test_ds = pd.read_csv(test_data_path)
y_test = test_ds[['y']]

In [45]:
print(accuracy_score(aml_model.predict(test_ds.drop(columns=['y'])), y_test))
print(classification_report(y_test, aml_model.predict(test_ds.drop(columns=['y']))))
print(confusion_matrix(y_test, aml_model.predict(test_ds.drop(columns=['y']))))

0.9169902912621359
              precision    recall  f1-score   support

          no       0.94      0.97      0.95      3636
         yes       0.68      0.55      0.61       484

    accuracy                           0.92      4120
   macro avg       0.81      0.76      0.78      4120
weighted avg       0.91      0.92      0.91      4120

[[3510  126]
 [ 216  268]]


## Cleaning the allocated resources

In [46]:
try:
    aml_compute.delete()
    print('Computetarget deleted')
except ComputeTargetException:
    print('Computetarget not found')

Computetarget deleted
Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

