In [1]:

from azureml.core import Workspace, Experiment

ws = Workspace.get(name="capstone-ml")
exp = Experiment(workspace=ws, name="uda-cap")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: capstone-ml
Azure region: eastus
Subscription id: b09642af-8e07-4efe-80f5-7f7e59fb2cf2
Resource group: uda-cap


In [2]:
ws = Workspace.from_config()
experiment_name = 'capstone-ml'

experiment=Experiment(ws, experiment_name)

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute


from azureml.core.compute_target import ComputeTargetException

cluster_name = "cap-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('I found the existing cluster, So I am using it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           vm_priority = 'lowpriority',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

I found the existing cluster, So I am using it.


## Import Libraries

In [4]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
import pandas as pd
from train import clean_data
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.model_selection import train_test_split
import logging
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Import Dataset

In [5]:
url = 'https://cap.blob.core.windows.net/cap/train.csv'

ds = TabularDatasetFactory.from_delimited_files(url)


#### Clean the dataset with the clean_data function, imported from the training script

In [6]:
x, y = clean_data(ds)
y = y.drop(y.index[0])
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [7]:
x.head()

Unnamed: 0,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
2,1,0,1,1,3000,0.0,66.0,360.0,1.0,2
3,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2
4,0,0,1,0,6000,0.0,141.0,360.0,1.0,2
5,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2
6,1,0,0,0,2333,1516.0,95.0,360.0,1.0,2


In [8]:
df_train = pd.concat([x_train, y_train], axis=1)

In [9]:
df_train.head()

Unnamed: 0,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
329,0,0,1,0,2500,0.0,67.0,360.0,1.0,2,1.0
54,1,1,1,1,11500,0.0,286.0,360.0,0.0,2,0.0
230,1,1,1,0,2491,2054.0,104.0,360.0,1.0,1,1.0
274,1,2,1,0,3900,0.0,90.0,360.0,1.0,1,1.0
210,0,0,1,0,10000,0.0,214.0,360.0,1.0,1,0.0


In [10]:
y.head()

Unnamed: 0,Loan_Status
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0


## AutoML Configuration

#### AutoML Setting

In [32]:

automl_settings = {
    "featurization": "auto",
    "n_cross_validations": 4,
    "experiment_timeout_minutes": 30,
    "enable_early_stopping": True,
    "verbosity": logging.INFO,
}

### AutoML Configuration

In [11]:
# automl_config = AutoMLConfig(
#     task="classification",
#     training_data=df_train,
#     label_column_name=y,
#     primary_metric="AUC_weighted",
#     **automl_settings
# )

In [12]:
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=df_train,
    label_column_name="Loan_Status",
    n_cross_validations=5)

Submit Experiment

In [13]:
automl_run = exp.submit(automl_config, show_output=True)
# RunDetails(hyperdrive_run).show()


Running on local machine
Parent Run ID: AutoML_461e7c11-28bd-464d-8215-67062724e3b1

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing f