# Automated ML



In [3]:
#TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.
from azureml.core.workspace import Workspace, Dataset
from azureml.core import Experiment, Model, Webservice
from azureml.widgets import RunDetails
from azureml.train.automl import AutoMLConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

## Dataset

### Overview
This is the clasical titanic dataset. The dataset can be found in kaggle [Titanic](https://www.kaggle.com/c/titanic/data).

The attributes are as below
Variable
- survival	: Survival	(0 = No, 1 = Yes)
- pclass : Ticket class	(1 = 1st, 2 = 2nd, 3 = 3rd)
- sex		
- Age : Age in years	
- sibsp	: # of siblings / spouses aboard the Titanic	
- parch	: # of parents / children aboard the Titanic	
- ticket : Ticket number	
- fare	: Passenger fare	
- cabin	: Cabin number	
- embarked	: Port of Embarkation(C = Cherbourg, Q = Queenstown, S = Southampton)

### Objective
Our objective is to build a classification model on the same

In [4]:
ws = Workspace.from_config()
experiment_name = 'automl_experiment'

experiment=Experiment(ws, experiment_name)

dataset = Dataset.get_by_name(ws, name='titanic-dataset')
dataset.to_pandas_dataframe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
dataset = dataset.to_pandas_dataframe()
print(dataset.shape)
dataset.head(2)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [6]:
dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
drop_column = ['PassengerId','Cabin', 'Ticket','Name']
dataset.drop(drop_column, axis=1, inplace = True)

In [12]:
dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1

In [14]:
#Here we encode Sex in Rank
dataset.loc[dataset['Sex'] == "female", 'Sex'] = 1
dataset.loc[dataset['Sex'] == "male", 'Sex'] = 0

In [15]:
#Here we encode Embarked in Rank
dataset.loc[dataset['Embarked'] == "C", 'Embarked'] = 2
dataset.loc[dataset['Embarked'] == "Q", 'Embarked'] = 1
dataset.loc[dataset['Embarked'] == "S", 'Embarked'] = 0

In [16]:
dataset.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,0,3,0,22.0,1,0,7.25,0,2
1,1,1,1,38.0,1,0,71.2833,2,2


In [17]:
cpu_cluster_name = "cpu-cluster"

# Check if the cluster exists. If there is an error, create the cluster
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2',
                                                            max_nodes=10)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    

cpu_cluster.wait_for_completion(show_output=True)

Creating...
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [18]:

# split data into training and test data
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(dataset,test_size=0.25)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
703,0,3,0,25.0,0,0,7.7417,1,1
256,1,1,1,28.0,0,0,79.2,2,1
81,1,3,0,29.0,0,0,9.5,0,1
64,0,1,0,28.0,0,0,27.7208,2,1
707,1,1,0,42.0,0,0,26.2875,0,1


In [19]:

# Save training data to a csv file 
train_data.to_csv("titanic_traindata.csv")

In [20]:
data_store = ws.get_default_datastore()
data_store.upload(src_dir='./',target_path='./')
ftrain_data = Dataset.Tabular.from_delimited_files(path = [(data_store, './titanic_traindata.csv')])

Uploading an estimated of 7 files
Uploading ./.amlignore
Uploaded ./.amlignore, 1 files out of an estimated total of 7
Uploading ./.amlignore.amltmp
Uploaded ./.amlignore.amltmp, 2 files out of an estimated total of 7
Uploading ./automl.ipynb
Uploaded ./automl.ipynb, 3 files out of an estimated total of 7
Uploading ./automl.ipynb.amltmp
Uploaded ./automl.ipynb.amltmp, 4 files out of an estimated total of 7
Uploading ./hyperparameter_tuning.ipynb
Uploaded ./hyperparameter_tuning.ipynb, 5 files out of an estimated total of 7
Uploading ./titanic_traindata.csv
Uploaded ./titanic_traindata.csv, 6 files out of an estimated total of 7
Uploading ./.ipynb_aml_checkpoints/automl-checkpoint2021-4-16-17-38-13.ipynb
Uploaded ./.ipynb_aml_checkpoints/automl-checkpoint2021-4-16-17-38-13.ipynb, 7 files out of an estimated total of 7
Uploaded 7 files


In [21]:

temp = ftrain_data.drop_columns('Column1')

In [24]:
temp

{
  "source": [
    "('workspaceblobstore', './titanic_traindata.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes",
    "DropColumns"
  ]
}

## AutoML Configuration

We set the AutoML settings and configuration. Since I am using the class lab provided, I only have a limited amount of time to wait for a run and limited resources, so we set the max_concurrent_iterations to 5 and the timeout to 30 minutes.

We also define a 'classification' task, auto featurization from AutoML, our primary metric as 'accuracy', and specify the dataset and the 'Survive' column from the dataset as the label column.

In [25]:
# TODO: Put your automl settings here
automl_settings =  {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 3,
    "enable_early_stopping": True,
    "featurization": 'auto'
}
# TODO: Put your automl config here
automl_config = AutoMLConfig(
                    task="classification",
                    training_data=temp,
                    label_column_name="Survived",
                    compute_target=cpu_cluster,
                    **automl_settings)

In [26]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config, show_output = True)

Submitting remote run.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
automl_experiment,AutoML_d0d8bb1f-c6d5-48c2-9de9-4acdea4dedf0,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

******************************************************************

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?



In [27]:
#TODO: In the cell below, use the `RunDetails` widget to show the different experiments.
from azureml.widgets import RunDetails
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
automl_experiment,AutoML_d0d8bb1f-c6d5-48c2-9de9-4acdea4dedf0,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation




****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more abo

{'runId': 'AutoML_d0d8bb1f-c6d5-48c2-9de9-4acdea4dedf0',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-05-16T18:01:29.331628Z',
 'endTimeUtc': '2021-05-16T18:21:05.201505Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '3',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"eec1bbcc-b815-4a1a-8e20-b7c607c485cf\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.27.0", "azureml-train": "1.27.0", "azureml-train-restclients-hyperdrive": "1.27.0", "azureml-train-core": "1.27.0", "azureml-train-automl": "1.27.0", "azureml-train-automl-runtime": "1.27.0", "azureml-train-automl-client": "1.27.0", "azu

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [28]:
best_automl, best_fit_model = remote_run.get_output()
print(best_fit_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               reg_alpha=0.20833333333333334,
                                                                                               reg_lambda=0.9375,
                                                                                               scale_pos_weight=1,
                                     

In [29]:

print(best_automl)

Run(Experiment: automl_experiment,
Id: AutoML_d0d8bb1f-c6d5-48c2-9de9-4acdea4dedf0_38,
Type: azureml.scriptrun,
Status: Completed)


In [31]:

#TODO: Save the best model
import joblib, pickle
joblib.dump(best_fit_model, 'best_fit_automl_model.pkl')

['best_fit_automl_model.pkl']

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [32]:
# Register the Model
from azureml.core.model import Model
model = Model.register(workspace = ws, model_name = 'best_fit_automl_model', model_path = 'best_fit_automl_model.pkl')
print(model.name, model.id, model.version, sep='\t')

Registering model best_fit_automl_model
best_fit_automl_model	best_fit_automl_model:1	1


TODO: In the cell below, send a request to the web service you deployed to test it.

In [34]:
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import LocalWebservice, Webservice, AciWebservice
from azureml.core.conda_dependencies import CondaDependencies
import azureml.train.automl

# Create the environment
env = best_automl.get_environment()
conda_dep = CondaDependencies()

inference_config = InferenceConfig(entry_script='score.py', environment=env)

deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=4, enable_app_insights=True)
service = Model.deploy(ws, "customerservice", [model], inference_config, deployment_config)
service.wait_for_deployment(show_output = True)

print(service.state)
print(service.scoring_uri)
print(service.swagger_uri)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-05-16 18:41:57+00:00 Creating Container Registry if not exists..
2021-05-16 18:42:08+00:00 Registering the environment..
2021-05-16 18:42:09+00:00 Use the existing image.
2021-05-16 18:42:09+00:00 Generating deployment configuration.
2021-05-16 18:42:10+00:00 Submitting deployment to compute.
2021-05-16 18:42:13+00:00 Checking the status of deployment customerservice..
2021-05-16 18:46:31+00:00 Checking the status of inference endpoint customerservice.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy
http://3d2e87c7-7675-4031-b273-8cdbf535577a.southcentralus.azurecontainer.io/score
http://3d2e87c7-7675-4031-b273-8cdbf535577a.southcentralus.azurecontainer.io/swagger.json


TODO: In the cell below, print the logs of the web service and delete the service

In [37]:
import requests
import json

# URL for the web service
scoring_uri = 'http://3d2e87c7-7675-4031-b273-8cdbf535577a.southcentralus.azurecontainer.io/score'

# Set the content type
headers = {'Content-Type': 'application/json'}

# Two sets of data to score, so we get two results back
data = {"data":
        [{
                "Pclass" :3,
                "Sex":0,
                "Age" :30.0,
                "SibSp" :0,
                "Parch":0,
                "Fare" :9.500,
                "Embarked" :2,
                "FamilySize" :1 }
        ]
        }
# Convert to JSON string
input_data = json.dumps(data)

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)

print("Response Code : ", resp.status_code)
print("Predicted Value : ",resp.text)

Response Code :  200
Predicted Value :  [0]


In [38]:
# Web Service Logs
print(service.get_logs())

2021-05-16T18:46:23,648957400+00:00 - iot-server/run 
2021-05-16T18:46:23,649254300+00:00 - rsyslog/run 
2021-05-16T18:46:23,667598800+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-05-16T18:46:23,673543300+00:00 - gunicorn/run 
/usr/sbin/nginx: /azureml-envs/azureml_98cae94c606e3ceb655a787040a8a93c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

In [39]:
service.delete()