# Coronary Heart Disease Prediction: Model Training MLOps Script

Section 8 in this notebook will be leveraged as a .py script in the DevOps pipeline.

### 1. Declare variables

In [None]:
# Azure subscription
subscription_id = "<enter this>" 

# Resource Group 
resource_group = "mlops-rg" 

# Workspace Name and Azure Region of the Azure Machine Learning Workspace
workspace_name = "mlops-aml-ws" 
workspace_region = "eastus2" 

# Other variables
experiment_name = 'chd-prediction'
project_dir = './chd'
deployment_dir = './deploy'
model_name = 'chd-predictor'
model_description = 'Model to predict coronory heart disease'

# AML managed compute to be spun up for training
vm_name = "chd-temp-compute"

### 2. Load necessary packages

In [None]:
import os
import logging

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.compute import ComputeTarget
from azureml.core.model import Model
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.core import Workspace
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

### 3. Azure Machine Learning Workspace configuration

In [None]:
# Instantiate an AML workspace - leverage existing, if not - create new
ws = Workspace.create(
    name = workspace_name,
    subscription_id = subscription_id,
    resource_group = resource_group, 
    location = workspace_region,
    exist_ok = True) #Leverage existing

ws.write_config()
print('Workspace configuration succeeded')

### 4. Experiment configuration

In [None]:
# Instantiate an experiment in the AML workspace
experiment = Experiment(ws, experiment_name)

### 5. Project directory configuration

In [None]:
# Create project directory
if not os.path.exists(project_dir):
    os.makedirs(project_dir)

### 6. Azure Machine Learning Managed Compute configuration & provisioning

In [None]:
# Provision AML managed compute 
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

try:
    compute_target = ComputeTarget(workspace=ws, name=vm_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2',
                                                           min_nodes=1, max_nodes=1)

    # create the cluster
    compute_target = ComputeTarget.create(ws, vm_name, compute_config)
    # Show output
    compute_target.wait_for_completion(show_output=True)

### 7. Environment configuration

In [None]:
# Create Docker based environment with scikit-learn installed
training_venv = Environment("training_venv")

training_venv.docker.enabled = True
training_venv.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])

### 8. Create training script

In [None]:
%%writefile $project_dir/train.py

# Load necessary packages
import pandas as pd
import numpy as np
import pickle
import os

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Read training dataset into pandas dataframe
dataset_url = ('https://mlopssa.blob.core.windows.net/chd-dataset/framingham.csv')
df = pd.read_csv(dataset_url)

# create a boolean array of smokers
smoke = (df['currentSmoker']==1)
# Apply mean to NaNs in cigsPerDay but using a set of smokers only
df.loc[smoke,'cigsPerDay'] = df.loc[smoke,'cigsPerDay'].fillna(df.loc[smoke,'cigsPerDay'].mean())

# Fill out missing values
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)

# Features and label
features = df.iloc[:,:-1]
result = df.iloc[:,-1] # the last column is what we are about to forecast

# Train & Test split
X_train, X_test, y_train, y_test = train_test_split(features, result, test_size = 0.2, random_state = 14)

# RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)

# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.12
sfm = SelectFromModel(clf, threshold=0.12)

# Train the selector
sfm.fit(X_train, y_train)

# Features selected
feat_labels = list(features.columns.values) # creating a list with features' names
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

# Feature importance
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# With only imporant features. Can check X_important_train.shape[1]
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)

# Save the model to disk
os.makedirs('./outputs/model', exist_ok=True)

filename = './outputs/model/chd-rf-model'
pickle.dump(clf_important, open(filename, 'wb'))
print("model saved in ././outputs/model/chd-rf-model folder")
print("Saving model completed")

### 9. Start the training experiment using compute and script from earlier

In [None]:
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

src = ScriptRunConfig(source_directory=project_dir, script='train.py')

# Set compute target to the one created in previous step
src.run_config.target = compute_target.name

# Set environment
src.run_config.environment = training_venv
 
run = experiment.submit(config=src)
run

### 10. Poll for training experiment completion

In [None]:
%%time
# Shows output of the run on stdout
run.wait_for_completion(show_output=True)

### 11. Training experiment status

In [None]:
RunDetails(run).show()

### 12. Check for success; Register model to model registry

In [None]:
if run.get_status() == 'Completed':
    print("Training completed successfully!")
    model_run = run.register_model(model_name=model_name,  
                               model_path="././outputs/model/chd-rf-model",
                               tags={"type": "classification", "description": model_description, "run_id": run.id})
    print("Model registered with version number: ", model_run.version)
else:
    print("Training failed!")
    Exception("Training failed!")
    